27#define DM_TASK_QUEUE_SIZE 4
33#define _dm_mtx_lock() snrt_mutex_acquire(&dm_p->mutex)
34#define _dm_mtx_release() snrt_mutex_release(&dm_p->mutex)
39#define DM_STATUS_COMPLETE_ID 0
40#define DM_STATUS_NEXT_ID 1
41#define DM_STATUS_BUSY 2
42#define DM_STATUS_WOULD_BLOCK 3
52#define _DM_PRINTF(...) \
54 printf("[dm] "__VA_ARGS__); \
56#define DM_PRINTF(d, ...) \
57 if (DM_DEBUG_LEVEL >= d) { \
58 _DM_PRINTF(__VA_ARGS__); \
61#define DM_PRINTF(d, ...)
95 volatile uint32_t queue_fill;
96 volatile uint32_t mutex;
97 volatile en_stat_t stat_q;
98 volatile uint32_t stat_p;
99 volatile uint32_t stat_pvalid;
100 volatile uint32_t dm_wfi;
111extern __thread
volatile dm_t *dm_p;
116extern volatile dm_t *
volatile dm_p_global;
122#ifdef DM_USE_GLOBAL_CLINT
123inline void wfi_dm(uint32_t cluster_core_idx) {
124 (void)cluster_core_idx;
127inline void wake_dm(
void) {
128 uint32_t basehart = snrt_cluster_core_base_hartid();
129 snrt_int_sw_set(basehart + snrt_cluster_dm_core_idx());
132inline void wfi_dm(uint32_t cluster_core_idx) {
133 __atomic_add_fetch(&dm_p->dm_wfi, 1, __ATOMIC_RELAXED);
135 snrt_int_cluster_clr(1 << cluster_core_idx);
136 __atomic_add_fetch(&dm_p->dm_wfi, -1, __ATOMIC_RELAXED);
138inline void wake_dm(
void) {
140 while (!__atomic_load_n(&dm_p->dm_wfi, __ATOMIC_RELAXED))
142 snrt_int_cluster_set(1 << snrt_cluster_compute_core_num());
152inline void dm_init(
void) {
154 if (snrt_is_dm_core()) {
155#ifdef DM_USE_GLOBAL_CLINT
156 snrt_interrupt_enable(IRQ_M_SOFT);
158 snrt_interrupt_enable(IRQ_M_CLUSTER);
160 dm_p = (
dm_t *)snrt_l1_alloc(
sizeof(
dm_t));
161 snrt_memset((
void *)dm_p, 0,
sizeof(
dm_t));
174inline void dm_main(
void) {
176 uint32_t do_exit = 0;
177 uint32_t cluster_core_idx = snrt_cluster_core_idx();
179 DM_PRINTF(10,
"enter main\n");
183 if (dm_p->queue_fill) {
185 while (__builtin_sdma_stat(DM_STATUS_WOULD_BLOCK))
188 t = &dm_p->queue[dm_p->queue_back];
191 DM_PRINTF(10,
"start twod\n");
192 __builtin_sdma_start_twod(t->src, t->dst, t->size, t->sstrd,
193 t->dstrd, t->nreps, t->cfg);
195 DM_PRINTF(10,
"start oned\n");
196 __builtin_sdma_start_oned(t->src, t->dst, t->size, t->cfg);
200 dm_p->queue_back = (dm_p->queue_back + 1) % DM_TASK_QUEUE_SIZE;
201 __atomic_add_fetch(&dm_p->queue_fill, -1, __ATOMIC_RELAXED);
206 switch (dm_p->stat_q) {
210 if (__builtin_sdma_stat(DM_STATUS_BUSY) == 0) {
211 DM_PRINTF(50,
"idle\n");
212 dm_p->stat_pvalid = 1;
220 DM_PRINTF(50,
"ready\n");
221 dm_p->stat_pvalid = 1;
228 if (!dm_p->queue_fill && !dm_p->stat_q) {
229 wfi_dm(cluster_core_idx);
232 DM_PRINTF(10,
"dm: exit\n");
233#ifdef DM_USE_GLOBAL_CLINT
234 snrt_interrupt_disable(IRQ_M_SOFT);
236 snrt_interrupt_disable(IRQ_M_CLUSTER);
245inline void dm_exit(
void) {
246 dm_p->stat_q = STAT_EXIT;
261inline void dm_memcpy_async(
void *dest,
const void *src,
size_t n) {
265 DM_PRINTF(10,
"dm_memcpy_async %#x -> %#x size %d\n", src, dest,
270 s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
271 }
while (s >= DM_TASK_QUEUE_SIZE);
275 t = &dm_p->queue[dm_p->queue_front];
276 t->src = (uint64_t)src;
277 t->dst = (uint64_t)dest;
278 t->size = (uint32_t)n;
283 __atomic_add_fetch(&dm_p->queue_fill, 1, __ATOMIC_RELAXED);
284 dm_p->queue_front = (dm_p->queue_front + 1) % DM_TASK_QUEUE_SIZE;
302inline void dm_memcpy2d_async(uint64_t src, uint64_t dst, uint32_t size,
303 uint32_t sstrd, uint32_t dstrd, uint32_t nreps,
308 DM_PRINTF(10,
"dm_memcpy2d_async %#x -> %#x size %d\n", src, dst,
313 s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
314 }
while (s >= DM_TASK_QUEUE_SIZE);
318 t = &dm_p->queue[dm_p->queue_front];
329 __atomic_add_fetch(&dm_p->queue_fill, 1, __ATOMIC_RELAXED);
330 dm_p->queue_front = (dm_p->queue_front + 1) % DM_TASK_QUEUE_SIZE;
339inline void dm_start(
void) { wake_dm(); }
345inline void dm_wait(
void) {
353 s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
361 dm_p->stat_pvalid = 0;
363 dm_p->stat_q = STAT_WAIT_IDLE;
367 while (!dm_p->stat_pvalid)
376inline void dm_wait_ready(
void) {
378 dm_p->stat_pvalid = 0;
379 dm_p->stat_q = STAT_READY;
381 while (!dm_p->stat_pvalid)