Snitch Runtime
Loading...
Searching...
No Matches
dma.h
Go to the documentation of this file.
1// Copyright 2023 ETH Zurich and University of Bologna.
2// Licensed under the Apache License, Version 2.0, see LICENSE for details.
3// SPDX-License-Identifier: Apache-2.0
4
5#ifndef SNRT_SUPPORTS_DMA
6#include <string.h>
7#endif
8
14#pragma once
15
16#include <math.h>
17
19typedef uint32_t snrt_dma_txid_t;
20
35static inline uint32_t snrt_dma_start_1d(uint64_t dst, uint64_t src,
36 size_t size,
37 const uint32_t channel = 0) {
38#ifdef SNRT_SUPPORTS_DMA
39 uint32_t dst_lo = dst & 0xFFFFFFFF;
40 uint32_t dst_hi = dst >> 32;
41 uint32_t src_lo = src & 0xFFFFFFFF;
42 uint32_t src_hi = src >> 32;
43 uint32_t txid;
44
45 asm volatile(
46 "dmsrc %[src_lo], %[src_hi] \n"
47 "dmdst %[dst_lo], %[dst_hi] \n"
48 "dmcpyi %[txid], %[size], (%[channel] << 2) | 0b00 \n"
49 : [ txid ] "=r"(txid)
50 : [ src_lo ] "r"(src_lo), [ src_hi ] "r"(src_hi),
51 [ dst_lo ] "r"(dst_lo), [ dst_hi ] "r"(dst_hi), [ size ] "r"(size),
52 [ channel ] "i"(channel));
53
54 return txid;
55#else
56 memcpy((void *)dst, (const void *)src, size);
57 return 0;
58#endif
59}
60
67static inline uint32_t snrt_dma_start_1d(volatile void *dst, volatile void *src,
68 size_t size,
69 const uint32_t channel = 0) {
70 return snrt_dma_start_1d((uint64_t)dst, (uint64_t)src, size, channel);
71}
72
80inline void snrt_dma_set_awuser(uint64_t field) {
81#ifdef SNRT_SUPPORTS_DMA
82 uint32_t user_low = (uint32_t)(field);
83 uint32_t user_high = (uint32_t)(field >> 32);
84 asm volatile("dmuser %[user_low], %[user_high] \n"
85 :
86 : [ user_low ] "r"(user_low), [ user_high ] "r"(user_high));
87#endif
88}
89
97inline void snrt_dma_enable_multicast(uint64_t mask) {
99 op.f.opcode = SNRT_COLLECTIVE_MULTICAST;
100 op.f.mask = mask;
102}
103
112inline void snrt_dma_enable_reduction(uint64_t mask,
113 snrt_collective_opcode_t opcode) {
115 op.f.opcode = opcode;
116 op.f.mask = mask;
118}
119
125
131
140static inline uint32_t snrt_dma_start_1d_reduction(
141 uint64_t dst, uint64_t src, size_t size, uint64_t mask,
142 snrt_collective_opcode_t opcode, const uint32_t channel = 0) {
143 snrt_dma_enable_reduction(mask, opcode);
144 uint32_t txid = snrt_dma_start_1d(dst, src, size, channel);
146 return txid;
147}
148
157static inline uint32_t snrt_dma_start_1d_reduction(
158 uint64_t dst, uint64_t src, size_t size, snrt_comm_t comm,
159 snrt_collective_opcode_t opcode, const uint32_t channel = 0) {
160 uint64_t mask = snrt_get_collective_mask(comm);
161 uint32_t txid =
162 snrt_dma_start_1d_reduction(dst, src, size, mask, opcode, channel);
163 return txid;
164}
165
173static inline uint32_t snrt_dma_start_1d_mcast(uint64_t dst, uint64_t src,
174 size_t size, uint64_t mask,
175 const uint32_t channel = 0) {
177 uint32_t txid = snrt_dma_start_1d(dst, src, size, channel);
179 return txid;
180}
181
189static inline uint32_t snrt_dma_start_1d_mcast(uint64_t dst, uint64_t src,
190 size_t size, snrt_comm_t comm,
191 const uint32_t channel = 0) {
192 uint64_t mask = snrt_get_collective_mask(comm);
193 uint32_t txid = snrt_dma_start_1d_mcast(dst, src, size, mask, channel);
194 return txid;
195}
196
205static inline uint32_t snrt_dma_start_1d_reduction(
206 volatile void *dst, volatile void *src, size_t size, uint64_t mask,
207 snrt_collective_opcode_t opcode, const uint32_t channel = 0) {
208 return snrt_dma_start_1d_reduction((uint64_t)dst, (uint64_t)src, size, mask,
209 opcode, channel);
210}
211
220static inline uint32_t snrt_dma_start_1d_mcast(volatile void *dst,
221 volatile void *src, size_t size,
222 uint64_t mask,
223 const uint32_t channel = 0) {
224 return snrt_dma_start_1d_mcast((uint64_t)dst, (uint64_t)src, size, mask,
225 channel);
226}
227
246static inline snrt_dma_txid_t snrt_dma_start_2d(uint64_t dst, uint64_t src,
247 size_t size, size_t dst_stride,
248 size_t src_stride,
249 size_t repeat,
250 const uint32_t channel = 0) {
251#ifdef SNRT_SUPPORTS_DMA
252 uint32_t dst_lo = dst & 0xFFFFFFFF;
253 uint32_t dst_hi = dst >> 32;
254 uint32_t src_lo = src & 0xFFFFFFFF;
255 uint32_t src_hi = src >> 32;
256 uint32_t txid;
257
258 asm volatile(
259 "dmsrc %[src_lo], %[src_hi] \n"
260 "dmdst %[dst_lo], %[dst_hi] \n"
261 "dmstr %[src_stride], %[dst_stride] \n"
262 "dmrep %[repeat] \n"
263 "dmcpyi %[txid], %[size], (%[channel] << 2) | 0b10 \n"
264 : [ txid ] "=r"(txid)
265 : [ src_lo ] "r"(src_lo), [ src_hi ] "r"(src_hi),
266 [ dst_lo ] "r"(dst_lo), [ dst_hi ] "r"(dst_hi),
267 [ dst_stride ] "r"(dst_stride), [ src_stride ] "r"(src_stride),
268 [ repeat ] "r"(repeat), [ size ] "r"(size), [ channel ] "i"(channel));
269
270 return txid;
271#else
272 // TODO(colluca): we can implement this as a series of memcpy calls
273 return 0;
274#endif
275}
276
284static inline uint32_t snrt_dma_start_2d(volatile void *dst, volatile void *src,
285 size_t size, size_t dst_stride,
286 size_t src_stride, size_t repeat,
287 const uint32_t channel = 0) {
288 return snrt_dma_start_2d((uint64_t)dst, (uint64_t)src, size, dst_stride,
289 src_stride, repeat, channel);
290}
291
301static inline uint32_t snrt_dma_start_2d_mcast(uint64_t dst, uint64_t src,
302 size_t size, size_t dst_stride,
303 size_t src_stride, size_t repeat,
304 uint32_t mask,
305 const uint32_t channel = 0) {
307 uint32_t txid = snrt_dma_start_2d(dst, src, size, dst_stride, src_stride,
308 repeat, channel);
310 return txid;
311}
312
321static inline uint32_t snrt_dma_start_2d_mcast(volatile void *dst,
322 volatile void *src, size_t size,
323 size_t dst_stride,
324 size_t src_stride, size_t repeat,
325 uint32_t mask,
326 const uint32_t channel = 0) {
327 return snrt_dma_start_2d_mcast((uint64_t)dst, (uint64_t)src, size,
328 dst_stride, src_stride, repeat, mask,
329 channel);
330}
331
342static inline void snrt_dma_wait(snrt_dma_txid_t txid,
343 const uint32_t channel = 0) {
344#ifdef SNRT_SUPPORTS_DMA
345 asm volatile(
346 "1: \n"
347 "dmstati t0, (%[channel] << 2) | 0 \n"
348 "bltu t0, %[txid], 1b \n"
349 :
350 : [ txid ] "r"(txid), [ channel ] "i"(channel)
351 : "t0");
352#endif
353}
354
364static inline void snrt_dma_wait_all(const uint32_t channel = 0) {
365#ifdef SNRT_SUPPORTS_DMA
366 uint32_t busy;
367 asm volatile(
368 "1: \n"
369 "dmstati %[busy], (%[channel] << 2) | 2 \n"
370 "bne %[busy], zero, 1b \n"
371 : [ busy ] "=r"(busy)
372 : [ channel ] "i"(channel));
373#endif
374}
375
380inline void snrt_dma_wait_all_channels(uint32_t num_channels) {
381 for (int c = 0; c < num_channels; c++) {
383 }
384}
385
393#ifdef SNRT_SUPPORTS_DMA
394 asm volatile("dmstati zero, 0 \n");
395#endif
396}
397
405#ifdef SNRT_SUPPORTS_DMA
406 asm volatile("dmstati zero, 0 \n");
407#endif
408}
409
417inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) {
418#ifdef SNRT_SUPPORTS_DMA
419 // We set the first 64 bytes to the value, and then we use the DMA to copy
420 // these into the remaining memory region. DMA is used only if len is
421 // larger than 64 bytes, and an integer multiple of 64 bytes.
422 size_t n_1d_transfers = len / 64;
423 size_t use_dma = (len % 64) == 0 && len > 64;
424 uint8_t *p = (uint8_t *)ptr;
425
426 uint32_t nbytes = len < 64 || !use_dma ? len : 64;
427 while (nbytes--) {
428 *p++ = value;
429 }
430
431 if (use_dma) {
432 snrt_dma_start_2d(ptr, ptr, 64, 64, 0, n_1d_transfers);
434 }
435#else
436 memset(ptr, (int)value, len);
437#endif
438}
439
448inline snrt_dma_txid_t snrt_dma_load_1d_tile(volatile void *dst,
449 volatile void *src,
450 size_t tile_idx, size_t tile_size,
451 uint32_t prec) {
452 size_t tile_nbytes = tile_size * prec;
453 return snrt_dma_start_1d(
454 (uint64_t)dst, (uint64_t)src + tile_idx * tile_nbytes, tile_nbytes);
455}
456
467 size_t tile_idx,
468 size_t tile_size,
469 uint32_t prec,
470 uint64_t mask) {
471 size_t tile_nbytes = tile_size * prec;
472 return snrt_dma_start_1d_mcast((uintptr_t)dst,
473 (uintptr_t)src + tile_idx * tile_nbytes,
474 tile_nbytes, mask);
475}
476
488 void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec,
489 uint64_t mask, snrt_collective_opcode_t opcode) {
490 size_t tile_nbytes = tile_size * prec;
491 return snrt_dma_start_1d_reduction((uintptr_t)dst,
492 (uintptr_t)src + tile_idx * tile_nbytes,
493 tile_nbytes, mask, opcode);
494}
495
504inline snrt_dma_txid_t snrt_dma_1d_to_2d(volatile void *dst, volatile void *src,
505 size_t size, size_t row_size,
506 size_t stride) {
507 return snrt_dma_start_2d(dst, src, row_size, stride, row_size,
508 size / row_size);
509}
510
519inline snrt_dma_txid_t snrt_dma_2d_to_1d(volatile void *dst, volatile void *src,
520 size_t size, size_t row_size,
521 size_t stride) {
522 return snrt_dma_start_2d(dst, src, row_size, row_size, stride,
523 size / row_size);
524}
525
534inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src,
535 size_t tile_idx, size_t tile_size,
536 uint32_t prec) {
537 size_t tile_nbytes = tile_size * prec;
538 return snrt_dma_start_1d((uint64_t)dst + tile_idx * tile_nbytes,
539 (uint64_t)src, tile_nbytes);
540}
541
558 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
559 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
560 uint32_t prec, size_t tile_ld) {
561 size_t src_offset = 0;
562 // Advance src array in x0 and x1 dimensions, and convert to byte offset
563 src_offset += tile_x0_idx * tile_x0_size;
564 src_offset += tile_x1_idx * tile_x1_size * full_x0_size;
565 src_offset *= prec;
566 // Initiate transfer
567 return snrt_dma_start_2d((uint64_t)dst, // dst
568 (uint64_t)src + src_offset, // src
569 tile_x0_size * prec, // size
570 tile_ld, // dst_stride
571 full_x0_size * prec, // src_stride
572 tile_x1_size // repeat
573 );
574}
575
586 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
587 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
588 uint32_t prec) {
589 return snrt_dma_load_2d_tile(dst, src, tile_x1_idx, tile_x0_idx,
590 tile_x1_size, tile_x0_size, full_x0_size, prec,
591 tile_x0_size * prec);
592}
593
602 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
603 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
604 uint32_t prec, size_t tile_ld, uint32_t mask) {
605 size_t src_offset = 0;
606 // Advance src array in x0 and x1 dimensions, and convert to byte offset
607 src_offset += tile_x0_idx * tile_x0_size;
608 src_offset += tile_x1_idx * tile_x1_size * full_x0_size;
609 src_offset *= prec;
610 // Initiate transfer
611 return snrt_dma_start_2d_mcast((uint64_t)dst, // dst
612 (uint64_t)src + src_offset, // src
613 tile_x0_size * prec, // size
614 tile_ld, // dst_stride
615 full_x0_size * prec, // src_stride
616 tile_x1_size, // repeat
617 mask // mask
618 );
619}
620
631 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
632 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
633 uint32_t prec, uint32_t mask) {
634 return snrt_dma_load_2d_tile_mcast(dst, src, tile_x1_idx, tile_x0_idx,
635 tile_x1_size, tile_x0_size, full_x0_size,
636 prec, tile_x0_size * prec, mask);
637}
638
650 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
651 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
652 uint32_t prec, snrt_comm_t comm) {
653 uint64_t mask = snrt_get_collective_mask(comm);
654 return snrt_dma_load_2d_tile_mcast(dst, src, tile_x1_idx, tile_x0_idx,
655 tile_x1_size, tile_x0_size, full_x0_size,
656 prec, mask);
657}
658
676 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
677 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
678 uint32_t prec, size_t num_banks) {
679 // Calculate new tile size after reshaping the tile in the selected banks
680 size_t tile_x0_size_in_banks = (num_banks * SNRT_TCDM_BANK_WIDTH) / prec;
681 size_t tile_x1_size_in_banks =
682 ceil((tile_x1_size * tile_x0_size) / (double)tile_x0_size_in_banks);
683 size_t tile_ld = SNRT_TCDM_HYPERBANK_WIDTH;
684 return snrt_dma_load_2d_tile(dst, src, tile_x1_idx, tile_x0_idx,
685 tile_x1_size_in_banks, tile_x0_size_in_banks,
686 full_x0_size, prec, tile_ld);
687}
688
705 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
706 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
707 uint32_t prec, size_t tile_ld) {
708 size_t dst_offset = 0;
709 // Advance dst array in x0 and x1 dimensions, and convert to byte offset
710 dst_offset += tile_x0_idx * tile_x0_size;
711 dst_offset += tile_x1_idx * tile_x1_size * full_x0_size;
712 dst_offset *= prec;
713 // Initiate transfer
714 return snrt_dma_start_2d((uint64_t)dst + dst_offset, // dst
715 (uint64_t)src, // src
716 tile_x0_size * prec, // size
717 full_x0_size * prec, // dst_stride
718 tile_ld, // src_stride
719 tile_x1_size // repeat
720 );
721}
722
733 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
734 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
735 uint32_t prec) {
736 return snrt_dma_store_2d_tile(dst, src, tile_x1_idx, tile_x0_idx,
737 tile_x1_size, tile_x0_size, full_x0_size,
738 prec, tile_x0_size * prec);
739}
740
758 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
759 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
760 uint32_t prec, size_t num_banks) {
761 // Calculate new tile size after reshaping the tile in the selected banks
762 size_t tile_x0_size_in_banks = (num_banks * SNRT_TCDM_BANK_WIDTH) / prec;
763 size_t tile_x1_size_in_banks =
764 ceil((tile_x1_size * tile_x0_size) / (double)tile_x0_size_in_banks);
765 size_t tile_ld = SNRT_TCDM_HYPERBANK_WIDTH;
766 return snrt_dma_store_2d_tile(dst, src, tile_x1_idx, tile_x0_idx,
767 tile_x1_size_in_banks, tile_x0_size_in_banks,
768 full_x0_size, prec, tile_ld);
769}
void snrt_dma_disable_reduction()
Disable reduction operations for successive transfers.
Definition dma.h:130
void snrt_dma_set_awuser(uint64_t field)
Set AW user field of the DMA's AXI interface.
Definition dma.h:80
snrt_dma_txid_t snrt_dma_load_1d_tile_mcast(void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec, uint64_t mask)
Load a tile of a 1D array.
Definition dma.h:466
void snrt_dma_disable_multicast()
Disable multicast for successive transfers.
Definition dma.h:124
snrt_dma_txid_t snrt_dma_load_1d_tile(volatile void *dst, volatile void *src, size_t tile_idx, size_t tile_size, uint32_t prec)
Load a tile of a 1D array.
Definition dma.h:448
snrt_dma_txid_t snrt_dma_1d_to_2d(volatile void *dst, volatile void *src, size_t size, size_t row_size, size_t stride)
Transfer and reshape a 1D array into a 2D array.
Definition dma.h:504
static snrt_dma_txid_t snrt_dma_start_2d(uint64_t dst, uint64_t src, size_t size, size_t dst_stride, size_t src_stride, size_t repeat, const uint32_t channel=0)
Start an asynchronous 2D DMA transfer with 64-bit wide pointers.
Definition dma.h:246
snrt_dma_txid_t snrt_dma_store_2d_tile_from_banks(void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, uint32_t prec, size_t num_banks)
Store a 2D tile of a 2D array from a 1D layout occupying a subset of TCDM banks.
Definition dma.h:757
static uint32_t snrt_dma_start_1d(uint64_t dst, uint64_t src, size_t size, const uint32_t channel=0)
Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a specific DMA channel.
Definition dma.h:35
uint32_t snrt_dma_txid_t
A DMA transfer identifier.
Definition dma.h:19
static uint32_t snrt_dma_start_1d_reduction(uint64_t dst, uint64_t src, size_t size, uint64_t mask, snrt_collective_opcode_t opcode, const uint32_t channel=0)
Start an asynchronous reduction 1D DMA transfer with 64-bit wide pointers.
Definition dma.h:140
snrt_dma_txid_t snrt_dma_store_2d_tile(void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, uint32_t prec, size_t tile_ld)
Store a 2D tile to a 2D array.
Definition dma.h:704
snrt_dma_txid_t snrt_dma_load_2d_tile_in_banks(void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, uint32_t prec, size_t num_banks)
Load a 2D tile of a 2D array and reshape it to occupy a subset of TCDM banks.
Definition dma.h:675
snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec)
Store a tile to a 1D array.
Definition dma.h:534
static uint32_t snrt_dma_start_2d_mcast(uint64_t dst, uint64_t src, size_t size, size_t dst_stride, size_t src_stride, size_t repeat, uint32_t mask, const uint32_t channel=0)
Start an asynchronous, multicast 2D DMA transfer with 64-bit wide pointers.
Definition dma.h:301
snrt_dma_txid_t snrt_dma_2d_to_1d(volatile void *dst, volatile void *src, size_t size, size_t row_size, size_t stride)
Transfer and reshape a 2D array into a 1D array.
Definition dma.h:519
void snrt_dma_enable_reduction(uint64_t mask, snrt_collective_opcode_t opcode)
Enable reduction operations for successive transfers.
Definition dma.h:112
void snrt_dma_wait_all_channels(uint32_t num_channels)
Block until the first num_channels channels are idle.
Definition dma.h:380
static uint32_t snrt_dma_start_1d_mcast(uint64_t dst, uint64_t src, size_t size, uint64_t mask, const uint32_t channel=0)
Start an asynchronous multicast 1D DMA transfer with 64-bit wide pointers.
Definition dma.h:173
snrt_dma_txid_t snrt_dma_reduction_load_1d_tile(void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec, uint64_t mask, snrt_collective_opcode_t opcode)
Load a tile of a 1D array.
Definition dma.h:487
void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len)
Fast memset function performed by DMA.
Definition dma.h:417
void snrt_dma_enable_multicast(uint64_t mask)
Enable multicast for successive transfers.
Definition dma.h:97
static void snrt_dma_wait(snrt_dma_txid_t txid, const uint32_t channel=0)
Block until a DMA transfer finishes on a specific DMA channel.
Definition dma.h:342
snrt_dma_txid_t snrt_dma_load_2d_tile(void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, uint32_t prec, size_t tile_ld)
Load a 2D tile of a 2D array.
Definition dma.h:557
static void snrt_dma_wait_all(const uint32_t channel=0)
Block until a specific DMA channel is idle.
Definition dma.h:364
void snrt_dma_start_tracking()
Start tracking of dma performance region. Does not have any implications on the HW....
Definition dma.h:392
snrt_dma_txid_t snrt_dma_load_2d_tile_mcast(void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, uint32_t prec, size_t tile_ld, uint32_t mask)
Load a 2D tile of a 2D array using multicast.
Definition dma.h:601
void snrt_dma_stop_tracking()
Stop tracking of dma performance region. Does not have any implications on the HW....
Definition dma.h:404
Definition sync_decls.h:14
Definition sync_decls.h:40