Snitch Runtime
Loading...
Searching...
No Matches
dma.h
Go to the documentation of this file.
1// Copyright 2023 ETH Zurich and University of Bologna.
2// Licensed under the Apache License, Version 2.0, see LICENSE for details.
3// SPDX-License-Identifier: Apache-2.0
4
10#pragma once
11
12#include <math.h>
13
15typedef uint32_t snrt_dma_txid_t;
16
31static inline uint32_t snrt_dma_start_1d(uint64_t dst, uint64_t src,
32 size_t size,
33 const uint32_t channel = 0) {
34 uint32_t dst_lo = dst & 0xFFFFFFFF;
35 uint32_t dst_hi = dst >> 32;
36 uint32_t src_lo = src & 0xFFFFFFFF;
37 uint32_t src_hi = src >> 32;
38 uint32_t txid;
39
40 asm volatile(
41 "dmsrc %[src_lo], %[src_hi] \n"
42 "dmdst %[dst_lo], %[dst_hi] \n"
43 "dmcpyi %[txid], %[size], (%[channel] << 2) | 0b00 \n"
44 : [ txid ] "=r"(txid)
45 : [ src_lo ] "r"(src_lo), [ src_hi ] "r"(src_hi),
46 [ dst_lo ] "r"(dst_lo), [ dst_hi ] "r"(dst_hi), [ size ] "r"(size),
47 [ channel ] "i"(channel));
48
49 return txid;
50}
51
58static inline uint32_t snrt_dma_start_1d(volatile void *dst, volatile void *src,
59 size_t size,
60 const uint32_t channel = 0) {
61 return snrt_dma_start_1d((uint64_t)dst, (uint64_t)src, size, channel);
62}
63
68inline void snrt_dma_enable_mcast(uint32_t mask) {
69 asm volatile("dmuser %[mask], zero \n" : : [ mask ] "r"(mask));
70}
71
76inline void snrt_dma_disable_mcast() { asm volatile("dmuser zero, zero \n"); }
77
85static inline uint32_t snrt_dma_start_1d_mcast(uint64_t dst, uint64_t src,
86 size_t size, uint32_t mask,
87 const uint32_t channel = 0) {
89 uint32_t txid = snrt_dma_start_1d(dst, src, size, channel);
91 return txid;
92}
93
102static inline uint32_t snrt_dma_start_1d_mcast(volatile void *dst,
103 volatile void *src, size_t size,
104 uint32_t mask,
105 const uint32_t channel = 0) {
106 return snrt_dma_start_1d_mcast((uint64_t)dst, (uint64_t)src, size, mask,
107 channel);
108}
109
128static inline snrt_dma_txid_t snrt_dma_start_2d(uint64_t dst, uint64_t src,
129 size_t size, size_t dst_stride,
130 size_t src_stride,
131 size_t repeat,
132 const uint32_t channel = 0) {
133 uint32_t dst_lo = dst & 0xFFFFFFFF;
134 uint32_t dst_hi = dst >> 32;
135 uint32_t src_lo = src & 0xFFFFFFFF;
136 uint32_t src_hi = src >> 32;
137 uint32_t txid;
138
139 asm volatile(
140 "dmsrc %[src_lo], %[src_hi] \n"
141 "dmdst %[dst_lo], %[dst_hi] \n"
142 "dmstr %[src_stride], %[dst_stride] \n"
143 "dmrep %[repeat] \n"
144 "dmcpyi %[txid], %[size], (%[channel] << 2) | 0b10 \n"
145 : [ txid ] "=r"(txid)
146 : [ src_lo ] "r"(src_lo), [ src_hi ] "r"(src_hi),
147 [ dst_lo ] "r"(dst_lo), [ dst_hi ] "r"(dst_hi),
148 [ dst_stride ] "r"(dst_stride), [ src_stride ] "r"(src_stride),
149 [ repeat ] "r"(repeat), [ size ] "r"(size), [ channel ] "i"(channel));
150
151 return txid;
152}
153
161static inline uint32_t snrt_dma_start_2d(volatile void *dst, volatile void *src,
162 size_t size, size_t dst_stride,
163 size_t src_stride, size_t repeat,
164 const uint32_t channel = 0) {
165 return snrt_dma_start_2d((uint64_t)dst, (uint64_t)src, size, dst_stride,
166 src_stride, repeat, channel);
167}
168
178static inline uint32_t snrt_dma_start_2d_mcast(uint64_t dst, uint64_t src,
179 size_t size, size_t dst_stride,
180 size_t src_stride, size_t repeat,
181 uint32_t mask,
182 const uint32_t channel = 0) {
184 uint32_t txid = snrt_dma_start_2d(dst, src, size, dst_stride, src_stride,
185 repeat, channel);
187 return txid;
188}
189
198static inline uint32_t snrt_dma_start_2d_mcast(volatile void *dst,
199 volatile void *src, size_t size,
200 size_t dst_stride,
201 size_t src_stride, size_t repeat,
202 uint32_t mask,
203 const uint32_t channel = 0) {
204 return snrt_dma_start_2d_mcast((uint64_t)dst, (uint64_t)src, size,
205 dst_stride, src_stride, repeat, mask,
206 channel);
207}
208
219static inline void snrt_dma_wait(snrt_dma_txid_t txid,
220 const uint32_t channel = 0) {
221 asm volatile(
222 "1: \n"
223 "dmstati t0, (%[channel] << 2) | 0 \n"
224 "bltu t0, %[txid], 1b \n"
225 :
226 : [ txid ] "r"(txid), [ channel ] "i"(channel)
227 : "t0");
228}
229
239static inline void snrt_dma_wait_all(const uint32_t channel = 0) {
240 uint32_t busy;
241 asm volatile(
242 "1: \n"
243 "dmstati %[busy], (%[channel] << 2) | 2 \n"
244 "bne %[busy], zero, 1b \n"
245 : [ busy ] "=r"(busy)
246 : [ channel ] "i"(channel));
247}
248
253inline void snrt_dma_wait_all_channels(uint32_t num_channels) {
254 for (int c = 0; c < num_channels; c++) {
256 }
257}
258
265inline void snrt_dma_start_tracking() { asm volatile("dmstati zero, 0 \n"); }
266
273inline void snrt_dma_stop_tracking() { asm volatile("dmstati zero, 0 \n"); }
274
282inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) {
283 // We set the first 64 bytes to the value, and then we use the DMA to copy
284 // these into the remaining memory region. DMA is used only if len is
285 // larger than 64 bytes, and an integer multiple of 64 bytes.
286 size_t n_1d_transfers = len / 64;
287 size_t use_dma = (len % 64) == 0 && len > 64;
288 uint8_t *p = (uint8_t *)ptr;
289
290 uint32_t nbytes = len < 64 || !use_dma ? len : 64;
291 while (nbytes--) {
292 *p++ = value;
293 }
294
295 if (use_dma) {
296 snrt_dma_start_2d(ptr, ptr, 64, 64, 0, n_1d_transfers);
298 }
299}
300
309inline snrt_dma_txid_t snrt_dma_load_1d_tile(volatile void *dst,
310 volatile void *src,
311 size_t tile_idx, size_t tile_size,
312 uint32_t prec) {
313 size_t tile_nbytes = tile_size * prec;
314 return snrt_dma_start_1d(
315 (uint64_t)dst, (uint64_t)src + tile_idx * tile_nbytes, tile_nbytes);
316}
317
328 size_t tile_idx,
329 size_t tile_size,
330 uint32_t prec,
331 uint32_t mcast) {
332 size_t tile_nbytes = tile_size * prec;
333 return snrt_dma_start_1d_mcast((uintptr_t)dst,
334 (uintptr_t)src + tile_idx * tile_nbytes,
335 tile_nbytes, mcast);
336}
337
346inline snrt_dma_txid_t snrt_dma_1d_to_2d(volatile void *dst, volatile void *src,
347 size_t size, size_t row_size,
348 size_t stride) {
349 return snrt_dma_start_2d(dst, src, row_size, stride, row_size,
350 size / row_size);
351}
352
361inline snrt_dma_txid_t snrt_dma_2d_to_1d(volatile void *dst, volatile void *src,
362 size_t size, size_t row_size,
363 size_t stride) {
364 return snrt_dma_start_2d(dst, src, row_size, row_size, stride,
365 size / row_size);
366}
367
376inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src,
377 size_t tile_idx, size_t tile_size,
378 uint32_t prec) {
379 size_t tile_nbytes = tile_size * prec;
380 return snrt_dma_start_1d((uint64_t)dst + tile_idx * tile_nbytes,
381 (uint64_t)src, tile_nbytes);
382}
383
400 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
401 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
402 uint32_t prec, size_t tile_ld) {
403 size_t src_offset = 0;
404 // Advance src array in x0 and x1 dimensions, and convert to byte offset
405 src_offset += tile_x0_idx * tile_x0_size;
406 src_offset += tile_x1_idx * tile_x1_size * full_x0_size;
407 src_offset *= prec;
408 // Initiate transfer
409 return snrt_dma_start_2d((uint64_t)dst, // dst
410 (uint64_t)src + src_offset, // src
411 tile_x0_size * prec, // size
412 tile_ld, // dst_stride
413 full_x0_size * prec, // src_stride
414 tile_x1_size // repeat
415 );
416}
417
428 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
429 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
430 uint32_t prec) {
431 return snrt_dma_load_2d_tile(dst, src, tile_x1_idx, tile_x0_idx,
432 tile_x1_size, tile_x0_size, full_x0_size, prec,
433 tile_x0_size * prec);
434}
435
444 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
445 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
446 uint32_t prec, size_t tile_ld, uint32_t mask) {
447 size_t src_offset = 0;
448 // Advance src array in x0 and x1 dimensions, and convert to byte offset
449 src_offset += tile_x0_idx * tile_x0_size;
450 src_offset += tile_x1_idx * tile_x1_size * full_x0_size;
451 src_offset *= prec;
452 // Initiate transfer
453 return snrt_dma_start_2d_mcast((uint64_t)dst, // dst
454 (uint64_t)src + src_offset, // src
455 tile_x0_size * prec, // size
456 tile_ld, // dst_stride
457 full_x0_size * prec, // src_stride
458 tile_x1_size, // repeat
459 mask // mask
460 );
461}
462
473 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
474 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
475 uint32_t prec, uint32_t mask) {
476 return snrt_dma_load_2d_tile_mcast(dst, src, tile_x1_idx, tile_x0_idx,
477 tile_x1_size, tile_x0_size, full_x0_size,
478 prec, tile_x0_size * prec, mask);
479}
480
498 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
499 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
500 uint32_t prec, size_t num_banks) {
501 // Calculate new tile size after reshaping the tile in the selected banks
502 size_t tile_x0_size_in_banks = (num_banks * SNRT_TCDM_BANK_WIDTH) / prec;
503 size_t tile_x1_size_in_banks =
504 ceil((tile_x1_size * tile_x0_size) / (double)tile_x0_size_in_banks);
505 size_t tile_ld = SNRT_TCDM_HYPERBANK_WIDTH;
506 return snrt_dma_load_2d_tile(dst, src, tile_x1_idx, tile_x0_idx,
507 tile_x1_size_in_banks, tile_x0_size_in_banks,
508 full_x0_size, prec, tile_ld);
509}
510
527 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
528 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
529 uint32_t prec, size_t tile_ld) {
530 size_t dst_offset = 0;
531 // Advance dst array in x0 and x1 dimensions, and convert to byte offset
532 dst_offset += tile_x0_idx * tile_x0_size;
533 dst_offset += tile_x1_idx * tile_x1_size * full_x0_size;
534 dst_offset *= prec;
535 // Initiate transfer
536 return snrt_dma_start_2d((uint64_t)dst + dst_offset, // dst
537 (uint64_t)src, // src
538 tile_x0_size * prec, // size
539 full_x0_size * prec, // dst_stride
540 tile_ld, // src_stride
541 tile_x1_size // repeat
542 );
543}
544
555 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
556 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
557 uint32_t prec) {
558 return snrt_dma_store_2d_tile(dst, src, tile_x1_idx, tile_x0_idx,
559 tile_x1_size, tile_x0_size, full_x0_size,
560 prec, tile_x0_size * prec);
561}
562
580 void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
581 size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
582 uint32_t prec, size_t num_banks) {
583 // Calculate new tile size after reshaping the tile in the selected banks
584 size_t tile_x0_size_in_banks = (num_banks * SNRT_TCDM_BANK_WIDTH) / prec;
585 size_t tile_x1_size_in_banks =
586 ceil((tile_x1_size * tile_x0_size) / (double)tile_x0_size_in_banks);
587 size_t tile_ld = SNRT_TCDM_HYPERBANK_WIDTH;
588 return snrt_dma_store_2d_tile(dst, src, tile_x1_idx, tile_x0_idx,
589 tile_x1_size_in_banks, tile_x0_size_in_banks,
590 full_x0_size, prec, tile_ld);
591}
void snrt_dma_enable_mcast(uint32_t mask)
Enable multicast for successive transfers.
Definition dma.h:68
snrt_dma_txid_t snrt_dma_load_1d_tile(volatile void *dst, volatile void *src, size_t tile_idx, size_t tile_size, uint32_t prec)
Load a tile of a 1D array.
Definition dma.h:309
snrt_dma_txid_t snrt_dma_1d_to_2d(volatile void *dst, volatile void *src, size_t size, size_t row_size, size_t stride)
Transfer and reshape a 1D array into a 2D array.
Definition dma.h:346
static snrt_dma_txid_t snrt_dma_start_2d(uint64_t dst, uint64_t src, size_t size, size_t dst_stride, size_t src_stride, size_t repeat, const uint32_t channel=0)
Start an asynchronous 2D DMA transfer with 64-bit wide pointers.
Definition dma.h:128
snrt_dma_txid_t snrt_dma_store_2d_tile_from_banks(void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, uint32_t prec, size_t num_banks)
Store a 2D tile of a 2D array from a 1D layout occupying a subset of TCDM banks.
Definition dma.h:579
static uint32_t snrt_dma_start_1d(uint64_t dst, uint64_t src, size_t size, const uint32_t channel=0)
Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a specific DMA channel.
Definition dma.h:31
uint32_t snrt_dma_txid_t
A DMA transfer identifier.
Definition dma.h:15
snrt_dma_txid_t snrt_dma_store_2d_tile(void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, uint32_t prec, size_t tile_ld)
Store a 2D tile to a 2D array.
Definition dma.h:526
snrt_dma_txid_t snrt_dma_load_2d_tile_in_banks(void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, uint32_t prec, size_t num_banks)
Load a 2D tile of a 2D array and reshape it to occupy a subset of TCDM banks.
Definition dma.h:497
snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec)
Store a tile to a 1D array.
Definition dma.h:376
static uint32_t snrt_dma_start_2d_mcast(uint64_t dst, uint64_t src, size_t size, size_t dst_stride, size_t src_stride, size_t repeat, uint32_t mask, const uint32_t channel=0)
Start an asynchronous, multicast 2D DMA transfer with 64-bit wide pointers.
Definition dma.h:178
snrt_dma_txid_t snrt_dma_2d_to_1d(volatile void *dst, volatile void *src, size_t size, size_t row_size, size_t stride)
Transfer and reshape a 2D array into a 1D array.
Definition dma.h:361
void snrt_dma_wait_all_channels(uint32_t num_channels)
Block until the first num_channels channels are idle.
Definition dma.h:253
snrt_dma_txid_t snrt_dma_load_1d_tile_mcast(void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec, uint32_t mcast)
Load a tile of a 1D array.
Definition dma.h:327
static uint32_t snrt_dma_start_1d_mcast(uint64_t dst, uint64_t src, size_t size, uint32_t mask, const uint32_t channel=0)
Start an asynchronous multicast 1D DMA transfer with 64-bit wide pointers.
Definition dma.h:85
void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len)
Fast memset function performed by DMA.
Definition dma.h:282
static void snrt_dma_wait(snrt_dma_txid_t txid, const uint32_t channel=0)
Block until a DMA transfer finishes on a specific DMA channel.
Definition dma.h:219
snrt_dma_txid_t snrt_dma_load_2d_tile(void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, uint32_t prec, size_t tile_ld)
Load a 2D tile of a 2D array.
Definition dma.h:399
static void snrt_dma_wait_all(const uint32_t channel=0)
Block until a specific DMA channel is idle.
Definition dma.h:239
void snrt_dma_start_tracking()
Start tracking of dma performance region. Does not have any implications on the HW....
Definition dma.h:265
void snrt_dma_disable_mcast()
Disable multicast for successive transfers.
Definition dma.h:76
snrt_dma_txid_t snrt_dma_load_2d_tile_mcast(void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, uint32_t prec, size_t tile_ld, uint32_t mask)
Load a 2D tile of a 2D array using multicast.
Definition dma.h:443
void snrt_dma_stop_tracking()
Stop tracking of dma performance region. Does not have any implications on the HW....
Definition dma.h:273