snRuntime/src/dm.h
Classes
Name | |
---|---|
struct | dm_task_t |
struct | dm_t |
Types
Name | |
---|---|
enum | en_stat { STAT_WAIT_IDLE = 1, STAT_EXIT = 2, STAT_READY = 3} |
typedef enum en_stat | en_stat_t |
Functions
Name | |
---|---|
void | wfi_dm(uint32_t cluster_core_idx) |
void | wake_dm(void ) |
void | dm_init(void ) Init the data mover and load a pointer to the DM struct in to TLS. Needs to be called by the DM itself and all harts that want to use the dm functions. |
void | dm_main(void ) data mover main function |
void | dm_exit(void ) Send the data mover to exit() |
void | dm_memcpy_async(void * dest, const void * src, size_t n) Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued. |
void | dm_memcpy2d_async(uint64_t src, uint64_t dst, uint32_t size, uint32_t sstrd, uint32_t dstrd, uint32_t nreps, uint32_t cfg) Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued. |
void | dm_start(void ) Trigger the start of queued transfers and exit immediately. |
void | dm_wait(void ) Wait for all DMA transfers to complete. |
void | dm_wait_ready(void ) Wait for the DM core to be ready. |
Attributes
Name | |
---|---|
__thread volatile dm_t * | dm_p Pointer to the data mover struct in TCDM per thread for faster access. |
volatile dm_t *volatile | dm_p_global Pointer to where the DM struct in TCDM is located. |
Defines
Name | |
---|---|
DM_TASK_QUEUE_SIZE Define DM_USE_GLOBAL_CLINT to use the cluster-shared CLINT based SW interrupt system for synchronization. If not defined, the harts use the cluster-local CLINT to syncrhonize which is faster but only works for cluster-local synchronization which is sufficient at the moment since the OpenMP runtime is single cluster only. |
|
_dm_mtx_lock() | |
_dm_mtx_release() | |
DM_STATUS_COMPLETE_ID | |
DM_STATUS_NEXT_ID | |
DM_STATUS_BUSY | |
DM_STATUS_WOULD_BLOCK | |
DM_PRINTF(d, ...) |
Types Documentation
enum en_stat
Enumerator | Value | Description |
---|---|---|
STAT_WAIT_IDLE | 1 | |
STAT_EXIT | 2 | |
STAT_READY | 3 |
typedef en_stat_t
typedef enum en_stat en_stat_t;
Functions Documentation
function wfi_dm
inline void wfi_dm(
uint32_t cluster_core_idx
)
function wake_dm
inline void wake_dm(
void
)
function dm_init
inline void dm_init(
void
)
Init the data mover and load a pointer to the DM struct in to TLS. Needs to be called by the DM itself and all harts that want to use the dm functions.
function dm_main
inline void dm_main(
void
)
data mover main function
New transaction to issue?
any STAT request pending?
function dm_exit
inline void dm_exit(
void
)
Send the data mover to exit()
function dm_memcpy_async
inline void dm_memcpy_async(
void * dest,
const void * src,
size_t n
)
Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued.
Parameters:
- dest destination pointer
- src source pointer
- n number of bytes to copy
Return: transfer ID
block only if DM queue is full
function dm_memcpy2d_async
inline void dm_memcpy2d_async(
uint64_t src,
uint64_t dst,
uint32_t size,
uint32_t sstrd,
uint32_t dstrd,
uint32_t nreps,
uint32_t cfg
)
Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued.
Parameters:
- src source address
- dst destination address
- size size in inner dimension
- sstrd outer source stride
- dstrd outer destination stride
- nreps number of repetitions in outer dimension
- cfg DMA configuration
block only if DM queue is full
function dm_start
inline void dm_start(
void
)
Trigger the start of queued transfers and exit immediately.
function dm_wait
inline void dm_wait(
void
)
Wait for all DMA transfers to complete.
function dm_wait_ready
inline void dm_wait_ready(
void
)
Wait for the DM core to be ready.
Attributes Documentation
variable dm_p
__thread volatile dm_t * dm_p;
Pointer to the data mover struct in TCDM per thread for faster access.
variable dm_p_global
volatile dm_t *volatile dm_p_global;
Pointer to where the DM struct in TCDM is located.
Macros Documentation
define DM_TASK_QUEUE_SIZE
#define DM_TASK_QUEUE_SIZE 4
Define DM_USE_GLOBAL_CLINT to use the cluster-shared CLINT based SW interrupt system for synchronization. If not defined, the harts use the cluster-local CLINT to syncrhonize which is faster but only works for cluster-local synchronization which is sufficient at the moment since the OpenMP runtime is single cluster only.
Number of outstanding transactions to buffer. Each requires sizeof(dm_task_t) bytes
define _dm_mtx_lock
#define _dm_mtx_lock(
)
snrt_mutex_acquire(&dm_p->mutex)
define _dm_mtx_release
#define _dm_mtx_release(
)
snrt_mutex_release(&dm_p->mutex)
define DM_STATUS_COMPLETE_ID
#define DM_STATUS_COMPLETE_ID 0
Returns of the dm status call
define DM_STATUS_NEXT_ID
#define DM_STATUS_NEXT_ID 1
define DM_STATUS_BUSY
#define DM_STATUS_BUSY 2
define DM_STATUS_WOULD_BLOCK
#define DM_STATUS_WOULD_BLOCK 3
define DM_PRINTF
#define DM_PRINTF(
d,
...
)
Source code
// Copyright 2021 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#ifndef DM_H
#define DM_H
//================================================================================
// Settings
//================================================================================
// #define DM_USE_GLOBAL_CLINT
#define DM_TASK_QUEUE_SIZE 4
//================================================================================
// Macros
//================================================================================
#define _dm_mtx_lock() snrt_mutex_acquire(&dm_p->mutex)
#define _dm_mtx_release() snrt_mutex_release(&dm_p->mutex)
#define DM_STATUS_COMPLETE_ID 0
#define DM_STATUS_NEXT_ID 1
#define DM_STATUS_BUSY 2
#define DM_STATUS_WOULD_BLOCK 3
//================================================================================
// Debug
//================================================================================
// #define DM_DEBUG_LEVEL 100
#ifdef DM_DEBUG_LEVEL
#include "printf.h"
#define _DM_PRINTF(...) \
if (1) { \
printf("[dm] "__VA_ARGS__); \
}
#define DM_PRINTF(d, ...) \
if (DM_DEBUG_LEVEL >= d) { \
_DM_PRINTF(__VA_ARGS__); \
}
#else
#define DM_PRINTF(d, ...)
#endif
//================================================================================
// Types
//================================================================================
typedef struct {
uint64_t src;
uint64_t dst;
uint32_t size;
uint32_t sstrd;
uint32_t dstrd;
uint32_t nreps;
uint32_t cfg;
uint32_t twod;
} dm_task_t;
// used for ultra-fine grained communication
// stat_q can be used to request a command, 0 is no command
// the response is put into stat_p and is valid iff stat_pvalid is non-zero
typedef enum en_stat {
// commands the DM core to wait until all transfers are complete
STAT_WAIT_IDLE = 1,
// abort and exit
STAT_EXIT = 2,
// poll if DM is ready
STAT_READY = 3,
} en_stat_t;
typedef struct {
dm_task_t queue[DM_TASK_QUEUE_SIZE];
uint32_t queue_back;
uint32_t queue_front;
volatile uint32_t queue_fill;
volatile uint32_t mutex;
volatile en_stat_t stat_q;
volatile uint32_t stat_p;
volatile uint32_t stat_pvalid;
volatile uint32_t dm_wfi;
} dm_t;
//================================================================================
// Data
//================================================================================
extern __thread volatile dm_t *dm_p;
extern volatile dm_t *volatile dm_p_global;
//================================================================================
// Functions
//================================================================================
#ifdef DM_USE_GLOBAL_CLINT
inline void wfi_dm(uint32_t cluster_core_idx) {
(void)cluster_core_idx;
snrt_int_sw_poll();
}
inline void wake_dm(void) {
uint32_t basehart = snrt_cluster_core_base_hartid();
snrt_int_sw_set(basehart + snrt_cluster_dm_core_idx());
}
#else
inline void wfi_dm(uint32_t cluster_core_idx) {
__atomic_add_fetch(&dm_p->dm_wfi, 1, __ATOMIC_RELAXED);
snrt_wfi();
snrt_int_cluster_clr(1 << cluster_core_idx);
__atomic_add_fetch(&dm_p->dm_wfi, -1, __ATOMIC_RELAXED);
}
inline void wake_dm(void) {
// wait for DM to sleep before sending wakeup
while (!__atomic_load_n(&dm_p->dm_wfi, __ATOMIC_RELAXED))
;
snrt_int_cluster_set(1 << snrt_cluster_compute_core_num());
}
#endif // #ifdef DM_USE_GLOBAL_CLINT
inline void dm_init(void) {
// create a data mover instance
if (snrt_is_dm_core()) {
#ifdef DM_USE_GLOBAL_CLINT
snrt_interrupt_enable(IRQ_M_SOFT);
#else
snrt_interrupt_enable(IRQ_M_CLUSTER);
#endif
dm_p = (dm_t *)snrt_l1alloc(sizeof(dm_t));
snrt_memset((void *)dm_p, 0, sizeof(dm_t));
dm_p_global = dm_p;
} else {
while (!dm_p_global)
;
dm_p = dm_p_global;
}
}
inline void dm_main(void) {
volatile dm_task_t *t;
uint32_t do_exit = 0;
uint32_t cluster_core_idx = snrt_cluster_core_idx();
DM_PRINTF(10, "enter main\n");
while (!do_exit) {
if (dm_p->queue_fill) {
// wait until DMA is ready
while (__builtin_sdma_stat(DM_STATUS_WOULD_BLOCK))
;
t = &dm_p->queue[dm_p->queue_back];
if (t->twod) {
DM_PRINTF(10, "start twod\n");
__builtin_sdma_start_twod(t->src, t->dst, t->size, t->sstrd,
t->dstrd, t->nreps, t->cfg);
} else {
DM_PRINTF(10, "start oned\n");
__builtin_sdma_start_oned(t->src, t->dst, t->size, t->cfg);
}
// bump
dm_p->queue_back = (dm_p->queue_back + 1) % DM_TASK_QUEUE_SIZE;
__atomic_add_fetch(&dm_p->queue_fill, -1, __ATOMIC_RELAXED);
}
if (dm_p->stat_q) {
switch (dm_p->stat_q) {
case STAT_WAIT_IDLE:
// check status and set pvalid if DMA is idle and clear
// request
if (__builtin_sdma_stat(DM_STATUS_BUSY) == 0) {
DM_PRINTF(50, "idle\n");
dm_p->stat_pvalid = 1;
dm_p->stat_q = 0;
}
break;
case STAT_EXIT:
do_exit = 1;
break;
case STAT_READY:
DM_PRINTF(50, "ready\n");
dm_p->stat_pvalid = 1;
dm_p->stat_q = 0;
break;
}
}
// sleep if queue is empty and no stats pending
if (!dm_p->queue_fill && !dm_p->stat_q) {
wfi_dm(cluster_core_idx);
}
}
DM_PRINTF(10, "dm: exit\n");
#ifdef DM_USE_GLOBAL_CLINT
snrt_interrupt_disable(IRQ_M_SOFT);
#else
snrt_interrupt_disable(IRQ_M_CLUSTER);
#endif
return;
}
inline void dm_exit(void) {
dm_p->stat_q = STAT_EXIT;
// signal data mover
wake_dm();
}
inline void dm_memcpy_async(void *dest, const void *src, size_t n) {
uint32_t s;
volatile dm_task_t *t;
DM_PRINTF(10, "dm_memcpy_async %#x -> %#x size %d\n", src, dest,
(uint32_t)n);
// poll queue size
do {
s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
} while (s >= DM_TASK_QUEUE_SIZE);
_dm_mtx_lock();
// insert
t = &dm_p->queue[dm_p->queue_front];
t->src = (uint64_t)src;
t->dst = (uint64_t)dest;
t->size = (uint32_t)n;
t->twod = 0;
t->cfg = 0;
// bump
__atomic_add_fetch(&dm_p->queue_fill, 1, __ATOMIC_RELAXED);
dm_p->queue_front = (dm_p->queue_front + 1) % DM_TASK_QUEUE_SIZE;
_dm_mtx_release();
}
inline void dm_memcpy2d_async(uint64_t src, uint64_t dst, uint32_t size,
uint32_t sstrd, uint32_t dstrd, uint32_t nreps,
uint32_t cfg) {
uint32_t s;
volatile dm_task_t *t;
DM_PRINTF(10, "dm_memcpy2d_async %#x -> %#x size %d\n", src, dst,
(uint32_t)size);
// poll queue size
do {
s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
} while (s >= DM_TASK_QUEUE_SIZE);
_dm_mtx_lock();
// insert
t = &dm_p->queue[dm_p->queue_front];
t->src = src;
t->dst = dst;
t->size = size;
t->sstrd = sstrd;
t->dstrd = dstrd;
t->nreps = nreps;
t->twod = 1;
t->cfg = cfg;
// bump
__atomic_add_fetch(&dm_p->queue_fill, 1, __ATOMIC_RELAXED);
dm_p->queue_front = (dm_p->queue_front + 1) % DM_TASK_QUEUE_SIZE;
_dm_mtx_release();
}
inline void dm_start(void) { wake_dm(); }
inline void dm_wait(void) {
uint32_t s;
// signal data mover
wake_dm();
// first, wait for the dm queue to be empty and no request be pending
do {
s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
} while (s != 0);
while (dm_p->stat_q)
;
// then, issue the STAT_WAIT_IDLE request so the DM core polls for the DMA
// to be idle
_dm_mtx_lock();
dm_p->stat_pvalid = 0;
// this is the request
dm_p->stat_q = STAT_WAIT_IDLE;
// signal data mover
wake_dm();
// whenever stat_pvalid is non-zero, the DMA has completed all transfers
while (!dm_p->stat_pvalid)
;
_dm_mtx_release();
}
inline void dm_wait_ready(void) {
_dm_mtx_lock();
dm_p->stat_pvalid = 0;
dm_p->stat_q = STAT_READY;
wake_dm();
while (!dm_p->stat_pvalid)
;
_dm_mtx_release();
}
#endif /* DM_H */
Updated on 2023-06-19 at 09:43:56 +0000