snRuntime/src/dm.h

Classes

	Name
struct	dm_task_t
struct	dm_t

Types

	Name
enum	en_stat { STAT_WAIT_IDLE = 1, STAT_EXIT = 2, STAT_READY = 3}
typedef enum en_stat	en_stat_t

Functions

	Name
void	wfi_dm(uint32_t cluster_core_idx)
void	wake_dm(void )
void	dm_init(void ) Init the data mover and load a pointer to the DM struct in to TLS. Needs to be called by the DM itself and all harts that want to use the dm functions.
void	dm_main(void ) data mover main function
void	dm_exit(void ) Send the data mover to exit()
void	dm_memcpy_async(void * dest, const void * src, size_t n) Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued.
void	dm_memcpy2d_async(uint64_t src, uint64_t dst, uint32_t size, uint32_t sstrd, uint32_t dstrd, uint32_t nreps, uint32_t cfg) Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued.
void	dm_start(void ) Trigger the start of queued transfers and exit immediately.
void	dm_wait(void ) Wait for all DMA transfers to complete.
void	dm_wait_ready(void ) Wait for the DM core to be ready.

Attributes

	Name
__thread volatile dm_t *	dm_p Pointer to the data mover struct in TCDM per thread for faster access.
volatile dm_t *volatile	dm_p_global Pointer to where the DM struct in TCDM is located.

Defines

	Name
	DM_TASK_QUEUE_SIZE Define DM_USE_GLOBAL_CLINT to use the cluster-shared CLINT based SW interrupt system for synchronization. If not defined, the harts use the cluster-local CLINT to syncrhonize which is faster but only works for cluster-local synchronization which is sufficient at the moment since the OpenMP runtime is single cluster only.
	_dm_mtx_lock()
	_dm_mtx_release()
	DM_STATUS_COMPLETE_ID
	DM_STATUS_NEXT_ID
	DM_STATUS_BUSY
	DM_STATUS_WOULD_BLOCK
	DM_PRINTF(d, ...)

Types Documentation

enum en_stat

Enumerator	Value	Description
STAT_WAIT_IDLE	1
STAT_EXIT	2
STAT_READY	3

typedef en_stat_t

typedef enum en_stat en_stat_t;

Functions Documentation

function wfi_dm

inline void wfi_dm(
    uint32_t cluster_core_idx
)

function wake_dm

inline void wake_dm(
    void 
)

function dm_init

inline void dm_init(
    void 
)

Init the data mover and load a pointer to the DM struct in to TLS. Needs to be called by the DM itself and all harts that want to use the dm functions.

function dm_main

inline void dm_main(
    void 
)

data mover main function

New transaction to issue?

any STAT request pending?

function dm_exit

inline void dm_exit(
    void 
)

Send the data mover to exit()

function dm_memcpy_async

inline void dm_memcpy_async(
    void * dest,
    const void * src,
    size_t n
)

Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued.

Parameters:

dest destination pointer
src source pointer
n number of bytes to copy

Return: transfer ID

block only if DM queue is full

function dm_memcpy2d_async

inline void dm_memcpy2d_async(
    uint64_t src,
    uint64_t dst,
    uint32_t size,
    uint32_t sstrd,
    uint32_t dstrd,
    uint32_t nreps,
    uint32_t cfg
)

Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued.

Parameters:

src source address
dst destination address
size size in inner dimension
sstrd outer source stride
dstrd outer destination stride
nreps number of repetitions in outer dimension
cfg DMA configuration

block only if DM queue is full

function dm_start

inline void dm_start(
    void 
)

Trigger the start of queued transfers and exit immediately.

function dm_wait

inline void dm_wait(
    void 
)

Wait for all DMA transfers to complete.

function dm_wait_ready

inline void dm_wait_ready(
    void 
)

Wait for the DM core to be ready.

Attributes Documentation

variable dm_p

__thread volatile dm_t * dm_p;

Pointer to the data mover struct in TCDM per thread for faster access.

variable dm_p_global

volatile dm_t *volatile dm_p_global;

Pointer to where the DM struct in TCDM is located.

Macros Documentation

define DM_TASK_QUEUE_SIZE

#define DM_TASK_QUEUE_SIZE 4

Define DM_USE_GLOBAL_CLINT to use the cluster-shared CLINT based SW interrupt system for synchronization. If not defined, the harts use the cluster-local CLINT to syncrhonize which is faster but only works for cluster-local synchronization which is sufficient at the moment since the OpenMP runtime is single cluster only.

Number of outstanding transactions to buffer. Each requires sizeof(dm_task_t) bytes

define _dm_mtx_lock

#define _dm_mtx_lock(

)
snrt_mutex_acquire(&dm_p->mutex)

define _dm_mtx_release

#define _dm_mtx_release(

)
snrt_mutex_release(&dm_p->mutex)

define DM_STATUS_COMPLETE_ID

#define DM_STATUS_COMPLETE_ID 0

Returns of the dm status call

define DM_STATUS_NEXT_ID

#define DM_STATUS_NEXT_ID 1

define DM_STATUS_BUSY

#define DM_STATUS_BUSY 2

define DM_STATUS_WOULD_BLOCK

#define DM_STATUS_WOULD_BLOCK 3

define DM_PRINTF

#define DM_PRINTF(
    d,
    ...
)

Source code

// Copyright 2021 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#ifndef DM_H
#define DM_H

//================================================================================
// Settings
//================================================================================

// #define DM_USE_GLOBAL_CLINT

#define DM_TASK_QUEUE_SIZE 4

//================================================================================
// Macros
//================================================================================

#define _dm_mtx_lock() snrt_mutex_acquire(&dm_p->mutex)
#define _dm_mtx_release() snrt_mutex_release(&dm_p->mutex)

#define DM_STATUS_COMPLETE_ID 0
#define DM_STATUS_NEXT_ID 1
#define DM_STATUS_BUSY 2
#define DM_STATUS_WOULD_BLOCK 3

//================================================================================
// Debug
//================================================================================

// #define DM_DEBUG_LEVEL 100

#ifdef DM_DEBUG_LEVEL
#include "printf.h"
#define _DM_PRINTF(...)             \
    if (1) {                        \
        printf("[dm] "__VA_ARGS__); \
    }
#define DM_PRINTF(d, ...)        \
    if (DM_DEBUG_LEVEL >= d) {   \
        _DM_PRINTF(__VA_ARGS__); \
    }
#else
#define DM_PRINTF(d, ...)
#endif

//================================================================================
// Types
//================================================================================

typedef struct {
    uint64_t src;
    uint64_t dst;
    uint32_t size;
    uint32_t sstrd;
    uint32_t dstrd;
    uint32_t nreps;
    uint32_t cfg;
    uint32_t twod;
} dm_task_t;

// used for ultra-fine grained communication
// stat_q can be used to request a command, 0 is no command
// the response is put into stat_p and is valid iff stat_pvalid is non-zero
typedef enum en_stat {
    // commands the DM core to wait until all transfers are complete
    STAT_WAIT_IDLE = 1,
    // abort and exit
    STAT_EXIT = 2,
    // poll if DM is ready
    STAT_READY = 3,
} en_stat_t;

typedef struct {
    dm_task_t queue[DM_TASK_QUEUE_SIZE];
    uint32_t queue_back;
    uint32_t queue_front;
    volatile uint32_t queue_fill;
    volatile uint32_t mutex;
    volatile en_stat_t stat_q;
    volatile uint32_t stat_p;
    volatile uint32_t stat_pvalid;
    volatile uint32_t dm_wfi;
} dm_t;

//================================================================================
// Data
//================================================================================

extern __thread volatile dm_t *dm_p;
extern volatile dm_t *volatile dm_p_global;

//================================================================================
// Functions
//================================================================================

#ifdef DM_USE_GLOBAL_CLINT
inline void wfi_dm(uint32_t cluster_core_idx) {
    (void)cluster_core_idx;
    snrt_int_sw_poll();
}
inline void wake_dm(void) {
    uint32_t basehart = snrt_cluster_core_base_hartid();
    snrt_int_sw_set(basehart + snrt_cluster_dm_core_idx());
}
#else
inline void wfi_dm(uint32_t cluster_core_idx) {
    __atomic_add_fetch(&dm_p->dm_wfi, 1, __ATOMIC_RELAXED);
    snrt_wfi();
    snrt_int_cluster_clr(1 << cluster_core_idx);
    __atomic_add_fetch(&dm_p->dm_wfi, -1, __ATOMIC_RELAXED);
}
inline void wake_dm(void) {
    // wait for DM to sleep before sending wakeup
    while (!__atomic_load_n(&dm_p->dm_wfi, __ATOMIC_RELAXED))
        ;
    snrt_int_cluster_set(1 << snrt_cluster_compute_core_num());
}
#endif  // #ifdef DM_USE_GLOBAL_CLINT

inline void dm_init(void) {
    // create a data mover instance
    if (snrt_is_dm_core()) {
#ifdef DM_USE_GLOBAL_CLINT
        snrt_interrupt_enable(IRQ_M_SOFT);
#else
        snrt_interrupt_enable(IRQ_M_CLUSTER);
#endif
        dm_p = (dm_t *)snrt_l1alloc(sizeof(dm_t));
        snrt_memset((void *)dm_p, 0, sizeof(dm_t));
        dm_p_global = dm_p;
    } else {
        while (!dm_p_global)
            ;
        dm_p = dm_p_global;
    }
}

inline void dm_main(void) {
    volatile dm_task_t *t;
    uint32_t do_exit = 0;
    uint32_t cluster_core_idx = snrt_cluster_core_idx();

    DM_PRINTF(10, "enter main\n");

    while (!do_exit) {
        if (dm_p->queue_fill) {
            // wait until DMA is ready
            while (__builtin_sdma_stat(DM_STATUS_WOULD_BLOCK))
                ;

            t = &dm_p->queue[dm_p->queue_back];

            if (t->twod) {
                DM_PRINTF(10, "start twod\n");
                __builtin_sdma_start_twod(t->src, t->dst, t->size, t->sstrd,
                                          t->dstrd, t->nreps, t->cfg);
            } else {
                DM_PRINTF(10, "start oned\n");
                __builtin_sdma_start_oned(t->src, t->dst, t->size, t->cfg);
            }

            // bump
            dm_p->queue_back = (dm_p->queue_back + 1) % DM_TASK_QUEUE_SIZE;
            __atomic_add_fetch(&dm_p->queue_fill, -1, __ATOMIC_RELAXED);
        }

        if (dm_p->stat_q) {
            switch (dm_p->stat_q) {
                case STAT_WAIT_IDLE:
                    // check status and set pvalid if DMA is idle and clear
                    // request
                    if (__builtin_sdma_stat(DM_STATUS_BUSY) == 0) {
                        DM_PRINTF(50, "idle\n");
                        dm_p->stat_pvalid = 1;
                        dm_p->stat_q = 0;
                    }
                    break;
                case STAT_EXIT:
                    do_exit = 1;
                    break;
                case STAT_READY:
                    DM_PRINTF(50, "ready\n");
                    dm_p->stat_pvalid = 1;
                    dm_p->stat_q = 0;
                    break;
            }
        }

        // sleep if queue is empty and no stats pending
        if (!dm_p->queue_fill && !dm_p->stat_q) {
            wfi_dm(cluster_core_idx);
        }
    }
    DM_PRINTF(10, "dm: exit\n");
#ifdef DM_USE_GLOBAL_CLINT
    snrt_interrupt_disable(IRQ_M_SOFT);
#else
    snrt_interrupt_disable(IRQ_M_CLUSTER);
#endif
    return;
}

inline void dm_exit(void) {
    dm_p->stat_q = STAT_EXIT;
    // signal data mover
    wake_dm();
}

inline void dm_memcpy_async(void *dest, const void *src, size_t n) {
    uint32_t s;
    volatile dm_task_t *t;

    DM_PRINTF(10, "dm_memcpy_async %#x -> %#x size %d\n", src, dest,
              (uint32_t)n);

    // poll queue size
    do {
        s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
    } while (s >= DM_TASK_QUEUE_SIZE);
    _dm_mtx_lock();

    // insert
    t = &dm_p->queue[dm_p->queue_front];
    t->src = (uint64_t)src;
    t->dst = (uint64_t)dest;
    t->size = (uint32_t)n;
    t->twod = 0;
    t->cfg = 0;

    // bump
    __atomic_add_fetch(&dm_p->queue_fill, 1, __ATOMIC_RELAXED);
    dm_p->queue_front = (dm_p->queue_front + 1) % DM_TASK_QUEUE_SIZE;

    _dm_mtx_release();
}

inline void dm_memcpy2d_async(uint64_t src, uint64_t dst, uint32_t size,
                              uint32_t sstrd, uint32_t dstrd, uint32_t nreps,
                              uint32_t cfg) {
    uint32_t s;
    volatile dm_task_t *t;

    DM_PRINTF(10, "dm_memcpy2d_async %#x -> %#x size %d\n", src, dst,
              (uint32_t)size);

    // poll queue size
    do {
        s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
    } while (s >= DM_TASK_QUEUE_SIZE);
    _dm_mtx_lock();

    // insert
    t = &dm_p->queue[dm_p->queue_front];
    t->src = src;
    t->dst = dst;
    t->size = size;
    t->sstrd = sstrd;
    t->dstrd = dstrd;
    t->nreps = nreps;
    t->twod = 1;
    t->cfg = cfg;

    // bump
    __atomic_add_fetch(&dm_p->queue_fill, 1, __ATOMIC_RELAXED);
    dm_p->queue_front = (dm_p->queue_front + 1) % DM_TASK_QUEUE_SIZE;

    _dm_mtx_release();
}

inline void dm_start(void) { wake_dm(); }

inline void dm_wait(void) {
    uint32_t s;

    // signal data mover
    wake_dm();

    // first, wait for the dm queue to be empty and no request be pending
    do {
        s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
    } while (s != 0);
    while (dm_p->stat_q)
        ;

    // then, issue the STAT_WAIT_IDLE request so the DM core polls for the DMA
    // to be idle
    _dm_mtx_lock();
    dm_p->stat_pvalid = 0;
    // this is the request
    dm_p->stat_q = STAT_WAIT_IDLE;
    // signal data mover
    wake_dm();
    // whenever stat_pvalid is non-zero, the DMA has completed all transfers
    while (!dm_p->stat_pvalid)
        ;
    _dm_mtx_release();
}

inline void dm_wait_ready(void) {
    _dm_mtx_lock();
    dm_p->stat_pvalid = 0;
    dm_p->stat_q = STAT_READY;
    wake_dm();
    while (!dm_p->stat_pvalid)
        ;
    _dm_mtx_release();
}

#endif /* DM_H */

Updated on 2023-06-19 at 09:43:56 +0000