Skip to content

snRuntime/src/dm.h

Classes

Name
struct dm_task_t
struct dm_t

Types

Name
enum en_stat { STAT_WAIT_IDLE = 1, STAT_EXIT = 2, STAT_READY = 3}
typedef enum en_stat en_stat_t

Functions

Name
void wfi_dm(uint32_t cluster_core_idx)
void wake_dm(void )
void dm_init(void )
Init the data mover and load a pointer to the DM struct in to TLS. Needs to be called by the DM itself and all harts that want to use the dm functions.
void dm_main(void )
data mover main function
void dm_exit(void )
Send the data mover to exit()
void dm_memcpy_async(void * dest, const void * src, size_t n)
Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued.
void dm_memcpy2d_async(uint64_t src, uint64_t dst, uint32_t size, uint32_t sstrd, uint32_t dstrd, uint32_t nreps, uint32_t cfg)
Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued.
void dm_start(void )
Trigger the start of queued transfers and exit immediately.
void dm_wait(void )
Wait for all DMA transfers to complete.
void dm_wait_ready(void )
Wait for the DM core to be ready.

Attributes

Name
__thread volatile dm_t * dm_p
Pointer to the data mover struct in TCDM per thread for faster access.
volatile dm_t *volatile dm_p_global
Pointer to where the DM struct in TCDM is located.

Defines

Name
DM_TASK_QUEUE_SIZE
Define DM_USE_GLOBAL_CLINT to use the cluster-shared CLINT based SW interrupt system for synchronization. If not defined, the harts use the cluster-local CLINT to syncrhonize which is faster but only works for cluster-local synchronization which is sufficient at the moment since the OpenMP runtime is single cluster only.
_dm_mtx_lock()
_dm_mtx_release()
DM_STATUS_COMPLETE_ID
DM_STATUS_NEXT_ID
DM_STATUS_BUSY
DM_STATUS_WOULD_BLOCK
DM_PRINTF(d, ...)

Types Documentation

enum en_stat

Enumerator Value Description
STAT_WAIT_IDLE 1
STAT_EXIT 2
STAT_READY 3

typedef en_stat_t

typedef enum en_stat en_stat_t;

Functions Documentation

function wfi_dm

inline void wfi_dm(
    uint32_t cluster_core_idx
)

function wake_dm

inline void wake_dm(
    void 
)

function dm_init

inline void dm_init(
    void 
)

Init the data mover and load a pointer to the DM struct in to TLS. Needs to be called by the DM itself and all harts that want to use the dm functions.

function dm_main

inline void dm_main(
    void 
)

data mover main function

New transaction to issue?

any STAT request pending?

function dm_exit

inline void dm_exit(
    void 
)

Send the data mover to exit()

function dm_memcpy_async

inline void dm_memcpy_async(
    void * dest,
    const void * src,
    size_t n
)

Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued.

Parameters:

  • dest destination pointer
  • src source pointer
  • n number of bytes to copy

Return: transfer ID

block only if DM queue is full

function dm_memcpy2d_async

inline void dm_memcpy2d_async(
    uint64_t src,
    uint64_t dst,
    uint32_t size,
    uint32_t sstrd,
    uint32_t dstrd,
    uint32_t nreps,
    uint32_t cfg
)

Queue an asynchronus memory copy. The transfer is not started unless dm_start or dm_wait is issued.

Parameters:

  • src source address
  • dst destination address
  • size size in inner dimension
  • sstrd outer source stride
  • dstrd outer destination stride
  • nreps number of repetitions in outer dimension
  • cfg DMA configuration

block only if DM queue is full

function dm_start

inline void dm_start(
    void 
)

Trigger the start of queued transfers and exit immediately.

function dm_wait

inline void dm_wait(
    void 
)

Wait for all DMA transfers to complete.

function dm_wait_ready

inline void dm_wait_ready(
    void 
)

Wait for the DM core to be ready.

Attributes Documentation

variable dm_p

__thread volatile dm_t * dm_p;

Pointer to the data mover struct in TCDM per thread for faster access.

variable dm_p_global

volatile dm_t *volatile dm_p_global;

Pointer to where the DM struct in TCDM is located.

Macros Documentation

define DM_TASK_QUEUE_SIZE

#define DM_TASK_QUEUE_SIZE 4

Define DM_USE_GLOBAL_CLINT to use the cluster-shared CLINT based SW interrupt system for synchronization. If not defined, the harts use the cluster-local CLINT to syncrhonize which is faster but only works for cluster-local synchronization which is sufficient at the moment since the OpenMP runtime is single cluster only.

Number of outstanding transactions to buffer. Each requires sizeof(dm_task_t) bytes

define _dm_mtx_lock

#define _dm_mtx_lock(

)
snrt_mutex_acquire(&dm_p->mutex)

define _dm_mtx_release

#define _dm_mtx_release(

)
snrt_mutex_release(&dm_p->mutex)

define DM_STATUS_COMPLETE_ID

#define DM_STATUS_COMPLETE_ID 0

Returns of the dm status call

define DM_STATUS_NEXT_ID

#define DM_STATUS_NEXT_ID 1

define DM_STATUS_BUSY

#define DM_STATUS_BUSY 2

define DM_STATUS_WOULD_BLOCK

#define DM_STATUS_WOULD_BLOCK 3

define DM_PRINTF

#define DM_PRINTF(
    d,
    ...
)

Source code

// Copyright 2021 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#ifndef DM_H
#define DM_H

//================================================================================
// Settings
//================================================================================

// #define DM_USE_GLOBAL_CLINT

#define DM_TASK_QUEUE_SIZE 4

//================================================================================
// Macros
//================================================================================

#define _dm_mtx_lock() snrt_mutex_acquire(&dm_p->mutex)
#define _dm_mtx_release() snrt_mutex_release(&dm_p->mutex)

#define DM_STATUS_COMPLETE_ID 0
#define DM_STATUS_NEXT_ID 1
#define DM_STATUS_BUSY 2
#define DM_STATUS_WOULD_BLOCK 3

//================================================================================
// Debug
//================================================================================

// #define DM_DEBUG_LEVEL 100

#ifdef DM_DEBUG_LEVEL
#include "printf.h"
#define _DM_PRINTF(...)             \
    if (1) {                        \
        printf("[dm] "__VA_ARGS__); \
    }
#define DM_PRINTF(d, ...)        \
    if (DM_DEBUG_LEVEL >= d) {   \
        _DM_PRINTF(__VA_ARGS__); \
    }
#else
#define DM_PRINTF(d, ...)
#endif

//================================================================================
// Types
//================================================================================

typedef struct {
    uint64_t src;
    uint64_t dst;
    uint32_t size;
    uint32_t sstrd;
    uint32_t dstrd;
    uint32_t nreps;
    uint32_t cfg;
    uint32_t twod;
} dm_task_t;

// used for ultra-fine grained communication
// stat_q can be used to request a command, 0 is no command
// the response is put into stat_p and is valid iff stat_pvalid is non-zero
typedef enum en_stat {
    // commands the DM core to wait until all transfers are complete
    STAT_WAIT_IDLE = 1,
    // abort and exit
    STAT_EXIT = 2,
    // poll if DM is ready
    STAT_READY = 3,
} en_stat_t;

typedef struct {
    dm_task_t queue[DM_TASK_QUEUE_SIZE];
    uint32_t queue_back;
    uint32_t queue_front;
    volatile uint32_t queue_fill;
    volatile uint32_t mutex;
    volatile en_stat_t stat_q;
    volatile uint32_t stat_p;
    volatile uint32_t stat_pvalid;
    volatile uint32_t dm_wfi;
} dm_t;

//================================================================================
// Data
//================================================================================

extern __thread volatile dm_t *dm_p;
extern volatile dm_t *volatile dm_p_global;

//================================================================================
// Functions
//================================================================================

#ifdef DM_USE_GLOBAL_CLINT
inline void wfi_dm(uint32_t cluster_core_idx) {
    (void)cluster_core_idx;
    snrt_int_sw_poll();
}
inline void wake_dm(void) {
    uint32_t basehart = snrt_cluster_core_base_hartid();
    snrt_int_sw_set(basehart + snrt_cluster_dm_core_idx());
}
#else
inline void wfi_dm(uint32_t cluster_core_idx) {
    __atomic_add_fetch(&dm_p->dm_wfi, 1, __ATOMIC_RELAXED);
    snrt_wfi();
    snrt_int_cluster_clr(1 << cluster_core_idx);
    __atomic_add_fetch(&dm_p->dm_wfi, -1, __ATOMIC_RELAXED);
}
inline void wake_dm(void) {
    // wait for DM to sleep before sending wakeup
    while (!__atomic_load_n(&dm_p->dm_wfi, __ATOMIC_RELAXED))
        ;
    snrt_int_cluster_set(1 << snrt_cluster_compute_core_num());
}
#endif  // #ifdef DM_USE_GLOBAL_CLINT

inline void dm_init(void) {
    // create a data mover instance
    if (snrt_is_dm_core()) {
#ifdef DM_USE_GLOBAL_CLINT
        snrt_interrupt_enable(IRQ_M_SOFT);
#else
        snrt_interrupt_enable(IRQ_M_CLUSTER);
#endif
        dm_p = (dm_t *)snrt_l1alloc(sizeof(dm_t));
        snrt_memset((void *)dm_p, 0, sizeof(dm_t));
        dm_p_global = dm_p;
    } else {
        while (!dm_p_global)
            ;
        dm_p = dm_p_global;
    }
}

inline void dm_main(void) {
    volatile dm_task_t *t;
    uint32_t do_exit = 0;
    uint32_t cluster_core_idx = snrt_cluster_core_idx();

    DM_PRINTF(10, "enter main\n");

    while (!do_exit) {
        if (dm_p->queue_fill) {
            // wait until DMA is ready
            while (__builtin_sdma_stat(DM_STATUS_WOULD_BLOCK))
                ;

            t = &dm_p->queue[dm_p->queue_back];

            if (t->twod) {
                DM_PRINTF(10, "start twod\n");
                __builtin_sdma_start_twod(t->src, t->dst, t->size, t->sstrd,
                                          t->dstrd, t->nreps, t->cfg);
            } else {
                DM_PRINTF(10, "start oned\n");
                __builtin_sdma_start_oned(t->src, t->dst, t->size, t->cfg);
            }

            // bump
            dm_p->queue_back = (dm_p->queue_back + 1) % DM_TASK_QUEUE_SIZE;
            __atomic_add_fetch(&dm_p->queue_fill, -1, __ATOMIC_RELAXED);
        }

        if (dm_p->stat_q) {
            switch (dm_p->stat_q) {
                case STAT_WAIT_IDLE:
                    // check status and set pvalid if DMA is idle and clear
                    // request
                    if (__builtin_sdma_stat(DM_STATUS_BUSY) == 0) {
                        DM_PRINTF(50, "idle\n");
                        dm_p->stat_pvalid = 1;
                        dm_p->stat_q = 0;
                    }
                    break;
                case STAT_EXIT:
                    do_exit = 1;
                    break;
                case STAT_READY:
                    DM_PRINTF(50, "ready\n");
                    dm_p->stat_pvalid = 1;
                    dm_p->stat_q = 0;
                    break;
            }
        }

        // sleep if queue is empty and no stats pending
        if (!dm_p->queue_fill && !dm_p->stat_q) {
            wfi_dm(cluster_core_idx);
        }
    }
    DM_PRINTF(10, "dm: exit\n");
#ifdef DM_USE_GLOBAL_CLINT
    snrt_interrupt_disable(IRQ_M_SOFT);
#else
    snrt_interrupt_disable(IRQ_M_CLUSTER);
#endif
    return;
}

inline void dm_exit(void) {
    dm_p->stat_q = STAT_EXIT;
    // signal data mover
    wake_dm();
}

inline void dm_memcpy_async(void *dest, const void *src, size_t n) {
    uint32_t s;
    volatile dm_task_t *t;

    DM_PRINTF(10, "dm_memcpy_async %#x -> %#x size %d\n", src, dest,
              (uint32_t)n);

    // poll queue size
    do {
        s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
    } while (s >= DM_TASK_QUEUE_SIZE);
    _dm_mtx_lock();

    // insert
    t = &dm_p->queue[dm_p->queue_front];
    t->src = (uint64_t)src;
    t->dst = (uint64_t)dest;
    t->size = (uint32_t)n;
    t->twod = 0;
    t->cfg = 0;

    // bump
    __atomic_add_fetch(&dm_p->queue_fill, 1, __ATOMIC_RELAXED);
    dm_p->queue_front = (dm_p->queue_front + 1) % DM_TASK_QUEUE_SIZE;

    _dm_mtx_release();
}

inline void dm_memcpy2d_async(uint64_t src, uint64_t dst, uint32_t size,
                              uint32_t sstrd, uint32_t dstrd, uint32_t nreps,
                              uint32_t cfg) {
    uint32_t s;
    volatile dm_task_t *t;

    DM_PRINTF(10, "dm_memcpy2d_async %#x -> %#x size %d\n", src, dst,
              (uint32_t)size);

    // poll queue size
    do {
        s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
    } while (s >= DM_TASK_QUEUE_SIZE);
    _dm_mtx_lock();

    // insert
    t = &dm_p->queue[dm_p->queue_front];
    t->src = src;
    t->dst = dst;
    t->size = size;
    t->sstrd = sstrd;
    t->dstrd = dstrd;
    t->nreps = nreps;
    t->twod = 1;
    t->cfg = cfg;

    // bump
    __atomic_add_fetch(&dm_p->queue_fill, 1, __ATOMIC_RELAXED);
    dm_p->queue_front = (dm_p->queue_front + 1) % DM_TASK_QUEUE_SIZE;

    _dm_mtx_release();
}

inline void dm_start(void) { wake_dm(); }

inline void dm_wait(void) {
    uint32_t s;

    // signal data mover
    wake_dm();

    // first, wait for the dm queue to be empty and no request be pending
    do {
        s = __atomic_load_n(&dm_p->queue_fill, __ATOMIC_RELAXED);
    } while (s != 0);
    while (dm_p->stat_q)
        ;

    // then, issue the STAT_WAIT_IDLE request so the DM core polls for the DMA
    // to be idle
    _dm_mtx_lock();
    dm_p->stat_pvalid = 0;
    // this is the request
    dm_p->stat_q = STAT_WAIT_IDLE;
    // signal data mover
    wake_dm();
    // whenever stat_pvalid is non-zero, the DMA has completed all transfers
    while (!dm_p->stat_pvalid)
        ;
    _dm_mtx_release();
}

inline void dm_wait_ready(void) {
    _dm_mtx_lock();
    dm_p->stat_pvalid = 0;
    dm_p->stat_q = STAT_READY;
    wake_dm();
    while (!dm_p->stat_pvalid)
        ;
    _dm_mtx_release();
}

#endif /* DM_H */

Updated on 2023-06-19 at 09:43:56 +0000