snRuntime/src/omp/eu.h
Functions
| Name | |
|---|---|
| void | wait_worker_wfi(void ) | 
| void | wake_workers(void ) When using the CLINT as wakeup. | 
| void | worker_wfi(uint32_t cluster_core_idx) | 
| void | eu_print_status() Debugging info to printf. | 
| void | eu_mutex_lock() Acquires the event unit mutex, exits only on success. | 
| void | eu_mutex_release() Releases the acquired mutex. | 
| uint32_t | eu_get_workers_in_loop() | 
| uint32_t | eu_get_workers_in_wfi() | 
| void | eu_init(void ) Initialize the event unit. | 
| void | eu_exit(uint32_t core_idx) send all workers in loop to exit() | 
| void | eu_event_loop(uint32_t cluster_core_idx) Enter the event unit loop, never exits. | 
| int | eu_dispatch_push(void()(void , uint32_t) fn, uint32_t argc, void * data, uint32_t nthreads) Set function to execute by nthreadsnumber of threads. | 
| void | eu_run_empty(uint32_t core_idx) wait for all workers to idle | 
Attributes
| Name | |
|---|---|
| __thread volatile eu_t * | eu_p Pointer to the event unit struct only initialized after call to eu_init for main thread and call to eu_event_loop for worker threads. | 
| volatile eu_t *volatile | eu_p_global Pointer to where the DM struct in TCDM is located. | 
Defines
| Name | |
|---|---|
| EU_PRINTF(d, ...) Define EU_USE_GLOBAL_CLINT to use the cluster-shared CLINT based SW interrupt system for synchronization. If not defined, the harts use the cluster-local CLINT to syncrhonize which is faster but only works for cluster-local synchronization which is sufficient at the moment since the OpenMP runtime is single cluster only. | 
Functions Documentation
function wait_worker_wfi
inline void wait_worker_wfi(
    void 
)
function wake_workers
inline void wake_workers(
    void 
)
When using the CLINT as wakeup.
function worker_wfi
inline void worker_wfi(
    uint32_t cluster_core_idx
)
function eu_print_status
inline void eu_print_status()
Debugging info to printf.
function eu_mutex_lock
inline void eu_mutex_lock()
Acquires the event unit mutex, exits only on success.
function eu_mutex_release
inline void eu_mutex_release()
Releases the acquired mutex.
function eu_get_workers_in_loop
inline uint32_t eu_get_workers_in_loop()
Getters
function eu_get_workers_in_wfi
inline uint32_t eu_get_workers_in_wfi()
function eu_init
inline void eu_init(
    void 
)
Initialize the event unit.
function eu_exit
inline void eu_exit(
    uint32_t core_idx
)
send all workers in loop to exit()
Parameters:
- core_idx cluster-local core index
function eu_event_loop
inline void eu_event_loop(
    uint32_t cluster_core_idx
)
Enter the event unit loop, never exits.
Parameters:
- cluster_core_idx cluster-local core index
function eu_dispatch_push
inline int eu_dispatch_push(
    void(*)(void *, uint32_t) fn,
    uint32_t argc,
    void * data,
    uint32_t nthreads
)
Set function to execute by nthreads number of threads. 
Parameters:
- fn pointer to worker function to be executed
- data pointer to function arguments
- argc number of elements in data
- nthreads number of threads that have to execute this event
function eu_run_empty
inline void eu_run_empty(
    uint32_t core_idx
)
wait for all workers to idle
Parameters:
- core_idx cluster-local core index
Attributes Documentation
variable eu_p
__thread volatile eu_t * eu_p;
Pointer to the event unit struct only initialized after call to eu_init for main thread and call to eu_event_loop for worker threads.
variable eu_p_global
volatile eu_t *volatile eu_p_global;
Pointer to where the DM struct in TCDM is located.
Macros Documentation
define EU_PRINTF
#define EU_PRINTF(
    d,
    ...
)
Define EU_USE_GLOBAL_CLINT to use the cluster-shared CLINT based SW interrupt system for synchronization. If not defined, the harts use the cluster-local CLINT to syncrhonize which is faster but only works for cluster-local synchronization which is sufficient at the moment since the OpenMP runtime is single cluster only.
Source code
// Copyright 2021 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#ifndef EU_H
#define EU_H
#include "eu_decls.h"
//================================================================================
// Settings
//================================================================================
// #define EU_USE_GLOBAL_CLINT
//================================================================================
// Debug
//================================================================================
#ifdef EU_DEBUG_LEVEL
#include "printf.h"
#define _EU_PRINTF(...)             \
    if (1) {                        \
        printf("[eu] "__VA_ARGS__); \
    }
#define EU_PRINTF(d, ...)        \
    if (EU_DEBUG_LEVEL >= d) {   \
        _EU_PRINTF(__VA_ARGS__); \
    }
#else
#define EU_PRINTF(d, ...)
#endif
//================================================================================
// Data
//================================================================================
extern __thread volatile eu_t *eu_p;
extern volatile eu_t *volatile eu_p_global;
//================================================================================
// Functions
//================================================================================
inline void wait_worker_wfi(void) {
    uint32_t scratch = eu_p->workers_in_loop;
    while (__atomic_load_n(&eu_p->workers_wfi, __ATOMIC_RELAXED) != scratch)
        ;
}
#ifdef EU_USE_GLOBAL_CLINT
inline void wake_workers(void) {
#ifdef OMPSTATIC_NUMTHREADS
#define WAKE_MASK (((1 << OMPSTATIC_NUMTHREADS) - 1) & ~0x1)
    // Fast wake-up for static number of worker threads
    uint32_t basehart = snrt_cluster_core_base_hartid();
    if ((basehart % 32) + OMPSTATIC_NUMTHREADS > 32) {
        // wake-up is split over two CLINT registers
        snrt_int_clint_set(basehart / 32, WAKE_MASK << (basehart % 32));
        snrt_int_clint_set(basehart / 32 + 1,
                           WAKE_MASK >> (32 - basehart % 32));
    } else {
        snrt_int_clint_set(basehart / 32, WAKE_MASK << (basehart % 32));
    }
    const uint32_t mask = OMPSTATIC_NUMTHREADS - 1;
#else
    // wake all worker cores except the main thread
    uint32_t numcores = snrt_cluster_compute_core_num(),
             basehart = snrt_cluster_core_base_hartid();
    uint32_t mask = 0, hart = 1;
    for (; hart < numcores; ++hart) {
        mask |= 1 << (basehart + hart);
        if ((basehart + hart + 1) % 32 == 0) {
            snrt_int_clint_set((basehart + hart) / 32, mask);
            mask = 0;
        }
    }
    if (mask) snrt_int_clint_set((basehart + hart) / 32, mask);
#endif
}
inline void worker_wfi(uint32_t cluster_core_idx) {
    __atomic_add_fetch(&eu_p->workers_wfi, 1, __ATOMIC_RELAXED);
    snrt_int_sw_poll();
    __atomic_add_fetch(&eu_p->workers_wfi, -1, __ATOMIC_RELAXED);
}
#else  // #ifdef EU_USE_GLOBAL_CLINT
inline void wake_workers(void) {
    // Guard to wake only if all workers are wfi
    wait_worker_wfi();
    // Wake the cluster cores. We do this with cluster relative hart IDs and do
    // not wake hart 0 since this is the main thread
    uint32_t numcores = snrt_cluster_compute_core_num();
    snrt_int_cluster_set(~0x1 & ((1 << numcores) - 1));
}
inline void worker_wfi(uint32_t cluster_core_idx) {
    __atomic_add_fetch(&eu_p->workers_wfi, 1, __ATOMIC_RELAXED);
    snrt_wfi();
    snrt_int_cluster_clr(1 << cluster_core_idx);
    __atomic_add_fetch(&eu_p->workers_wfi, -1, __ATOMIC_RELAXED);
}
#endif  // #ifdef EU_USE_GLOBAL_CLINT
inline void eu_print_status() {
    EU_PRINTF(0, "workers_in_loop=%d\n", eu_p->workers_in_loop);
}
inline void eu_mutex_lock() { snrt_mutex_acquire(&eu_p->workers_mutex); }
inline void eu_mutex_release() { snrt_mutex_release(&eu_p->workers_mutex); }
inline uint32_t eu_get_workers_in_loop() {
    return __atomic_load_n(&eu_p->workers_in_loop, __ATOMIC_RELAXED);
}
inline uint32_t eu_get_workers_in_wfi() {
    return __atomic_load_n(&eu_p->workers_wfi, __ATOMIC_RELAXED);
}
inline void eu_init(void) {
    if (snrt_cluster_core_idx() == 0) {
        // Allocate the eu struct in L1 for fast access
        eu_p = snrt_l1alloc(sizeof(eu_t));
        snrt_memset((void *)eu_p, 0, sizeof(eu_t));
        // store copy of eu_p on shared memory
        eu_p_global = eu_p;
    } else {
        while (!eu_p_global)
            ;
        eu_p = eu_p_global;
    }
}
inline void eu_exit(uint32_t core_idx) {
    // make sure queue is empty
    if (!eu_p->e.nthreads) eu_run_empty(core_idx);
    // set exit flag and wake cores
    wait_worker_wfi();
    eu_p->exit_flag = 1;
    wake_workers();
}
inline void eu_event_loop(uint32_t cluster_core_idx) {
    uint32_t scratch;
    uint32_t nthds;
    // count number of workers in loop
    __atomic_add_fetch(&eu_p->workers_in_loop, 1, __ATOMIC_RELAXED);
    // enable software interrupts
#ifdef EU_USE_GLOBAL_CLINT
    snrt_interrupt_enable(IRQ_M_SOFT);
#else
    snrt_interrupt_enable(IRQ_M_CLUSTER);
#endif
    EU_PRINTF(0, "#%d entered event loop\n", cluster_core_idx);
    while (1) {
        // check for exit
        if (eu_p->exit_flag) {
#ifdef EU_USE_GLOBAL_CLINT
            snrt_interrupt_disable(IRQ_M_SOFT);
#else
            // TODO colluca: should this be "disable"?
            snrt_interrupt_enable(IRQ_M_CLUSTER);
#endif
            return;
        }
        if (cluster_core_idx < eu_p->e.nthreads) {
            // make a local copy of nthreads to sync after work since the master
            // hart will reset eu_p->e.nthreads as soon as all workers finished
            // which might cause a race condition
            nthds = eu_p->e.nthreads;
            EU_PRINTF(0, "run fn @ %#x (arg 0 = %#x)\n", eu_p->e.fn,
                      ((uint32_t *)eu_p->e.data)[0]);
            // call
            eu_p->e.fn(eu_p->e.data, eu_p->e.argc);
        }
        // enter wait for interrupt
        __atomic_add_fetch(&eu_p->e.fini_count, 1, __ATOMIC_RELAXED);
        worker_wfi(cluster_core_idx);
    }
}
inline int eu_dispatch_push(void (*fn)(void *, uint32_t), uint32_t argc,
                            void *data, uint32_t nthreads) {
    // wait for workers to be in wfi before manipulating the event struct
    wait_worker_wfi();
    // fill queue
    eu_p->e.fn = fn;
    eu_p->e.data = data;
    eu_p->e.argc = argc;
    eu_p->e.nthreads = nthreads;
    EU_PRINTF(10, "eu_dispatch_push success, workers %d in loop %d\n", nthreads,
              eu_p->workers_in_loop);
    return 0;
}
inline void eu_run_empty(uint32_t core_idx) {
    unsigned nfini, scratch;
    scratch = eu_p->e.nthreads;
    if (!scratch) return;
    EU_PRINTF(10, "eu_run_empty enter: q size %d\n", eu_p->e.nthreads);
    eu_p->e.fini_count = 0;
    if (scratch > 1) wake_workers();
    // Am i also part of the team?
    if (core_idx < eu_p->e.nthreads) {
        // call
        EU_PRINTF(0, "run fn @ %#x (arg 0 = %#x)\n", eu_p->e.fn,
                  ((uint32_t *)eu_p->e.data)[0]);
        eu_p->e.fn(eu_p->e.data, eu_p->e.argc);
    }
    // wait for queue to be empty
    if (scratch > 1) {
        scratch = eu_get_workers_in_loop();
        while (__atomic_load_n(&eu_p->e.fini_count, __ATOMIC_RELAXED) !=
               scratch)
            ;
    }
    // stop workers from re-executing the task
    eu_p->e.nthreads = 0;
    EU_PRINTF(10, "eu_run_empty exit\n");
}
#endif /* EU_H */
Updated on 2023-06-19 at 09:43:56 +0000