snRuntime/src/omp/eu.h

Functions

	Name
void	wait_worker_wfi(void )
void	wake_workers(void ) When using the CLINT as wakeup.
void	worker_wfi(uint32_t cluster_core_idx)
void	eu_print_status() Debugging info to printf.
void	eu_mutex_lock() Acquires the event unit mutex, exits only on success.
void	eu_mutex_release() Releases the acquired mutex.
uint32_t	eu_get_workers_in_loop()
uint32_t	eu_get_workers_in_wfi()
void	eu_init(void ) Initialize the event unit.
void	eu_exit(uint32_t core_idx) send all workers in loop to exit()
void	eu_event_loop(uint32_t cluster_core_idx) Enter the event unit loop, never exits.
int	eu_dispatch_push(void()(void , uint32_t) fn, uint32_t argc, void * data, uint32_t nthreads) Set function to execute by `nthreads` number of threads.
void	eu_run_empty(uint32_t core_idx) wait for all workers to idle

Attributes

	Name
__thread volatile eu_t *	eu_p Pointer to the event unit struct only initialized after call to eu_init for main thread and call to eu_event_loop for worker threads.
volatile eu_t *volatile	eu_p_global Pointer to where the DM struct in TCDM is located.

Defines

	Name
	EU_PRINTF(d, ...) Define EU_USE_GLOBAL_CLINT to use the cluster-shared CLINT based SW interrupt system for synchronization. If not defined, the harts use the cluster-local CLINT to syncrhonize which is faster but only works for cluster-local synchronization which is sufficient at the moment since the OpenMP runtime is single cluster only.

Functions Documentation

function wait_worker_wfi

inline void wait_worker_wfi(
    void 
)

function wake_workers

inline void wake_workers(
    void 
)

When using the CLINT as wakeup.

function worker_wfi

inline void worker_wfi(
    uint32_t cluster_core_idx
)

function eu_print_status

inline void eu_print_status()

Debugging info to printf.

function eu_mutex_lock

inline void eu_mutex_lock()

Acquires the event unit mutex, exits only on success.

function eu_mutex_release

inline void eu_mutex_release()

Releases the acquired mutex.

function eu_get_workers_in_loop

inline uint32_t eu_get_workers_in_loop()

Getters

function eu_get_workers_in_wfi

inline uint32_t eu_get_workers_in_wfi()

function eu_init

inline void eu_init(
    void 
)

Initialize the event unit.

function eu_exit

inline void eu_exit(
    uint32_t core_idx
)

send all workers in loop to exit()

Parameters:

core_idx cluster-local core index

function eu_event_loop

inline void eu_event_loop(
    uint32_t cluster_core_idx
)

Enter the event unit loop, never exits.

Parameters:

cluster_core_idx cluster-local core index

function eu_dispatch_push

inline int eu_dispatch_push(
    void(*)(void *, uint32_t) fn,
    uint32_t argc,
    void * data,
    uint32_t nthreads
)

Set function to execute by nthreads number of threads.

Parameters:

fn pointer to worker function to be executed
data pointer to function arguments
argc number of elements in data
nthreads number of threads that have to execute this event

function eu_run_empty

inline void eu_run_empty(
    uint32_t core_idx
)

wait for all workers to idle

Parameters:

core_idx cluster-local core index

Attributes Documentation

variable eu_p

__thread volatile eu_t * eu_p;

Pointer to the event unit struct only initialized after call to eu_init for main thread and call to eu_event_loop for worker threads.

variable eu_p_global

volatile eu_t *volatile eu_p_global;

Pointer to where the DM struct in TCDM is located.

Macros Documentation

define EU_PRINTF

#define EU_PRINTF(
    d,
    ...
)

Define EU_USE_GLOBAL_CLINT to use the cluster-shared CLINT based SW interrupt system for synchronization. If not defined, the harts use the cluster-local CLINT to syncrhonize which is faster but only works for cluster-local synchronization which is sufficient at the moment since the OpenMP runtime is single cluster only.

Source code

// Copyright 2021 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#ifndef EU_H
#define EU_H

#include "eu_decls.h"

//================================================================================
// Settings
//================================================================================
// #define EU_USE_GLOBAL_CLINT

//================================================================================
// Debug
//================================================================================

#ifdef EU_DEBUG_LEVEL
#include "printf.h"
#define _EU_PRINTF(...)             \
    if (1) {                        \
        printf("[eu] "__VA_ARGS__); \
    }
#define EU_PRINTF(d, ...)        \
    if (EU_DEBUG_LEVEL >= d) {   \
        _EU_PRINTF(__VA_ARGS__); \
    }
#else
#define EU_PRINTF(d, ...)
#endif

//================================================================================
// Data
//================================================================================

extern __thread volatile eu_t *eu_p;

extern volatile eu_t *volatile eu_p_global;

//================================================================================
// Functions
//================================================================================

inline void wait_worker_wfi(void) {
    uint32_t scratch = eu_p->workers_in_loop;
    while (__atomic_load_n(&eu_p->workers_wfi, __ATOMIC_RELAXED) != scratch)
        ;
}

#ifdef EU_USE_GLOBAL_CLINT

inline void wake_workers(void) {
#ifdef OMPSTATIC_NUMTHREADS
#define WAKE_MASK (((1 << OMPSTATIC_NUMTHREADS) - 1) & ~0x1)
    // Fast wake-up for static number of worker threads
    uint32_t basehart = snrt_cluster_core_base_hartid();
    if ((basehart % 32) + OMPSTATIC_NUMTHREADS > 32) {
        // wake-up is split over two CLINT registers
        snrt_int_clint_set(basehart / 32, WAKE_MASK << (basehart % 32));
        snrt_int_clint_set(basehart / 32 + 1,
                           WAKE_MASK >> (32 - basehart % 32));
    } else {
        snrt_int_clint_set(basehart / 32, WAKE_MASK << (basehart % 32));
    }
    const uint32_t mask = OMPSTATIC_NUMTHREADS - 1;
#else

    // wake all worker cores except the main thread
    uint32_t numcores = snrt_cluster_compute_core_num(),
             basehart = snrt_cluster_core_base_hartid();
    uint32_t mask = 0, hart = 1;
    for (; hart < numcores; ++hart) {
        mask |= 1 << (basehart + hart);
        if ((basehart + hart + 1) % 32 == 0) {
            snrt_int_clint_set((basehart + hart) / 32, mask);
            mask = 0;
        }
    }
    if (mask) snrt_int_clint_set((basehart + hart) / 32, mask);
#endif
}

inline void worker_wfi(uint32_t cluster_core_idx) {
    __atomic_add_fetch(&eu_p->workers_wfi, 1, __ATOMIC_RELAXED);
    snrt_int_sw_poll();
    __atomic_add_fetch(&eu_p->workers_wfi, -1, __ATOMIC_RELAXED);
}

#else  // #ifdef EU_USE_GLOBAL_CLINT

inline void wake_workers(void) {
    // Guard to wake only if all workers are wfi
    wait_worker_wfi();
    // Wake the cluster cores. We do this with cluster relative hart IDs and do
    // not wake hart 0 since this is the main thread
    uint32_t numcores = snrt_cluster_compute_core_num();
    snrt_int_cluster_set(~0x1 & ((1 << numcores) - 1));
}
inline void worker_wfi(uint32_t cluster_core_idx) {
    __atomic_add_fetch(&eu_p->workers_wfi, 1, __ATOMIC_RELAXED);
    snrt_wfi();
    snrt_int_cluster_clr(1 << cluster_core_idx);
    __atomic_add_fetch(&eu_p->workers_wfi, -1, __ATOMIC_RELAXED);
}

#endif  // #ifdef EU_USE_GLOBAL_CLINT

inline void eu_print_status() {
    EU_PRINTF(0, "workers_in_loop=%d\n", eu_p->workers_in_loop);
}

inline void eu_mutex_lock() { snrt_mutex_acquire(&eu_p->workers_mutex); }

inline void eu_mutex_release() { snrt_mutex_release(&eu_p->workers_mutex); }

inline uint32_t eu_get_workers_in_loop() {
    return __atomic_load_n(&eu_p->workers_in_loop, __ATOMIC_RELAXED);
}
inline uint32_t eu_get_workers_in_wfi() {
    return __atomic_load_n(&eu_p->workers_wfi, __ATOMIC_RELAXED);
}

inline void eu_init(void) {
    if (snrt_cluster_core_idx() == 0) {
        // Allocate the eu struct in L1 for fast access
        eu_p = snrt_l1alloc(sizeof(eu_t));
        snrt_memset((void *)eu_p, 0, sizeof(eu_t));
        // store copy of eu_p on shared memory
        eu_p_global = eu_p;
    } else {
        while (!eu_p_global)
            ;
        eu_p = eu_p_global;
    }
}

inline void eu_exit(uint32_t core_idx) {
    // make sure queue is empty
    if (!eu_p->e.nthreads) eu_run_empty(core_idx);
    // set exit flag and wake cores
    wait_worker_wfi();
    eu_p->exit_flag = 1;
    wake_workers();
}

inline void eu_event_loop(uint32_t cluster_core_idx) {
    uint32_t scratch;
    uint32_t nthds;

    // count number of workers in loop
    __atomic_add_fetch(&eu_p->workers_in_loop, 1, __ATOMIC_RELAXED);

    // enable software interrupts
#ifdef EU_USE_GLOBAL_CLINT
    snrt_interrupt_enable(IRQ_M_SOFT);
#else
    snrt_interrupt_enable(IRQ_M_CLUSTER);
#endif

    EU_PRINTF(0, "#%d entered event loop\n", cluster_core_idx);

    while (1) {
        // check for exit
        if (eu_p->exit_flag) {
#ifdef EU_USE_GLOBAL_CLINT
            snrt_interrupt_disable(IRQ_M_SOFT);
#else
            // TODO colluca: should this be "disable"?
            snrt_interrupt_enable(IRQ_M_CLUSTER);
#endif
            return;
        }

        if (cluster_core_idx < eu_p->e.nthreads) {
            // make a local copy of nthreads to sync after work since the master
            // hart will reset eu_p->e.nthreads as soon as all workers finished
            // which might cause a race condition
            nthds = eu_p->e.nthreads;
            EU_PRINTF(0, "run fn @ %#x (arg 0 = %#x)\n", eu_p->e.fn,
                      ((uint32_t *)eu_p->e.data)[0]);
            // call
            eu_p->e.fn(eu_p->e.data, eu_p->e.argc);
        }

        // enter wait for interrupt
        __atomic_add_fetch(&eu_p->e.fini_count, 1, __ATOMIC_RELAXED);
        worker_wfi(cluster_core_idx);
    }
}

inline int eu_dispatch_push(void (*fn)(void *, uint32_t), uint32_t argc,
                            void *data, uint32_t nthreads) {
    // wait for workers to be in wfi before manipulating the event struct
    wait_worker_wfi();

    // fill queue
    eu_p->e.fn = fn;
    eu_p->e.data = data;
    eu_p->e.argc = argc;
    eu_p->e.nthreads = nthreads;

    EU_PRINTF(10, "eu_dispatch_push success, workers %d in loop %d\n", nthreads,
              eu_p->workers_in_loop);

    return 0;
}

inline void eu_run_empty(uint32_t core_idx) {
    unsigned nfini, scratch;
    scratch = eu_p->e.nthreads;
    if (!scratch) return;
    EU_PRINTF(10, "eu_run_empty enter: q size %d\n", eu_p->e.nthreads);

    eu_p->e.fini_count = 0;
    if (scratch > 1) wake_workers();

    // Am i also part of the team?
    if (core_idx < eu_p->e.nthreads) {
        // call
        EU_PRINTF(0, "run fn @ %#x (arg 0 = %#x)\n", eu_p->e.fn,
                  ((uint32_t *)eu_p->e.data)[0]);
        eu_p->e.fn(eu_p->e.data, eu_p->e.argc);
    }

    // wait for queue to be empty
    if (scratch > 1) {
        scratch = eu_get_workers_in_loop();
        while (__atomic_load_n(&eu_p->e.fini_count, __ATOMIC_RELAXED) !=
               scratch)
            ;
    }

    // stop workers from re-executing the task
    eu_p->e.nthreads = 0;

    EU_PRINTF(10, "eu_run_empty exit\n");
}

#endif /* EU_H */

Updated on 2023-06-19 at 09:43:56 +0000