15#include "../../deps/riscv-opcodes/encoding.h"
35 snrt_comm_world = &snrt_comm_world_info;
46 void *barrier_ptr = snrt_l1_alloc_cluster_local(
sizeof(uint32_t));
47 barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), 0);
48 if (snrt_global_core_idx() == 0) {
49 *(uint32_t *)barrier_ptr = 0;
54 snrt_l1_update_next(snrt_l1_next_aligned_hyperbank());
56 snrt_global_sw_barrier();
61 snrt_comm_world->barrier_ptr = (uint32_t *)barrier_ptr;
86 void *barrier_ptr = snrt_l1_alloc_cluster_local(
sizeof(uint32_t));
87 barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), 0);
88 if (snrt_global_core_idx() == 0) *(uint32_t *)barrier_ptr = 0;
93 (*communicator)->size = size;
94 (*communicator)->base = 0;
95 (*communicator)->mask = size - 1;
96 (*communicator)->barrier_ptr = (uint32_t *)barrier_ptr;
97 (*communicator)->is_participant = snrt_cluster_idx() < size;
107inline volatile uint32_t *
snrt_mutex() {
return &_snrt_mutex; }
121 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
122 " bnez t0,1b # Retry if previously set)\n"
138 " li t0,1 # t0 = 1\n"
140 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
141 " bnez t0,2b # Retry if previously set)\n"
151 asm volatile(
"amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n"
169 if (comm == NULL) comm = snrt_comm_world;
171#ifdef SNRT_SUPPORTS_NARROW_MULTICAST
173 if (snrt_cluster_num() > 0) {
174 volatile snitch_cluster_t *cluster = snrt_cluster(0);
175#pragma clang diagnostic push
176#pragma clang diagnostic ignored "-Waddress-of-packed-member"
177 uint32_t *addr = (uint32_t *)&(cluster->peripheral_reg.cl_clint_set.w);
178#pragma clang diagnostic pop
179 uint32_t mcast_mask = snrt_get_collective_mask(comm);
188 uint32_t mask = comm->mask;
189 uint32_t fixed = comm->base & ~mask;
190 uint32_t submask = 0;
192 uint32_t i = fixed | submask;
193 if (snrt_cluster_idx() != i) snrt_int_cluster_set(core_mask, i);
194 submask = (submask - 1) & mask;
195 }
while (submask != 0);
205 asm volatile(
"csrr x0, barrier" :::
"memory");
208static inline void snrt_inter_cluster_sw_barrier(
snrt_comm_t comm = NULL) {
210 if (comm == NULL) comm = snrt_comm_world;
213 if (!comm->is_participant)
return;
216 uint32_t cnt = __atomic_add_fetch(comm->barrier_ptr, 1, __ATOMIC_RELAXED);
221 if (cnt == comm->size) {
222 *(comm->barrier_ptr) = 0;
243 if (comm == NULL) comm = snrt_comm_world;
246 if (!comm->is_participant)
return;
248#ifdef SNRT_SUPPORTS_NARROW_REDUCTION
250 volatile uint32_t *addr = comm->barrier_ptr;
253 uint64_t mask = snrt_get_collective_mask(comm);
263 snrt_inter_cluster_sw_barrier(comm);
267inline void snrt_global_sw_barrier(
snrt_comm_t comm) {
272 if (snrt_is_dm_core()) {
273 snrt_inter_cluster_sw_barrier(comm);
295 if (snrt_is_dm_core()) {
312 uint32_t prev_it = barr->iteration;
313 uint32_t cnt = __atomic_add_fetch(&barr->cnt, 1, __ATOMIC_RELAXED);
318 __atomic_add_fetch(&barr->iteration, 1, __ATOMIC_RELAXED);
321 while (prev_it == barr->iteration)
342 uint32_t *cluster_result = &(snrt_cls()->reduction);
343 uint32_t tmp = __atomic_fetch_add(cluster_result, value, __ATOMIC_RELAXED);
350 if (snrt_is_dm_core()) {
351 __atomic_add_fetch(&_reduction_result, *cluster_result,
354 *cluster_result = _reduction_result;
357 return *cluster_result;
383 if (comm == NULL) comm = snrt_comm_world;
386 if (comm->size > 1) {
393 int num_levels = ceil(log2(comm->size));
394 for (
unsigned int level = 0; level < num_levels; level++) {
399 uint32_t is_active = (snrt_cluster_idx() % (1 << level)) == 0;
400 uint32_t is_sender = (snrt_cluster_idx() % (1 << (level + 1))) != 0;
404 if (is_active && is_sender) {
405 if (!snrt_is_compute_core()) {
406 uint64_t dst = (uint64_t)dst_buffer -
407 (1 << level) * SNRT_CLUSTER_OFFSET;
418 if (is_active && !is_sender) {
420 if (snrt_is_compute_core()) {
421 uint32_t items_per_core =
422 len / snrt_cluster_compute_core_num();
423 uint32_t core_offset =
424 snrt_cluster_core_idx() * items_per_core;
425 for (uint32_t i = 0; i < items_per_core; i++) {
426 uint32_t abs_i = core_offset + i;
427 src_buffer[abs_i] += dst_buffer[abs_i];
450 asm volatile(
"mv %0, %0" :
"+r"(val)::);
465 write_csr(user_low, (uint32_t)(field));
466 write_csr(user_high, (uint32_t)(field >> 32));
469inline void snrt_set_awuser_low(uint32_t field) {
470 write_csr(user_low, (uint32_t)(field));
473inline uint64_t snrt_get_collective_mask(
snrt_comm_t comm) {
474 return comm->mask * SNRT_CLUSTER_OFFSET;
490 op.f.opcode = SNRT_COLLECTIVE_MULTICAST;
512 snrt_collective_opcode_t opcode) {
514 op.f.opcode = opcode;
static uint32_t snrt_dma_start_1d(uint64_t dst, uint64_t src, size_t size, const uint32_t channel=0)
Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a specific DMA channel.
Definition dma.h:35
static void snrt_dma_wait_all(const uint32_t channel=0)
Block until a specific DMA channel is idle.
Definition dma.h:364
Definition sync_decls.h:9
Definition sync_decls.h:14
void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n)
Generic software barrier.
Definition sync.h:310
void snrt_set_awuser(uint64_t field)
Enable LSU AW user field.
Definition sync.h:464
void snrt_comm_create(uint32_t size, snrt_comm_t *communicator)
Creates a communicator object.
Definition sync.h:74
void snrt_wake_clusters(uint32_t core_mask, snrt_comm_t comm=NULL)
Wake the clusters belonging to a given communicator. Can only be called by a single core in the whole...
Definition sync.h:167
void snrt_enable_reduction(uint64_t mask, snrt_collective_opcode_t opcode)
Enable LSU reduction.
Definition sync.h:511
void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx)
Acquire a mutex, blocking.
Definition sync.h:133
void snrt_mutex_acquire(volatile uint32_t *pmtx)
Acquire a mutex, blocking.
Definition sync.h:117
void snrt_comm_init()
Initialize the world communicator.
Definition sync.h:32
volatile uint32_t * snrt_mutex()
Get a pointer to a mutex variable.
Definition sync.h:107
void snrt_wait_writeback(uint32_t val)
Ensure value is written back to the register file.
Definition sync.h:449
static void snrt_inter_cluster_barrier(snrt_comm_t comm=NULL)
Synchronize one core from every cluster with the others.
Definition sync.h:241
void snrt_global_reduction_dma(T *dst_buffer, T *src_buffer, size_t len, snrt_comm_t comm=NULL)
Perform a sum reduction among clusters, blocking.
Definition sync.h:380
void snrt_global_barrier(snrt_comm_t comm)
Synchronize all Snitch cores.
Definition sync.h:290
uint32_t snrt_global_all_to_all_reduction(uint32_t value)
Perform a global sum reduction, blocking.
Definition sync.h:340
void snrt_disable_reduction()
Disable LSU reduction.
Definition sync.h:522
void snrt_cluster_hw_barrier()
Synchronize cores in a cluster with a hardware barrier, blocking.
Definition sync.h:204
void snrt_mutex_release(volatile uint32_t *pmtx)
Release a previously-acquired mutex.
Definition sync.h:150
void snrt_disable_multicast()
Disable LSU multicast.
Definition sync.h:498
void snrt_enable_multicast(uint64_t mask)
Enable LSU multicast.
Definition sync.h:488
Definition sync_decls.h:40