Snitch Runtime
Loading...
Searching...
No Matches
sync.h File Reference

This file provides functions to synchronize Snitch cores. More...

#include "../../deps/riscv-opcodes/encoding.h"
#include <math.h>

Go to the source code of this file.

Functions

void snrt_comm_init ()
 Initialize the world communicator.
 
void snrt_comm_create (uint32_t size, snrt_comm_t *communicator)
 Creates a communicator object.
 
volatile uint32_t * snrt_mutex ()
 Get a pointer to a mutex variable.
 
void snrt_mutex_acquire (volatile uint32_t *pmtx)
 Acquire a mutex, blocking.
 
void snrt_mutex_ttas_acquire (volatile uint32_t *pmtx)
 Acquire a mutex, blocking.
 
void snrt_mutex_release (volatile uint32_t *pmtx)
 Release a previously-acquired mutex.
 
void snrt_wake_clusters (uint32_t core_mask, snrt_comm_t comm=NULL)
 Wake the clusters belonging to a given communicator. Can only be called by a single core in the whole system!
 
void snrt_cluster_hw_barrier ()
 Synchronize cores in a cluster with a hardware barrier, blocking.
 
static void snrt_inter_cluster_sw_barrier (snrt_comm_t comm=NULL)
 
static void snrt_inter_cluster_barrier (snrt_comm_t comm=NULL)
 Synchronize one core from every cluster with the others.
 
void snrt_global_sw_barrier (snrt_comm_t comm)
 
void snrt_global_barrier (snrt_comm_t comm)
 Synchronize all Snitch cores.
 
void snrt_partial_barrier (snrt_barrier_t *barr, uint32_t n)
 Generic software barrier.
 
uint32_t snrt_global_all_to_all_reduction (uint32_t value)
 Perform a global sum reduction, blocking.
 
template<typename T >
void snrt_global_reduction_dma (T *dst_buffer, T *src_buffer, size_t len, snrt_comm_t comm=NULL)
 Perform a sum reduction among clusters, blocking.
 
void snrt_wait_writeback (uint32_t val)
 Ensure value is written back to the register file.
 
void snrt_set_awuser (uint64_t field)
 Enable LSU AW user field.
 
void snrt_set_awuser_low (uint32_t field)
 
uint64_t snrt_get_collective_mask (snrt_comm_t comm)
 
void snrt_enable_multicast (uint64_t mask)
 Enable LSU multicast.
 
void snrt_disable_multicast ()
 Disable LSU multicast.
 
void snrt_enable_reduction (uint64_t mask, snrt_collective_opcode_t opcode)
 Enable LSU reduction.
 
void snrt_disable_reduction ()
 Disable LSU reduction.
 

Variables

__thread snrt_comm_info_t snrt_comm_world_info
 
__thread snrt_comm_t snrt_comm_world
 

Detailed Description

This file provides functions to synchronize Snitch cores.

Function Documentation

◆ snrt_cluster_hw_barrier()

void snrt_cluster_hw_barrier ( )
inline

Synchronize cores in a cluster with a hardware barrier, blocking.

Note
Synchronizes all (both DM and compute) cores. All cores must invoke this function, or the calling cores will stall indefinitely.
204 {
205 asm volatile("csrr x0, barrier" ::: "memory");
206}

◆ snrt_comm_create()

void snrt_comm_create ( uint32_t size,
snrt_comm_t * communicator )
inline

Creates a communicator object.

The newly created communicator object includes the first size clusters. All clusters, even those which are not part of the communicator, must invoke this function.

Parameters
sizeThe number of clusters to include in the communicator.
communicatorPointer to the communicator object to be created.
74 {
75 // Allocate communicator struct in L1 and point to it.
76 *communicator =
77 (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t));
78
79 // Allocate barrier counter in L1. This allows us to perform global
80 // hardware barriers, as reductions are currently not supported in L3.
81 // All clusters allocate a barrier counter because we want to keep all
82 // clusters' L1 allocators aligned, but only the zero-th cluster's is
83 // actually used. So all clusters allocate one, but only the zero-th
84 // cluster's is initialized. A global barrier is then used to ensure
85 // all cores "see" the initialized value.
86 void *barrier_ptr = snrt_l1_alloc_cluster_local(sizeof(uint32_t));
87 barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), 0);
88 if (snrt_global_core_idx() == 0) *(uint32_t *)barrier_ptr = 0;
90
91 // Initialize communicator, pointing to the newly-allocated barrier
92 // counter in L3.
93 (*communicator)->size = size;
94 (*communicator)->base = 0;
95 (*communicator)->mask = size - 1;
96 (*communicator)->barrier_ptr = (uint32_t *)barrier_ptr;
97 (*communicator)->is_participant = snrt_cluster_idx() < size;
98}
Definition sync_decls.h:14
void snrt_global_barrier(snrt_comm_t comm)
Synchronize all Snitch cores.
Definition sync.h:290

◆ snrt_comm_init()

void snrt_comm_init ( )
inline

Initialize the world communicator.

Note
This function should be called before using any of the inter-cluster synchronization functions.
32 {
33 // Point to default-initialized communicator struct, with barrier pointer
34 // in L3.
35 snrt_comm_world = &snrt_comm_world_info;
36
37 // Allocate barrier counter in L1. This allows us to perform global
38 // hardware barriers, as reductions are currently not supported in L3.
39 // All clusters allocate a barrier counter because we want to keep all
40 // clusters' L1 allocators aligned, but only the zero-th cluster's is
41 // actually used. So all clusters allocate one, but only the zero-th
42 // cluster's is initialized. A global barrier is then used to ensure
43 // all cores "see" the initialized value. This global barrier uses the
44 // default-initialized barrier pointer in L3. It must thus be a software
45 // barrier, as we currently do not support hardware reductions in L3.
46 void *barrier_ptr = snrt_l1_alloc_cluster_local(sizeof(uint32_t));
47 barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), 0);
48 if (snrt_global_core_idx() == 0) {
49 *(uint32_t *)barrier_ptr = 0;
50 // TODO(colluca): this is a workaround that shouldn't be necessary.
51 // It seems some tests expect the next pointer at the start of the
52 // user application to be aligned to the hyperbank.
53 // > Should we get rid of the alloc_v1 API altogether and fix these?
54 snrt_l1_update_next(snrt_l1_next_aligned_hyperbank());
55 }
56 snrt_global_sw_barrier();
57
58 // Update the communicator struct, pointing to the barrier pointer in L1.
59 // This whole workaround is required because we cannot statically allocate
60 // variables in L1.
61 snrt_comm_world->barrier_ptr = (uint32_t *)barrier_ptr;
62}

◆ snrt_disable_multicast()

void snrt_disable_multicast ( )
inline

Disable LSU multicast.

498{ snrt_set_awuser(0); }
void snrt_set_awuser(uint64_t field)
Enable LSU AW user field.
Definition sync.h:464

◆ snrt_disable_reduction()

void snrt_disable_reduction ( )
inline

Disable LSU reduction.

522{ snrt_set_awuser(0); }

◆ snrt_enable_multicast()

void snrt_enable_multicast ( uint64_t mask)
inline

Enable LSU multicast.

All stores performed after this call will be multicast to all addresses specified by the address and mask pair.

Parameters
maskMulticast mask value
488 {
490 op.f.opcode = SNRT_COLLECTIVE_MULTICAST;
491 op.f.mask = mask;
492 snrt_set_awuser(op.w);
493}
Definition sync_decls.h:40

◆ snrt_enable_reduction()

void snrt_enable_reduction ( uint64_t mask,
snrt_collective_opcode_t opcode )
inline

Enable LSU reduction.

All stores performed after this call will be reductions

Parameters
maskMask defines all involved members
opcodeType of reduction operation
512 {
514 op.f.opcode = opcode;
515 op.f.mask = mask;
516 snrt_set_awuser(op.w);
517}

◆ snrt_get_collective_mask()

uint64_t snrt_get_collective_mask ( snrt_comm_t comm)
inline
473 {
474 return comm->mask * SNRT_CLUSTER_OFFSET;
475}

◆ snrt_global_all_to_all_reduction()

uint32_t snrt_global_all_to_all_reduction ( uint32_t value)
inline

Perform a global sum reduction, blocking.

All cores participate in the reduction and synchronize globally to wait for the reduction to complete. The synchronization is performed via snrt_global_barrier.

Parameters
valueThe value to be summed.
Returns
The result of the sum reduction.
Note
Every Snitch core must invoke this function, or the calling cores will stall indefinitely.
340 {
341 // Reduce cores within cluster in TCDM
342 uint32_t *cluster_result = &(snrt_cls()->reduction);
343 uint32_t tmp = __atomic_fetch_add(cluster_result, value, __ATOMIC_RELAXED);
344
345 // Wait for writeback to ensure AMO is seen by all cores after barrier
348
349 // Reduce DM cores across clusters in global memory
350 if (snrt_is_dm_core()) {
351 __atomic_add_fetch(&_reduction_result, *cluster_result,
352 __ATOMIC_RELAXED);
354 *cluster_result = _reduction_result;
355 }
357 return *cluster_result;
358}
void snrt_wait_writeback(uint32_t val)
Ensure value is written back to the register file.
Definition sync.h:449
static void snrt_inter_cluster_barrier(snrt_comm_t comm=NULL)
Synchronize one core from every cluster with the others.
Definition sync.h:241
void snrt_cluster_hw_barrier()
Synchronize cores in a cluster with a hardware barrier, blocking.
Definition sync.h:204

◆ snrt_global_barrier()

void snrt_global_barrier ( snrt_comm_t comm)
inline

Synchronize all Snitch cores.

Synchronization is performed hierarchically. Within a cluster, cores are synchronized through a hardware barrier (see snrt_cluster_hw_barrier). Clusters are synchronized through a software barrier (see snrt_inter_cluster_barrier).

Parameters
commThe communicator determining which clusters synchronize.
Note
Every Snitch core must invoke this function, or the calling cores will stall indefinitely.
290 {
291 // Synchronize cores in a cluster with the HW barrier
293
294 // Synchronize all clusters
295 if (snrt_is_dm_core()) {
297 }
298
299 // Synchronize cores in a cluster with the HW barrier
301}

◆ snrt_global_reduction_dma()

template<typename T >
void snrt_global_reduction_dma ( T * dst_buffer,
T * src_buffer,
size_t len,
snrt_comm_t comm = NULL )
inline

Perform a sum reduction among clusters, blocking.

The reduction is performed in a logarithmic fashion. Half of the clusters active in every level of the binary-tree participate as as senders, the other half as receivers. Senders use the DMA to send their data to the respective receiver's destination buffer. The receiver then reduces each element in its destination buffer with the respective element in its source buffer. The result is stored in the source buffer. It then proceeds to the next level in the binary tree.

Parameters
dst_bufferThe pointer to the calling cluster's destination buffer.
src_bufferThe pointer to the calling cluster's source buffer.
lenThe amount of data in each buffer. Only integer multiples of the number of compute cores are supported at the moment.
commThe communicator determining which clusters participate in the reduction.
Note
The destination buffers must lie at the same offset in every cluster's TCDM.
381 {
382 // If no communicator is given, world communicator is used as default.
383 if (comm == NULL) comm = snrt_comm_world;
384
385 // If we have a single cluster, no reduction has to be done
386 if (comm->size > 1) {
387 // DMA core will send compute cores' data, so it must wait on it
388 // to be available
389 snrt_fpu_fence();
391
392 // Iterate levels in the binary reduction tree
393 int num_levels = ceil(log2(comm->size));
394 for (unsigned int level = 0; level < num_levels; level++) {
395 // Determine whether the current cluster is an active cluster.
396 // An active cluster is a cluster that participates in the current
397 // level of the reduction tree. Every second cluster among the
398 // active ones is a sender.
399 uint32_t is_active = (snrt_cluster_idx() % (1 << level)) == 0;
400 uint32_t is_sender = (snrt_cluster_idx() % (1 << (level + 1))) != 0;
401
402 // If the cluster is a sender, it sends the data in its source
403 // buffer to the respective receiver's destination buffer
404 if (is_active && is_sender) {
405 if (!snrt_is_compute_core()) {
406 uint64_t dst = (uint64_t)dst_buffer -
407 (1 << level) * SNRT_CLUSTER_OFFSET;
408 snrt_dma_start_1d(dst, (uint64_t)src_buffer,
409 len * sizeof(T));
411 }
412 }
413
414 // Synchronize senders and receivers
416
417 // Every cluster which is not a sender performs the reduction
418 if (is_active && !is_sender) {
419 // Computation is parallelized over the compute cores
420 if (snrt_is_compute_core()) {
421 uint32_t items_per_core =
422 len / snrt_cluster_compute_core_num();
423 uint32_t core_offset =
424 snrt_cluster_core_idx() * items_per_core;
425 for (uint32_t i = 0; i < items_per_core; i++) {
426 uint32_t abs_i = core_offset + i;
427 src_buffer[abs_i] += dst_buffer[abs_i];
428 }
429 }
430 }
431
432 // Synchronize compute and DM cores for next tree level
433 snrt_fpu_fence();
435 }
436 }
437}
static uint32_t snrt_dma_start_1d(uint64_t dst, uint64_t src, size_t size, const uint32_t channel=0)
Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a specific DMA channel.
Definition dma.h:35
static void snrt_dma_wait_all(const uint32_t channel=0)
Block until a specific DMA channel is idle.
Definition dma.h:364

◆ snrt_global_sw_barrier()

void snrt_global_sw_barrier ( snrt_comm_t comm)
inline
267 {
268 // Synchronize cores in a cluster with the HW barrier
270
271 // Synchronize all clusters
272 if (snrt_is_dm_core()) {
273 snrt_inter_cluster_sw_barrier(comm);
274 }
275
276 // Synchronize cores in a cluster with the HW barrier
278}

◆ snrt_inter_cluster_barrier()

static void snrt_inter_cluster_barrier ( snrt_comm_t comm = NULL)
inlinestatic

Synchronize one core from every cluster with the others.

Parameters
commThe communicator determining which clusters synchronize. Only used when not employing HW reduction.

Implemented as a software barrier.

Note
One core per cluster participating in the barrier must invoke this function (the same across all clusters), or the calling cores will stall indefinitely.
241 {
242 // If no communicator is given, world communicator is used as default.
243 if (comm == NULL) comm = snrt_comm_world;
244
245 // If the current cluster is not a participant, return immediately.
246 if (!comm->is_participant) return;
247
248#ifdef SNRT_SUPPORTS_NARROW_REDUCTION
249 // Fetch the address for the reduction
250 volatile uint32_t *addr = comm->barrier_ptr;
251
252 // Compose collective mask
253 uint64_t mask = snrt_get_collective_mask(comm);
254
255 // Launch the reduction
256 snrt_enable_reduction(mask, SNRT_REDUCTION_BARRIER);
257 *addr = 0;
259
260 // Fence to wait until the reduction is finished
261 snrt_fence();
262#else
263 snrt_inter_cluster_sw_barrier(comm);
264#endif
265}
void snrt_enable_reduction(uint64_t mask, snrt_collective_opcode_t opcode)
Enable LSU reduction.
Definition sync.h:511
void snrt_disable_reduction()
Disable LSU reduction.
Definition sync.h:522

◆ snrt_inter_cluster_sw_barrier()

static void snrt_inter_cluster_sw_barrier ( snrt_comm_t comm = NULL)
inlinestatic
208 {
209 // If no communicator is given, world communicator is used as default.
210 if (comm == NULL) comm = snrt_comm_world;
211
212 // If the current cluster is not a participant, return immediately.
213 if (!comm->is_participant) return;
214
215 // Clusters participating in the barrier increment a shared counter.
216 uint32_t cnt = __atomic_add_fetch(comm->barrier_ptr, 1, __ATOMIC_RELAXED);
217
218 // All but the last cluster arriving on the barrier enter WFI. The last
219 // cluster resets the counter for the next barrier (if any) and multicasts
220 // an interrupt to wake up the other clusters.
221 if (cnt == comm->size) {
222 *(comm->barrier_ptr) = 0;
223 snrt_fence();
224 snrt_wake_clusters(1 << snrt_cluster_core_idx(), comm);
225 } else {
226 snrt_wfi();
227 }
228 // Clear interrupt for next barrier (interrupt arrives also at sender)
229 snrt_int_clr_mcip();
230}
void snrt_wake_clusters(uint32_t core_mask, snrt_comm_t comm=NULL)
Wake the clusters belonging to a given communicator. Can only be called by a single core in the whole...
Definition sync.h:167

◆ snrt_mutex()

volatile uint32_t * snrt_mutex ( )
inline

Get a pointer to a mutex variable.

107{ return &_snrt_mutex; }

◆ snrt_mutex_acquire()

void snrt_mutex_acquire ( volatile uint32_t * pmtx)
inline

Acquire a mutex, blocking.

Test-and-set (TAS) implementation of a lock.

Parameters
pmtxA pointer to a variable which can be used as a mutex, i.e. to which all cores have a reference and at a memory location to which atomic accesses can be made. This can be declared e.g. as static volatile uint32_t mtx = 0;.
117 {
118 asm volatile(
119 "li t0,1 # t0 = 1\n"
120 "1:\n"
121 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
122 " bnez t0,1b # Retry if previously set)\n"
123 : "+r"(pmtx)
124 :
125 : "t0");
126}

◆ snrt_mutex_release()

void snrt_mutex_release ( volatile uint32_t * pmtx)
inline

Release a previously-acquired mutex.

150 {
151 asm volatile("amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n"
152 : "+r"(pmtx));
153}

◆ snrt_mutex_ttas_acquire()

void snrt_mutex_ttas_acquire ( volatile uint32_t * pmtx)
inline

Acquire a mutex, blocking.

Same as snrt_mutex_acquire but acquires the lock using a test and test-and-set (TTAS) strategy.

133 {
134 asm volatile(
135 "1:\n"
136 " lw t0, 0(%0)\n"
137 " bnez t0, 1b\n"
138 " li t0,1 # t0 = 1\n"
139 "2:\n"
140 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
141 " bnez t0,2b # Retry if previously set)\n"
142 : "+r"(pmtx)
143 :
144 : "t0");
145}

◆ snrt_partial_barrier()

void snrt_partial_barrier ( snrt_barrier_t * barr,
uint32_t n )
inline

Generic software barrier.

Parameters
barrpointer to a barrier variable.
nnumber of harts that have to enter before released.
Note
Exactly the specified number of harts must invoke this function, or the calling cores will stall indefinitely.
310 {
311 // Remember previous iteration
312 uint32_t prev_it = barr->iteration;
313 uint32_t cnt = __atomic_add_fetch(&barr->cnt, 1, __ATOMIC_RELAXED);
314
315 // Increment the barrier counter
316 if (cnt == n) {
317 barr->cnt = 0;
318 __atomic_add_fetch(&barr->iteration, 1, __ATOMIC_RELAXED);
319 } else {
320 // Some threads have not reached the barrier --> Let's wait
321 while (prev_it == barr->iteration)
322 ;
323 }
324}

◆ snrt_set_awuser()

void snrt_set_awuser ( uint64_t field)
inline

Enable LSU AW user field.

All stores performed after this call are equipped with the given AW user field

Parameters
fieldDefines the AW user field for the AXI transfer
464 {
465 write_csr(user_low, (uint32_t)(field));
466 write_csr(user_high, (uint32_t)(field >> 32));
467}

◆ snrt_set_awuser_low()

void snrt_set_awuser_low ( uint32_t field)
inline
469 {
470 write_csr(user_low, (uint32_t)(field));
471}

◆ snrt_wait_writeback()

void snrt_wait_writeback ( uint32_t val)
inline

Ensure value is written back to the register file.

This function introduces a RAW dependency on val to stall the core until val is written back to the register file.

Parameters
valThe variable we want to wait on.
449 {
450 asm volatile("mv %0, %0" : "+r"(val)::);
451}

◆ snrt_wake_clusters()

void snrt_wake_clusters ( uint32_t core_mask,
snrt_comm_t comm = NULL )
inline

Wake the clusters belonging to a given communicator. Can only be called by a single core in the whole system!

Parameters
commThe communicator determining which clusters to wake up.
Note
When multicast is enabled the interrupt is sent also to the cluster invoking the function. As a consequence even the core invoking the function should clear its own interrupt.
167 {
168 // If no communicator is given, world communicator is used as default.
169 if (comm == NULL) comm = snrt_comm_world;
170
171#ifdef SNRT_SUPPORTS_NARROW_MULTICAST
172 // Multicast cluster interrupt to every other cluster's core
173 if (snrt_cluster_num() > 0) {
174 volatile snitch_cluster_t *cluster = snrt_cluster(0);
175#pragma clang diagnostic push
176#pragma clang diagnostic ignored "-Waddress-of-packed-member"
177 uint32_t *addr = (uint32_t *)&(cluster->peripheral_reg.cl_clint_set.w);
178#pragma clang diagnostic pop
179 uint32_t mcast_mask = snrt_get_collective_mask(comm);
180 snrt_enable_multicast(mcast_mask);
181 *addr = core_mask;
183 }
184#else
185 // Wake clusters sequentially.
186 // We find all clusters represented by the (base, mask) encoding through
187 // submask enumeration (https://codeforces.com/blog/entry/108942).
188 uint32_t mask = comm->mask;
189 uint32_t fixed = comm->base & ~mask;
190 uint32_t submask = 0;
191 do {
192 uint32_t i = fixed | submask;
193 if (snrt_cluster_idx() != i) snrt_int_cluster_set(core_mask, i);
194 submask = (submask - 1) & mask;
195 } while (submask != 0);
196#endif
197}
void snrt_disable_multicast()
Disable LSU multicast.
Definition sync.h:498
void snrt_enable_multicast(uint64_t mask)
Enable LSU multicast.
Definition sync.h:488

Variable Documentation

◆ snrt_comm_world_info

__thread snrt_comm_info_t snrt_comm_world_info
extern
14 {
15 .barrier_ptr = &(_snrt_barrier.cnt),
16 .size = SNRT_CLUSTER_NUM,
17 .mask = SNRT_CLUSTER_NUM - 1,
18 .base = 0,
19 .is_participant = 1};