Snitch Runtime
Loading...
Searching...
No Matches
sync.h File Reference

This file provides functions to synchronize Snitch cores. More...

#include "../../deps/riscv-opcodes/encoding.h"
#include <math.h>

Go to the source code of this file.

Macros

#define SNRT_BROADCAST_MASK   ((SNRT_CLUSTER_NUM - 1) * SNRT_CLUSTER_OFFSET)
 

Functions

void snrt_comm_init ()
 Initialize the communicator functions.
 
void snrt_comm_create (uint32_t size, snrt_comm_t *communicator)
 Creates a communicator object.
 
volatile uint32_t * snrt_mutex ()
 Get a pointer to a mutex variable.
 
void snrt_mutex_acquire (volatile uint32_t *pmtx)
 Acquire a mutex, blocking.
 
void snrt_mutex_ttas_acquire (volatile uint32_t *pmtx)
 Acquire a mutex, blocking.
 
void snrt_mutex_release (volatile uint32_t *pmtx)
 Release a previously-acquired mutex.
 
void snrt_wake_clusters (uint32_t core_mask, snrt_comm_t comm=NULL)
 Wake the clusters belonging to a given communicator.
 
void snrt_cluster_hw_barrier ()
 Synchronize cores in a cluster with a hardware barrier, blocking.
 
static void snrt_inter_cluster_barrier (snrt_comm_t comm=NULL)
 Synchronize one core from every cluster with the others.
 
void snrt_global_barrier (snrt_comm_t comm)
 Synchronize all Snitch cores.
 
void snrt_partial_barrier (snrt_barrier_t *barr, uint32_t n)
 Generic software barrier.
 
uint32_t snrt_global_all_to_all_reduction (uint32_t value)
 Perform a global sum reduction, blocking.
 
template<typename T >
void snrt_global_reduction_dma (T *dst_buffer, T *src_buffer, size_t len, snrt_comm_t comm=NULL)
 Perform a sum reduction among clusters, blocking.
 
void snrt_wait_writeback (uint32_t val)
 Ensure value is written back to the register file.
 
void snrt_enable_multicast (uint32_t mask)
 Enable LSU multicast.
 
void snrt_disable_multicast ()
 Disable LSU multicast.
 

Variables

__thread snrt_comm_info_t snrt_comm_world_info
 
__thread snrt_comm_t snrt_comm_world
 

Detailed Description

This file provides functions to synchronize Snitch cores.

Function Documentation

◆ snrt_cluster_hw_barrier()

void snrt_cluster_hw_barrier ( )
inline

Synchronize cores in a cluster with a hardware barrier, blocking.

Note
Synchronizes all (both DM and compute) cores. All cores must invoke this function, or the calling cores will stall indefinitely.
173 {
174 asm volatile("csrr x0, 0x7C2" ::: "memory");
175}

◆ snrt_comm_create()

void snrt_comm_create ( uint32_t size,
snrt_comm_t * communicator )
inline

Creates a communicator object.

The newly created communicator object includes the first size clusters. All clusters, even those which are not part of the communicator, must invoke this function.

Parameters
sizeThe number of clusters to include in the communicator.
communicatorPointer to the communicator object to be created.
49 {
50 // Allocate communicator struct in L1 and point to it.
51 *communicator =
52 (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t));
53
54 // Allocate barrier counter in L3 and initialize to 0. Every core invokes
55 // the allocation function to update its allocator, but only one core
56 // initializes it. A global barrier is then used to ensure all cores "see"
57 // the initialized value.
58 uint32_t *barrier_ptr = (uint32_t *)snrt_l3_alloc_v2(sizeof(uint32_t));
59 if (snrt_global_core_idx() == 0) *barrier_ptr = 0;
61
62 // Initialize communicator, pointing to the newly-allocated barrier
63 // counter in L3.
64 (*communicator)->size = size;
65 (*communicator)->barrier_ptr = barrier_ptr;
66 (*communicator)->is_participant = snrt_cluster_idx() < size;
67}
Definition sync_decls.h:14
void snrt_global_barrier(snrt_comm_t comm)
Synchronize all Snitch cores.
Definition sync.h:217

◆ snrt_comm_init()

void snrt_comm_init ( )
inline

Initialize the communicator functions.

This function initializes the L1 allocator by calculating the end address of the heap and setting the base, end, and next pointers of the allocator.

Note
This function should be called before using any of the allocation functions.
37{ snrt_comm_world = &snrt_comm_world_info; }

◆ snrt_disable_multicast()

void snrt_disable_multicast ( )
inline

Disable LSU multicast.

394{ write_csr(0x7c4, 0); }

◆ snrt_enable_multicast()

void snrt_enable_multicast ( uint32_t mask)
inline

Enable LSU multicast.

All stores performed after this call will be multicast to all addresses specified by the address and mask pair.

Parameters
maskMulticast mask value
389{ write_csr(0x7c4, mask); }

◆ snrt_global_all_to_all_reduction()

uint32_t snrt_global_all_to_all_reduction ( uint32_t value)
inline

Perform a global sum reduction, blocking.

All cores participate in the reduction and synchronize globally to wait for the reduction to complete. The synchronization is performed via snrt_global_barrier.

Parameters
valueThe value to be summed.
Returns
The result of the sum reduction.
Note
Every Snitch core must invoke this function, or the calling cores will stall indefinitely.
265 {
266 // Reduce cores within cluster in TCDM
267 uint32_t *cluster_result = &(cls()->reduction);
268 uint32_t tmp = __atomic_fetch_add(cluster_result, value, __ATOMIC_RELAXED);
269
270 // Wait for writeback to ensure AMO is seen by all cores after barrier
273
274 // Reduce DM cores across clusters in global memory
275 if (snrt_is_dm_core()) {
276 __atomic_add_fetch(&_reduction_result, *cluster_result,
277 __ATOMIC_RELAXED);
279 *cluster_result = _reduction_result;
280 }
282 return *cluster_result;
283}
void snrt_wait_writeback(uint32_t val)
Ensure value is written back to the register file.
Definition sync.h:374
static void snrt_inter_cluster_barrier(snrt_comm_t comm=NULL)
Synchronize one core from every cluster with the others.
Definition sync.h:184
void snrt_cluster_hw_barrier()
Synchronize cores in a cluster with a hardware barrier, blocking.
Definition sync.h:173

◆ snrt_global_barrier()

void snrt_global_barrier ( snrt_comm_t comm)
inline

Synchronize all Snitch cores.

Synchronization is performed hierarchically. Within a cluster, cores are synchronized through a hardware barrier (see snrt_cluster_hw_barrier). Clusters are synchronized through a software barrier (see snrt_inter_cluster_barrier).

Parameters
commThe communicator determining which clusters synchronize.
Note
Every Snitch core must invoke this function, or the calling cores will stall indefinitely.
217 {
219
220 // Synchronize all DM cores in software
221 if (snrt_is_dm_core()) {
223 }
224 // Synchronize cores in a cluster with the HW barrier
226}

◆ snrt_global_reduction_dma()

template<typename T >
void snrt_global_reduction_dma ( T * dst_buffer,
T * src_buffer,
size_t len,
snrt_comm_t comm = NULL )
inline

Perform a sum reduction among clusters, blocking.

The reduction is performed in a logarithmic fashion. Half of the clusters active in every level of the binary-tree participate as as senders, the other half as receivers. Senders use the DMA to send their data to the respective receiver's destination buffer. The receiver then reduces each element in its destination buffer with the respective element in its source buffer. The result is stored in the source buffer. It then proceeds to the next level in the binary tree.

Parameters
dst_bufferThe pointer to the calling cluster's destination buffer.
src_bufferThe pointer to the calling cluster's source buffer.
lenThe amount of data in each buffer. Only integer multiples of the number of compute cores are supported at the moment.
commThe communicator determining which clusters participate in the reduction.
Note
The destination buffers must lie at the same offset in every cluster's TCDM.
306 {
307 // If no communicator is given, world communicator is used as default.
308 if (comm == NULL) comm = snrt_comm_world;
309
310 // If we have a single cluster, no reduction has to be done
311 if (comm->size > 1) {
312 // DMA core will send compute cores' data, so it must wait on it
313 // to be available
314 snrt_fpu_fence();
316
317 // Iterate levels in the binary reduction tree
318 int num_levels = ceil(log2(comm->size));
319 for (unsigned int level = 0; level < num_levels; level++) {
320 // Determine whether the current cluster is an active cluster.
321 // An active cluster is a cluster that participates in the current
322 // level of the reduction tree. Every second cluster among the
323 // active ones is a sender.
324 uint32_t is_active = (snrt_cluster_idx() % (1 << level)) == 0;
325 uint32_t is_sender = (snrt_cluster_idx() % (1 << (level + 1))) != 0;
326
327 // If the cluster is a sender, it sends the data in its source
328 // buffer to the respective receiver's destination buffer
329 if (is_active && is_sender) {
330 if (!snrt_is_compute_core()) {
331 uint64_t dst = (uint64_t)dst_buffer -
332 (1 << level) * SNRT_CLUSTER_OFFSET;
333 snrt_dma_start_1d(dst, (uint64_t)src_buffer,
334 len * sizeof(T));
336 }
337 }
338
339 // Synchronize senders and receivers
341
342 // Every cluster which is not a sender performs the reduction
343 if (is_active && !is_sender) {
344 // Computation is parallelized over the compute cores
345 if (snrt_is_compute_core()) {
346 uint32_t items_per_core =
347 len / snrt_cluster_compute_core_num();
348 uint32_t core_offset =
349 snrt_cluster_core_idx() * items_per_core;
350 for (uint32_t i = 0; i < items_per_core; i++) {
351 uint32_t abs_i = core_offset + i;
352 src_buffer[abs_i] += dst_buffer[abs_i];
353 }
354 }
355 }
356
357 // Synchronize compute and DM cores for next tree level
358 snrt_fpu_fence();
360 }
361 }
362}
static uint32_t snrt_dma_start_1d(uint64_t dst, uint64_t src, size_t size, const uint32_t channel=0)
Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a specific DMA channel.
Definition dma.h:31
static void snrt_dma_wait_all(const uint32_t channel=0)
Block until a specific DMA channel is idle.
Definition dma.h:239

◆ snrt_inter_cluster_barrier()

static void snrt_inter_cluster_barrier ( snrt_comm_t comm = NULL)
inlinestatic

Synchronize one core from every cluster with the others.

Parameters
commThe communicator determining which clusters synchronize.

Implemented as a software barrier.

Note
One core per cluster participating in the barrier must invoke this function, or the calling cores will stall indefinitely.
184 {
185 // If no communicator is given, world communicator is used as default.
186 if (comm == NULL) comm = snrt_comm_world;
187
188 // If the current cluster is not a participant, return immediately.
189 if (!comm->is_participant) return;
190
191 // Clusters participating in the barrier increment a shared counter.
192 uint32_t cnt = __atomic_add_fetch(comm->barrier_ptr, 1, __ATOMIC_RELAXED);
193
194 // All but the last cluster arriving on the barrier enter WFI. The last
195 // cluster resets the counter for the next barrier (if any) and multicasts
196 // an interrupt to wake up the other clusters.
197 if (cnt == comm->size) {
198 *(comm->barrier_ptr) = 0;
199 snrt_wake_clusters(1 << snrt_cluster_core_idx(), comm);
200 } else {
201 snrt_wfi();
202 // Clear interrupt for next barrier
203 snrt_int_clr_mcip();
204 }
205}
void snrt_wake_clusters(uint32_t core_mask, snrt_comm_t comm=NULL)
Wake the clusters belonging to a given communicator.
Definition sync.h:132

◆ snrt_mutex()

volatile uint32_t * snrt_mutex ( )
inline

Get a pointer to a mutex variable.

76{ return &_snrt_mutex; }

◆ snrt_mutex_acquire()

void snrt_mutex_acquire ( volatile uint32_t * pmtx)
inline

Acquire a mutex, blocking.

Test-and-set (TAS) implementation of a lock.

Parameters
pmtxA pointer to a variable which can be used as a mutex, i.e. to which all cores have a reference and at a memory location to which atomic accesses can be made. This can be declared e.g. as static volatile uint32_t mtx = 0;.
86 {
87 asm volatile(
88 "li t0,1 # t0 = 1\n"
89 "1:\n"
90 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
91 " bnez t0,1b # Retry if previously set)\n"
92 : "+r"(pmtx)
93 :
94 : "t0");
95}

◆ snrt_mutex_release()

void snrt_mutex_release ( volatile uint32_t * pmtx)
inline

Release a previously-acquired mutex.

119 {
120 asm volatile("amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n"
121 : "+r"(pmtx));
122}

◆ snrt_mutex_ttas_acquire()

void snrt_mutex_ttas_acquire ( volatile uint32_t * pmtx)
inline

Acquire a mutex, blocking.

Same as snrt_mutex_acquire but acquires the lock using a test and test-and-set (TTAS) strategy.

102 {
103 asm volatile(
104 "1:\n"
105 " lw t0, 0(%0)\n"
106 " bnez t0, 1b\n"
107 " li t0,1 # t0 = 1\n"
108 "2:\n"
109 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
110 " bnez t0,2b # Retry if previously set)\n"
111 : "+r"(pmtx)
112 :
113 : "t0");
114}

◆ snrt_partial_barrier()

void snrt_partial_barrier ( snrt_barrier_t * barr,
uint32_t n )
inline

Generic software barrier.

Parameters
barrpointer to a barrier variable.
nnumber of harts that have to enter before released.
Note
Exactly the specified number of harts must invoke this function, or the calling cores will stall indefinitely.
235 {
236 // Remember previous iteration
237 uint32_t prev_it = barr->iteration;
238 uint32_t cnt = __atomic_add_fetch(&barr->cnt, 1, __ATOMIC_RELAXED);
239
240 // Increment the barrier counter
241 if (cnt == n) {
242 barr->cnt = 0;
243 __atomic_add_fetch(&barr->iteration, 1, __ATOMIC_RELAXED);
244 } else {
245 // Some threads have not reached the barrier --> Let's wait
246 while (prev_it == barr->iteration)
247 ;
248 }
249}

◆ snrt_wait_writeback()

void snrt_wait_writeback ( uint32_t val)
inline

Ensure value is written back to the register file.

This function introduces a RAW dependency on val to stall the core until val is written back to the register file.

Parameters
valThe variable we want to wait on.
374 {
375 asm volatile("mv %0, %0" : "+r"(val)::);
376}

◆ snrt_wake_clusters()

void snrt_wake_clusters ( uint32_t core_mask,
snrt_comm_t comm = NULL )
inline

Wake the clusters belonging to a given communicator.

Parameters
commThe communicator determining which clusters to wake up.
132 {
133 // If no communicator is given, world communicator is used as default.
134 if (comm == NULL) comm = snrt_comm_world;
135
136#ifdef SNRT_SUPPORTS_MULTICAST
137 // Multicast cluster interrupt to every other cluster's core
138 // Note: we need to address another cluster's address space
139 // because the cluster XBAR has not been extended to support
140 // multicast yet. We address the second cluster, if we are the
141 // first cluster, and the first cluster otherwise.
142 if (snrt_cluster_num() > 0) {
143 volatile snitch_cluster_t *cluster;
144 if (snrt_cluster_idx() == 0)
145 cluster = snrt_cluster(1);
146 else
147 cluster = snrt_cluster(0);
148#pragma clang diagnostic push
149#pragma clang diagnostic ignored "-Waddress-of-packed-member"
150 uint32_t *addr = (uint32_t *)&(cluster->peripheral_reg.cl_clint_set.w);
151#pragma clang diagnostic pop
152 uint32_t mcast_mask = ((comm->size) - 1) * SNRT_CLUSTER_OFFSET;
153 snrt_enable_multicast(mcast_mask);
154 *addr = core_mask;
156 }
157#else
158 // Wake clusters sequentially
159 for (int i = 0; i < comm->size; i++) {
160 if (snrt_cluster_idx() != i) {
161 snrt_cluster(i)->peripheral_reg.cl_clint_set.f.cl_clint_set =
162 core_mask;
163 }
164 }
165#endif
166}
void snrt_enable_multicast(uint32_t mask)
Enable LSU multicast.
Definition sync.h:389
void snrt_disable_multicast()
Disable LSU multicast.
Definition sync.h:394

Variable Documentation

◆ snrt_comm_world_info

__thread snrt_comm_info_t snrt_comm_world_info
extern
14 {
15 .barrier_ptr = &(_snrt_barrier.cnt),
16 .size = SNRT_CLUSTER_NUM,
17 .is_participant = 1};