Snitch Runtime
Loading...
Searching...
No Matches
sync.h
Go to the documentation of this file.
1// Copyright 2023 ETH Zurich and University of Bologna.
2// Licensed under the Apache License, Version 2.0, see LICENSE for details.
3// SPDX-License-Identifier: Apache-2.0
4//
5// Luca Colagrande <colluca@iis.ee.ethz.ch>
6// Viviane Potocnik <vivianep@iis.ee.ethz.ch>
7
13#pragma once
14
15#include "../../deps/riscv-opcodes/encoding.h"
16
17#include <math.h>
18
19#define SNRT_BROADCAST_MASK ((SNRT_CLUSTER_NUM - 1) * SNRT_CLUSTER_OFFSET)
20
21//================================================================================
22// Communicator functions
23//================================================================================
24
25extern __thread snrt_comm_info_t snrt_comm_world_info;
26extern __thread snrt_comm_t snrt_comm_world;
27
37inline void snrt_comm_init() { snrt_comm_world = &snrt_comm_world_info; }
38
49inline void snrt_comm_create(uint32_t size, snrt_comm_t *communicator) {
50 // Allocate communicator struct in L1 and point to it.
51 *communicator =
52 (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t));
53
54 // Allocate barrier counter in L3 and initialize to 0. Every core invokes
55 // the allocation function to update its allocator, but only one core
56 // initializes it. A global barrier is then used to ensure all cores "see"
57 // the initialized value.
58 uint32_t *barrier_ptr = (uint32_t *)snrt_l3_alloc_v2(sizeof(uint32_t));
59 if (snrt_global_core_idx() == 0) *barrier_ptr = 0;
61
62 // Initialize communicator, pointing to the newly-allocated barrier
63 // counter in L3.
64 (*communicator)->size = size;
65 (*communicator)->barrier_ptr = barrier_ptr;
66 (*communicator)->is_participant = snrt_cluster_idx() < size;
67}
68
69//================================================================================
70// Mutex functions
71//================================================================================
72
76inline volatile uint32_t *snrt_mutex() { return &_snrt_mutex; }
77
86inline void snrt_mutex_acquire(volatile uint32_t *pmtx) {
87 asm volatile(
88 "li t0,1 # t0 = 1\n"
89 "1:\n"
90 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
91 " bnez t0,1b # Retry if previously set)\n"
92 : "+r"(pmtx)
93 :
94 : "t0");
95}
96
102inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) {
103 asm volatile(
104 "1:\n"
105 " lw t0, 0(%0)\n"
106 " bnez t0, 1b\n"
107 " li t0,1 # t0 = 1\n"
108 "2:\n"
109 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
110 " bnez t0,2b # Retry if previously set)\n"
111 : "+r"(pmtx)
112 :
113 : "t0");
114}
115
119inline void snrt_mutex_release(volatile uint32_t *pmtx) {
120 asm volatile("amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n"
121 : "+r"(pmtx));
122}
123
124//================================================================================
125// Barrier functions
126//================================================================================
127
132inline void snrt_wake_clusters(uint32_t core_mask, snrt_comm_t comm = NULL) {
133 // If no communicator is given, world communicator is used as default.
134 if (comm == NULL) comm = snrt_comm_world;
135
136#ifdef SNRT_SUPPORTS_MULTICAST
137 // Multicast cluster interrupt to every other cluster's core
138 // Note: we need to address another cluster's address space
139 // because the cluster XBAR has not been extended to support
140 // multicast yet. We address the second cluster, if we are the
141 // first cluster, and the first cluster otherwise.
142 if (snrt_cluster_num() > 0) {
143 volatile snitch_cluster_t *cluster;
144 if (snrt_cluster_idx() == 0)
145 cluster = snrt_cluster(1);
146 else
147 cluster = snrt_cluster(0);
148#pragma clang diagnostic push
149#pragma clang diagnostic ignored "-Waddress-of-packed-member"
150 uint32_t *addr = (uint32_t *)&(cluster->peripheral_reg.cl_clint_set.w);
151#pragma clang diagnostic pop
152 uint32_t mcast_mask = ((comm->size) - 1) * SNRT_CLUSTER_OFFSET;
153 snrt_enable_multicast(mcast_mask);
154 *addr = core_mask;
156 }
157#else
158 // Wake clusters sequentially
159 for (int i = 0; i < comm->size; i++) {
160 if (snrt_cluster_idx() != i) {
161 snrt_cluster(i)->peripheral_reg.cl_clint_set.f.cl_clint_set =
162 core_mask;
163 }
164 }
165#endif
166}
167
174 asm volatile("csrr x0, 0x7C2" ::: "memory");
175}
176
184static inline void snrt_inter_cluster_barrier(snrt_comm_t comm = NULL) {
185 // If no communicator is given, world communicator is used as default.
186 if (comm == NULL) comm = snrt_comm_world;
187
188 // If the current cluster is not a participant, return immediately.
189 if (!comm->is_participant) return;
190
191 // Clusters participating in the barrier increment a shared counter.
192 uint32_t cnt = __atomic_add_fetch(comm->barrier_ptr, 1, __ATOMIC_RELAXED);
193
194 // All but the last cluster arriving on the barrier enter WFI. The last
195 // cluster resets the counter for the next barrier (if any) and multicasts
196 // an interrupt to wake up the other clusters.
197 if (cnt == comm->size) {
198 *(comm->barrier_ptr) = 0;
199 snrt_wake_clusters(1 << snrt_cluster_core_idx(), comm);
200 } else {
201 snrt_wfi();
202 // Clear interrupt for next barrier
203 snrt_int_clr_mcip();
204 }
205}
206
219
220 // Synchronize all DM cores in software
221 if (snrt_is_dm_core()) {
223 }
224 // Synchronize cores in a cluster with the HW barrier
226}
227
235inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) {
236 // Remember previous iteration
237 uint32_t prev_it = barr->iteration;
238 uint32_t cnt = __atomic_add_fetch(&barr->cnt, 1, __ATOMIC_RELAXED);
239
240 // Increment the barrier counter
241 if (cnt == n) {
242 barr->cnt = 0;
243 __atomic_add_fetch(&barr->iteration, 1, __ATOMIC_RELAXED);
244 } else {
245 // Some threads have not reached the barrier --> Let's wait
246 while (prev_it == barr->iteration)
247 ;
248 }
249}
250
251//================================================================================
252// Reduction functions
253//================================================================================
254
265inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
266 // Reduce cores within cluster in TCDM
267 uint32_t *cluster_result = &(cls()->reduction);
268 uint32_t tmp = __atomic_fetch_add(cluster_result, value, __ATOMIC_RELAXED);
269
270 // Wait for writeback to ensure AMO is seen by all cores after barrier
273
274 // Reduce DM cores across clusters in global memory
275 if (snrt_is_dm_core()) {
276 __atomic_add_fetch(&_reduction_result, *cluster_result,
277 __ATOMIC_RELAXED);
279 *cluster_result = _reduction_result;
280 }
282 return *cluster_result;
283}
284
304template <typename T>
305inline void snrt_global_reduction_dma(T *dst_buffer, T *src_buffer, size_t len,
306 snrt_comm_t comm = NULL) {
307 // If no communicator is given, world communicator is used as default.
308 if (comm == NULL) comm = snrt_comm_world;
309
310 // If we have a single cluster, no reduction has to be done
311 if (comm->size > 1) {
312 // DMA core will send compute cores' data, so it must wait on it
313 // to be available
314 snrt_fpu_fence();
316
317 // Iterate levels in the binary reduction tree
318 int num_levels = ceil(log2(comm->size));
319 for (unsigned int level = 0; level < num_levels; level++) {
320 // Determine whether the current cluster is an active cluster.
321 // An active cluster is a cluster that participates in the current
322 // level of the reduction tree. Every second cluster among the
323 // active ones is a sender.
324 uint32_t is_active = (snrt_cluster_idx() % (1 << level)) == 0;
325 uint32_t is_sender = (snrt_cluster_idx() % (1 << (level + 1))) != 0;
326
327 // If the cluster is a sender, it sends the data in its source
328 // buffer to the respective receiver's destination buffer
329 if (is_active && is_sender) {
330 if (!snrt_is_compute_core()) {
331 uint64_t dst = (uint64_t)dst_buffer -
332 (1 << level) * SNRT_CLUSTER_OFFSET;
333 snrt_dma_start_1d(dst, (uint64_t)src_buffer,
334 len * sizeof(T));
336 }
337 }
338
339 // Synchronize senders and receivers
341
342 // Every cluster which is not a sender performs the reduction
343 if (is_active && !is_sender) {
344 // Computation is parallelized over the compute cores
345 if (snrt_is_compute_core()) {
346 uint32_t items_per_core =
347 len / snrt_cluster_compute_core_num();
348 uint32_t core_offset =
349 snrt_cluster_core_idx() * items_per_core;
350 for (uint32_t i = 0; i < items_per_core; i++) {
351 uint32_t abs_i = core_offset + i;
352 src_buffer[abs_i] += dst_buffer[abs_i];
353 }
354 }
355 }
356
357 // Synchronize compute and DM cores for next tree level
358 snrt_fpu_fence();
360 }
361 }
362}
363
364//================================================================================
365// Memory consistency
366//================================================================================
367
374inline void snrt_wait_writeback(uint32_t val) {
375 asm volatile("mv %0, %0" : "+r"(val)::);
376}
377
378//================================================================================
379// Multicast functions
380//================================================================================
381
389inline void snrt_enable_multicast(uint32_t mask) { write_csr(0x7c4, mask); }
390
394inline void snrt_disable_multicast() { write_csr(0x7c4, 0); }
static uint32_t snrt_dma_start_1d(uint64_t dst, uint64_t src, size_t size, const uint32_t channel=0)
Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a specific DMA channel.
Definition dma.h:31
static void snrt_dma_wait_all(const uint32_t channel=0)
Block until a specific DMA channel is idle.
Definition dma.h:239
Definition sync_decls.h:9
Definition sync_decls.h:14
void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n)
Generic software barrier.
Definition sync.h:235
void snrt_comm_create(uint32_t size, snrt_comm_t *communicator)
Creates a communicator object.
Definition sync.h:49
void snrt_wake_clusters(uint32_t core_mask, snrt_comm_t comm=NULL)
Wake the clusters belonging to a given communicator.
Definition sync.h:132
void snrt_enable_multicast(uint32_t mask)
Enable LSU multicast.
Definition sync.h:389
void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx)
Acquire a mutex, blocking.
Definition sync.h:102
void snrt_mutex_acquire(volatile uint32_t *pmtx)
Acquire a mutex, blocking.
Definition sync.h:86
void snrt_comm_init()
Initialize the communicator functions.
Definition sync.h:37
volatile uint32_t * snrt_mutex()
Get a pointer to a mutex variable.
Definition sync.h:76
void snrt_wait_writeback(uint32_t val)
Ensure value is written back to the register file.
Definition sync.h:374
static void snrt_inter_cluster_barrier(snrt_comm_t comm=NULL)
Synchronize one core from every cluster with the others.
Definition sync.h:184
void snrt_global_reduction_dma(T *dst_buffer, T *src_buffer, size_t len, snrt_comm_t comm=NULL)
Perform a sum reduction among clusters, blocking.
Definition sync.h:305
void snrt_global_barrier(snrt_comm_t comm)
Synchronize all Snitch cores.
Definition sync.h:217
uint32_t snrt_global_all_to_all_reduction(uint32_t value)
Perform a global sum reduction, blocking.
Definition sync.h:265
void snrt_cluster_hw_barrier()
Synchronize cores in a cluster with a hardware barrier, blocking.
Definition sync.h:173
void snrt_mutex_release(volatile uint32_t *pmtx)
Release a previously-acquired mutex.
Definition sync.h:119
void snrt_disable_multicast()
Disable LSU multicast.
Definition sync.h:394