Snitch Runtime
Loading...
Searching...
No Matches
sync.h
Go to the documentation of this file.
1// Copyright 2023 ETH Zurich and University of Bologna.
2// Licensed under the Apache License, Version 2.0, see LICENSE for details.
3// SPDX-License-Identifier: Apache-2.0
4//
5// Luca Colagrande <colluca@iis.ee.ethz.ch>
6// Viviane Potocnik <vivianep@iis.ee.ethz.ch>
7
13#pragma once
14
15#include "../../deps/riscv-opcodes/encoding.h"
16
17#include <math.h>
18
19//================================================================================
20// Communicator functions
21//================================================================================
22
23extern __thread snrt_comm_info_t snrt_comm_world_info;
24extern __thread snrt_comm_t snrt_comm_world;
25
32inline void snrt_comm_init() {
33 // Point to default-initialized communicator struct, with barrier pointer
34 // in L3.
35 snrt_comm_world = &snrt_comm_world_info;
36
37 // Allocate barrier counter in L1. This allows us to perform global
38 // hardware barriers, as reductions are currently not supported in L3.
39 // All clusters allocate a barrier counter because we want to keep all
40 // clusters' L1 allocators aligned, but only the zero-th cluster's is
41 // actually used. So all clusters allocate one, but only the zero-th
42 // cluster's is initialized. A global barrier is then used to ensure
43 // all cores "see" the initialized value. This global barrier uses the
44 // default-initialized barrier pointer in L3. It must thus be a software
45 // barrier, as we currently do not support hardware reductions in L3.
46 void *barrier_ptr = snrt_l1_alloc_cluster_local(sizeof(uint32_t));
47 barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), 0);
48 if (snrt_global_core_idx() == 0) {
49 *(uint32_t *)barrier_ptr = 0;
50 // TODO(colluca): this is a workaround that shouldn't be necessary.
51 // It seems some tests expect the next pointer at the start of the
52 // user application to be aligned to the hyperbank.
53 // > Should we get rid of the alloc_v1 API altogether and fix these?
54 snrt_l1_update_next(snrt_l1_next_aligned_hyperbank());
55 }
56 snrt_global_sw_barrier();
57
58 // Update the communicator struct, pointing to the barrier pointer in L1.
59 // This whole workaround is required because we cannot statically allocate
60 // variables in L1.
61 snrt_comm_world->barrier_ptr = (uint32_t *)barrier_ptr;
62}
63
74inline void snrt_comm_create(uint32_t size, snrt_comm_t *communicator) {
75 // Allocate communicator struct in L1 and point to it.
76 *communicator =
77 (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t));
78
79 // Allocate barrier counter in L1. This allows us to perform global
80 // hardware barriers, as reductions are currently not supported in L3.
81 // All clusters allocate a barrier counter because we want to keep all
82 // clusters' L1 allocators aligned, but only the zero-th cluster's is
83 // actually used. So all clusters allocate one, but only the zero-th
84 // cluster's is initialized. A global barrier is then used to ensure
85 // all cores "see" the initialized value.
86 void *barrier_ptr = snrt_l1_alloc_cluster_local(sizeof(uint32_t));
87 barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), 0);
88 if (snrt_global_core_idx() == 0) *(uint32_t *)barrier_ptr = 0;
90
91 // Initialize communicator, pointing to the newly-allocated barrier
92 // counter in L3.
93 (*communicator)->size = size;
94 (*communicator)->base = 0;
95 (*communicator)->mask = size - 1;
96 (*communicator)->barrier_ptr = (uint32_t *)barrier_ptr;
97 (*communicator)->is_participant = snrt_cluster_idx() < size;
98}
99
100//================================================================================
101// Mutex functions
102//================================================================================
103
107inline volatile uint32_t *snrt_mutex() { return &_snrt_mutex; }
108
117inline void snrt_mutex_acquire(volatile uint32_t *pmtx) {
118 asm volatile(
119 "li t0,1 # t0 = 1\n"
120 "1:\n"
121 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
122 " bnez t0,1b # Retry if previously set)\n"
123 : "+r"(pmtx)
124 :
125 : "t0");
126}
127
133inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) {
134 asm volatile(
135 "1:\n"
136 " lw t0, 0(%0)\n"
137 " bnez t0, 1b\n"
138 " li t0,1 # t0 = 1\n"
139 "2:\n"
140 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
141 " bnez t0,2b # Retry if previously set)\n"
142 : "+r"(pmtx)
143 :
144 : "t0");
145}
146
150inline void snrt_mutex_release(volatile uint32_t *pmtx) {
151 asm volatile("amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n"
152 : "+r"(pmtx));
153}
154
155//================================================================================
156// Barrier functions
157//================================================================================
158
167inline void snrt_wake_clusters(uint32_t core_mask, snrt_comm_t comm = NULL) {
168 // If no communicator is given, world communicator is used as default.
169 if (comm == NULL) comm = snrt_comm_world;
170
171#ifdef SNRT_SUPPORTS_NARROW_MULTICAST
172 // Multicast cluster interrupt to every other cluster's core
173 if (snrt_cluster_num() > 0) {
174 volatile snitch_cluster_t *cluster = snrt_cluster(0);
175#pragma clang diagnostic push
176#pragma clang diagnostic ignored "-Waddress-of-packed-member"
177 uint32_t *addr = (uint32_t *)&(cluster->peripheral_reg.cl_clint_set.w);
178#pragma clang diagnostic pop
179 uint32_t mcast_mask = snrt_get_collective_mask(comm);
180 snrt_enable_multicast(mcast_mask);
181 *addr = core_mask;
183 }
184#else
185 // Wake clusters sequentially.
186 // We find all clusters represented by the (base, mask) encoding through
187 // submask enumeration (https://codeforces.com/blog/entry/108942).
188 uint32_t mask = comm->mask;
189 uint32_t fixed = comm->base & ~mask;
190 uint32_t submask = 0;
191 do {
192 uint32_t i = fixed | submask;
193 if (snrt_cluster_idx() != i) snrt_int_cluster_set(core_mask, i);
194 submask = (submask - 1) & mask;
195 } while (submask != 0);
196#endif
197}
198
205 asm volatile("csrr x0, barrier" ::: "memory");
206}
207
208static inline void snrt_inter_cluster_sw_barrier(snrt_comm_t comm = NULL) {
209 // If no communicator is given, world communicator is used as default.
210 if (comm == NULL) comm = snrt_comm_world;
211
212 // If the current cluster is not a participant, return immediately.
213 if (!comm->is_participant) return;
214
215 // Clusters participating in the barrier increment a shared counter.
216 uint32_t cnt = __atomic_add_fetch(comm->barrier_ptr, 1, __ATOMIC_RELAXED);
217
218 // All but the last cluster arriving on the barrier enter WFI. The last
219 // cluster resets the counter for the next barrier (if any) and multicasts
220 // an interrupt to wake up the other clusters.
221 if (cnt == comm->size) {
222 *(comm->barrier_ptr) = 0;
223 snrt_fence();
224 snrt_wake_clusters(1 << snrt_cluster_core_idx(), comm);
225 } else {
226 snrt_wfi();
227 }
228 // Clear interrupt for next barrier (interrupt arrives also at sender)
229 snrt_int_clr_mcip();
230}
231
241static inline void snrt_inter_cluster_barrier(snrt_comm_t comm = NULL) {
242 // If no communicator is given, world communicator is used as default.
243 if (comm == NULL) comm = snrt_comm_world;
244
245 // If the current cluster is not a participant, return immediately.
246 if (!comm->is_participant) return;
247
248#ifdef SNRT_SUPPORTS_NARROW_REDUCTION
249 // Fetch the address for the reduction
250 volatile uint32_t *addr = comm->barrier_ptr;
251
252 // Compose collective mask
253 uint64_t mask = snrt_get_collective_mask(comm);
254
255 // Launch the reduction
256 snrt_enable_reduction(mask, SNRT_REDUCTION_BARRIER);
257 *addr = 0;
259
260 // Fence to wait until the reduction is finished
261 snrt_fence();
262#else
263 snrt_inter_cluster_sw_barrier(comm);
264#endif
265}
266
267inline void snrt_global_sw_barrier(snrt_comm_t comm) {
268 // Synchronize cores in a cluster with the HW barrier
270
271 // Synchronize all clusters
272 if (snrt_is_dm_core()) {
273 snrt_inter_cluster_sw_barrier(comm);
274 }
275
276 // Synchronize cores in a cluster with the HW barrier
278}
279
291 // Synchronize cores in a cluster with the HW barrier
293
294 // Synchronize all clusters
295 if (snrt_is_dm_core()) {
297 }
298
299 // Synchronize cores in a cluster with the HW barrier
301}
302
310inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) {
311 // Remember previous iteration
312 uint32_t prev_it = barr->iteration;
313 uint32_t cnt = __atomic_add_fetch(&barr->cnt, 1, __ATOMIC_RELAXED);
314
315 // Increment the barrier counter
316 if (cnt == n) {
317 barr->cnt = 0;
318 __atomic_add_fetch(&barr->iteration, 1, __ATOMIC_RELAXED);
319 } else {
320 // Some threads have not reached the barrier --> Let's wait
321 while (prev_it == barr->iteration)
322 ;
323 }
324}
325
326//================================================================================
327// Reduction functions
328//================================================================================
329
340inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
341 // Reduce cores within cluster in TCDM
342 uint32_t *cluster_result = &(snrt_cls()->reduction);
343 uint32_t tmp = __atomic_fetch_add(cluster_result, value, __ATOMIC_RELAXED);
344
345 // Wait for writeback to ensure AMO is seen by all cores after barrier
348
349 // Reduce DM cores across clusters in global memory
350 if (snrt_is_dm_core()) {
351 __atomic_add_fetch(&_reduction_result, *cluster_result,
352 __ATOMIC_RELAXED);
354 *cluster_result = _reduction_result;
355 }
357 return *cluster_result;
358}
359
379template <typename T>
380inline void snrt_global_reduction_dma(T *dst_buffer, T *src_buffer, size_t len,
381 snrt_comm_t comm = NULL) {
382 // If no communicator is given, world communicator is used as default.
383 if (comm == NULL) comm = snrt_comm_world;
384
385 // If we have a single cluster, no reduction has to be done
386 if (comm->size > 1) {
387 // DMA core will send compute cores' data, so it must wait on it
388 // to be available
389 snrt_fpu_fence();
391
392 // Iterate levels in the binary reduction tree
393 int num_levels = ceil(log2(comm->size));
394 for (unsigned int level = 0; level < num_levels; level++) {
395 // Determine whether the current cluster is an active cluster.
396 // An active cluster is a cluster that participates in the current
397 // level of the reduction tree. Every second cluster among the
398 // active ones is a sender.
399 uint32_t is_active = (snrt_cluster_idx() % (1 << level)) == 0;
400 uint32_t is_sender = (snrt_cluster_idx() % (1 << (level + 1))) != 0;
401
402 // If the cluster is a sender, it sends the data in its source
403 // buffer to the respective receiver's destination buffer
404 if (is_active && is_sender) {
405 if (!snrt_is_compute_core()) {
406 uint64_t dst = (uint64_t)dst_buffer -
407 (1 << level) * SNRT_CLUSTER_OFFSET;
408 snrt_dma_start_1d(dst, (uint64_t)src_buffer,
409 len * sizeof(T));
411 }
412 }
413
414 // Synchronize senders and receivers
416
417 // Every cluster which is not a sender performs the reduction
418 if (is_active && !is_sender) {
419 // Computation is parallelized over the compute cores
420 if (snrt_is_compute_core()) {
421 uint32_t items_per_core =
422 len / snrt_cluster_compute_core_num();
423 uint32_t core_offset =
424 snrt_cluster_core_idx() * items_per_core;
425 for (uint32_t i = 0; i < items_per_core; i++) {
426 uint32_t abs_i = core_offset + i;
427 src_buffer[abs_i] += dst_buffer[abs_i];
428 }
429 }
430 }
431
432 // Synchronize compute and DM cores for next tree level
433 snrt_fpu_fence();
435 }
436 }
437}
438
439//================================================================================
440// Memory consistency
441//================================================================================
442
449inline void snrt_wait_writeback(uint32_t val) {
450 asm volatile("mv %0, %0" : "+r"(val)::);
451}
452
453//================================================================================
454// User functions
455//================================================================================
456
464inline void snrt_set_awuser(uint64_t field) {
465 write_csr(user_low, (uint32_t)(field));
466 write_csr(user_high, (uint32_t)(field >> 32));
467}
468
469inline void snrt_set_awuser_low(uint32_t field) {
470 write_csr(user_low, (uint32_t)(field));
471}
472
473inline uint64_t snrt_get_collective_mask(snrt_comm_t comm) {
474 return comm->mask * SNRT_CLUSTER_OFFSET;
475}
476
477//================================================================================
478// Multicast functions
479//================================================================================
480
488inline void snrt_enable_multicast(uint64_t mask) {
490 op.f.opcode = SNRT_COLLECTIVE_MULTICAST;
491 op.f.mask = mask;
492 snrt_set_awuser(op.w);
493}
494
499
500//================================================================================
501// Reduction functions
502//================================================================================
503
511inline void snrt_enable_reduction(uint64_t mask,
512 snrt_collective_opcode_t opcode) {
514 op.f.opcode = opcode;
515 op.f.mask = mask;
516 snrt_set_awuser(op.w);
517}
518
static uint32_t snrt_dma_start_1d(uint64_t dst, uint64_t src, size_t size, const uint32_t channel=0)
Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a specific DMA channel.
Definition dma.h:35
static void snrt_dma_wait_all(const uint32_t channel=0)
Block until a specific DMA channel is idle.
Definition dma.h:364
Definition sync_decls.h:9
Definition sync_decls.h:14
void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n)
Generic software barrier.
Definition sync.h:310
void snrt_set_awuser(uint64_t field)
Enable LSU AW user field.
Definition sync.h:464
void snrt_comm_create(uint32_t size, snrt_comm_t *communicator)
Creates a communicator object.
Definition sync.h:74
void snrt_wake_clusters(uint32_t core_mask, snrt_comm_t comm=NULL)
Wake the clusters belonging to a given communicator. Can only be called by a single core in the whole...
Definition sync.h:167
void snrt_enable_reduction(uint64_t mask, snrt_collective_opcode_t opcode)
Enable LSU reduction.
Definition sync.h:511
void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx)
Acquire a mutex, blocking.
Definition sync.h:133
void snrt_mutex_acquire(volatile uint32_t *pmtx)
Acquire a mutex, blocking.
Definition sync.h:117
void snrt_comm_init()
Initialize the world communicator.
Definition sync.h:32
volatile uint32_t * snrt_mutex()
Get a pointer to a mutex variable.
Definition sync.h:107
void snrt_wait_writeback(uint32_t val)
Ensure value is written back to the register file.
Definition sync.h:449
static void snrt_inter_cluster_barrier(snrt_comm_t comm=NULL)
Synchronize one core from every cluster with the others.
Definition sync.h:241
void snrt_global_reduction_dma(T *dst_buffer, T *src_buffer, size_t len, snrt_comm_t comm=NULL)
Perform a sum reduction among clusters, blocking.
Definition sync.h:380
void snrt_global_barrier(snrt_comm_t comm)
Synchronize all Snitch cores.
Definition sync.h:290
uint32_t snrt_global_all_to_all_reduction(uint32_t value)
Perform a global sum reduction, blocking.
Definition sync.h:340
void snrt_disable_reduction()
Disable LSU reduction.
Definition sync.h:522
void snrt_cluster_hw_barrier()
Synchronize cores in a cluster with a hardware barrier, blocking.
Definition sync.h:204
void snrt_mutex_release(volatile uint32_t *pmtx)
Release a previously-acquired mutex.
Definition sync.h:150
void snrt_disable_multicast()
Disable LSU multicast.
Definition sync.h:498
void snrt_enable_multicast(uint64_t mask)
Enable LSU multicast.
Definition sync.h:488
Definition sync_decls.h:40