Snitch Runtime
Loading...
Searching...
No Matches
sync.h
Go to the documentation of this file.
1// Copyright 2023 ETH Zurich and University of Bologna.
2// Licensed under the Apache License, Version 2.0, see LICENSE for details.
3// SPDX-License-Identifier: Apache-2.0
4//
5// Luca Colagrande <colluca@iis.ee.ethz.ch>
6// Viviane Potocnik <vivianep@iis.ee.ethz.ch>
7
13#pragma once
14
15#include "../../deps/riscv-opcodes/encoding.h"
16
17#include <math.h>
18
19#define SNRT_BROADCAST_MASK ((SNRT_CLUSTER_NUM - 1) * SNRT_CLUSTER_OFFSET)
20
21//================================================================================
22// Mutex functions
23//================================================================================
24
28inline volatile uint32_t *snrt_mutex() { return &_snrt_mutex; }
29
38inline void snrt_mutex_acquire(volatile uint32_t *pmtx) {
39 asm volatile(
40 "li t0,1 # t0 = 1\n"
41 "1:\n"
42 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
43 " bnez t0,1b # Retry if previously set)\n"
44 : "+r"(pmtx)
45 :
46 : "t0");
47}
48
54inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) {
55 asm volatile(
56 "1:\n"
57 " lw t0, 0(%0)\n"
58 " bnez t0, 1b\n"
59 " li t0,1 # t0 = 1\n"
60 "2:\n"
61 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
62 " bnez t0,2b # Retry if previously set)\n"
63 : "+r"(pmtx)
64 :
65 : "t0");
66}
67
71inline void snrt_mutex_release(volatile uint32_t *pmtx) {
72 asm volatile("amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n"
73 : "+r"(pmtx));
74}
75
76//================================================================================
77// Barrier functions
78//================================================================================
79
80inline void snrt_wake_all(uint32_t core_mask) {
81#ifdef SNRT_SUPPORTS_MULTICAST
82 // Multicast cluster interrupt to every other cluster's core
83 // Note: we need to address another cluster's address space
84 // because the cluster XBAR has not been extended to support
85 // multicast yet. We address the second cluster, if we are the
86 // first cluster, and the first cluster otherwise.
87 if (snrt_cluster_num() > 0) {
88 volatile snitch_cluster_t *cluster;
89 if (snrt_cluster_idx() == 0)
90 cluster = snrt_cluster(1);
91 else
92 cluster = snrt_cluster(0);
93#pragma clang diagnostic push
94#pragma clang diagnostic ignored "-Waddress-of-packed-member"
95 uint32_t *addr = (uint32_t *)&(cluster->peripheral_reg.cl_clint_set.w);
96#pragma clang diagnostic pop
97 snrt_enable_multicast(SNRT_BROADCAST_MASK);
98 *addr = core_mask;
100 }
101#else
102 for (int i = 0; i < snrt_cluster_num(); i++) {
103 if (snrt_cluster_idx() != i) {
104 snrt_cluster(i)->peripheral_reg.cl_clint_set.f.cl_clint_set =
105 core_mask;
106 }
107 }
108#endif
109}
110
117 asm volatile("csrr x0, 0x7C2" ::: "memory");
118}
119
126static inline void snrt_inter_cluster_barrier() {
127 // Everyone increments a shared counter
128 uint32_t cnt =
129 __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
130
131 // All but the last cluster enter WFI, while the last cluster resets the
132 // counter for the next barrier and multicasts an interrupt to wake up the
133 // other clusters.
134 if (cnt == snrt_cluster_num()) {
135 _snrt_barrier.cnt = 0;
136 // Wake all clusters
137 snrt_wake_all(1 << snrt_cluster_core_idx());
138 } else {
139 snrt_wfi();
140 // Clear interrupt for next barrier
141 snrt_int_clr_mcip();
142 }
143}
144
154inline void snrt_global_barrier() {
156
157 // Synchronize all DM cores in software
158 if (snrt_is_dm_core()) {
159 snrt_inter_cluster_barrier();
160 }
161 // Synchronize cores in a cluster with the HW barrier
163}
164
172inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) {
173 // Remember previous iteration
174 uint32_t prev_it = barr->iteration;
175 uint32_t cnt = __atomic_add_fetch(&barr->cnt, 1, __ATOMIC_RELAXED);
176
177 // Increment the barrier counter
178 if (cnt == n) {
179 barr->cnt = 0;
180 __atomic_add_fetch(&barr->iteration, 1, __ATOMIC_RELAXED);
181 } else {
182 // Some threads have not reached the barrier --> Let's wait
183 while (prev_it == barr->iteration)
184 ;
185 }
186}
187
188//================================================================================
189// Reduction functions
190//================================================================================
191
202inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
203 // Reduce cores within cluster in TCDM
204 uint32_t *cluster_result = &(cls()->reduction);
205 uint32_t tmp = __atomic_fetch_add(cluster_result, value, __ATOMIC_RELAXED);
206
207 // Wait for writeback to ensure AMO is seen by all cores after barrier
210
211 // Reduce DM cores across clusters in global memory
212 if (snrt_is_dm_core()) {
213 __atomic_add_fetch(&_reduction_result, *cluster_result,
214 __ATOMIC_RELAXED);
215 snrt_inter_cluster_barrier();
216 *cluster_result = _reduction_result;
217 }
219 return *cluster_result;
220}
221
238inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
239 size_t len) {
240 // If we have a single cluster, no reduction has to be done
241 if (snrt_cluster_num() > 1) {
242 // Iterate levels in the binary reduction tree
243 int num_levels = ceil(log2(snrt_cluster_num()));
244 for (unsigned int level = 0; level < num_levels; level++) {
245 // Determine whether the current cluster is an active cluster.
246 // An active cluster is a cluster that participates in the current
247 // level of the reduction tree. Every second cluster among the
248 // active ones is a sender.
249 uint32_t is_active = (snrt_cluster_idx() % (1 << level)) == 0;
250 uint32_t is_sender = (snrt_cluster_idx() % (1 << (level + 1))) != 0;
251
252 // If the cluster is a sender, it sends the data in its source
253 // buffer to the respective receiver's destination buffer
254 if (is_active && is_sender) {
255 if (!snrt_is_compute_core()) {
256 uint64_t dst = (uint64_t)dst_buffer -
257 (1 << level) * SNRT_CLUSTER_OFFSET;
258 snrt_dma_start_1d(dst, (uint64_t)src_buffer,
259 len * sizeof(double));
260 snrt_dma_wait_all();
261 }
262 }
263
264 // Synchronize senders and receivers
266
267 // Every cluster which is not a sender performs the reduction
268 if (is_active && !is_sender) {
269 // Computation is parallelized over the compute cores
270 if (snrt_is_compute_core()) {
271 uint32_t items_per_core =
272 len / snrt_cluster_compute_core_num();
273 uint32_t core_offset =
274 snrt_cluster_core_idx() * items_per_core;
275 for (uint32_t i = 0; i < items_per_core; i++) {
276 uint32_t abs_i = core_offset + i;
277 src_buffer[abs_i] += dst_buffer[abs_i];
278 }
279 }
280 }
281
282 // Synchronize compute and DM cores for next tree level
284 }
285 }
286}
287
288//================================================================================
289// Memory consistency
290//================================================================================
291
298inline void snrt_wait_writeback(uint32_t val) {
299 asm volatile("mv %0, %0" : "+r"(val)::);
300}
301
302//================================================================================
303// Multicast functions
304//================================================================================
305
313inline void snrt_enable_multicast(uint32_t mask) { write_csr(0x7c4, mask); }
314
318inline void snrt_disable_multicast() { write_csr(0x7c4, 0); }
Definition sync_decls.h:9
void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n)
Generic software barrier.
Definition sync.h:172
void snrt_enable_multicast(uint32_t mask)
Enable LSU multicast.
Definition sync.h:313
void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx)
Acquire a mutex, blocking.
Definition sync.h:54
void snrt_mutex_acquire(volatile uint32_t *pmtx)
Acquire a mutex, blocking.
Definition sync.h:38
volatile uint32_t * snrt_mutex()
Get a pointer to a mutex variable.
Definition sync.h:28
void snrt_wait_writeback(uint32_t val)
Ensure value is written back to the register file.
Definition sync.h:298
uint32_t snrt_global_all_to_all_reduction(uint32_t value)
Perform a global sum reduction, blocking.
Definition sync.h:202
void snrt_global_barrier()
Synchronize all Snitch cores.
Definition sync.h:154
void snrt_cluster_hw_barrier()
Synchronize cores in a cluster with a hardware barrier, blocking.
Definition sync.h:116
void snrt_mutex_release(volatile uint32_t *pmtx)
Release a previously-acquired mutex.
Definition sync.h:71
void snrt_disable_multicast()
Disable LSU multicast.
Definition sync.h:318
void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer, size_t len)
Perform a sum reduction among clusters, blocking.
Definition sync.h:238