Snitch Runtime
Loading...
Searching...
No Matches
sync.h
Go to the documentation of this file.
1// Copyright 2023 ETH Zurich and University of Bologna.
2// Licensed under the Apache License, Version 2.0, see LICENSE for details.
3// SPDX-License-Identifier: Apache-2.0
4//
5// Luca Colagrande <colluca@iis.ee.ethz.ch>
6// Viviane Potocnik <vivianep@iis.ee.ethz.ch>
7
13#pragma once
14
15#include <math.h>
16
17//================================================================================
18// Mutex functions
19//================================================================================
20
24inline volatile uint32_t *snrt_mutex() { return &_snrt_mutex; }
25
34inline void snrt_mutex_acquire(volatile uint32_t *pmtx) {
35 asm volatile(
36 "li t0,1 # t0 = 1\n"
37 "1:\n"
38 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
39 " bnez t0,1b # Retry if previously set)\n"
40 : "+r"(pmtx)
41 :
42 : "t0");
43}
44
50inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) {
51 asm volatile(
52 "1:\n"
53 " lw t0, 0(%0)\n"
54 " bnez t0, 1b\n"
55 " li t0,1 # t0 = 1\n"
56 "2:\n"
57 " amoswap.w.aq t0,t0,(%0) # t0 = oldlock & lock = 1\n"
58 " bnez t0,2b # Retry if previously set)\n"
59 : "+r"(pmtx)
60 :
61 : "t0");
62}
63
67inline void snrt_mutex_release(volatile uint32_t *pmtx) {
68 asm volatile("amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n"
69 : "+r"(pmtx));
70}
71
72//================================================================================
73// Barrier functions
74//================================================================================
75
82 asm volatile("csrr x0, 0x7C2" ::: "memory");
83}
84
92 // Remember previous iteration
93 uint32_t prev_barrier_iteration = _snrt_barrier.iteration;
94 uint32_t cnt =
95 __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
96
97 // Increment the barrier counter
98 if (cnt == snrt_cluster_num()) {
99 _snrt_barrier.cnt = 0;
100 __atomic_add_fetch(&(_snrt_barrier.iteration), 1, __ATOMIC_RELAXED);
101 } else {
102 while (prev_barrier_iteration == _snrt_barrier.iteration)
103 ;
104 }
105}
106
116inline void snrt_global_barrier() {
118
119 // Synchronize all DM cores in software
120 if (snrt_is_dm_core()) {
122 }
123 // Synchronize cores in a cluster with the HW barrier
125}
126
134inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) {
135 // Remember previous iteration
136 uint32_t prev_it = barr->iteration;
137 uint32_t cnt = __atomic_add_fetch(&barr->cnt, 1, __ATOMIC_RELAXED);
138
139 // Increment the barrier counter
140 if (cnt == n) {
141 barr->cnt = 0;
142 __atomic_add_fetch(&barr->iteration, 1, __ATOMIC_RELAXED);
143 } else {
144 // Some threads have not reached the barrier --> Let's wait
145 while (prev_it == barr->iteration)
146 ;
147 }
148}
149
150//================================================================================
151// Reduction functions
152//================================================================================
153
164inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
165 // Reduce cores within cluster in TCDM
166 uint32_t *cluster_result = &(cls()->reduction);
167 uint32_t tmp = __atomic_fetch_add(cluster_result, value, __ATOMIC_RELAXED);
168
169 // Wait for writeback to ensure AMO is seen by all cores after barrier
172
173 // Reduce DM cores across clusters in global memory
174 if (snrt_is_dm_core()) {
175 __atomic_add_fetch(&_reduction_result, *cluster_result,
176 __ATOMIC_RELAXED);
178 *cluster_result = _reduction_result;
179 }
181 return *cluster_result;
182}
183
199inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
200 size_t len) {
201 // If we have a single cluster the reduction degenerates to a memcpy
202 if (snrt_cluster_num() == 1) {
203 if (!snrt_is_compute_core()) {
204 snrt_dma_start_1d(dst_buffer, src_buffer, len * sizeof(double));
205 snrt_dma_wait_all();
206 }
208 } else {
209 // Iterate levels in the binary reduction tree
210 int num_levels = ceil(log2(snrt_cluster_num()));
211 for (unsigned int level = 0; level < num_levels; level++) {
212 // Determine whether the current cluster is an active cluster.
213 // An active cluster is a cluster that participates in the current
214 // level of the reduction tree. Every second cluster among the
215 // active ones is a sender.
216 uint32_t is_active = (snrt_cluster_idx() % (1 << level)) == 0;
217 uint32_t is_sender = (snrt_cluster_idx() % (1 << (level + 1))) != 0;
218
219 // If the cluster is a sender, it sends the data in its source
220 // buffer to the respective receiver's destination buffer
221 if (is_active && is_sender) {
222 if (!snrt_is_compute_core()) {
223 void *dst =
224 (void *)dst_buffer - (1 << level) * SNRT_CLUSTER_OFFSET;
225 snrt_dma_start_1d(dst, src_buffer, len * sizeof(double));
226 snrt_dma_wait_all();
227 }
228 }
229
230 // Synchronize senders and receivers
232
233 // Every cluster which is not a sender performs the reduction
234 if (is_active && !is_sender) {
235 // Computation is parallelized over the compute cores
236 if (snrt_is_compute_core()) {
237 uint32_t items_per_core =
238 len / snrt_cluster_compute_core_num();
239 uint32_t core_offset =
240 snrt_cluster_core_idx() * items_per_core;
241 for (uint32_t i = 0; i < items_per_core; i++) {
242 uint32_t abs_i = core_offset + i;
243 dst_buffer[abs_i] += src_buffer[abs_i];
244 }
245 }
246 }
247
248 // Synchronize compute and DM cores for next tree level
250 }
251 }
252}
253
254//================================================================================
255// Memory consistency
256//================================================================================
257
264inline void snrt_wait_writeback(uint32_t val) {
265 asm volatile("mv %0, %0" : "+r"(val)::);
266}
Definition sync_decls.h:9
void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n)
Generic software barrier.
Definition sync.h:134
void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx)
Acquire a mutex, blocking.
Definition sync.h:50
void snrt_mutex_acquire(volatile uint32_t *pmtx)
Acquire a mutex, blocking.
Definition sync.h:34
volatile uint32_t * snrt_mutex()
Get a pointer to a mutex variable.
Definition sync.h:24
void snrt_wait_writeback(uint32_t val)
Ensure value is written back to the register file.
Definition sync.h:264
void snrt_inter_cluster_barrier()
Synchronize one core from every cluster with the others.
Definition sync.h:91
uint32_t snrt_global_all_to_all_reduction(uint32_t value)
Perform a global sum reduction, blocking.
Definition sync.h:164
void snrt_global_barrier()
Synchronize all Snitch cores.
Definition sync.h:116
void snrt_cluster_hw_barrier()
Synchronize cores in a cluster with a hardware barrier, blocking.
Definition sync.h:81
void snrt_mutex_release(volatile uint32_t *pmtx)
Release a previously-acquired mutex.
Definition sync.h:67
void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer, size_t len)
Perform a sum reduction among clusters, blocking.
Definition sync.h:199