/github/workspace/src/FilteringFunctions/plp_conv_i32_parallel.c
Functions
Name | |
---|---|
void | plp_conv_i32_parallel(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, const uint8_t nPE, int32_t * pRes) Glue code for parallel convolution of 32-bit integer vectors. |
Attributes
Name | |
---|---|
HAL_CL_L1 int32_t * | resultsBuffer |
Functions Documentation
function plp_conv_i32_parallel
void plp_conv_i32_parallel(
const int32_t * pSrcA,
const uint32_t srcALen,
const int32_t * pSrcB,
const uint32_t srcBLen,
const uint8_t nPE,
int32_t * pRes
)
Glue code for parallel convolution of 32-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- nPE Number of cores to compute on
- pRes output result returned here
Return: none
Attributes Documentation
variable resultsBuffer
HAL_CL_L1 int32_t * resultsBuffer;
Source code
/* =====================================================================
* Project: PULP DSP Library
* Title: plp_conv_i32_parallel.c
* Description: 32-bit paralell integer convolution glue code
*
* $Date: 01. July 2019
* $Revision: V0
*
* Target Processor: PULP cores
* ===================================================================== */
/*
* Copyright (C) 2019 ETH Zurich and University of Bologna.
*
* Author: Moritz Scherer
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plp_math.h"
#include "rtos_hal.h"
HAL_CL_L1 int32_t *resultsBuffer;
//#define PLP_CONV_SEQUENTIALADDING 1
void plp_conv_i32_parallel(const int32_t *pSrcA,
const uint32_t srcALen,
const int32_t *pSrcB,
const uint32_t srcBLen,
const uint8_t nPE,
int32_t *pRes) {
if (hal_cluster_id() == ARCHI_FC_CID) {
printf("parallel processing supported only for cluster side\n");
return;
} else {
if (nPE == 1) {
plp_conv_i32(pSrcA, srcALen, pSrcB, srcBLen, pRes);
return;
}
const int32_t *pIn1;
const int32_t *pIn2;
uint32_t pIn1Len;
uint32_t pIn2Len;
if (srcALen >= srcBLen) {
pIn2 = pSrcA;
pIn1 = pSrcB;
pIn2Len = srcALen;
pIn1Len = srcBLen;
} else {
pIn2 = pSrcB;
pIn1 = pSrcA;
pIn2Len = srcBLen;
pIn1Len = srcALen;
}
uint32_t srcAoffset = ((pIn1Len + nPE - 1) / nPE);
uint32_t resultsoffset = srcAoffset + pIn2Len - 1;
uint32_t resultsLen =
resultsoffset * (nPE - 1) + (pIn1Len - (srcAoffset * (nPE - 1))) + pIn2Len - 1;
int32_t *resBuf;
if (nPE > 1) {
resultsBuffer =
(int32_t *)hal_cl_l1_malloc(sizeof(int32_t) * resultsoffset * nPE);
resBuf = resultsBuffer;
for (uint32_t i = resultsLen; i < resultsoffset * nPE; i++) {
resultsBuffer[i] = 0;
}
// printf("Address of resultsBuffer: 0x%x, End: 0x%x\n", resultsBuffer, resultsBuffer +
// sizeof(int32_t)*resultsLen);
} else {
resultsBuffer = pRes;
}
plp_conv_instance_i32 S = { .srcALen = pIn1Len,
.srcBLen = pIn2Len,
.pSrcA = pIn1,
.pSrcB = pIn2,
.pRes = resultsBuffer,
.nPE = nPE };
hal_cl_team_fork(nPE, plp_conv_i32p_xpulpv2, (void *)&S);
if (nPE > 1) {
/* Sequential overlap-adding */
#if defined(PLP_CONV_SEQUENTIALADDING)
for (uint32_t i = 0; i < resultsoffset; i++) {
pRes[i] = resultsBuffer[i];
}
for (uint32_t i = resultsoffset; i < srcALen + srcBLen - 1; i++) {
pRes[i] = 0;
}
for (int32_t i = 1; i < nPE - 1; i++) {
for (uint32_t j = 0; j < resultsoffset; j++) {
pRes[i * srcAoffset + j] += resultsBuffer[j + i * resultsoffset];
}
}
for (uint32_t j = 0; j < resultsLen - resultsoffset * (nPE - 1); j++) {
pRes[(nPE - 1) * srcAoffset + j] += resultsBuffer[(nPE - 1) * resultsoffset + j];
}
#else
/* Parallel overlap-adding */
plp_conv_parallel_OLA(nPE, pIn1Len, pIn2Len, resultsBuffer);
#if defined(PLP_MATH_LOOPUNROLL)
uint32_t k = (srcALen + srcBLen - 1) >> 1U;
int32_t temp1, temp2;
while (k) {
temp1 = *resultsBuffer++;
temp2 = *resultsBuffer++;
*pRes++ = temp1;
*pRes++ = temp2;
k--;
}
k = (srcALen + srcBLen - 1) % 0x2U;
if (k) {
*pRes++ = *resultsBuffer++;
}
#else
for (uint32_t i = 0; i < srcALen + srcBLen - 1; i++) {
pRes[i] = resultsBuffer[i];
}
#endif
hal_cl_l1_free(resBuf, sizeof(int32_t) * resultsoffset * nPE);
#endif
}
return;
}
}
Updated on 2023-03-01 at 16:16:32 +0000