/github/workspace/src/FilteringFunctions/kernels/plp_conv_i32s_xpulpv2.c
Functions
Name | |
---|---|
void | plp_conv_i32s_xpulpv2(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 32-bit integer vectors kernel for XPULPV2 extension. |
Functions Documentation
function plp_conv_i32s_xpulpv2
void plp_conv_i32s_xpulpv2(
const int32_t * pSrcA,
const uint32_t srcALen,
const int32_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution of 32-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
Source code
/* =====================================================================
* Project: PULP DSP Library
* Title: plp_conv_i32_xpulpv2.c
* Description: 32-bit integer singlecore convolution for XPULPV2
*
* $Date: 01. July 2019
* $Revision: V0
*
* Target Processor: PULP cores
* ===================================================================== */
/*
* Copyright (C) 2019 ETH Zurich and University of Bologna.
*
* Author: Moritz Scherer, ETH Zurich
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plp_math.h"
// Pre-condition: psrcALen >= psrcBLen, established by calling function plp_conv_i32
// Pre-condition: pRes has enough allocated memory, i.e. srcALen + srcBLen-1u
// Pre-condition: srcALen >= 2 and srcBLen >= 2, otherwise use vector dot product
void plp_conv_i32s_xpulpv2(const int32_t *pSrcA,
const uint32_t srcALen,
const int32_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes) {
const int32_t *pIn1 = pSrcA; /* InputA pointer */
const int32_t *pIn2 = pSrcB; /* InputB pointer */
int32_t *pOut = pRes; /* Output pointer */
const int32_t *px; /* Intermediate inputA pointer */
const int32_t *py; /* Intermediate inputB pointer */
const int32_t *pSrc1, *pSrc2; /* Intermediate pointers */
int32_t sum; /* Accumulators */
uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
uint32_t j, k, count, blkCnt; /* Loop counters */
#if defined(PLP_MATH_LOOPUNROLL)
int32_t acc0, acc1, acc2, acc3; /* Accumulators */
int32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
#endif
int32_t temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
blockSize1 = srcBLen - 1U;
blockSize2 = srcALen - (srcBLen - 1U);
blockSize3 = blockSize1;
/* --------------------------
* Initializations of stage1
* -------------------------*/
/* sum = x[0] * y[0]
* sum = x[0] * y[1] + x[1] * y[0]
* ....
* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
*/
/* In this stage the MAC operations are increased by 1 for every iteration.
The count variable holds the number of MAC operations performed */
count = 1U;
/* Working pointer of inputA */
px = pIn1;
/* Working pointer of inputB */
py = pIn2;
/* ------------------------
* Stage1 process
* ----------------------*/
/* The first stage starts here */
while (blockSize1 > 0U) {
/* Accumulator is made zero for every iteration */
sum = 0;
temp1 = *px;
temp2 = *py;
#if defined(PLP_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
k = count >> 1U;
while (k > 0U) {
temp3 = *(px + 1);
temp4 = *(py - 1);
sum = __MAC(sum, temp1, temp2);
sum = __MAC(sum, temp3, temp4);
temp1 = *(px + 2);
temp2 = *(py - 2);
px += 2;
py -= 2;
/* Decrement loop counter */
k--;
}
/* Loop unrolling: Compute remaining outputs */
k = count % 0x2U;
if (k) {
sum = __MAC(sum, temp1, temp2);
}
#else
/* Initialize k with number of samples */
k = count;
while (k > 0U) {
/* Perform the multiply-accumulate */
sum = __MAC(sum, *px++, *py--);
/* Decrement loop counter */
k--;
}
#endif /* #if defined (PLP_MATH_LOOPUNROLL) */
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = sum;
/* Update the inputA and inputB pointers for next MAC calculation */
py = pIn2 + count;
px = pIn1;
/* Increment MAC count */
count++;
/* Decrement loop counter */
blockSize1--;
}
/* --------------------------
* Initializations of stage2
* ------------------------*/
/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
* ....
* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] *
* y[0]
*/
/* Working pointer of inputA */
px = pIn1;
/* Working pointer of inputB */
pSrc2 = pIn2 + (srcBLen - 1U);
py = pSrc2;
/* count is index by which the pointer pIn1 to be incremented */
count = 0U;
/* -------------------
* Stage2 process
* ------------------*/
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2,
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U) {
#if defined(PLP_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize2 >> 2U;
while (blkCnt > 0U) {
/* Set all accumulators to zero */
acc0 = 0;
acc1 = 0;
acc2 = 0;
acc3 = 0;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
/* read x[0], x[1], x[2] samples */
x0 = *px++;
x1 = *px++;
x2 = *px++;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
do {
/* Read y[srcBLen - 1] sample */
c0 = *py--;
/* Read x[3] sample */
x3 = *(px);
/* acc0 += x[0] * y[srcBLen - 1] */
acc0 += x0 * c0;
/* acc1 += x[1] * y[srcBLen - 1] */
acc1 += x1 * c0;
/* acc2 += x[2] * y[srcBLen - 1] */
acc2 += x2 * c0;
/* acc3 += x[3] * y[srcBLen - 1] */
acc3 += x3 * c0;
/* Read y[srcBLen - 2] sample */
c0 = *py--;
/* Read x[4] sample */
x0 = *(px + 1U);
/* acc0 += x[1] * y[srcBLen - 2] */
acc0 += x1 * c0;
/* acc1 += x[2] * y[srcBLen - 2] */
acc1 += x2 * c0;
/* acc2 += x[3] * y[srcBLen - 2] */
acc2 += x3 * c0;
/* acc3 += x[4] * y[srcBLen - 2] */
acc3 += x0 * c0;
/* Read y[srcBLen - 3] sample */
c0 = *py--;
/* Read x[5] sample */
x1 = *(px + 2U);
/* acc0 += x[2] * y[srcBLen - 3] */
acc0 += x2 * c0;
/* acc1 += x[3] * y[srcBLen - 2] */
acc1 += x3 * c0;
/* acc2 += x[4] * y[srcBLen - 2] */
acc2 += x0 * c0;
/* acc3 += x[5] * y[srcBLen - 2] */
acc3 += x1 * c0;
/* Read y[srcBLen - 4] sample */
c0 = *py--;
/* Read x[6] sample */
x2 = *(px + 3U);
px += 4U;
/* acc0 += x[3] * y[srcBLen - 4] */
acc0 += x3 * c0;
/* acc1 += x[4] * y[srcBLen - 4] */
acc1 += x0 * c0;
/* acc2 += x[5] * y[srcBLen - 4] */
acc2 += x1 * c0;
/* acc3 += x[6] * y[srcBLen - 4] */
acc3 += x2 * c0;
} while (--k);
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
k = srcBLen % 0x4U;
while (k > 0U) {
/* Read y[srcBLen - 5] sample */
c0 = *py--;
/* Read x[7] sample */
x3 = *px++;
/* Perform the multiply-accumulate */
/* acc0 += x[4] * y[srcBLen - 5] */
acc0 += x0 * c0;
/* acc1 += x[5] * y[srcBLen - 5] */
acc1 += x1 * c0;
/* acc2 += x[6] * y[srcBLen - 5] */
acc2 += x2 * c0;
/* acc3 += x[7] * y[srcBLen - 5] */
acc3 += x3 * c0;
/* Reuse the present samples for the next MAC */
x0 = x1;
x1 = x2;
x2 = x3;
/* Decrement the loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = acc0;
*pOut++ = acc1;
*pOut++ = acc2;
*pOut++ = acc3;
/* Increment the pointer pIn1 index, count by 4 */
count += 4U;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pSrc2;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize2 % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize2;
#endif /* #if defined (PLP_MATH_LOOPUNROLL)*/
while (blkCnt > 0U) {
/* Accumulator is made zero for every iteration */
sum = 0;
temp1 = *px;
temp2 = *py;
#if defined(PLP_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 1U;
while (k > 0U) {
temp3 = *(px + 1);
temp4 = *(py - 1);
sum = __MAC(sum, temp1, temp2);
sum = __MAC(sum, temp3, temp4);
temp1 = *(px + 2);
temp2 = *(py - 2);
px += 2;
py -= 2;
/* Decrement loop counter */
k--;
}
/* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x2U;
if (k) {
sum = __MAC(sum, temp1, temp2);
}
#else
/* Initialize blkCnt with number of samples */
k = srcBLen;
while (k > 0U) {
/* Perform the multiply-accumulate */
sum = __MAC(sum, *px++, *py--);
/* Decrement the loop counter */
k--;
}
#endif /* #if defined (PLP_MATH_LOOPUNROLL) */
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = sum;
/* Increment the MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pSrc2;
/* Decrement the loop counter */
blkCnt--;
}
} else {
/* If the srcBLen is not a multiple of 4,
* the blockSize2 loop cannot be unrolled by 4 */
blkCnt = blockSize2;
while (blkCnt > 0U) {
/* Accumulator is made zero for every iteration */
sum = 0;
/* srcBLen number of MACS should be performed */
k = srcBLen;
while (k > 0U) {
/* Perform the multiply-accumulate */
sum = __MAC(sum, *px++, *py--);
/* Decrement the loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = sum;
/* Increment the MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pSrc2;
/* Decrement the loop counter */
blkCnt--;
}
}
/* --------------------------
* Initializations of stage3
* -------------------------*/
/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+
* x[srcALen-1] * y[1] sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] *
* y[srcBLen-2] +...+ x[srcALen-1] * y[2]
* ....
* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
* sum += x[srcALen-1] * y[srcBLen-1]
*/
/* In this stage the MAC operations are decreased by 1 for every iteration.
The blockSize3 variable holds the number of MAC operations performed */
/* Working pointer of inputA */
pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
px = pSrc1;
/* Working pointer of inputB */
pSrc2 = pIn2 + (srcBLen - 1U);
py = pSrc2;
/* -------------------
* Stage3 process
* ------------------*/
while (blockSize3 > 0U) {
/* Accumulator is made zero for every iteration */
sum = 0;
#if defined(PLP_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
k = blockSize3 >> 1U;
temp1 = *px;
temp2 = *py;
while (k > 0U) {
temp3 = *(px + 1);
temp4 = *(py - 1);
sum = __MAC(sum, temp1, temp2);
sum = __MAC(sum, temp3, temp4);
temp1 = *(px + 2);
temp2 = *(py - 2);
px += 2;
py -= 2;
/* Decrement loop counter */
k--;
}
/* Loop unrolling: Compute remaining outputs */
k = blockSize3 % 0x2U;
if (k) {
sum = __MAC(sum, temp1, temp2);
}
#else
/* Initialize blkCnt with number of samples */
k = blockSize3;
while (k > 0U) {
/* Perform the multiply-accumulate */
/* sum += x[srcALen-1] * y[srcBLen-1] */
sum = __MAC(sum, *px++, *py--);
/* Decrement loop counter */
k--;
}
#endif /* defined (PLP_MATH_LOOPUNROLL)*/
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = sum;
/* Update the inputA and inputB pointers for next MAC calculation */
px = ++pSrc1;
py = pSrc2;
/* Decrement the loop counter */
blockSize3--;
}
}
Updated on 2023-03-01 at 16:16:32 +0000