/github/workspace/src/FilteringFunctions/kernels/plp_conv_valid_rep_i16s_xpulpv2.c
Functions
Name | |
---|---|
void | plp_conv_valid_rep_i16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const uint32_t srcAMem, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 16-bit integer vectors kernel for XPULPV2 extension. |
Defines
Name | |
---|---|
shufflemask1 | |
ymask |
Functions Documentation
function plp_conv_valid_rep_i16s_xpulpv2
void plp_conv_valid_rep_i16s_xpulpv2(
const int16_t * pSrcA,
const uint32_t srcALen,
const uint32_t srcAMem,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution of 16-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector of the replicated data
- srcALen Number of elements in (unreplicated) vector a
- srcAMem Number of elements between each replication
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
Convolution (valid with data replication) of 16-bit integer vectors kernel for XPULPV2 extension.
Macros Documentation
define shufflemask1
#define shufflemask1 (v2s) { 1, 0 }
define ymask
#define ymask (v2s) { 0xFFFF, 0x0000 }
Source code
/* =====================================================================
* Project: PULP DSP Library
* Title: plp_conv_valid_rep_i16s_xpulpv2.c
* Description: 16-bit integer singlecore convolution (valid with data
* replication) for XPULPV2
*
* $Date: 3. May 2020
* $Revision: V0
*
* Target Processor: PULP cores
* ===================================================================== */
/*
* Copyright (C) 2020 ETH Zurich and University of Bologna.
*
* Author: Moritz Scherer, Tibor Schneider, ETH Zurich
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plp_math.h"
#define shufflemask1 \
(v2s) { 1, 0 }
#define ymask \
(v2s) { 0xFFFF, 0x0000 }
// Pre-condition: pSrcA with data replicated 4 times, shifted by 1 element.
// Pre-condition: srcALen >= srcBLen, established by calling function plp_conv_i32
// Pre-condition: pRes has enough allocated memory, i.e. srcALen + srcBLen-1u
// Pre-condition: srcALen >= 2 and srcBLen >= 2, otherwise use vector dot product
void plp_conv_valid_rep_i16s_xpulpv2(const int16_t *pSrcA,
const uint32_t srcALen,
const uint32_t srcAMem,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes) {
const int16_t *pSrcA_iter_0; // intermediate input a pointer (replication 0)
const int16_t *pSrcA_iter_1; // intermediate input a pointer (replication 1)
const int16_t *pSrcB_iter; // intermediate input b pointer
int res_len = srcALen - srcBLen + 1; // length of output vector
#ifdef PLP_MATH_LOOPUNROLL
const int16_t *pSrcB_end; // Intermediate pointers
int32_t sum; // Accumulators
uint32_t j, k, count, blk_cnt; // Loop counters
// for loop unroll
int32_t acc0, acc1, acc2, acc3; // Accumulators
v2s _x0, _x1, _x2, _x3, _x4, _x5; // local registers
v2s _y1, _y2; // local registers
int16_t _a0, _a1, _a2, _b0; // local registers for non-simd computation
// Working pointer of inputA
pSrcA_iter_0 = pSrcA + 0 * srcAMem;
pSrcA_iter_1 = pSrcA + 1 * srcAMem;
// Working pointer of inputB
pSrcB_end = pSrcB + (srcBLen - 1U);
pSrcB_iter = pSrcB_end;
// count is index by which the pointer pSrcA to be incremented
count = 0U;
if (srcBLen >= 4U) {
// compute 4 outputs at the same time
blk_cnt = res_len >> 2U;
while (blk_cnt > 0U) {
// Set all accumulators to zero
acc0 = 0;
acc1 = 0;
acc2 = 0;
acc3 = 0;
// Apply loop unrolling and compute 4 MACs simultaneously.
k = srcBLen >> 2U;
/* First part of the processing with loop unrolling. Compute 4 MACs at a
* a second loop below computes MACs for the remaining 1 to 3 samples.
*/
do {
// load the data
_y1 = *((v2s *)(pSrcB_iter - 1)); // { y[srcBLen - 2] , y[srcBLen - 1] }
_y2 = *((v2s *)(pSrcB_iter - 3)); // { y[srcBLen - 4] , y[srcBLen - 3] }
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_x3 = *((v2s *)(pSrcA_iter_1 + 2)); // {x[3],x[4]}
_x4 = *((v2s *)(pSrcA_iter_0 + 4)); // {x[4],x[5]}
_x5 = *((v2s *)(pSrcA_iter_1 + 4)); // {x[5],x[6]}
_y1 = __builtin_shuffle(_y1, _y1,
shufflemask1); // { y[srcBLen - 1] , y[srcBLen - 2] }
_y2 = __builtin_shuffle(_y2, _y2,
shufflemask1); // { y[srcBLen - 3] , y[srcBLen - 4] }
// update pointers
pSrcB_iter -= 4;
pSrcA_iter_0 += 4;
pSrcA_iter_1 += 4;
// Perform the multiply-accumulate
acc0 = __SUMDOTP2(_x0, _y1, acc0);
acc1 = __SUMDOTP2(_x1, _y1, acc1);
acc2 = __SUMDOTP2(_x2, _y1, acc2);
acc3 = __SUMDOTP2(_x3, _y1, acc3);
acc0 = __SUMDOTP2(_x2, _y2, acc0);
acc1 = __SUMDOTP2(_x3, _y2, acc1);
acc2 = __SUMDOTP2(_x4, _y2, acc2);
acc3 = __SUMDOTP2(_x5, _y2, acc3);
} while (--k);
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
* No loop unrolling is used.
*/
k = srcBLen % 0x4U;
if (k == 1) {
// load the data
_y1 = *((v2s *)(pSrcB_iter - 1)); // {y[srcBLen - 2],y[srcBLen - 1]}
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_x3 = *((v2s *)(pSrcA_iter_1 + 2)); // {x[3],x[4]}
_y1 = __builtin_shuffle(_y1, _y1,
shufflemask1); // { y[srcBLen - 1] , y[srcBLen - 2] }
_y1 = __AND2(_y1, ymask);
// Perform the multiply-accumulate
acc0 = __SUMDOTP2(_x0, _y1, acc0);
acc1 = __SUMDOTP2(_x1, _y1, acc1);
acc2 = __SUMDOTP2(_x2, _y1, acc2);
acc3 = __SUMDOTP2(_x3, _y1, acc3);
} else if (k == 2) {
// load the data
_y1 = *((v2s *)(pSrcB_iter - 1)); // {y[srcBLen - 2],y[srcBLen - 1]}
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_x3 = *((v2s *)(pSrcA_iter_1 + 2)); // {x[3],x[4]}
_y1 = __builtin_shuffle(_y1, _y1,
shufflemask1); // { y[srcBLen - 1] , y[srcBLen - 2] }
// Perform the multiply-accumulate
acc0 = __SUMDOTP2(_x0, _y1, acc0);
acc1 = __SUMDOTP2(_x1, _y1, acc1);
acc2 = __SUMDOTP2(_x2, _y1, acc2);
acc3 = __SUMDOTP2(_x3, _y1, acc3);
} else if (k == 3) {
// load the data
_y1 = *((v2s *)(pSrcB_iter - 1)); // {y[srcBLen - 2],y[srcBLen - 1]}
_y2 = *((v2s *)(pSrcB_iter - 3)); // {y[srcBLen - 4],y[srcBLen - 3]}
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_x3 = *((v2s *)(pSrcA_iter_1 + 2)); // {x[3],x[4]}
_x4 = *((v2s *)(pSrcA_iter_0 + 4)); // {x[4],x[5]}
_x5 = *((v2s *)(pSrcA_iter_1 + 4)); // {x[5],x[6]}
_y1 = __builtin_shuffle(_y1, _y1,
shufflemask1); // { y[srcBLen - 1] , y[srcBLen - 2] }
_y2 = __builtin_shuffle(_y2, _y2,
shufflemask1); // { y[srcBLen - 3] , y[srcBLen - 4] }
_y2 = __AND2(_y2, ymask);
// Perform the multiply-accumulate
acc0 = __SUMDOTP2(_x0, _y1, acc0);
acc1 = __SUMDOTP2(_x1, _y1, acc1);
acc2 = __SUMDOTP2(_x2, _y1, acc2);
acc3 = __SUMDOTP2(_x3, _y1, acc3);
acc0 = __SUMDOTP2(_x2, _y2, acc0);
acc1 = __SUMDOTP2(_x3, _y2, acc1);
acc2 = __SUMDOTP2(_x4, _y2, acc2);
acc3 = __SUMDOTP2(_x5, _y2, acc3);
}
/* Store the result in the accumulator in the destination buffer. */
*pRes++ = acc0;
*pRes++ = acc1;
*pRes++ = acc2;
*pRes++ = acc3;
/* Increment the pointer pSrcA index, count by 4 */
count += 4U;
/* Update the inputA and inputB pointers for next MAC calculation */
pSrcA_iter_0 = pSrcA + count + 0 * srcAMem;
pSrcA_iter_1 = pSrcA + count + 1 * srcAMem;
pSrcB_iter = pSrcB_end;
/* Decrement the loop counter */
blk_cnt--;
}
/* If the res_len is not a multiple of 4, compute any remaining output samples here.
* No loop unrolling is used.
*/
blk_cnt = res_len % 0x4U;
// only one element required, only use pSrcA_iter_0
if (blk_cnt == 1) {
// Set all accumulators to zero
acc0 = 0;
// setup the iterator
k = srcBLen >> 2U;
do {
// Read y[srcBLen - 1] sample
_y1 = *((v2s *)(pSrcB_iter - 1)); // {y[srcBLen - 2],y[srcBLen - 1]}
_y2 = *((v2s *)(pSrcB_iter - 3)); // {y[srcBLen - 4],y[srcBLen - 3]}
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_y1 = __builtin_shuffle(_y1, _y1,
shufflemask1); // { y[srcBLen - 1] , y[srcBLen - 2] }
_y2 = __builtin_shuffle(_y2, _y2,
shufflemask1); // { y[srcBLen - 3] , y[srcBLen - 4] }
acc0 = __SUMDOTP2(_x0, _y1, acc0);
acc0 = __SUMDOTP2(_x2, _y2, acc0);
pSrcB_iter -= 4;
pSrcA_iter_0 += 4;
} while (--k);
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
* No loop unrolling is used.
*/
k = srcBLen % 0x4U;
while (k > 0) {
_b0 = *pSrcB_iter--;
_a0 = *pSrcA_iter_0++;
acc0 = __MAC(acc0, _a0, _b0);
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pRes++ = acc0;
}
// only two element required, only use pSrcA_iter_0 and pSrcA_iter_1
else if (blk_cnt == 2) {
// Set all accumulators to zero
acc0 = 0;
acc1 = 0;
// setup the iterator
k = srcBLen >> 2U;
do {
// Read y[srcBLen - 1] sample
_y1 = *((v2s *)(pSrcB_iter - 1)); // {y[srcBLen - 2],y[srcBLen - 1]}
_y2 = *((v2s *)(pSrcB_iter - 3)); // {y[srcBLen - 4],y[srcBLen - 3]}
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_x3 = *((v2s *)(pSrcA_iter_1 + 2)); // {x[3],x[4]}
_y1 = __builtin_shuffle(_y1, _y1,
shufflemask1); // { y[srcBLen - 1] , y[srcBLen - 2] }
_y2 = __builtin_shuffle(_y2, _y2,
shufflemask1); // { y[srcBLen - 3] , y[srcBLen - 4] }
acc0 = __SUMDOTP2(_x0, _y1, acc0);
acc1 = __SUMDOTP2(_x1, _y1, acc1);
acc0 = __SUMDOTP2(_x2, _y2, acc0);
acc1 = __SUMDOTP2(_x3, _y2, acc1);
pSrcB_iter -= 4;
pSrcA_iter_0 += 4;
pSrcA_iter_1 += 4;
} while (--k);
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
* No loop unrolling is used.
*/
k = srcBLen % 0x4U;
while (k > 0) {
_b0 = *pSrcB_iter--;
_a0 = *pSrcA_iter_0++;
_a1 = *pSrcA_iter_1++;
acc0 = __MAC(acc0, _a0, _b0);
acc1 = __MAC(acc1, _a1, _b0);
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pRes++ = acc0;
*pRes++ = acc1;
}
// only three element required, only use pSrcA_iter_0, pSrcA_iter_1 and pSrcA_iter_2
else if (blk_cnt == 3) {
// Set all accumulators to zero
acc0 = 0;
acc1 = 0;
acc2 = 0;
// setup the iterator
k = srcBLen >> 2U;
do {
// Read y[srcBLen - 1] sample
_y1 = *((v2s *)(pSrcB_iter - 1)); // {y[srcBLen - 2],y[srcBLen - 1]}
_y2 = *((v2s *)(pSrcB_iter - 3)); // {y[srcBLen - 4],y[srcBLen - 3]}
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_x3 = *((v2s *)(pSrcA_iter_1 + 2)); // {x[3],x[4]}
_x4 = *((v2s *)(pSrcA_iter_0 + 4)); // {x[4],x[5]}
_x5 = *((v2s *)(pSrcA_iter_1 + 4)); // {x[5],x[6]}
_y1 = __builtin_shuffle(_y1, _y1,
shufflemask1); // { y[srcBLen - 1] , y[srcBLen - 2] }
_y2 = __builtin_shuffle(_y2, _y2,
shufflemask1); // { y[srcBLen - 3] , y[srcBLen - 4] }
acc0 = __SUMDOTP2(_x0, _y1, acc0);
acc1 = __SUMDOTP2(_x1, _y1, acc1);
acc2 = __SUMDOTP2(_x2, _y1, acc2);
acc0 = __SUMDOTP2(_x2, _y2, acc0);
acc1 = __SUMDOTP2(_x3, _y2, acc1);
acc2 = __SUMDOTP2(_x4, _y2, acc2);
pSrcB_iter -= 4;
pSrcA_iter_0 += 4;
pSrcA_iter_1 += 4;
} while (--k);
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
* No loop unrolling is used.
*/
k = srcBLen % 0x4U;
while (k > 0) {
_b0 = *pSrcB_iter--;
_a0 = *pSrcA_iter_0;
_a1 = *pSrcA_iter_1++;
_a2 = *((pSrcA_iter_0++) + 2);
acc0 = __MAC(acc0, _a0, _b0);
acc1 = __MAC(acc1, _a1, _b0);
acc2 = __MAC(acc2, _a2, _b0);
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pRes++ = acc0;
*pRes++ = acc1;
*pRes++ = acc2;
}
} else { // case: srcBLen < 4
blk_cnt = res_len >> 2;
if (srcBLen == 1) {
printf("Error: use dot product instead of convolution!\n");
} else if (srcBLen == 2) {
_y1 = *((v2s *)pSrcB);
_y1 = __builtin_shuffle(_y1, _y1, shufflemask1);
while (blk_cnt > 0) {
// load data
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_x3 = *((v2s *)(pSrcA_iter_1 + 2)); // {x[3],x[4]}
// store dot product
*pRes++ = __DOTP2(_x0, _y1);
*pRes++ = __DOTP2(_x1, _y1);
*pRes++ = __DOTP2(_x2, _y1);
*pRes++ = __DOTP2(_x3, _y1);
// update pointers
pSrcA_iter_0 += 4;
pSrcA_iter_1 += 4;
// decrement iteration counter
blk_cnt--;
}
// do the remaining elements without loop unrolling
blk_cnt = res_len % 4;
if (blk_cnt == 1) {
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
*pRes++ = __DOTP2(_x0, _y1);
} else if (blk_cnt == 2) {
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
*pRes++ = __DOTP2(_x0, _y1);
*pRes++ = __DOTP2(_x1, _y1);
} else if (blk_cnt == 3) {
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
*pRes++ = __DOTP2(_x0, _y1);
*pRes++ = __DOTP2(_x1, _y1);
*pRes++ = __DOTP2(_x2, _y1);
}
} else if (srcBLen == 3) {
_y1 = *((v2s *)(pSrcB + 1));
_y2 = *((v2s *)(pSrcB + 0));
_y1 = __builtin_shuffle(_y1, _y1, shufflemask1);
_y2 = __AND2(_y2, ymask);
while (blk_cnt > 0) {
// load data
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_x3 = *((v2s *)(pSrcA_iter_1 + 2)); // {x[3],x[4]}
_x4 = *((v2s *)(pSrcA_iter_0 + 4)); // {x[4],x[5]}
_x5 = *((v2s *)(pSrcA_iter_1 + 4)); // {x[5],x[6]}
// compute dot product
acc0 = __DOTP2(_x0, _y1);
acc1 = __DOTP2(_x1, _y1);
acc2 = __DOTP2(_x2, _y1);
acc3 = __DOTP2(_x3, _y1);
*pRes++ = __SUMDOTP2(_x2, _y2, acc0);
*pRes++ = __SUMDOTP2(_x3, _y2, acc1);
*pRes++ = __SUMDOTP2(_x4, _y2, acc2);
*pRes++ = __SUMDOTP2(_x5, _y2, acc3);
// update pointers
pSrcA_iter_0 += 4;
pSrcA_iter_1 += 4;
// decrement iteration counter
blk_cnt--;
}
// do the remaining elements without loop unrolling
blk_cnt = res_len % 4;
if (blk_cnt == 1) {
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
acc0 = __DOTP2(_x0, _y1);
*pRes++ = __SUMDOTP2(_x2, _y2, acc0);
} else if (blk_cnt == 2) {
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_x3 = *((v2s *)(pSrcA_iter_1 + 2)); // {x[3],x[4]}
acc0 = __DOTP2(_x0, _y1);
acc1 = __DOTP2(_x1, _y1);
*pRes++ = __SUMDOTP2(_x2, _y2, acc0);
*pRes++ = __SUMDOTP2(_x3, _y2, acc1);
} else if (blk_cnt == 3) {
_x0 = *((v2s *)(pSrcA_iter_0 + 0)); // {x[0],x[1]}
_x1 = *((v2s *)(pSrcA_iter_1 + 0)); // {x[1],x[2]}
_x2 = *((v2s *)(pSrcA_iter_0 + 2)); // {x[2],x[3]}
_x3 = *((v2s *)(pSrcA_iter_1 + 2)); // {x[3],x[4]}
_x4 = *((v2s *)(pSrcA_iter_0 + 4)); // {x[4],x[5]}
acc0 = __DOTP2(_x0, _y1);
acc1 = __DOTP2(_x1, _y1);
acc2 = __DOTP2(_x2, _y1);
*pRes++ = __SUMDOTP2(_x2, _y2, acc0);
*pRes++ = __SUMDOTP2(_x3, _y2, acc1);
*pRes++ = __SUMDOTP2(_x4, _y2, acc2);
}
}
}
#else // PLP_MATH_LOOPUNROLL
// this makes no sense...
printf("Error: not implemented!");
#endif // PLP_MATH_LOOPUNROLL
}
Updated on 2023-03-01 at 16:16:32 +0000