/github/workspace/src/FilteringFunctions/kernels/plp_conv_i16s_xpulpv2.c

Functions

	Name
void	plp_conv_i16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 16-bit integer vectors kernel for XPULPV2 extension.

Defines

	Name
	shufflemask1
	shufflemask2

Functions Documentation

function plp_conv_i16s_xpulpv2

void plp_conv_i16s_xpulpv2(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution of 16-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

Macros Documentation

define shufflemask1

#define shufflemask1     (v2s) { 1, 0 }

define shufflemask2

#define shufflemask2     (v2s) { 1, 2 }

Source code

/* =====================================================================
 * Project:      PULP DSP Library
 * Title:        plp_conv_i16s_xpulpv2.c
 * Description:  16-bit integer singlecore convolution for XPULPV2
 *
 * $Date:        01. July 2019
 * $Revision:    V0
 *
 * Target Processor: PULP cores
 * ===================================================================== */
/*
 * Copyright (C) 2019 ETH Zurich and University of Bologna.
 *
 * Author: Moritz Scherer, ETH Zurich
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "plp_math.h"

#define shufflemask1                                                                               \
    (v2s) { 1, 0 }
#define shufflemask2                                                                               \
    (v2s) { 1, 2 }

// Pre-condition: psrcALen >= psrcBLen, established by calling function plp_conv_i16
// Pre-condition: pRes has enough allocated memory, i.e. srcALen + srcBLen-1u
// Pre-condition: srcALen >= 2 and srcBLen >= 2, otherwise use vector dot product

void plp_conv_i16s_xpulpv2(const int16_t *pSrcA,
                           const uint32_t srcALen,
                           const int16_t *pSrcB,
                           const uint32_t srcBLen,
                           int32_t *pRes) {

    const int16_t *pIn1 = pSrcA;                 /* InputA pointer */
    const int16_t *pIn2 = pSrcB;                 /* InputB pointer */
    int32_t *pOut = pRes;                        /* Output pointer */
    const int16_t *px;                           /* Intermediate inputA pointer */
    const int16_t *py;                           /* Intermediate inputB pointer */
    const int16_t *pSrc1, *pSrc2;                /* Intermediate pointers */
    int32_t sum;                                 /* Accumulators */
    uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
    uint32_t j, k, count, blkCnt;                /* Loop counters */

#if defined(PLP_MATH_LOOPUNROLL)
    int32_t acc0, acc1, acc2, acc3; /* Accumulators */
    int16_t x0, x1, x2, x3, c0;     /* Temporary variables to hold state and coefficient values */
#endif

    int16_t temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;

    v2s _x1, _x2, _x3, _x4, _x5, _x6, _x7;
    v2s _y1, _y2;

    int16_t y0, y1;

    blockSize1 = srcBLen - 1U;
    blockSize2 = srcALen - (srcBLen - 1U);
    blockSize3 = blockSize1;

    /* --------------------------
     * Initializations of stage1
     * -------------------------*/

    /* sum = x[0] * y[0]
     * sum = x[0] * y[1] + x[1] * y[0]
     * ....
     * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
     */

    /* In this stage the MAC operations are increased by 1 for every iteration.
       The count variable holds the number of MAC operations performed */
    count = 1U;

    /* Working pointer of inputA */
    px = pIn1;

    /* Working pointer of inputB */
    py = pIn2;

    /* ------------------------
     * Stage1 process
     * ----------------------*/

    /* The first stage starts here */
    while (blockSize1 > 0U) {
        /* Accumulator is made zero for every iteration */

        _y1 = *((v2s *)(py - 1));
        _x1 = *((v2s *)(px));
        sum = 0;

#if defined(PLP_MATH_LOOPUNROLL)
        /* Loop unrolling: Compute 4 outputs at a time */
        k = count >> 1U;
        while (k > 0U) {
            _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);
            sum = __SUMDOTP2(_x1, _y1, sum);

            px += 2;
            py -= 2;

            _y1 = *((v2s *)(py - 1));
            _x1 = *((v2s *)(px));

            k--;
        }

        /* Loop unrolling: Compute remaining outputs */
        k = count % 0x2U;

        if (k) {
            sum = __MAC(sum, _x1[0], _y1[1]);
        }
#else
        /* Initialize k with number of samples */
        k = count;

        while (k) {
            sum = __MAC(sum, *px++, *py--);
            k--;
        }

#endif /* #if defined (PLP_MATH_LOOPUNROLL) */

        /* Store the result in the accumulator in the destination buffer. */
        *pOut++ = sum;

        /* Update the inputA and inputB pointers for next MAC calculation */
        py = pIn2 + count;
        px = pIn1;

        /* Increment MAC count */
        count++;

        /* Decrement loop counter */
        blockSize1--;
    }

    /* --------------------------
     * Initializations of stage2
     * ------------------------*/

    /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
     * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen]   * y[0]
     * ....
     * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] *
     * y[0]
     */

    /* Working pointer of inputA */
    px = pIn1;

    /* Working pointer of inputB */
    pSrc2 = pIn2 + (srcBLen - 1U);
    py = pSrc2;

    /* count is index by which the pointer pIn1 to be incremented */
    count = 0U;

    /* -------------------
     * Stage2 process
     * ------------------*/

    /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
     * So, to loop unroll over blockSize2,
     * srcBLen should be greater than or equal to 4 */
    if (srcBLen >= 4U) {

#if defined(PLP_MATH_LOOPUNROLL)

        /* Loop unrolling: Compute 4 outputs at a time */
        blkCnt = blockSize2 >> 2U;
        while (blkCnt > 0U) {
            /* Set all accumulators to zero */
            acc0 = 0;
            acc1 = 0;
            acc2 = 0;
            acc3 = 0;

            /* Apply loop unrolling and compute 4 MACs simultaneously. */
            k = srcBLen >> 2U;

            /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
            ** a second loop below computes MACs for the remaining 1 to 3 samples. */
            do {

                _y1 = *((v2s *)(py - 1)); // { y[srcBLen - 2] , y[srcBLen - 1] }
                _y2 = *((v2s *)(py - 3)); // { y[srcBLen - 4] , y[srcBLen - 3] }

                _x3 = *((v2s *)(px + 2)); // {x[2], x[3]}
                _x5 = *((v2s *)(px + 4)); // {x[4], x[5]}
                _x7 = *((v2s *)(px + 6)); // {x[6], x[7]}

                _y1 = __builtin_shuffle(_y1, _y1,
                                        shufflemask1); // { y[srcBLen - 1] , y[srcBLen - 2] }

                // Load _x1 later to keep it in the register
                _x1 = *((v2s *)(px)); // {x[0], x[1]}

                _x4 = __builtin_shuffle(_x3, _x5, shufflemask2); // {x[3], x[4]}
                _x2 = __builtin_shuffle(_x1, _x3, shufflemask2); // {x[1], x[2]}

                /* acc0 +=  x[0] * y[srcBLen - 1] +  x[1] * y[srcBLen - 2] */
                acc0 = __SUMDOTP2(_x1, _y1, acc0);
                /* acc1 +=  x[1] * y[srcBLen - 1] +  x[2] * y[srcBLen - 2] */
                acc1 = __SUMDOTP2(_x2, _y1, acc1);
                /* acc2 +=  x[2] * y[srcBLen - 1] +  x[3] * y[srcBLen - 2]*/
                acc2 = __SUMDOTP2(_x3, _y1, acc2);
                /* acc3 +=  x[3] * y[srcBLen - 1] +  x[4] * y[srcBLen - 2] */
                acc3 = __SUMDOTP2(_x4, _y1, acc3);
                /* Perform the multiply-accumulate */

                // Shuffle _x6 later to make sure that _x5 and _x6 are in the register
                _x6 = __builtin_shuffle(_x5, _x7, shufflemask2); // {x[5], x[6]}

                // Shuffle _y2 later to be sure it is in the register
                _y2 = __builtin_shuffle(_y2, _y2,
                                        shufflemask1); // { y[srcBLen - 3] , y[srcBLen - 4] }

                /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
                acc0 = __SUMDOTP2(_x3, _y2, acc0);
                /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
                acc1 = __SUMDOTP2(_x4, _y2, acc1);
                /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
                acc2 = __SUMDOTP2(_x5, _y2, acc2);
                /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
                acc3 = __SUMDOTP2(_x6, _y2, acc3);

                px += 4U;
                py -= 4U;

            } while (--k);

            /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
            ** No loop unrolling is used. */
            k = srcBLen % 0x4U;

            while (k > 0U) {
                /* Read y[srcBLen - 5] sample */
                c0 = *py--;

                _x1 = *((v2s *)(px));     // { x[0] , x[1] }
                _x2 = *((v2s *)(px + 2)); // { x[2] , x[3] }

                px++;

                /* Perform the multiply-accumulate */
                /* acc0 +=  x[4] * y[srcBLen - 5] */
                acc0 += _x1[0] * c0;
                /* acc1 +=  x[5] * y[srcBLen - 5] */
                acc1 += _x1[1] * c0;
                /* acc2 +=  x[6] * y[srcBLen - 5] */
                acc2 += _x2[0] * c0;
                /* acc3 +=  x[7] * y[srcBLen - 5] */
                acc3 += _x2[1] * c0;

                /* Decrement the loop counter */
                k--;
            }

            /* Store the result in the accumulator in the destination buffer. */

            *pOut++ = acc0;
            *pOut++ = acc1;
            *pOut++ = acc2;
            *pOut++ = acc3;

            /* Increment the pointer pIn1 index, count by 4 */
            count += 4U;

            /* Update the inputA and inputB pointers for next MAC calculation */
            px = pIn1 + count;
            py = pSrc2;

            /* Decrement the loop counter */
            blkCnt--;
        }

        /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
        ** No loop unrolling is used. */
        blkCnt = blockSize2 % 0x4U;

#else

        /* Initialize blkCnt with number of samples */
        blkCnt = blockSize2;

#endif /* #if defined (PLP_MATH_LOOPUNROLL)*/

        while (blkCnt > 0U) {
            /* Accumulator is made zero for every iteration */
            _y1 = *((v2s *)(py - 1));
            _x1 = *((v2s *)(px));
            sum = 0;

#if defined(PLP_MATH_LOOPUNROLL)
            /* Loop unrolling: Compute 4 outputs at a time */
            k = srcBLen >> 1U;
            while (k > 0U) {
                /* Perform the multiply-accumulate */
                _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

                sum = __SUMDOTP2(_x1, _y1, sum);

                py -= 2;
                px += 2;

                _y1 = *((v2s *)(py - 1));
                _x1 = *((v2s *)(px));

                k--;
            }

            /* Loop unrolling: Compute remaining outputs */
            k = srcBLen % 0x2U;

            if (k) {
                sum = __MAC(sum, _x1[0], _y1[1]);
            }

#else
            /* Initialize blkCnt with number of samples */
            k = srcBLen;

            while (k) {
                sum = __MAC(sum, *px++, *py--);
                k--;
            }

#endif /* #if defined (PLP_MATH_LOOPUNROLL) */

            /* Store the result in the accumulator in the destination buffer. */
            *pOut++ = sum;

            /* Increment the MAC count */
            count++;

            /* Update the inputA and inputB pointers for next MAC calculation */
            px = pIn1 + count;
            py = pSrc2;

            /* Decrement the loop counter */
            blkCnt--;
        }
    } else {
        /* If the srcBLen is not a multiple of 4,
         * the blockSize2 loop cannot be unrolled by 4 */
        blkCnt = blockSize2;

        while (blkCnt > 0U) {
            /* Accumulator is made zero for every iteration */
            sum = 0;

            /* srcBLen number of MACS should be performed */
            k = srcBLen;

            while (k > 0U) {
                /* Perform the multiply-accumulate */
                /* sum += *px++ * *py--; */
                sum = __MAC(sum, *px++, *py--);
                /* Decrement the loop counter */
                k--;
            }

            /* Store the result in the accumulator in the destination buffer. */
            *pOut++ = sum;

            /* Increment the MAC count */
            count++;

            /* Update the inputA and inputB pointers for next MAC calculation */
            px = pIn1 + count;
            py = pSrc2;

            /* Decrement the loop counter */
            blkCnt--;
        }
    }

    /* --------------------------
     * Initializations of stage3
     * -------------------------*/

    /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+
     * x[srcALen-1] * y[1] sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] *
     * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
     * ....
     * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
     * sum +=  x[srcALen-1] * y[srcBLen-1]
     */

    /* In this stage the MAC operations are decreased by 1 for every iteration.
       The blockSize3 variable holds the number of MAC operations performed */

    /* Working pointer of inputA */
    pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
    px = pSrc1;

    /* Working pointer of inputB */
    pSrc2 = pIn2 + (srcBLen - 1U);
    py = pSrc2;

    /* -------------------
     * Stage3 process
     * ------------------*/
    while (blockSize3 > 0U) {
        /* Accumulator is made zero for every iteration */

        _y1 = *((v2s *)(py - 1));
        _x1 = *((v2s *)(px));

        sum = 0;

#if defined(PLP_MATH_LOOPUNROLL)
        /* Loop unrolling: Compute 4 outputs at a time */
        k = blockSize3 >> 1U;
        while (k > 0U) {
            _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

            sum = __SUMDOTP2(_x1, _y1, sum);
            px += 2;
            py -= 2;

            _y1 = *((v2s *)(py - 1));
            _x1 = *((v2s *)(px));

            k--;
        }

        /* Loop unrolling: Compute remaining outputs */
        k = blockSize3 % 0x2U;

        if (k) {
            sum = __MAC(sum, _x1[0], _y1[1]);
        }
#else

        /* Initialize blkCnt with number of samples */
        k = blockSize3;

        while (k) {
            sum = __MAC(sum, *px++, *py--);
            k--;
        }

#endif /* defined (PLP_MATH_LOOPUNROLL)*/

        /* Store the result in the accumulator in the destination buffer. */
        *pOut++ = sum;

        /* Update the inputA and inputB pointers for next MAC calculation */
        px = ++pSrc1;
        py = pSrc2;

        /* Decrement the loop counter */
        blockSize3--;
    }
}

Updated on 2023-03-01 at 16:16:32 +0000