/github/workspace/src/FilteringFunctions/kernels/plp_conv_i8s_xpulpv2.c

Functions

	Name
void	plp_conv_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 8-bit integer vectors kernel for XPULPV2 extension.

Defines

	Name
	shufflemask1
	shufflemask2
	shufflemask3

Functions Documentation

function plp_conv_i8s_xpulpv2

void plp_conv_i8s_xpulpv2(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

Macros Documentation

define shufflemask1

#define shufflemask1     (v4s) { 3, 2, 1, 0 }

define shufflemask2

#define shufflemask2     (v4s) { 1, 2, 3, 5 }

define shufflemask3

#define shufflemask3     (v4s) { 2, 3, 5, 6 }

Source code

/* =====================================================================
 * Project:      PULP DSP Library
 * Title:        plp_conv_i8s_xpulpv2.c
 * Description:  8-bit integer singlecore convolution for XPULPV2
 *
 * $Date:        01. July 2019
 * $Revision:    V0
 *
 * Target Processor: PULP cores
 * ===================================================================== */
/*
 * Copyright (C) 2019 ETH Zurich and University of Bologna.
 *
 * Author: Moritz Scherer, ETH Zurich
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "plp_math.h"

#define shufflemask1                                                                               \
    (v4s) { 3, 2, 1, 0 }
#define shufflemask2                                                                               \
    (v4s) { 1, 2, 3, 5 }
#define shufflemask3                                                                               \
    (v4s) { 2, 3, 5, 6 }

// Pre-condition: psrcALen >= psrcBLen, established by calling function plp_conv_i8
// Pre-condition: pRes has enough allocated memory, i.e. srcALen + srcBLen-1u
// Pre-condition: srcALen >= 2 and srcBLen >= 2, otherwise use vector dot product

void plp_conv_i8s_xpulpv2(const int8_t *pSrcA,
                          const uint32_t srcALen,
                          const int8_t *pSrcB,
                          const uint32_t srcBLen,
                          int32_t *pRes) {

    const int8_t *pIn1 = pSrcA;                  /* InputA pointer */
    const int8_t *pIn2 = pSrcB;                  /* InputB pointer */
    int32_t *pOut = pRes;                        /* Output pointer */
    const int8_t *px;                            /* Intermediate inputA pointer */
    const int8_t *py;                            /* Intermediate inputB pointer */
    const int8_t *pSrc1, *pSrc2;                 /* Intermediate pointers */
    int32_t sum;                                 /* Accumulators */
    uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
    uint32_t j, k, count, blkCnt;                /* Loop counters */

#if defined(PLP_MATH_LOOPUNROLL)
    int32_t acc0, acc1, acc2, acc3; /* Accumulators */
    int8_t x0, x1, x2, x3, c0;      /* Temporary variables to hold state and coefficient values */
#endif

    int32_t temp1, temp2;
    v4s xmask[] = { (v4s){ 0, 0, 0, 0 }, (v4s){ 0xff, 0, 0, 0 }, (v4s){ 0xff, 0xff, 0, 0 },
                    (v4s){ 0xff, 0xff, 0xff, 0 } };
    v4s ymask[] = { (v4s){ 0, 0, 0, 0 }, (v4s){ 0, 0, 0, 0xff }, (v4s){ 0, 0, 0xff, 0xff },
                    (v4s){ 0, 0xff, 0xff, 0xff } };

    v4s mask;

    v4s _x1, _x2, _x3, _x4;
    v4s _y1, _y2;

    blockSize1 = srcBLen - 1U;
    blockSize2 = srcALen - (srcBLen - 1U);
    blockSize3 = blockSize1;

    /* --------------------------
     * Initializations of stage1
     * -------------------------*/

    /* sum = x[0] * y[0]
     * sum = x[0] * y[1] + x[1] * y[0]
     * ....
     * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
     */

    /* In this stage the MAC operations are increased by 1 for every iteration.
       The count variable holds the number of MAC operations performed */
    count = 1U;

    /* Working pointer of inputA */
    px = pIn1;

    /* Working pointer of inputB */
    py = pIn2;

    /* ------------------------
     * Stage1 process
     * ----------------------*/

    /* The first stage starts here */
    while (blockSize1 > 0U) {
        /* Accumulator is made zero for every iteration */

        _y1 = *((v4s *)(py - 3));
        _x1 = *((v4s *)(px));
        sum = 0;
        _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

#if defined(PLP_MATH_LOOPUNROLL)
        /* Loop unrolling: Compute 4 outputs at a time */
        k = count >> 2U;
        while (k > 0U) {
            sum = __SUMDOTP4(_x1, _y1, sum);

            _y1 = *((v4s *)(py - 7));
            _x1 = *((v4s *)(px + 4));

            px += 4U;
            py -= 4U;

            _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);
            k--;
        }

        /* Loop unrolling: Compute remaining outputs */
        k = count % 0x4U;

        mask = xmask[k];

        _x1 = __AND4(_x1, mask);
        sum = __SUMDOTP4(_x1, _y1, sum);
#else
        /* Initialize k with number of samples */
        k = count;

        while (k) {
            sum = __MAC(sum, *px++, *py--);
            k--;
        }

#endif /* #if defined (PLP_MATH_LOOPUNROLL) */

        /* Store the result in the accumulator in the destination buffer. */
        *pOut++ = sum;

        /* Update the inputA and inputB pointers for next MAC calculation */
        py = pIn2 + count;
        px = pIn1;

        /* Increment MAC count */
        count++;

        /* Decrement loop counter */
        blockSize1--;
    }

    /* --------------------------
     * Initializations of stage2
     * ------------------------*/

    /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
     * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen]   * y[0]
     * ....
     * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] *
     * y[0]
     */

    /* Working pointer of inputA */
    px = pIn1;

    /* Working pointer of inputB */
    pSrc2 = pIn2 + (srcBLen - 1U);
    py = pSrc2;

    /* count is index by which the pointer pIn1 to be incremented */
    count = 0U;

    /* -------------------
     * Stage2 process
     * ------------------*/

    /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
     * So, to loop unroll over blockSize2,
     * srcBLen should be greater than or equal to 4 */
    if (srcBLen >= 4U) {

#if defined(PLP_MATH_LOOPUNROLL)

        /* Loop unrolling: Compute 4 outputs at a time */
        blkCnt = blockSize2 >> 2U;
        while (blkCnt > 0U) {
            /* Set all accumulators to zero */
            acc0 = 0;
            acc1 = 0;
            acc2 = 0;
            acc3 = 0;

            /* Apply loop unrolling and compute 4 MACs simultaneously. */
            k = srcBLen >> 2U;

            /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
            ** a second loop below computes MACs for the remaining 1 to 3 samples. */
            do {
                /* Read y[srcBLen - 1] sample */
                _x1 = *((v4s *)px);       // {x[0],x[1],x[2],x[3]}
                _x4 = *((v4s *)(px + 3)); // {x[3],x[4],x[5],x[6]}
                _y1 = *((v4s *)(py - 3)); // {y[srcBLen - 4],y[srcBLen - 3],y[srcBLen - 2],y[srcBLen
                // - 1]}

                px += 4U;
                py -= 4U;

                _x2 = __builtin_shuffle(_x1, _x4, shufflemask2); // {x[1],x[2],x[3],x[4]}
                _x3 = __builtin_shuffle(_x1, _x4, shufflemask3); // {x[2],x[3],x[4],x[5]}

                _y1 = __builtin_shuffle(_y1, _y1, shufflemask1); // {y[srcBLen - 1],y[srcBLen -
                // 2],y[srcBLen - 3],y[srcBLen - 4]}

                acc0 = __SUMDOTP4(_x1, _y1, acc0);
                acc1 = __SUMDOTP4(_x2, _y1, acc1);
                acc2 = __SUMDOTP4(_x3, _y1, acc2);
                acc3 = __SUMDOTP4(_x4, _y1, acc3);

            } while (--k);

            /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
            ** No loop unrolling is used. */

            k = srcBLen % 0x4U;

            if (k > 0) {

                _x1 = *((v4s *)px);       // {x[0],x[1],x[2],x[3]}
                _x4 = *((v4s *)(px + 3)); // {x[3],x[4],x[5],x[6]}
                _y1 = *((v4s *)(py - 3)); // {y[srcBLen - 4],y[srcBLen - 3],y[srcBLen - 2],y[srcBLen
                // - 1]}

                mask = ymask[k];

                _x2 = __builtin_shuffle(_x1, _x4, shufflemask2); // {x[1],x[2],x[3],x[4]}
                _x3 = __builtin_shuffle(_x1, _x4, shufflemask3); // {x[2],x[3],x[4],x[5]}

                _y1 = __AND4(_y1, mask);
                _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

                /* Perform the multiply-accumulate */

                acc0 = __SUMDOTP4(_x1, _y1, acc0);
                acc1 = __SUMDOTP4(_x2, _y1, acc1);
                acc2 = __SUMDOTP4(_x3, _y1, acc2);
                acc3 = __SUMDOTP4(_x4, _y1, acc3);
            }

            /* Store the result in the accumulator in the destination buffer. */
            *pOut++ = acc0;
            *pOut++ = acc1;
            *pOut++ = acc2;
            *pOut++ = acc3;

            /* Increment the pointer pIn1 index, count by 4 */
            count += 4U;

            /* Update the inputA and inputB pointers for next MAC calculation */
            px = pIn1 + count;
            py = pSrc2;

            /* Decrement the loop counter */
            blkCnt--;
        }

        /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
        ** No loop unrolling is used. */
        blkCnt = blockSize2 % 0x4U;

#else

        /* Initialize blkCnt with number of samples */
        blkCnt = blockSize2;

#endif /* #if defined (PLP_MATH_LOOPUNROLL)*/

        while (blkCnt > 0U) {
            /* Accumulator is made zero for every iteration */

            _y1 = *((v4s *)(py - 3));
            _x1 = *((v4s *)(px));
            sum = 0;
            _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

#if defined(PLP_MATH_LOOPUNROLL)
            /* Loop unrolling: Compute 8 outputs at a time */
            k = srcBLen >> 2U;
            while (k > 0U) {
                sum = __SUMDOTP4(_x1, _y1, sum);

                _y1 = *((v4s *)(py - 7));
                _x1 = *((v4s *)(px + 4));

                px += 4U;
                py -= 4U;

                _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);
                k--;
            }

            /* Loop unrolling: Compute remaining outputs */
            k = srcBLen % 0x4U;

            mask = xmask[k];
            _x1 = __AND4(_x1, mask);
            sum = __SUMDOTP4(_x1, _y1, sum);

#else
            /* Initialize blkCnt with number of samples */
            k = srcBLen;

#endif /* #if defined (PLP_MATH_LOOPUNROLL) */

            /* Store the result in the accumulator in the destination buffer. */
            *pOut++ = sum;

            /* Increment the MAC count */
            count++;

            /* Update the inputA and inputB pointers for next MAC calculation */
            px = pIn1 + count;
            py = pSrc2;

            /* Decrement the loop counter */
            blkCnt--;
        }
    } else {
        /* If the srcBLen is not a multiple of 4,
         * the blockSize2 loop cannot be unrolled by 4 */
        blkCnt = blockSize2;

        while (blkCnt > 0U) {
            /* Accumulator is made zero for every iteration */
            sum = 0;

            /* srcBLen number of MACS should be performed */
            k = srcBLen;
            mask = xmask[k];

            _y1 = *((v4s *)(py - 3));
            _x1 = *((v4s *)(px));

            _x1 = __AND4(_x1, mask);
            _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

            sum = __SUMDOTP4(_x1, _y1, sum);

            /* Store the result in the accumulator in the destination buffer. */
            *pOut++ = sum;

            /* Increment the MAC count */
            count++;

            /* Update the inputA and inputB pointers for next MAC calculation */
            px = pIn1 + count;
            py = pSrc2;

            /* Decrement the loop counter */
            blkCnt--;
        }
    }

    /* --------------------------
     * Initializations of stage3
     * -------------------------*/

    /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+
     * x[srcALen-1] * y[1] sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] *
     * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
     * ....
     * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
     * sum +=  x[srcALen-1] * y[srcBLen-1]
     */

    /* In this stage the MAC operations are decreased by 1 for every iteration.
       The blockSize3 variable holds the number of MAC operations performed */

    /* Working pointer of inputA */
    pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
    px = pSrc1;

    /* Working pointer of inputB */
    pSrc2 = pIn2 + (srcBLen - 1U);
    py = pSrc2;

    /* -------------------
     * Stage3 process
     * ------------------*/
    while (blockSize3 > 0U) {
        /* Accumulator is made zero for every iteration */

        _y1 = *((v4s *)(py - 3));
        _x1 = *((v4s *)(px));
        sum = 0;
        _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

#if defined(PLP_MATH_LOOPUNROLL)
        /* Loop unrolling: Compute 4 outputs at a time */
        k = blockSize3 >> 2U;

        while (k > 0U) {
            sum = __SUMDOTP4(_x1, _y1, sum);

            _y1 = *((v4s *)(py - 7));
            _x1 = *((v4s *)(px + 4));

            px += 4U;
            py -= 4U;

            _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

            k--;
        }

        /* Loop unrolling: Compute remaining outputs */
        k = blockSize3 % 0x4U;

        mask = xmask[k];
        _x1 = __AND4(_x1, mask);
        sum = __SUMDOTP4(_x1, _y1, sum);

#else

        /* Initialize blkCnt with number of samples */
        k = blockSize3;

        while (k) {
            sum = __MAC(sum, *px++, *py--);
            k--;
        }

#endif /* defined (PLP_MATH_LOOPUNROLL)*/

        /* Store the result in the accumulator in the destination buffer. */
        *pOut++ = sum;

        /* Update the inputA and inputB pointers for next MAC calculation */
        px = ++pSrc1;
        py = pSrc2;

        /* Decrement the loop counter */
        blockSize3--;
    }
}

Updated on 2023-03-01 at 16:16:32 +0000