/github/workspace/src/FilteringFunctions/kernels/plp_conv_valid_i8s_xpulpv2.c

Functions

	Name
void	plp_conv_valid_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 8-bit integer vectors kernel for XPULPV2 extension.

Defines

	Name
	shufflemask1
	shufflemask2
	shufflemask3
	shufflemask4

Functions Documentation

function plp_conv_valid_i8s_xpulpv2

void plp_conv_valid_i8s_xpulpv2(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

Convolution (valid) of 8-bit integer vectors kernel for XPULPV2 extension.

Macros Documentation

define shufflemask1

#define shufflemask1     (v4s) { 3, 2, 1, 0 }

define shufflemask2

#define shufflemask2     (v4s) { 1, 2, 3, 4 }

define shufflemask3

#define shufflemask3     (v4s) { 2, 3, 4, 5 }

define shufflemask4

#define shufflemask4     (v4s) { 3, 4, 5, 6 }

Source code

/* =====================================================================
 * Project:      PULP DSP Library
 * Title:        plp_conv_i8s_xpulpv2.c
 * Description:  8-bit integer singlecore convolution (valid) for XPULPV2
 *
 * $Date:        24. April 2020
 * $Revision:    V0
 *
 * Target Processor: PULP cores
 * ===================================================================== */
/*
 * Copyright (C) 2020 ETH Zurich and University of Bologna.
 *
 * Author: Moritz Scherer, Tibor Schneider, ETH Zurich
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "plp_math.h"

#define shufflemask1                                                                               \
    (v4s) { 3, 2, 1, 0 }
#define shufflemask2                                                                               \
    (v4s) { 1, 2, 3, 4 }
#define shufflemask3                                                                               \
    (v4s) { 2, 3, 4, 5 }
#define shufflemask4                                                                               \
    (v4s) { 3, 4, 5, 6 }

// Pre-condition: psrcALen >= psrcBLen, established by calling function plp_conv_i32
// Pre-condition: pRes has enough allocated memory, i.e. srcALen + srcBLen-1u
// Pre-condition: srcALen >= 2 and srcBLen >= 2, otherwise use vector dot product

void plp_conv_valid_i8s_xpulpv2(const int8_t *pSrcA,
                                const uint32_t srcALen,
                                const int8_t *pSrcB,
                                const uint32_t srcBLen,
                                int32_t *pRes) {

    const int8_t *p_a_iter; // intermediate inputA pointer
    const int8_t *p_b_iter; // intermediate inputB pointer

    int res_len = srcALen - srcBLen + 1;

#ifdef PLP_MATH_LOOPUNROLL

    const int8_t *p_b_tmp;      // Intermediate pointers
    int32_t sum;                // Accumulators
    uint32_t k, count, blk_cnt; // Loop counters

    // for loop unroll
    int32_t acc0, acc1, acc2, acc3; // Accumulators

    v4s xmask[] = { (v4s){ 0, 0, 0, 0 }, (v4s){ 0xff, 0, 0, 0 }, (v4s){ 0xff, 0xff, 0, 0 },
                    (v4s){ 0xff, 0xff, 0xff, 0 } };
    v4s ymask[] = { (v4s){ 0, 0, 0, 0 }, (v4s){ 0, 0, 0, 0xff }, (v4s){ 0, 0, 0xff, 0xff },
                    (v4s){ 0, 0xff, 0xff, 0xff } };
    v4s mask;

    v4s _x1, _x2, _x3, _x4; // local registers
    v4s _y1;                // local registers

    // Working pointer of inputA
    p_a_iter = pSrcA;

    // Working pointer of inputB
    p_b_tmp = pSrcB + (srcBLen - 1U);
    p_b_iter = p_b_tmp;

    // count is index by which the pointer p_a to be incremented
    count = 0U;

    if (srcBLen >= 4U) {

        // compute 4 outputs at the same time
        blk_cnt = res_len >> 2U;
        while (blk_cnt > 0U) {

            // Set all accumulators to zero
            acc0 = 0;
            acc1 = 0;
            acc2 = 0;
            acc3 = 0;

            // Apply loop unrolling and compute 4 MACs simultaneously.
            k = srcBLen >> 2U;

            /* First part of the processing with loop unrolling. Compute 4 MACs at a
             * a second loop below computes MACs for the remaining 1 to 3 samples.
             */

            do {
                // Read y[srcBLen - 1] sample
                _x1 = *((v4s *)p_a_iter);       // {x[0],x[1],x[2],x[3]}
                _x4 = *((v4s *)(p_a_iter + 4)); // {x[4],x[5],x[6],x[7]}
                _y1 = *((v4s *)(p_b_iter - 3)); // {y[srcBLen - 4],y[srcBLen - 3],y[srcBLen -
                                                // 2],y[srcBLen - 1]}

                p_a_iter += 4U;
                p_b_iter -= 4U;

                _x2 = __builtin_shuffle(_x1, _x4, shufflemask2); // {x[1],x[2],x[3],x[4]}
                _x3 = __builtin_shuffle(_x1, _x4, shufflemask3); // {x[2],x[3],x[4],x[5]}
                _x4 = __builtin_shuffle(_x1, _x4, shufflemask4); // {x[2],x[3],x[4],x[5]}

                _y1 =
                    __builtin_shuffle(_y1, _y1, shufflemask1); // {y[srcBLen - 1],y[srcBLen -
                                                               // 2],y[srcBLen - 3],y[srcBLen - 4]}

                // Perform the multiply-accumulate

                acc0 = __SUMDOTP4(_x1, _y1, acc0);
                acc1 = __SUMDOTP4(_x2, _y1, acc1);
                acc2 = __SUMDOTP4(_x3, _y1, acc2);
                acc3 = __SUMDOTP4(_x4, _y1, acc3);

            } while (--k);

            /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
             * No loop unrolling is used.
             */

            k = srcBLen % 0x4U;

            if (k > 0) {
                _x1 = *((v4s *)p_a_iter);       // {x[0],x[1],x[2],x[3]}
                _x4 = *((v4s *)(p_a_iter + 4)); // {x[4],x[5],x[6],x[7]}
                _y1 = *((v4s *)(p_b_iter - 3)); // {y[srcBLen - 4],y[srcBLen - 3],y[srcBLen -
                                                // 2],y[srcBLen - 1]}

                mask = ymask[k];

                _x2 = __builtin_shuffle(_x1, _x4, shufflemask2); // {x[1],x[2],x[3],x[4]}
                _x3 = __builtin_shuffle(_x1, _x4, shufflemask3); // {x[2],x[3],x[4],x[5]}
                _x4 = __builtin_shuffle(_x1, _x4, shufflemask4); // {x[3],x[4],x[5],x[6]}

                _y1 = __AND4(_y1, mask);
                _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

                // Perform the multiply-accumulate

                acc0 = __SUMDOTP4(_x1, _y1, acc0);
                acc1 = __SUMDOTP4(_x2, _y1, acc1);
                acc2 = __SUMDOTP4(_x3, _y1, acc2);
                acc3 = __SUMDOTP4(_x4, _y1, acc3);
            }

            /* Store the result in the accumulator in the destination buffer. */
            *pRes++ = acc0;
            *pRes++ = acc1;
            *pRes++ = acc2;
            *pRes++ = acc3;

            /* Increment the pointer p_a index, count by 4 */
            count += 4U;

            /* Update the inputA and inputB pointers for next MAC calculation */
            p_a_iter = pSrcA + count;
            p_b_iter = p_b_tmp;

            /* Decrement the loop counter */
            blk_cnt--;
        }

        /* If the res_len is not a multiple of 4, compute any remaining output samples here.
         * No loop unrolling is used.
         */
        blk_cnt = res_len % 0x4U;

        while (blk_cnt > 0U) {

            // Accumulator is made zero for every iteration

            _y1 = *((v4s *)(p_b_iter - 3));
            _x1 = *((v4s *)(p_a_iter));
            sum = 0;
            _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

            /* Loop unrolling: Compute 8 outputs at a time */
            k = srcBLen >> 2U;
            while (k > 0U) {
                sum = __SUMDOTP4(_x1, _y1, sum);

                _y1 = *((v4s *)(p_b_iter - 7));
                _x1 = *((v4s *)(p_a_iter + 4));

                p_a_iter += 4U;
                p_b_iter -= 4U;

                _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);
                k--;
            }

            // Loop unrolling: Compute remaining outputs
            k = srcBLen % 0x4U;

            mask = xmask[k];
            _x1 = __AND4(_x1, mask);
            sum = __SUMDOTP4(_x1, _y1, sum);

            /* Store the result in the accumulator in the destination buffer. */
            *pRes++ = sum;

            /* Increment the MAC count */
            count++;

            /* Update the inputA and inputB pointers for next MAC calculation */
            p_a_iter = pSrcA + count;
            p_b_iter = p_b_tmp;

            /* Decrement the loop counter */
            blk_cnt--;
        }

    } else { // case: srcBLen < 4

        /* If the srcBLen is smaller than 4
         * the res_len loop cannot be unrolled by 4
         * TODO yes, it can!
         */
        blk_cnt = res_len;

        while (blk_cnt > 0U) {
            /* Accumulator is made zero for every iteration */
            sum = 0;

            /* srcBLen number of MACS should be performed */
            k = srcBLen;
            mask = xmask[k];

            _y1 = *((v4s *)(p_b_iter - 3));
            _x1 = *((v4s *)(p_a_iter));

            _x1 = __AND4(_x1, mask);
            _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

            sum = __SUMDOTP4(_x1, _y1, sum);

            /* Store the result in the accumulator in the destination buffer. */
            *pRes++ = sum;

            /* Increment the MAC count */
            count++;

            /* Update the inputA and inputB pointers for next MAC calculation */
            p_a_iter = pSrcA + count;
            p_b_iter = p_b_tmp;

            /* Decrement the loop counter */
            blk_cnt--;
        }
    }

#else // PLP_MATH_LOOPUNROLL

    for (int i_out = 0; i_out < res_len; i_out++) {

        p_a_iter = pSrcA + i_out;
        p_b_iter = pSrcB + srcBLen - 1;

        int32_t acc = 0;

        for (int i_in = 0; i_in < srcBLen; i_in++) {
            acc += (*(p_a_iter++)) * (*(p_b_iter--));
        }

        pRes[i_out] = acc;
    }

#endif // PLP_MATH_LOOPUNROLL
}

Updated on 2023-03-01 at 16:16:32 +0000