Skip to content

/github/workspace/src/FilteringFunctions/kernels/plp_conv_valid_i8s_xpulpv2.c

Functions

Name
void plp_conv_valid_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes)
Convolution of 8-bit integer vectors kernel for XPULPV2 extension.

Defines

Name
shufflemask1
shufflemask2
shufflemask3
shufflemask4

Functions Documentation

function plp_conv_valid_i8s_xpulpv2

void plp_conv_valid_i8s_xpulpv2(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

  • pSrcA points to the first input vector
  • srcALen Length of the first input vector
  • pSrcB points to the second input vector
  • srcBLen Length of the second input vector
  • pRes output result returned here

Return: none

Convolution (valid) of 8-bit integer vectors kernel for XPULPV2 extension.

Macros Documentation

define shufflemask1

#define shufflemask1     (v4s) { 3, 2, 1, 0 }

define shufflemask2

#define shufflemask2     (v4s) { 1, 2, 3, 4 }

define shufflemask3

#define shufflemask3     (v4s) { 2, 3, 4, 5 }

define shufflemask4

#define shufflemask4     (v4s) { 3, 4, 5, 6 }

Source code

/* =====================================================================
 * Project:      PULP DSP Library
 * Title:        plp_conv_i8s_xpulpv2.c
 * Description:  8-bit integer singlecore convolution (valid) for XPULPV2
 *
 * $Date:        24. April 2020
 * $Revision:    V0
 *
 * Target Processor: PULP cores
 * ===================================================================== */
/*
 * Copyright (C) 2020 ETH Zurich and University of Bologna.
 *
 * Author: Moritz Scherer, Tibor Schneider, ETH Zurich
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "plp_math.h"

#define shufflemask1                                                                               \
    (v4s) { 3, 2, 1, 0 }
#define shufflemask2                                                                               \
    (v4s) { 1, 2, 3, 4 }
#define shufflemask3                                                                               \
    (v4s) { 2, 3, 4, 5 }
#define shufflemask4                                                                               \
    (v4s) { 3, 4, 5, 6 }

// Pre-condition: psrcALen >= psrcBLen, established by calling function plp_conv_i32
// Pre-condition: pRes has enough allocated memory, i.e. srcALen + srcBLen-1u
// Pre-condition: srcALen >= 2 and srcBLen >= 2, otherwise use vector dot product

void plp_conv_valid_i8s_xpulpv2(const int8_t *pSrcA,
                                const uint32_t srcALen,
                                const int8_t *pSrcB,
                                const uint32_t srcBLen,
                                int32_t *pRes) {

    const int8_t *p_a_iter; // intermediate inputA pointer
    const int8_t *p_b_iter; // intermediate inputB pointer

    int res_len = srcALen - srcBLen + 1;

#ifdef PLP_MATH_LOOPUNROLL

    const int8_t *p_b_tmp;      // Intermediate pointers
    int32_t sum;                // Accumulators
    uint32_t k, count, blk_cnt; // Loop counters

    // for loop unroll
    int32_t acc0, acc1, acc2, acc3; // Accumulators

    v4s xmask[] = { (v4s){ 0, 0, 0, 0 }, (v4s){ 0xff, 0, 0, 0 }, (v4s){ 0xff, 0xff, 0, 0 },
                    (v4s){ 0xff, 0xff, 0xff, 0 } };
    v4s ymask[] = { (v4s){ 0, 0, 0, 0 }, (v4s){ 0, 0, 0, 0xff }, (v4s){ 0, 0, 0xff, 0xff },
                    (v4s){ 0, 0xff, 0xff, 0xff } };
    v4s mask;

    v4s _x1, _x2, _x3, _x4; // local registers
    v4s _y1;                // local registers

    // Working pointer of inputA
    p_a_iter = pSrcA;

    // Working pointer of inputB
    p_b_tmp = pSrcB + (srcBLen - 1U);
    p_b_iter = p_b_tmp;

    // count is index by which the pointer p_a to be incremented
    count = 0U;

    if (srcBLen >= 4U) {

        // compute 4 outputs at the same time
        blk_cnt = res_len >> 2U;
        while (blk_cnt > 0U) {

            // Set all accumulators to zero
            acc0 = 0;
            acc1 = 0;
            acc2 = 0;
            acc3 = 0;

            // Apply loop unrolling and compute 4 MACs simultaneously.
            k = srcBLen >> 2U;

            /* First part of the processing with loop unrolling. Compute 4 MACs at a
             * a second loop below computes MACs for the remaining 1 to 3 samples.
             */

            do {
                // Read y[srcBLen - 1] sample
                _x1 = *((v4s *)p_a_iter);       // {x[0],x[1],x[2],x[3]}
                _x4 = *((v4s *)(p_a_iter + 4)); // {x[4],x[5],x[6],x[7]}
                _y1 = *((v4s *)(p_b_iter - 3)); // {y[srcBLen - 4],y[srcBLen - 3],y[srcBLen -
                                                // 2],y[srcBLen - 1]}

                p_a_iter += 4U;
                p_b_iter -= 4U;

                _x2 = __builtin_shuffle(_x1, _x4, shufflemask2); // {x[1],x[2],x[3],x[4]}
                _x3 = __builtin_shuffle(_x1, _x4, shufflemask3); // {x[2],x[3],x[4],x[5]}
                _x4 = __builtin_shuffle(_x1, _x4, shufflemask4); // {x[2],x[3],x[4],x[5]}

                _y1 =
                    __builtin_shuffle(_y1, _y1, shufflemask1); // {y[srcBLen - 1],y[srcBLen -
                                                               // 2],y[srcBLen - 3],y[srcBLen - 4]}

                // Perform the multiply-accumulate

                acc0 = __SUMDOTP4(_x1, _y1, acc0);
                acc1 = __SUMDOTP4(_x2, _y1, acc1);
                acc2 = __SUMDOTP4(_x3, _y1, acc2);
                acc3 = __SUMDOTP4(_x4, _y1, acc3);

            } while (--k);

            /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
             * No loop unrolling is used.
             */

            k = srcBLen % 0x4U;

            if (k > 0) {
                _x1 = *((v4s *)p_a_iter);       // {x[0],x[1],x[2],x[3]}
                _x4 = *((v4s *)(p_a_iter + 4)); // {x[4],x[5],x[6],x[7]}
                _y1 = *((v4s *)(p_b_iter - 3)); // {y[srcBLen - 4],y[srcBLen - 3],y[srcBLen -
                                                // 2],y[srcBLen - 1]}

                mask = ymask[k];

                _x2 = __builtin_shuffle(_x1, _x4, shufflemask2); // {x[1],x[2],x[3],x[4]}
                _x3 = __builtin_shuffle(_x1, _x4, shufflemask3); // {x[2],x[3],x[4],x[5]}
                _x4 = __builtin_shuffle(_x1, _x4, shufflemask4); // {x[3],x[4],x[5],x[6]}

                _y1 = __AND4(_y1, mask);
                _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

                // Perform the multiply-accumulate

                acc0 = __SUMDOTP4(_x1, _y1, acc0);
                acc1 = __SUMDOTP4(_x2, _y1, acc1);
                acc2 = __SUMDOTP4(_x3, _y1, acc2);
                acc3 = __SUMDOTP4(_x4, _y1, acc3);
            }

            /* Store the result in the accumulator in the destination buffer. */
            *pRes++ = acc0;
            *pRes++ = acc1;
            *pRes++ = acc2;
            *pRes++ = acc3;

            /* Increment the pointer p_a index, count by 4 */
            count += 4U;

            /* Update the inputA and inputB pointers for next MAC calculation */
            p_a_iter = pSrcA + count;
            p_b_iter = p_b_tmp;

            /* Decrement the loop counter */
            blk_cnt--;
        }

        /* If the res_len is not a multiple of 4, compute any remaining output samples here.
         * No loop unrolling is used.
         */
        blk_cnt = res_len % 0x4U;

        while (blk_cnt > 0U) {

            // Accumulator is made zero for every iteration

            _y1 = *((v4s *)(p_b_iter - 3));
            _x1 = *((v4s *)(p_a_iter));
            sum = 0;
            _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

            /* Loop unrolling: Compute 8 outputs at a time */
            k = srcBLen >> 2U;
            while (k > 0U) {
                sum = __SUMDOTP4(_x1, _y1, sum);

                _y1 = *((v4s *)(p_b_iter - 7));
                _x1 = *((v4s *)(p_a_iter + 4));

                p_a_iter += 4U;
                p_b_iter -= 4U;

                _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);
                k--;
            }

            // Loop unrolling: Compute remaining outputs
            k = srcBLen % 0x4U;

            mask = xmask[k];
            _x1 = __AND4(_x1, mask);
            sum = __SUMDOTP4(_x1, _y1, sum);

            /* Store the result in the accumulator in the destination buffer. */
            *pRes++ = sum;

            /* Increment the MAC count */
            count++;

            /* Update the inputA and inputB pointers for next MAC calculation */
            p_a_iter = pSrcA + count;
            p_b_iter = p_b_tmp;

            /* Decrement the loop counter */
            blk_cnt--;
        }

    } else { // case: srcBLen < 4

        /* If the srcBLen is smaller than 4
         * the res_len loop cannot be unrolled by 4
         * TODO yes, it can!
         */
        blk_cnt = res_len;

        while (blk_cnt > 0U) {
            /* Accumulator is made zero for every iteration */
            sum = 0;

            /* srcBLen number of MACS should be performed */
            k = srcBLen;
            mask = xmask[k];

            _y1 = *((v4s *)(p_b_iter - 3));
            _x1 = *((v4s *)(p_a_iter));

            _x1 = __AND4(_x1, mask);
            _y1 = __builtin_shuffle(_y1, _y1, shufflemask1);

            sum = __SUMDOTP4(_x1, _y1, sum);

            /* Store the result in the accumulator in the destination buffer. */
            *pRes++ = sum;

            /* Increment the MAC count */
            count++;

            /* Update the inputA and inputB pointers for next MAC calculation */
            p_a_iter = pSrcA + count;
            p_b_iter = p_b_tmp;

            /* Decrement the loop counter */
            blk_cnt--;
        }
    }

#else // PLP_MATH_LOOPUNROLL

    for (int i_out = 0; i_out < res_len; i_out++) {

        p_a_iter = pSrcA + i_out;
        p_b_iter = pSrcB + srcBLen - 1;

        int32_t acc = 0;

        for (int i_in = 0; i_in < srcBLen; i_in++) {
            acc += (*(p_a_iter++)) * (*(p_b_iter--));
        }

        pRes[i_out] = acc;
    }

#endif // PLP_MATH_LOOPUNROLL
}

Updated on 2023-03-01 at 16:16:32 +0000