/github/workspace/src/FilteringFunctions/plp_conv_i32.c

Functions

	Name
void	plp_conv_i32(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t *restrict pRes) Glue code for convolution of 32-bit integer vectors.

Attributes

	Name
int32_t *	_pRes1_32

Defines

	Name
	OLARATIO32

Functions Documentation

function plp_conv_i32

void plp_conv_i32(
    const int32_t * pSrcA,
    const uint32_t srcALen,
    const int32_t * pSrcB,
    const uint32_t srcBLen,
    int32_t *__restrict__ pRes
)

Glue code for convolution of 32-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

Attributes Documentation

variable _pRes1_32

static int32_t * _pRes1_32;

Macros Documentation

define OLARATIO32

#define OLARATIO32 10

Source code

/* =====================================================================
 * Project:      PULP DSP Library
 * Title:        plp_dot_prod_i32.c
 * Description:  32-bit integer convolution glue code
 *
 * $Date:        01. July 2019
 * $Revision:    V0
 *
 * Target Processor: PULP cores
 * ===================================================================== */
/*
 * Copyright (C) 2019 ETH Zurich and University of Bologna.
 *
 * Author: Moritz Scherer
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "plp_math.h"
#define OLARATIO32 10 // Eight is optimal in terms of overhead minimization

static int32_t *_pRes1_32;

void plp_conv_i32(const int32_t *pSrcA,
                  const uint32_t srcALen,
                  const int32_t *pSrcB,
                  const uint32_t srcBLen,
                  int32_t *__restrict__ pRes) {

    uint32_t in1Len, in2Len;
    const int32_t *pIn1;
    const int32_t *pIn2;

    if (srcALen >= srcBLen) {
        in1Len = srcALen;
        in2Len = srcBLen;
        pIn1 = pSrcA;
        pIn2 = pSrcB;
    } else {
        in2Len = srcALen;
        in1Len = srcBLen;
        pIn2 = pSrcA;
        pIn1 = pSrcB;
    }

    uint32_t nPE = (OLARATIO32 / (in1Len / in2Len));
    nPE = nPE > 0 ? nPE : 1;
    uint32_t src2Offset = ((in2Len + nPE - 1) / nPE);
    uint32_t resultsoffset = src2Offset + in1Len - 1;
    uint32_t lastresultLen = (in2Len - (src2Offset * (nPE - 1))) + in1Len - 1;

    uint32_t temp1, temp2, k;

    for (uint32_t i = 0; i < srcALen + srcBLen - 1; i++) {
        pRes[i] = 0;
    }

    if (hal_cluster_id() == ARCHI_FC_CID) {

        _pRes1_32 = hal_fc_l1_malloc(sizeof(int32_t) * (resultsoffset));

        int32_t *pOut = pRes;
        int32_t *_pRes = _pRes1_32;

        for (uint32_t i = 0; i < nPE - 1; i++) {
            plp_conv_i32s_rv32im(pIn1, in1Len, pIn2 + i * src2Offset, src2Offset, _pRes1_32);

            pOut = pRes + i * src2Offset;
            _pRes = _pRes1_32;

            k = resultsoffset >> 1;
            while (k) {

                temp1 = *_pRes++;
                temp2 = *_pRes++;

                *pOut++ += temp1;
                *pOut++ += temp2;

                k--;
            }

            k = resultsoffset % 2U;

            if (k) {
                *pOut++ += *_pRes++;
            }
        }

        plp_conv_i32s_rv32im(pIn1, in1Len, pIn2 + (nPE - 1) * src2Offset,
                             in2Len - (src2Offset * (nPE - 1)), _pRes1_32);

        pOut = pRes + (nPE - 1) * src2Offset;
        _pRes = _pRes1_32;

        k = lastresultLen >> 1;

        while (k) {

            temp1 = *_pRes++;
            temp2 = *_pRes++;

            *pOut++ += temp1;
            *pOut++ += temp2;

            k--;
        }

        k = lastresultLen % 2U;

        if (k) {
            *pOut++ += *_pRes++;
        }

    } else {

        _pRes1_32 = hal_cl_l1_malloc(sizeof(int32_t) * (resultsoffset));

        int32_t *pOut = pRes;
        int32_t *_pRes = _pRes1_32;

        for (uint32_t i = 0; i < nPE - 1; i++) {
            plp_conv_i32s_xpulpv2(pIn1, in1Len, pIn2 + i * src2Offset, src2Offset, _pRes1_32);

            pOut = pRes + i * src2Offset;
            _pRes = _pRes1_32;

            k = resultsoffset >> 1;
            while (k) {

                temp1 = *_pRes++;
                temp2 = *_pRes++;

                *pOut++ += temp1;
                *pOut++ += temp2;

                k--;
            }

            k = resultsoffset % 2U;

            if (k) {
                *pOut++ += *_pRes++;
            }
        }

        plp_conv_i32s_xpulpv2(pIn1, in1Len, pIn2 + (nPE - 1) * src2Offset,
                              in2Len - (src2Offset * (nPE - 1)), _pRes1_32);

        pOut = pRes + (nPE - 1) * src2Offset;
        _pRes = _pRes1_32;

        k = lastresultLen >> 1;

        while (k) {

            temp1 = *_pRes++;
            temp2 = *_pRes++;

            *pOut++ += temp1;
            *pOut++ += temp2;

            k--;
        }

        k = lastresultLen % 2U;

        if (k) {
            *pOut++ += *_pRes++;
        }
    }
    hal_cl_l1_free(_pRes1_32, sizeof(int32_t) * (resultsoffset));
}

Updated on 2023-03-01 at 16:16:32 +0000