/github/workspace/src/TransformFunctions/kernels/plp_cfft_q16p_xpulpv2.c

Functions

	Name
void	plp_cfft_radix4by2_q16(int16_t * pSrc, uint32_t fftLen, const int16_t * pCoef, uint32_t nPE)
void	plp_radix4_butterfly_q16(int16_t * pSrc16, uint32_t fftLen, int16_t * pCoef16, uint32_t twidCoefModifier, uint32_t nPE) Core function for the Q15 CFFT butterfly process.
void	plp_cfft_q16p_xpulpv2(void * args) Parallel quantized 16 bit complex fast fourier transform for XPULPV2.

Defines

	Name
	MAX(x, y)
	MIN(x, y)

Functions Documentation

function plp_cfft_radix4by2_q16

static void plp_cfft_radix4by2_q16(
    int16_t * pSrc,
    uint32_t fftLen,
    const int16_t * pCoef,
    uint32_t nPE
)

function plp_radix4_butterfly_q16

static void plp_radix4_butterfly_q16(
    int16_t * pSrc16,
    uint32_t fftLen,
    int16_t * pCoef16,
    uint32_t twidCoefModifier,
    uint32_t nPE
)

Core function for the Q15 CFFT butterfly process.

Parameters:

*pSrc16 points to the in-place buffer of Q15 data type.
fftLen length of the FFT.
*pCoef16 points to twiddle coefficient buffer.
twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.

Return: none.

function plp_cfft_q16p_xpulpv2

void plp_cfft_q16p_xpulpv2(
    void * args
)

Parallel quantized 16 bit complex fast fourier transform for XPULPV2.

Parameters:

args points to the plp_cfft_instance_q16_parallel

Macros Documentation

define MAX

#define MAX(
    x,
    y
)
(((x) > (y)) ? (x) : (y))

define MIN

#define MIN(
    x,
    y
)
(((x) < (y)) ? (x) : (y))

Source code

/* =====================================================================
 * Project:      PULP DSP Library
 * Title:        plp_cfft_q16s_xpulpv2.c
 * Description:  16-bit fixed point Fast Fourier Transform on Compled Input Data
 *
 * $Date:        05. August 2020
 * $Revision:    V0
 *
 * Target Processor: PULP cores
 * ===================================================================== */
/*
 * Copyright (C) 2020 ETH Zurich and University of Bologna. All rights reserved.
 *
 * Author: Michael Rogenmoser, ETH Zurich
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "plp_math.h"

#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#define MIN(x, y) (((x) < (y)) ? (x) : (y))

static void plp_cfft_radix4by2_q16(int16_t *pSrc, uint32_t fftLen, const int16_t *pCoef, uint32_t nPE);

static void plp_radix4_butterfly_q16(int16_t *pSrc16,
                                     uint32_t fftLen,
                                     int16_t *pCoef16,
                                     uint32_t twidCoefModifier,
                                     uint32_t nPE);

void plp_cfft_q16p_xpulpv2(void *args){
    int core_id = hal_core_id();
    plp_cfft_instance_q16_parallel *a = (plp_cfft_instance_q16_parallel *) args;

    uint32_t L = a->S->fftLen;

    if (a->ifftFlag == 0) {
        switch (L) {
        case 16:
        case 64:
        case 256:
        case 1024:
        case 4096:
            plp_radix4_butterfly_q16(a->p1, L, (int16_t *)a->S->pTwiddle, 1, a->nPE);
            break;
        case 32:
        case 128:
        case 512:
        case 2048:
            plp_cfft_radix4by2_q16(a->p1, L, (int16_t *)a->S->pTwiddle, a->nPE);
            break;
        }
    }
    hal_team_barrier();
    // if (core_id == 0) {

        if (a->bitReverseFlag)
            plp_bitreversal_16p_xpulpv2((uint16_t *)a->p1, a->S->bitRevLength, a->S->pBitRevTable, a->nPE);
    // }
}

void plp_cfft_radix4by2_q16(int16_t *pSrc, uint32_t fftLen, const int16_t *pCoef, uint32_t nPE) {

    int core_id = hal_core_id();

    uint32_t i;
    uint32_t n2, nCores;
    v2s pa, pb;

    uint32_t l;
    int16_t xt, yt, cosVal, sinVal;
    v2s CoSi;
    v2s a, b, t;
    int16_t testa, testb;

    n2 = fftLen >> 1;
    if (n2 % nPE == 0) {
        nCores = n2/nPE;
    } else {
        nCores = n2/nPE + 1;
    }

    uint32_t core_offset = core_id*nCores;

    for (i = core_offset; i < MIN(n2,core_offset + nCores); i++) {
        CoSi = *(v2s *)&pCoef[i * 2];

        l = i + n2;

        a = __SRA2(*(v2s *)&pSrc[2 * i], ((v2s){ 1, 1 }));
        b = __SRA2(*(v2s *)&pSrc[2 * l], ((v2s){ 1, 1 }));
        t = __SUB2(a, b);
        *((v2s *)&pSrc[i * 2]) = __SRA2(__ADD2(a, b), ((v2s){ 1, 1 }));

        // xt = t[0];
        // yt = t[1];
        // cosVal = CoSi[0];
        // sinVal = CoSi[1];

        // pSrc[2U * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
        //           ((int16_t) (((q31_t) yt * sinVal) >> 16)));

        // pSrc[2U * l + 1U] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
        //                ((int16_t) (((q31_t) xt * sinVal) >> 16)));

        testa = (int16_t)(__DOTP2(t, CoSi) >> 16);
        testb = (int16_t)(__DOTP2(t, __PACK2(-CoSi[1], CoSi[0])) >> 16);

        *((v2s *)&pSrc[l * 2]) = __PACK2(testa, testb);
    }

    hal_team_barrier();

    if (nPE > 1){
        if (core_id < nPE/2){
            // first col
            plp_radix4_butterfly_q16(pSrc, n2, (int16_t *)pCoef, 2U, nPE/2);
        } else {
            // second col
            plp_radix4_butterfly_q16(pSrc + fftLen, n2, (int16_t *)pCoef, 2U, nPE - nPE/2);
        }
    } else {
        // first col
        plp_radix4_butterfly_q16(pSrc, n2, (int16_t *)pCoef, 2U, nPE);
        // second col
        plp_radix4_butterfly_q16(pSrc + fftLen, n2, (int16_t *)pCoef, 2U, nPE);
    }

    hal_team_barrier();

    for (i = core_offset; i < MIN((fftLen >> 1), core_offset + nCores); i++) {
        pa = *(v2s *)&pSrc[4 * i];
        pb = *(v2s *)&pSrc[4 * i + 2];

        pa = __SLL2(pa, ((v2s){ 1, 1 }));
        pb = __SLL2(pb, ((v2s){ 1, 1 }));

        *((v2s *)&pSrc[4 * i]) = pa;
        *((v2s *)&pSrc[4 * i + 2]) = pb;
    }
}

/*
* Radix-4 FFT algorithm used is :
*
* Input real and imaginary data:
* x(n) = xa + j * ya
* x(n+N/4 ) = xb + j * yb
* x(n+N/2 ) = xc + j * yc
* x(n+3N 4) = xd + j * yd
*
*
* Output real and imaginary data:
* x(4r) = xa'+ j * ya'
* x(4r+1) = xb'+ j * yb'
* x(4r+2) = xc'+ j * yc'
* x(4r+3) = xd'+ j * yd'
*
*
* Twiddle factors for radix-4 FFT:
* Wn = co1 + j * (- si1)
* W2n = co2 + j * (- si2)
* W3n = co3 + j * (- si3)

* The real and imaginary output values for the radix-4 butterfly are
* xa' = xa + xb + xc + xd
* ya' = ya + yb + yc + yd
* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
*
*/

void plp_radix4_butterfly_q16(int16_t *pSrc16,
                              uint32_t fftLen,
                              int16_t *pCoef16,
                              uint32_t twidCoefModifier,
                              uint32_t nPE) {
    int core_id = hal_core_id()%nPE;
    v2s R, S, T, U, V;
    v2s CoSi1, CoSi2, CoSi3, out;
    uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;

    /* Total process is divided into three stages */

    /* process first stage, middle stages, & last stage */

    /* Initializations for the first stage */
    n2 = fftLen;
    n1 = n2;

    /* n2 = fftLen/4 */
    n2 >>= 2U;

    /*Index for twiddle coefficient */
    // ic = 0U;

    /*Index for input read and output write */
    // i0 = 0U;
    // j = n2;

    uint32_t step;
    if (n2 % nPE == 0) {
        step = n2/nPE;
    } else {
        step = n2/nPE + 1;
    }


    /* Input is in 1.15(q15) format */

    /* start of first stage process */
    for (i0 = core_id * step; i0 < MIN(core_id * step + step, n2); i0++)
    {
        /* Butterfly implementation */

        /* index calculation for the input as, */
        /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
        i1 = i0 + n2;
        i2 = i1 + n2;
        i3 = i2 + n2;

        /*  Twiddle coefficients index modifier */
        ic = i0 * twidCoefModifier;

        /* Reading i0, i0+fftLen/2 inputs */

        /* input is down scale by 4 to avoid overflow */
        /* Read ya (real), xa (imag) input */
        T = __SRA2(*(v2s *)&pSrc16[i0 * 2U], ((v2s){ 2, 2 }));

        /* input is down scale by 4 to avoid overflow */
        /* Read yc (real), xc(imag) input */
        S = __SRA2(*(v2s *)&pSrc16[i2 * 2U], ((v2s){ 2, 2 }));

        /* R0 = (ya + yc) */
        /* R1 = (xa + xc) */
        R = __ADD2(T, S);

        /* S0 = (ya - yc) */
        /* S1 = (xa - xc) */
        S = __SUB2(T, S);

        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
        /* input is down scale by 4 to avoid overflow */
        /* Read yb (real), xb(imag) input */
        T = __SRA2(*(v2s *)&pSrc16[i1 * 2U], ((v2s){ 2, 2 }));

        /* input is down scale by 4 to avoid overflow */
        /* Read yd (real), xd(imag) input */
        U = __SRA2(*(v2s *)&pSrc16[i3 * 2U], ((v2s){ 2, 2 }));

        /* T0 = (yb + yd) */
        /* T1 = (xb + xd) */
        V = __ADD2(T, U);

        /*  writing the butterfly processed i0 sample */
        /* ya' = ya + yb + yc + yd */
        /* xa' = xa + xb + xc + xd */
        *((v2s *)&pSrc16[i0 * 2U]) = __ADD2(__SRA2(R, ((v2s){ 1, 1 })), __SRA2(V, ((v2s){ 1, 1 })));

        /* R0 = (ya + yc) - (yb + yd) */
        /* R1 = (xa + xc) - (xb + xd) */
        R = __SUB2(R, V);

        /* co2 & si2 are read from Coefficient pointer */
        CoSi2 = *(v2s *)&pCoef16[2U * ic * 2U];

        /*  Reading i0+fftLen/4 */
        /* input is down scale by 4 to avoid overflow */
        /* T0 = yb, T1 =  xb */
        // T = __SRA2(*(v2s *) &pSrc16[i1 *2U], ((v2s){2,2}));

        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
        /* writing the butterfly processed i0 + fftLen/4 sample */
        /* writing output(xc', yc') in little endian format */
        *((v2s *)&pSrc16[i1 * 2U]) =
            __PACK2((int16_t)(__DOTP2(CoSi2, R) >> 16U),
                    (int16_t)(__DOTP2(__PACK2(-CoSi2[1], CoSi2[0]), R) >> 16U));

        /*  Butterfly calculations */
        /* input is down scale by 4 to avoid overflow */
        /* U0 = yd, U1 = xd */
        U = __SRA2(*(v2s *)&pSrc16[i3 * 2U], ((v2s){ 2, 2 }));

        /* T0 = yb-yd */
        /* T1 = xb-xd */
        T = __SUB2(T, U);

        /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
        R = __ADD2(S, __PACK2(-T[1], T[0]));

        /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
        S = __ADD2(S, __PACK2(T[1], -T[0]));

        /* co1 & si1 are read from Coefficient pointer */
        CoSi1 = *(v2s *)&pCoef16[ic * 2U];

        /*  Butterfly process for the i0+fftLen/2 sample */
        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
        /* writing output(xb', yb') in little endian format */
        *((v2s *)&pSrc16[i2 * 2U]) =
            __PACK2((int16_t)(__DOTP2(CoSi1, S) >> 16U),
                    (int16_t)(__DOTP2(__PACK2(-CoSi1[1], CoSi1[0]), S) >> 16U));

        /* Co3 & si3 are read from Coefficient pointer */
        CoSi3 = *(v2s *)&pCoef16[3U * (ic * 2U)];

        /*  Butterfly process for the i0+3fftLen/4 sample */
        /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
        /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3)
        /* writing output(xd', yd') in little endian format */
        *((v2s *)&pSrc16[i3 * 2U]) =
            __PACK2((int16_t)(__DOTP2(CoSi3, R) >> 16U),
                    (int16_t)(__DOTP2(__PACK2(-CoSi3[1], CoSi3[0]), R) >> 16U));

    } //while (--j>stop);}
    /* data is in 4.11(q11) format */

    /* end of first stage process */

    hal_team_barrier();

    /* start of middle stage process */

    /*  Twiddle coefficients index modifier */
    twidCoefModifier <<= 2U;

    /*  Calculation of Middle stage */
    for (k = fftLen / 4U; k > 4U; k >>= 2U) {
        /*  Initializations for the middle stage */
        n1 = n2;
        n2 >>= 2U;
        // ic = 0U;

        if (n2 % nPE == 0) {
            step = n2/nPE;
        } else {
            step = n2/nPE + 1;
        }


        for (j = core_id * step; j < MIN(core_id * step + step, n2); j++) {
            /*  index calculation for the coefficients */

            /*  Twiddle coefficients index modifier */
            ic = twidCoefModifier * j;

            /*  index calculation for the coefficients */
            CoSi1 = *(v2s *)&pCoef16[ic * 2U];
            CoSi2 = *(v2s *)&pCoef16[2U * (ic * 2U)];
            CoSi3 = *(v2s *)&pCoef16[3U * (ic * 2U)];

            /*  Butterfly implementation */
            for (i0 = j; i0 < fftLen; i0 += n1) {
                /*  index calculation for the input as, */
                /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 +
                 * 3fftLen/4] */
                i1 = i0 + n2;
                i2 = i1 + n2;
                i3 = i2 + n2;

                /*  Reading i0, i0+fftLen/2 inputs */
                /* Read ya (real), xa(imag) input */
                T = *(v2s *)&pSrc16[i0 * 2U];

                /* Read yc (real), xc(imag) input */
                S = *(v2s *)&pSrc16[i2 * 2U];

                /* R0 = (ya + yc), R1 = (xa + xc) */
                R = __ADD2(T, S);

                /* S0 = (ya - yc), S1 =(xa - xc) */
                S = __SUB2(T, S);

                /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
                /* Read yb (real), xb(imag) input */
                T = *(v2s *)&pSrc16[i1 * 2U];

                /* Read yd (real), xd(imag) input */
                U = *(v2s *)&pSrc16[i3 * 2U];

                /* T0 = (yb + yd), T1 = (xb + xd) */
                V = __ADD2(T, U);

                /*  writing the butterfly processed i0 sample */

                /* xa' = xa + xb + xc + xd */
                /* ya' = ya + yb + yc + yd */
                *((v2s *)&pSrc16[i0 * 2U]) =
                    __SRA2(__ADD2(__SRA2(R, ((v2s){ 1, 1 })), __SRA2(V, ((v2s){ 1, 1 }))),
                           ((v2s){ 1, 1 }));

                /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
                R = __SUB2(__SRA2(R, ((v2s){ 1, 1 })), __SRA2(V, ((v2s){ 1, 1 })));

                /*  Reading i0+3fftLen/4 */
                /* Read yb (real), xb(imag) input */
                // T = *(v2s *) &pSrc16[i1 * 2U];

                /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
                /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
                /*  writing the butterfly processed i0 + fftLen/4 sample */
                /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
                /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
                *((v2s *)&pSrc16[i1 * 2U]) =
                    __PACK2((int16_t)(__DOTP2(CoSi2, R) >> 16U),
                            (int16_t)(__DOTP2(__PACK2(-CoSi2[1], CoSi2[0]), R) >> 16U));

                /*  Butterfly calculations */

                /* Read yd (real), xd(imag) input */
                U = *(v2s *)&pSrc16[i3 * 2U];

                /* T0 = yb-yd, T1 = xb-xd */
                T = __SRA2(__SUB2(T, U), ((v2s){ 1, 1 }));

                /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
                R = __ADD2(__SRA2(S, ((v2s){ 1, 1 })), __PACK2(-T[1], T[0]));

                /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
                S = __ADD2(__SRA2(S, ((v2s){ 1, 1 })), __PACK2(T[1], -T[0]));

                /*  Butterfly process for the i0+fftLen/2 sample */
                /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
                /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
                *((v2s *)&pSrc16[i2 * 2U]) =
                    __PACK2((int16_t)(__DOTP2(CoSi1, S) >> 16U),
                            (int16_t)(__DOTP2(__PACK2(-CoSi1[1], CoSi1[0]), S) >> 16U));

                /*  Butterfly process for the i0+3fftLen/4 sample */
                /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
                /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
                *((v2s *)&pSrc16[i3 * 2U]) =
                    __PACK2((int16_t)(__DOTP2(CoSi3, R) >> 16U),
                            (int16_t)(__DOTP2(__PACK2(-CoSi3[1], CoSi3[0]), R) >> 16U));
            }
        }
        /*  Twiddle coefficients index modifier */
        twidCoefModifier <<= 2U;
        hal_team_barrier();
    }
    /* end of middle stage process */

    /* data is in 10.6(q6) format for the 1024 point */
    /* data is in 8.8(q8) format for the 256 point */
    /* data is in 6.10(q10) format for the 64 point */
    /* data is in 4.12(q12) format for the 16 point */
    /*  Initializations for the last stage */
    n1 = n2;
    n2 >>= 2U;

    // if (core_id == 0) {



    int steps;
    /* start of last stage process */
    steps = fftLen/n1;
    // printf("steps: %i at %i\n", steps, fftLen);
    if (steps % nPE == 0) {
        step = steps/nPE;
    } else {
        step = steps/nPE + 1;
    }


    /*  Butterfly implementation */
    for (i0 = core_id * step * n1; i0 < MIN((core_id * step + step) * n1, fftLen); i0 += n1) {
        /*  index calculation for the input as, */
        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
        i1 = i0 + n2;
        i2 = i1 + n2;
        i3 = i2 + n2;

        /*  Reading i0, i0+fftLen/2 inputs */
        /* Read ya (real), xa(imag) input */
        T = *(v2s *)&pSrc16[i0 * 2U];

        /* Read yc (real), xc(imag) input */
        S = *(v2s *)&pSrc16[i2 * 2U];

        /* R0 = (ya + yc), R1 = (xa + xc) */
        R = __ADD2(T, S);

        /* S0 = (ya - yc), S1 = (xa - xc) */
        S = __SUB2(T, S);

        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
        /* Read yb (real), xb(imag) input */
        T = *(v2s *)&pSrc16[i1 * 2U];
        /* Read yd (real), xd(imag) input */
        U = *(v2s *)&pSrc16[i3 * 2U];

        /* T0 = (yb + yd), T1 = (xb + xd)) */
        T = __ADD2(T, U);

        /*  writing the butterfly processed i0 sample */
        /* xa' = xa + xb + xc + xd */
        /* ya' = ya + yb + yc + yd */
        *((v2s *)&pSrc16[i0 * 2U]) = __ADD2(__SRA2(R, ((v2s){ 1, 1 })), __SRA2(T, ((v2s){ 1, 1 })));

        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
        R = __SUB2(__SRA2(R, ((v2s){ 1, 1 })), __SRA2(T, ((v2s){ 1, 1 })));

        /* Read yb (real), xb(imag) input */
        T = *(v2s *)&pSrc16[i1 * 2U];

        /*  writing the butterfly processed i0 + fftLen/4 sample */
        /* xc' = (xa-xb+xc-xd) */
        /* yc' = (ya-yb+yc-yd) */
        *((v2s *)&pSrc16[i1 * 2U]) = R;

        /* Read yd (real), xd(imag) input */
        U = *(v2s *)&pSrc16[i3 * 2U];

        /* T0 = (yb - yd), T1 = (xb - xd)  */
        T = __SUB2(T, U);

        T = __SRA2(T, ((v2s){ 1, 1 }));
        S = __SRA2(S, ((v2s){ 1, 1 }));

        /*  writing the butterfly processed i0 + fftLen/2 sample */
        /* xb' = (xa+yb-xc-yd) */
        /* yb' = (ya-xb-yc+xd) */
        *((v2s *)&pSrc16[i2 * 2U]) = __ADD2(S, __PACK2(T[1], -T[0]));

        /*  writing the butterfly processed i0 + 3fftLen/4 sample */
        /* xd' = (xa-yb-xc+yd) */
        /* yd' = (ya+xb-yc-xd) */
        *((v2s *)&pSrc16[i3 * 2U]) = __ADD2(S, __PACK2(-T[1], T[0]));
    }

    /* end of last stage process */

    /* output is in 11.5(q5) format for the 1024 point */
    /* output is in 9.7(q7) format for the 256 point   */
    /* output is in 7.9(q9) format for the 64 point  */
    /* output is in 5.11(q11) format for the 16 point  */
// }
}

Updated on 2023-03-01 at 16:16:33 +0000