Skip to content

/github/workspace/src/TransformFunctions/kernels/plp_rfft_f32s_xpulpv2.c

Functions

Name
int bit_rev_radix2(int index, int log2FFTLen)
int bit_rev_radix4(int index, int log2FFTLen)
int bit_rev_radix8(int index, int log2FFTLen)
int bit_rev_2_4(int value, int log2FFTLen)
Complex_type_f32 complex_mul(Complex_type_f32 A, Complex_type_f32 B)
Complex_type_f32 complex_mul_real(float32_t A, Complex_type_f32 B)
void plp_rfft_f32s_xpulpv2(const plp_fft_instance_f32 * S, const float32_t restrict pSrc, float32_t restrict pDst)
Floating-point FFT on real input data for XPULPV2 extension.
void plp_radix2_rfft_f32s_xpulpv2(const plp_fft_instance_f32 * S, const float32_t restrict pSrc, float32_t restrict pDst)
void process_butterfly_real_radix2(const float32_t *restrict input, Complex_type_f32 * output, int twiddle_index, int distance, Complex_type_f32 * twiddle_ptr)
void process_butterfly_radix2(Complex_type_f32 * input, Complex_type_f32 * output, int twiddle_index, int index, int distance, Complex_type_f32 * twiddle_ptr)
void process_butterfly_last_radix2(Complex_type_f32 * input, Complex_type_f32 * output, int outindex)
void plp_radix4_rfft_f32s_xpulpv2(const plp_fft_instance_f32 * S, const float32_t restrict pSrc, float32_t restrict pDst)
void process_butterfly_real_radix4(const float32_t *restrict input, Complex_type_f32 * output, int twiddle_index, int distance, Complex_type_f32 * twiddle_ptr)
void process_butterfly_radix4(Complex_type_f32 * input, Complex_type_f32 * output, int twiddle_index, int index, int distance, Complex_type_f32 * twiddle_ptr)
void process_butterfly_last_radix4(Complex_type_f32 * input, Complex_type_f32 * output, int outindex)
void plp_radix8_rfft_f32s_xpulpv2(const plp_fft_instance_f32 * S, const float32_t restrict pSrc, float32_t restrict pDst)
void process_butterfly_real_radix8(const float32_t *restrict input, Complex_type_f32 * output, int twiddle_index, int distance, Complex_type_f32 * twiddle_ptr)
void process_butterfly_radix8(Complex_type_f32 * input, Complex_type_f32 * output, int twiddle_index, int index, int distance, Complex_type_f32 * twiddle_ptr)
void process_butterfly_last_radix8(Complex_type_f32 * input, Complex_type_f32 * output, int outindex)
void plp_radix_2by4_rfft_f32s_xpulpv2(const plp_fft_instance_f32 * S, const float32_t *restrict pSrc, float32_t * pDst)
void fft_radix4_mixed(float32_t restrict pSrc, float32_t restrict pDst, uint32_t FFTLength, const float32_t * pTwiddleFactors)

Attributes

Name
HAL_CL_L1 float32_t ROT_CONST

Functions Documentation

function bit_rev_radix2

int bit_rev_radix2(
    int index,
    int log2FFTLen
)

end of realFFTKernels group

function bit_rev_radix4

int bit_rev_radix4(
    int index,
    int log2FFTLen
)

function bit_rev_radix8

int bit_rev_radix8(
    int index,
    int log2FFTLen
)

function bit_rev_2_4

int bit_rev_2_4(
    int value,
    int log2FFTLen
)

function complex_mul

static inline Complex_type_f32 complex_mul(
    Complex_type_f32 A,
    Complex_type_f32 B
)

function complex_mul_real

static inline Complex_type_f32 complex_mul_real(
    float32_t A,
    Complex_type_f32 B
)

function plp_rfft_f32s_xpulpv2

void plp_rfft_f32s_xpulpv2(
    const plp_fft_instance_f32 * S,
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst
)

Floating-point FFT on real input data for XPULPV2 extension.

Parameters:

  • S points to an instance of the floating-point FFT structure
  • pSrc points to the input buffer (real data)
  • pDst points to the output buffer (complex data)

Return: none

function plp_radix2_rfft_f32s_xpulpv2

static void plp_radix2_rfft_f32s_xpulpv2(
    const plp_fft_instance_f32 * S,
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst
)

function process_butterfly_real_radix2

static inline void process_butterfly_real_radix2(
    const float32_t *__restrict__ input,
    Complex_type_f32 * output,
    int twiddle_index,
    int distance,
    Complex_type_f32 * twiddle_ptr
)

function process_butterfly_radix2

static inline void process_butterfly_radix2(
    Complex_type_f32 * input,
    Complex_type_f32 * output,
    int twiddle_index,
    int index,
    int distance,
    Complex_type_f32 * twiddle_ptr
)

function process_butterfly_last_radix2

static inline void process_butterfly_last_radix2(
    Complex_type_f32 * input,
    Complex_type_f32 * output,
    int outindex
)

function plp_radix4_rfft_f32s_xpulpv2

static void plp_radix4_rfft_f32s_xpulpv2(
    const plp_fft_instance_f32 * S,
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst
)

function process_butterfly_real_radix4

static inline void process_butterfly_real_radix4(
    const float32_t *__restrict__ input,
    Complex_type_f32 * output,
    int twiddle_index,
    int distance,
    Complex_type_f32 * twiddle_ptr
)

function process_butterfly_radix4

static inline void process_butterfly_radix4(
    Complex_type_f32 * input,
    Complex_type_f32 * output,
    int twiddle_index,
    int index,
    int distance,
    Complex_type_f32 * twiddle_ptr
)

function process_butterfly_last_radix4

static inline void process_butterfly_last_radix4(
    Complex_type_f32 * input,
    Complex_type_f32 * output,
    int outindex
)

function plp_radix8_rfft_f32s_xpulpv2

static void plp_radix8_rfft_f32s_xpulpv2(
    const plp_fft_instance_f32 * S,
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst
)

function process_butterfly_real_radix8

static inline void process_butterfly_real_radix8(
    const float32_t *__restrict__ input,
    Complex_type_f32 * output,
    int twiddle_index,
    int distance,
    Complex_type_f32 * twiddle_ptr
)

function process_butterfly_radix8

static inline void process_butterfly_radix8(
    Complex_type_f32 * input,
    Complex_type_f32 * output,
    int twiddle_index,
    int index,
    int distance,
    Complex_type_f32 * twiddle_ptr
)

function process_butterfly_last_radix8

static inline void process_butterfly_last_radix8(
    Complex_type_f32 * input,
    Complex_type_f32 * output,
    int outindex
)

function plp_radix_2by4_rfft_f32s_xpulpv2

static void plp_radix_2by4_rfft_f32s_xpulpv2(
    const plp_fft_instance_f32 * S,
    const float32_t *__restrict__ pSrc,
    float32_t * pDst
)

function fft_radix4_mixed

static void fft_radix4_mixed(
    float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst,
    uint32_t FFTLength,
    const float32_t * pTwiddleFactors
)

Attributes Documentation

variable ROT_CONST

static HAL_CL_L1 float32_t ROT_CONST = 0.707106781f;

Source code

/* =====================================================================
 * Project:      PULP DSP Library
 * Title:        plp_rfft_f32s_xpulpv2.c
 * Description:  Floating-point FFT on real input data for XPULPV2
 *
 * $Date:        10. Aug 2020
 * $Revision:    V1
 *
 * Target Processor: PULP cores with "F" support (wolfe, vega)
 * ===================================================================== */
/*
 * Copyright (C) 2020 ETH Zurich and University of Bologna. All rights reserved.
 *
 * Author: Giuseppe Tagliavini, University of Bologna
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "plp_math.h"
static HAL_CL_L1 float32_t ROT_CONST = 0.707106781f;

/* HELPER FUNCTIONS */
int bit_rev_radix2(int index, int log2FFTLen);
int bit_rev_radix4(int index, int log2FFTLen);
int bit_rev_radix8(int index, int log2FFTLen);
int bit_rev_2_4(int value, int log2FFTLen);

static inline Complex_type_f32 complex_mul(Complex_type_f32 A, Complex_type_f32 B) {
    Complex_type_f32 result;
    result.re = A.re * B.re - A.im * B.im;
    result.im = A.re * B.im + A.im * B.re;
    return result;
}
static inline Complex_type_f32 complex_mul_real(float32_t A, Complex_type_f32 B) {
    Complex_type_f32 result;
    result.re = A * B.re;
    result.im = A * B.im;
    return result;
}

void plp_rfft_f32s_xpulpv2(const plp_fft_instance_f32 *S, const float32_t *__restrict__ pSrc, float32_t *__restrict__ pDst);

/* RADIX-2 */
static void plp_radix2_rfft_f32s_xpulpv2(const plp_fft_instance_f32 *S, const float32_t *__restrict__ pSrc, float32_t *__restrict__ pDst);
static inline void process_butterfly_real_radix2(const float32_t *__restrict__ input, Complex_type_f32 *output, int twiddle_index, int distance, Complex_type_f32 *twiddle_ptr);
static inline void process_butterfly_radix2(Complex_type_f32 *input, Complex_type_f32* output, int twiddle_index, int index, int distance, Complex_type_f32 *twiddle_ptr);
static inline void process_butterfly_last_radix2(Complex_type_f32 *input, Complex_type_f32 *output, int outindex);

/* RADIX-4 */
static void plp_radix4_rfft_f32s_xpulpv2(const plp_fft_instance_f32 *S, const float32_t *__restrict__ pSrc, float32_t *__restrict__ pDst);
static inline void process_butterfly_real_radix4 (const float32_t *__restrict__ input, Complex_type_f32* output, int twiddle_index, int distance, Complex_type_f32* twiddle_ptr);
static inline void process_butterfly_radix4 (Complex_type_f32* input, Complex_type_f32* output, int twiddle_index, int index, int distance, Complex_type_f32* twiddle_ptr);
static inline void process_butterfly_last_radix4 (Complex_type_f32* input, Complex_type_f32* output, int outindex );

/* RADIX-8 */
static void plp_radix8_rfft_f32s_xpulpv2(const plp_fft_instance_f32 *S, const float32_t *__restrict__ pSrc, float32_t *__restrict__ pDst);
static inline void process_butterfly_real_radix8 (const float32_t *__restrict__ input, Complex_type_f32* output, int twiddle_index, int distance, Complex_type_f32* twiddle_ptr);
static inline void process_butterfly_radix8 (Complex_type_f32* input, Complex_type_f32* output, int twiddle_index, int index, int distance, Complex_type_f32* twiddle_ptr);
static inline void process_butterfly_last_radix8 (Complex_type_f32* input, Complex_type_f32* output, int outindex );

/* RADIX-4_2 */
static void plp_radix_2by4_rfft_f32s_xpulpv2(const plp_fft_instance_f32 *S, const float32_t *__restrict__ pSrc, float32_t* pDst);
static void fft_radix4_mixed(   float32_t *__restrict__ pSrc, 
                                float32_t *__restrict__ pDst,
                                uint32_t FFTLength,
                                const float32_t *pTwiddleFactors);

void plp_rfft_f32s_xpulpv2( const plp_fft_instance_f32 *S,
                            const float32_t *__restrict__ pSrc,
                            float32_t *__restrict__ pDst) {
    switch( S->FFTLength ) {

        case 64:
        case 512:
            plp_radix8_rfft_f32s_xpulpv2(S, pSrc, pDst);
            break;        
        case 16:
        case 256:
        case 1024:
            plp_radix4_rfft_f32s_xpulpv2(S, pSrc, pDst);
            break;
        case 32:
        case 128:
            plp_radix2_rfft_f32s_xpulpv2(S, pSrc, pDst);
            break;
        case 2048:
            plp_radix_2by4_rfft_f32s_xpulpv2(S, pSrc, pDst);
            break;
    }
}
/* RADIX-2 */

int bit_rev_radix2(int index, int log2FFTLen) {

    unsigned int revNum = 0;
    unsigned i;
    for (i = 0; i < log2FFTLen; i++) {
        unsigned int temp = (index & (1 << i));
        if (temp != 0)
            revNum |= (1 << ((log2FFTLen - 1) - i));
    }
    return revNum;
}

static inline void process_butterfly_real_radix2(   const float32_t *__restrict__ input,
                                                    Complex_type_f32 *output,
                                                    int twiddle_index,
                                                    int distance,
                                                    Complex_type_f32 *twiddle_ptr) {
    int index = 0;
    float32_t d0 = input[index];
    float32_t d1 = input[index + distance];
    Complex_type_f32 r0, r1;
    // Re(c1*c2) = c1.re*c2.re - c1.im*c2.im, since c1 is real = c1.re*c2.re
    r0.re = d0 + d1;
    // r0.im = 0.0f;
    r1.re = d0 - d1;
    Complex_type_f32 tw0 = twiddle_ptr[twiddle_index];
    output[index] = r0;
    output[index + distance] = complex_mul_real(r1.re, tw0);
}

static inline void process_butterfly_radix2(    Complex_type_f32 *input,
                                                Complex_type_f32 *output,
                                                int twiddle_index,
                                                int index,
                                                int distance,
                                                Complex_type_f32 *twiddle_ptr) {
    Complex_type_f32 r0, r1;
    float32_t d0 = input[index].re;
    float32_t d1 = input[index + distance].re;
    float32_t e0 = input[index].im;
    float32_t e1 = input[index + distance].im;
    // Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
    r0.re = d0 + d1;
    r1.re = d0 - d1;
    // Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
    r0.im = e0 + e1;
    r1.im = e0 - e1;
    Complex_type_f32 tw0 = twiddle_ptr[twiddle_index];
    output[index] = r0;
    output[index + distance] = complex_mul(tw0, r1);
}

static inline void process_butterfly_last_radix2 (  Complex_type_f32 *input,
                                                    Complex_type_f32 *output,
                                                    int outindex) {
    int index = 0;
    Complex_type_f32 r0, r1;
    float32_t d0 = input[index].re;
    float32_t d1 = input[index + 1].re;
    float32_t e0 = input[index].im;
    float32_t e1 = input[index + 1].im;
    // Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
    r0.re = d0 + d1;
    r1.re = d0 - d1;
    // Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
    r0.im = e0 + e1;
    r1.im = e0 - e1;
    /* In the Last step, twiddle factors are all 1 */
    output[outindex  ] = r0;
    output[outindex+1] = r1;
}

static void plp_radix2_rfft_f32s_xpulpv2(   const plp_fft_instance_f32 *S,
                                            const float32_t *__restrict__ pSrc,
                                            float32_t *__restrict__ pDst) {

    int k, j, stage, step, d, index;

    Complex_type_f32 temp;
    int dist = S->FFTLength >> 1;
    int nbutterfly = S->FFTLength >> 1;
    int butt = 2; // number of butterflies in the same group

    Complex_type_f32 *_in_ptr;
    Complex_type_f32 *_out_ptr;
    Complex_type_f32 *_tw_ptr;

    // FIRST STAGE
    stage = 1;
    _out_ptr = (Complex_type_f32 *)pDst;
    _tw_ptr = (Complex_type_f32 *)S->pTwiddleFactors;

    for (j = 0; j < nbutterfly; j++) {
        process_butterfly_real_radix2(pSrc, _out_ptr, j, dist, _tw_ptr);
        pSrc++;
        _out_ptr++;
    } // j

    stage = stage + 1;
    dist = dist >> 1;

    // STAGES 2 -> n-1
    while (dist > 1) {
        step = dist << 1;
        for (j = 0; j < butt; j++) {
            _in_ptr = (Complex_type_f32 *)pDst;
            for (d = 0; d < dist; d++) {
                process_butterfly_radix2(_in_ptr, _in_ptr, d * butt, j * step, dist, _tw_ptr);
                _in_ptr++;
            } // d
        }     // j
        stage = stage + 1;
        dist = dist >> 1;
        butt = butt << 1;
    }

    // LAST STAGE
    _in_ptr = (Complex_type_f32 *)pDst;
    index = 0;
    for (j = 0; j < (S->FFTLength >> 1); j++) {
        process_butterfly_last_radix2(_in_ptr, (Complex_type_f32 *)pDst, index);
        _in_ptr += 2;
        index += 2;
    } // j

    // ORDER VALUES
    if (S->bitReverseFlag) {
          int index1, index2, index3, index4;
        _out_ptr = (Complex_type_f32 *)pDst;
        for (j = 0; j < S->FFTLength; j += 4) {
            if (S->pBitReverseLUT) {
                unsigned int index12 = *((unsigned int *)(&S->pBitReverseLUT[j]));
                unsigned int index34 = *((unsigned int *)(&S->pBitReverseLUT[j + 2]));
                index1 = index12 & 0x0000FFFF;
                index2 = index12 >> 16;
                index3 = index34 & 0x0000FFFF;
                index4 = index34 >> 16;
            } else {
                int log2FFTLen = log2(S->FFTLength);
                index1 = bit_rev_radix2(j, log2FFTLen);
                index2 = bit_rev_radix2(j + 1, log2FFTLen);
                index3 = bit_rev_radix2(j + 2, log2FFTLen);
                index4 = bit_rev_radix2(j + 3, log2FFTLen);
            }
            if (index1 > j) {
                temp = _out_ptr[j];
                _out_ptr[j] = _out_ptr[index1];
                _out_ptr[index1] = temp;
            }
            if (index2 > j + 1) {
                temp = _out_ptr[j + 1];
                _out_ptr[j + 1] = _out_ptr[index2];
                _out_ptr[index2] = temp;
            }
            if (index3 > j + 2) {
                temp = _out_ptr[j + 2];
                _out_ptr[j + 2] = _out_ptr[index3];
                _out_ptr[index3] = temp;
            }
            if (index4 > j + 3) {
                temp = _out_ptr[j + 3];
                _out_ptr[j + 3] = _out_ptr[index4];
                _out_ptr[index4] = temp;
            }
        }
    }
}

/* RADIX-4 */

int bit_rev_radix4(int index, int log2FFTLen) //digit reverse 2 bit blocks
{
    int i;
    unsigned int revNum = 0;

    for (i = 0; i < log2FFTLen/2; i++)
    {
        revNum <<= 2;
        revNum |= (index & 0x3);
        index >>= 2;
    }
    return revNum;
}

static inline void process_butterfly_real_radix4 (  const float32_t *__restrict__ input, 
                                                    Complex_type_f32* output, 
                                                    int twiddle_index,
                                                    int distance, 
                                                    Complex_type_f32* twiddle_ptr) {
    Complex_type_f32 r0, r1, r2, r3;
    float32_t d0, d1, d2, d3;
    Complex_type_f32 tw1, tw2, tw3;

    d0         = input[0];
    d1         = input[distance];
    d2         = input[2*distance];
    d3         = input[3*distance];

    // Basic buttefly rotation
    /*
        r0   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i
        r1   1.0000 + 0.0000i   0.7071 - 0.7071i   0.0000 - 1.0000i  -0.7071 - 0.7071i  -1.0000 - 0.0000i  -0.7071 + 0.7071i   0.0000 + 1.0000i   0.7071 + 0.7071i
        r2   1.0000 + 0.0000i   0.0000 - 1.0000i  -1.0000 - 0.0000i   0.0000 + 1.0000i   1.0000 + 0.0000i   0.0000 - 1.0000i   -1.0000 - 0.0000i  0.0000 + 1.0000i
        r3   1.0000 + 0.0000i  -0.7071 - 0.7071i  -0.0000 + 1.0000i   0.7071 - 0.7071i  -1.0000 - 0.0000i   0.7071 + 0.7071i   0.0000 - 1.0000i  -0.7071 + 0.7071i
    */

     //Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
    r0.re = d0 + d1 + d2 + d3;
    r1.re = d0 - d2;
    r2.re = d0 - d1 + d2 - d3;
    r3.re = r1.re;

    r0.im = 0.0f;
    r1.im = -d1 + d3;
    r2.im = 0.0f;
    r3.im = d1 - d3;

    //Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
    tw1 = twiddle_ptr[twiddle_index];
    tw2 = twiddle_ptr[twiddle_index*2];
    tw3 = twiddle_ptr[twiddle_index*3];

    output[0] = r0;
    output[distance]   = complex_mul(r1, tw1);
    output[2*distance] = complex_mul_real(r2.re, tw2);
    output[3*distance] = complex_mul(r3, tw3);
}

static inline void process_butterfly_radix4 (   Complex_type_f32* input, 
                                                Complex_type_f32* output, 
                                                int twiddle_index, 
                                                int index, 
                                                int distance, 
                                                Complex_type_f32* twiddle_ptr) {
    Complex_type_f32 r0, r1, r2, r3;
    float32_t d0, d1, d2, d3;
    float32_t e0, e1, e2, e3;
    Complex_type_f32 tw1, tw2, tw3;

    d0         = input[index].re;
    d1         = input[index+distance].re;
    d2         = input[index+2*distance].re;
    d3         = input[index+3*distance].re;

    e0         = input[index].im;
    e1         = input[index+distance].im;
    e2         = input[index+2*distance].im;
    e3         = input[index+3*distance].im;


    // Basic buttefly rotation
    /*
        r0   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i
        r1   1.0000 + 0.0000i   0.7071 - 0.7071i   0.0000 - 1.0000i  -0.7071 - 0.7071i  -1.0000 - 0.0000i  -0.7071 + 0.7071i   0.0000 + 1.0000i   0.7071 + 0.7071i
        r2   1.0000 + 0.0000i   0.0000 - 1.0000i  -1.0000 - 0.0000i   0.0000 + 1.0000i   1.0000 + 0.0000i   0.0000 - 1.0000i   -1.0000 - 0.0000i  0.0000 + 1.0000i
        r3   1.0000 + 0.0000i  -0.7071 - 0.7071i  -0.0000 + 1.0000i   0.7071 - 0.7071i  -1.0000 - 0.0000i   0.7071 + 0.7071i   0.0000 - 1.0000i  -0.7071 + 0.7071i
    */

    //Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
    r0.re = d0 + d1 + d2 + d3;
    r1.re = d0 - d2 - ( e3 - e1 );
    r2.re = d0 - d1 + d2 - d3;
    r3.re = d0 - d2 - ( e1 - e3 );

    //Im(c1*c2) = c1.re*c2.im + c1.im*c2.re

    r0.im = e0 + e1 + e2 + e3;
    output[index] = r0;

    r1.im = -d1 + d3 + e0 - e2;
    r2.im = e0 - e1 + e2 - e3;
    r3.im = d1 - d3 + e0 - e2;

    tw1 = twiddle_ptr[twiddle_index];
    tw2 = twiddle_ptr[twiddle_index*2];
    tw3 = twiddle_ptr[twiddle_index*3];

    output[index+distance]   = complex_mul(tw1, r1);
    output[index+2*distance] = complex_mul(tw2, r2);
    output[index+3*distance] = complex_mul(tw3, r3);
}

static inline void process_butterfly_last_radix4 (  Complex_type_f32* input, 
                                                    Complex_type_f32* output, 
                                                    int outindex ) {
    int index = 0;
    Complex_type_f32 r0, r1, r2, r3;
    float32_t d0         = input[index].re;
    float32_t d1         = input[index+1].re;
    float32_t d2         = input[index+2].re;
    float32_t d3         = input[index+3].re;
    float32_t e0         = input[index].im;
    float32_t e1         = input[index+1].im;
    float32_t e2         = input[index+2].im;
    float32_t e3         = input[index+3].im;

    /* twiddles are all 1s*/
    // Basic buttefly rotation
    /*
        r0   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i
        r1   1.0000 + 0.0000i   0.7071 - 0.7071i   0.0000 - 1.0000i  -0.7071 - 0.7071i  -1.0000 - 0.0000i  -0.7071 + 0.7071i   0.0000 + 1.0000i   0.7071 + 0.7071i
        r2   1.0000 + 0.0000i   0.0000 - 1.0000i  -1.0000 - 0.0000i   0.0000 + 1.0000i   1.0000 + 0.0000i   0.0000 - 1.0000i   -1.0000 - 0.0000i  0.0000 + 1.0000i
        r3   1.0000 + 0.0000i  -0.7071 - 0.7071i  -0.0000 + 1.0000i   0.7071 - 0.7071i  -1.0000 - 0.0000i   0.7071 + 0.7071i   0.0000 - 1.0000i  -0.7071 + 0.7071i
    */

    // Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
    r0.re = d0 + d1 + d2 + d3;
    r1.re = d0 - d2 - ( e3 - e1 );
    r2.re = d0 - d1 + d2 - d3;
    r3.re = d0 - d2 - ( e1 - e3 );

    // Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
    r0.im = e0 + e1 + e2 + e3;
    r1.im = -d1 + d3 + e0 - e2;
    r2.im = e0 - e1 + e2 - e3;
    r3.im = d1 - d3 + e0 - e2;

    output[outindex  ] = r0;
    output[outindex+1] = r1;
    output[outindex+2] = r2;
    output[outindex+3] = r3;
}

static void plp_radix4_rfft_f32s_xpulpv2(   const plp_fft_instance_f32 *S, 
                                            const float32_t *__restrict__ pSrc, 
                                            float32_t *__restrict__ pDst) {

    int k, j, stage, step, d, index;

    Complex_type_f32 temp;
    int dist = S->FFTLength >> 2;
    int nbutterfly = S->FFTLength >> 2;
    int butt = 4; // number of butterflies in the same group

    Complex_type_f32 *_in_ptr;
    Complex_type_f32 *_out_ptr;
    Complex_type_f32 *_tw_ptr;

    // FIRST STAGE
    stage = 1;
    _out_ptr = (Complex_type_f32 *)pDst;
    _tw_ptr = (Complex_type_f32 *)S->pTwiddleFactors;

    for (j = 0; j < nbutterfly; j++) {
        process_butterfly_real_radix4(pSrc, _out_ptr, j, dist, _tw_ptr);
        pSrc++;
        _out_ptr++;
    } // j

    stage = stage + 1;
    dist = dist >> 2;

    // STAGES 2 -> n-1
    while (dist > 1) {
        step = dist << 2;
        for (j = 0; j < butt; j++) {
            _in_ptr = (Complex_type_f32 *)pDst;
            for (d = 0; d < dist; d++) {
                process_butterfly_radix4(_in_ptr, _in_ptr, d * butt, j * step, dist, _tw_ptr);
                _in_ptr++;
            } // d
        }     // j
        stage = stage + 1;
        dist = dist >> 2;
        butt = butt << 2;
    }

    // LAST STAGE
    _in_ptr = (Complex_type_f32 *)pDst;
    index = 0;
    for (j = 0; j < (S->FFTLength >> 2); j++) {
        process_butterfly_last_radix4(_in_ptr, (Complex_type_f32 *)pDst, index);
        _in_ptr += 4;
        index += 4;
    } // j

    // ORDER VALUES
    if (S->bitReverseFlag) {
          int index1, index2, index3, index4;
        _out_ptr = (Complex_type_f32 *)pDst;
        for (j = 0; j < S->FFTLength; j += 4) {
            if (S->pBitReverseLUT) {
                unsigned int index12 = *((unsigned int *)(&S->pBitReverseLUT[j]));
                unsigned int index34 = *((unsigned int *)(&S->pBitReverseLUT[j + 2]));
                index1 = index12 & 0x0000FFFF;
                index2 = index12 >> 16;
                index3 = index34 & 0x0000FFFF;
                index4 = index34 >> 16;
            } else {
                int log2FFTLen = log2(S->FFTLength);
                index1 = bit_rev_radix4(j, log2FFTLen);
                index2 = bit_rev_radix4(j + 1, log2FFTLen);
                index3 = bit_rev_radix4(j + 2, log2FFTLen);
                index4 = bit_rev_radix4(j + 3, log2FFTLen);
            }
            if (index1 > j) {
                temp = _out_ptr[j];
                _out_ptr[j] = _out_ptr[index1];
                _out_ptr[index1] = temp;
            }
            if (index2 > j + 1) {
                temp = _out_ptr[j + 1];
                _out_ptr[j + 1] = _out_ptr[index2];
                _out_ptr[index2] = temp;
            }
            if (index3 > j + 2) {
                temp = _out_ptr[j + 2];
                _out_ptr[j + 2] = _out_ptr[index3];
                _out_ptr[index3] = temp;
            }
            if (index4 > j + 3) {
                temp = _out_ptr[j + 3];
                _out_ptr[j + 3] = _out_ptr[index4];
                _out_ptr[index4] = temp;
            }
        }
    }
}

/* RADIX-8 */

int bit_rev_radix8(int index, int log2FFTLen) //digit reverse 2 bit blocks
{
    int i;
    unsigned int revNum = 0;

    for (i = 0; i < log2FFTLen/3; i++)
    {
        revNum <<= 3;
        revNum |= (index & 0x7);
        index >>= 3;
    }
    return revNum;
}

static inline void process_butterfly_real_radix8 (  const float32_t *__restrict__ input, 
                                                    Complex_type_f32* output, 
                                                    int twiddle_index,
                                                    int distance, 
                                                    Complex_type_f32* twiddle_ptr) {
    float32_t d0, d1, d2, d3, d4, d5, d6, d7;

    Complex_type_f32 r0, r1, r2, r3, r4, r5, r6, r7;
    Complex_type_f32 tw1, tw2, tw3, tw4, tw5, tw6, tw7;

    d0         = input[0];
    d1         = input[distance];
    d2         = input[2*distance];
    d3         = input[3*distance];
    d4         = input[4*distance];
    d5         = input[5*distance];
    d6         = input[6*distance];
    d7         = input[7*distance];

    // Basic buttefly rotation
    /*
    r0   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i
    r1   1.0000 + 0.0000i   0.7071 - 0.7071i   0.0000 - 1.0000i  -0.7071 - 0.7071i  -1.0000 - 0.0000i  -0.7071 + 0.7071i   0.0000 + 1.0000i   0.7071 + 0.7071i
    r2   1.0000 + 0.0000i   0.0000 - 1.0000i  -1.0000 - 0.0000i   0.0000 + 1.0000i   1.0000 + 0.0000i   0.0000 - 1.0000i   -1.0000 - 0.0000i  0.0000 + 1.0000i
    r3   1.0000 + 0.0000i  -0.7071 - 0.7071i  -0.0000 + 1.0000i   0.7071 - 0.7071i  -1.0000 - 0.0000i   0.7071 + 0.7071i   0.0000 - 1.0000i  -0.7071 + 0.7071i
    r4   1.0000 + 0.0000i  -1.0000 - 0.0000i   1.0000 + 0.0000i  -1.0000 - 0.0000i   1.0000 + 0.0000i  -1.0000 - 0.0000i   1.0000 + 0.0000i  -1.0000 - 0.0000i
    r5   1.0000 + 0.0000i  -0.7071 + 0.7071i   0.0000 - 1.0000i   0.7071 + 0.7071i  -1.0000 - 0.0000i   0.7071 - 0.7071i  -0.0000 + 1.0000i  -0.7071 - 0.7071i
    r6   1.0000 + 0.0000i   0.0000 + 1.0000i  -1.0000 - 0.0000i   0.0000 - 1.0000i   1.0000 + 0.0000i   0.0000 + 1.0000i  -1.0000 - 0.0000i   0.0000 - 1.0000i
    r7   1.0000 + 0.0000i   0.7071 + 0.7071i  -0.0000 + 1.0000i  -0.7071 + 0.7071i  -1.0000 - 0.0000i  -0.7071 - 0.7071i   0.0000 - 1.0000i   0.7071 - 0.7071i
    */    

    //Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
    float32_t d13 = d1 + d3;
    float32_t d1_3 = d1 - d3;
    float32_t d57 = d5 + d7;
    float32_t d_57 = d7 - d5;
    float32_t temp1 = ROT_CONST*(d1_3+d_57);
    float32_t temp2 = ROT_CONST*(d57-d13);
    float32_t d04   = d0 + d4;
    float32_t d0_4  = d0 - d4;
    float32_t d26  = d2 + d6;
    float32_t d_26  = d6 - d2;
    float32_t d0246 = d04 + d26;
    float32_t d1357 = d13 + d57;

    r0.re = d0246 + d1357;    
    r1.re = d0_4 + temp1;
    r7.re = r1.re;
    r1.re = r1.re;
    r2.re = d04 - d26;
    r6.re = r2.re;
    r2.re = r2.re;  
    r3.re = d0_4 - temp1;
    r5.re = r3.re;
    r3.re = r3.re;
    r4.re = d0246 - d1357;    

    r1.im = d_26 + temp2;
    r2.im = d_57 - d1_3;
    r3.im =  temp2 - d_26;
    r5.im = -r3.im;
    r6.im = -r2.im; 
    r7.im = -r1.im;

    // TWIDDLES
    tw1 = twiddle_ptr[twiddle_index*1];
    tw2 = twiddle_ptr[twiddle_index*2];
    tw3 = twiddle_ptr[twiddle_index*3];
    tw4 = twiddle_ptr[twiddle_index*4];
    tw5 = twiddle_ptr[twiddle_index*5];
    tw6 = twiddle_ptr[twiddle_index*6];
    tw7 = twiddle_ptr[twiddle_index*7];   

    output[0]            = r0;
    output[distance]   = complex_mul(tw1, r1);
    output[2*distance] = complex_mul(tw2, r2);
    output[3*distance] = complex_mul(tw3, r3);
    output[4*distance] = complex_mul_real(r4.re, tw4);
    output[5*distance] = complex_mul(tw5, r5);
    output[6*distance] = complex_mul(tw6, r6);
    output[7*distance] = complex_mul(tw7, r7);
}

static inline void process_butterfly_radix8 (   Complex_type_f32* input, 
                                                Complex_type_f32* output, 
                                                int twiddle_index, 
                                                int index, 
                                                int distance, 
                                                Complex_type_f32* twiddle_ptr) {
    float32_t d0, d1, d2, d3, d4, d5, d6, d7;
    float32_t e0, e1, e2, e3, e4, e5, e6, e7;

    Complex_type_f32 r0, r1, r2, r3, r4, r5, r6, r7;
    Complex_type_f32 tw1, tw2, tw3, tw4, tw5, tw6, tw7;

    d0         = input[index].re;
    d1         = input[index+distance].re;
    d2         = input[index+2*distance].re;
    d3         = input[index+3*distance].re;
    d4         = input[index+4*distance].re;
    d5         = input[index+5*distance].re;
    d6         = input[index+6*distance].re;
    d7         = input[index+7*distance].re;

    e0         = input[index].im;
    e1         = input[index+distance].im;
    e2         = input[index+2*distance].im;
    e3         = input[index+3*distance].im;
    e4         = input[index+4*distance].im;
    e5         = input[index+5*distance].im;
    e6         = input[index+6*distance].im;
    e7         = input[index+7*distance].im;

    // Basic buttefly rotation
    /*
    r0   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i
    r1   1.0000 + 0.0000i   0.7071 - 0.7071i   0.0000 - 1.0000i  -0.7071 - 0.7071i  -1.0000 - 0.0000i  -0.7071 + 0.7071i   0.0000 + 1.0000i   0.7071 + 0.7071i
    r2   1.0000 + 0.0000i   0.0000 - 1.0000i  -1.0000 - 0.0000i   0.0000 + 1.0000i   1.0000 + 0.0000i   0.0000 - 1.0000i   -1.0000 - 0.0000i  0.0000 + 1.0000i
    r3   1.0000 + 0.0000i  -0.7071 - 0.7071i  -0.0000 + 1.0000i   0.7071 - 0.7071i  -1.0000 - 0.0000i   0.7071 + 0.7071i   0.0000 - 1.0000i  -0.7071 + 0.7071i
    r4   1.0000 + 0.0000i  -1.0000 - 0.0000i   1.0000 + 0.0000i  -1.0000 - 0.0000i   1.0000 + 0.0000i  -1.0000 - 0.0000i   1.0000 + 0.0000i  -1.0000 - 0.0000i
    r5   1.0000 + 0.0000i  -0.7071 + 0.7071i   0.0000 - 1.0000i   0.7071 + 0.7071i  -1.0000 - 0.0000i   0.7071 - 0.7071i  -0.0000 + 1.0000i  -0.7071 - 0.7071i
    r6   1.0000 + 0.0000i   0.0000 + 1.0000i  -1.0000 - 0.0000i   0.0000 - 1.0000i   1.0000 + 0.0000i   0.0000 + 1.0000i  -1.0000 - 0.0000i   0.0000 - 1.0000i
    r7   1.0000 + 0.0000i   0.7071 + 0.7071i  -0.0000 + 1.0000i  -0.7071 + 0.7071i  -1.0000 - 0.0000i  -0.7071 - 0.7071i   0.0000 - 1.0000i   0.7071 - 0.7071i
    */    

    //Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
    float32_t d13 = d1 + d3;
    float32_t d1_3 = d1 - d3;
    float32_t e13 = e1 + e3;
    float32_t e57 = e5 + e7;
    float32_t e1_3 = e1 - e3;
    float32_t e_57 = e7 - e5;
    float32_t d57 = d5 + d7;
    float32_t d_57 = d7 - d5;
    float32_t temp1 = ROT_CONST*(d1_3+d_57);
    float32_t temp1b = ROT_CONST*(e57 - e13);
    float32_t temp2 = ROT_CONST*(d57-d13);
    float32_t temp2b = ROT_CONST*(e1_3 + e_57);
    float32_t d04   = d0 + d4;
    float32_t d0_4  = d0 - d4;
    float32_t d26  = d2 + d6;
    float32_t d_26  = d6 - d2;
    float32_t d0246 = d04 + d26;
    float32_t d1357 = d13 + d57;
    float32_t e0246 = e0 + e2 + e4 + e6;
    float32_t e0_4  = e0 - e4;
    float32_t e0_24_6 = e0 - e2 + e4 - e6;
    float32_t e1357 = e13 + e57;
    float32_t e_13_57 = e_57 - e1_3;
    float32_t e2_6 = e2 - e6 ;
    float32_t e_26 = e6 - e2 ;  

    r0.re = d0246 + d1357;    

    //r1.re = d0 + ROT_CONST*d1 - ROT_CONST*d3 - d4 - ROT_CONST*d5 + ROT_CONST*d7;
    r1.re = d0_4 + temp1;
    r7.re = r1.re + (e_26 + temp1b);
    r1.re = r1.re - (e_26 + temp1b);  

    r2.re = d04 - d26;
    r6.re = r2.re + e_13_57;
    r2.re = r2.re - e_13_57;  

    //r3.re = d0 - ROT_CONST*d1 + ROT_CONST*d3 - d4 + ROT_CONST*d5 - ROT_CONST*d7;
    r3.re = d0_4 - temp1;
    r5.re = r3.re + (e2_6 + temp1b);
    r3.re = r3.re - (e2_6 + temp1b);
    r4.re = d0246 - d1357;    

    //Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
    r0.im = e0246 + e1357;
    //r1.im = -ROT_CONST*d1 - d2 - ROT_CONST*d3 + ROT_CONST*d5 + d6 + ROT_CONST*d7;
    r1.im = d_26 + temp2;
    r7.im = -r1.im + ( e0_4 + temp2b);
    r1.im =  r1.im + ( e0_4 + temp2b);    
    r2.im = d_57 - d1_3;
    r6.im = -r2.im + e0_24_6;
    r2.im =  r2.im + e0_24_6; 

    //r3.im = -ROT_CONST*d1 + d2 - ROT_CONST*d3 + ROT_CONST*d5 - d6 + ROT_CONST*d7;
    r3.im =  temp2 - d_26;
    r5.im = -r3.im + (e0_4 - temp2b);
    r3.im =  r3.im + (e0_4 - temp2b); 
    r4.im = e0246 - e1357;    

    // TWIDDLES
    tw1 = twiddle_ptr[twiddle_index*1];
    tw2 = twiddle_ptr[twiddle_index*2];
    tw3 = twiddle_ptr[twiddle_index*3];
    tw4 = twiddle_ptr[twiddle_index*4];
    tw5 = twiddle_ptr[twiddle_index*5];
    tw6 = twiddle_ptr[twiddle_index*6];
    tw7 = twiddle_ptr[twiddle_index*7];   

    output[index]            = r0;
    output[index+distance]   = complex_mul(tw1, r1);
    output[index+2*distance] = complex_mul(tw2, r2);
    output[index+3*distance] = complex_mul(tw3, r3);
    output[index+4*distance] = complex_mul(tw4, r4);
    output[index+5*distance] = complex_mul(tw5, r5);
    output[index+6*distance] = complex_mul(tw6, r6);
    output[index+7*distance] = complex_mul(tw7, r7);
}

static inline void process_butterfly_last_radix8 (  Complex_type_f32* input, 
                                                    Complex_type_f32* output, 
                                                    int outindex ) {
    int index = 0;

    float32_t d0, d1, d2, d3, d4, d5, d6, d7;
    float32_t e0, e1, e2, e3, e4, e5, e6, e7;

    Complex_type_f32 r0, r1, r2, r3, r4, r7, r6, r5 ;

    d0         = input[index].re;
    d1         = input[index+1].re;
    d2         = input[index+2].re;
    d3         = input[index+3].re;
    d4         = input[index+4].re;
    d5         = input[index+5].re;
    d6         = input[index+6].re;
    d7         = input[index+7].re;

    e0         = input[index].im;
    e1         = input[index+1].im;
    e2         = input[index+2].im;
    e3         = input[index+3].im;
    e4         = input[index+4].im;
    e5         = input[index+5].im;
    e6         = input[index+6].im;
    e7         = input[index+7].im;

  /* twiddles are all 1s */
  // Basic buttefly rotation
  /*
    r0   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i   1.0000 + 0.0000i
    r1   1.0000 + 0.0000i   0.7071 - 0.7071i   0.0000 - 1.0000i  -0.7071 - 0.7071i  -1.0000 - 0.0000i  -0.7071 + 0.7071i   0.0000 + 1.0000i   0.7071 + 0.7071i
    r2   1.0000 + 0.0000i   0.0000 - 1.0000i  -1.0000 - 0.0000i   0.0000 + 1.0000i   1.0000 + 0.0000i   0.0000 - 1.0000i   -1.0000 - 0.0000i  0.0000 + 1.0000i
    r3   1.0000 + 0.0000i  -0.7071 - 0.7071i  -0.0000 + 1.0000i   0.7071 - 0.7071i  -1.0000 - 0.0000i   0.7071 + 0.7071i   0.0000 - 1.0000i  -0.7071 + 0.7071i
    r4   1.0000 + 0.0000i  -1.0000 - 0.0000i   1.0000 + 0.0000i  -1.0000 - 0.0000i   1.0000 + 0.0000i  -1.0000 - 0.0000i   1.0000 + 0.0000i  -1.0000 - 0.0000i
    r5   1.0000 + 0.0000i  -0.7071 + 0.7071i   0.0000 - 1.0000i   0.7071 + 0.7071i  -1.0000 - 0.0000i   0.7071 - 0.7071i  -0.0000 + 1.0000i  -0.7071 - 0.7071i
    r6   1.0000 + 0.0000i   0.0000 + 1.0000i  -1.0000 - 0.0000i   0.0000 - 1.0000i   1.0000 + 0.0000i   0.0000 + 1.0000i  -1.0000 - 0.0000i   0.0000 - 1.0000i
    r7   1.0000 + 0.0000i   0.7071 + 0.7071i  -0.0000 + 1.0000i  -0.7071 + 0.7071i  -1.0000 - 0.0000i  -0.7071 - 0.7071i   0.0000 - 1.0000i   0.7071 - 0.7071i
  */

    // Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
    float32_t d13 = d1 + d3;
    float32_t d1_3 = d1 - d3;
    float32_t e13 = e1 + e3;
    float32_t e57 = e5 + e7;
    float32_t e1_3 = e1 - e3;
    float32_t e_57 = e7 - e5;
    float32_t d57 = d5 + d7;
    float32_t d_57 = d7 - d5;
    float32_t temp1 = ROT_CONST*(d1_3+d_57);
    float32_t temp1b = ROT_CONST*(e57 - e13);
    float32_t temp2 = ROT_CONST*(d57-d13);
    float32_t temp2b = ROT_CONST*(e1_3 + e_57);
    float32_t d04   = d0 + d4;
    float32_t d0_4  = d0 - d4;
    float32_t d26  = d2 + d6;
    float32_t d_26  = d6 - d2;
    float32_t d0246 = d04 + d26;
    float32_t d1357 = d13 + d57;
    float32_t e0246 = e0 + e2 + e4 + e6;
    float32_t e0_4  = e0 - e4;
    float32_t e0_24_6 = e0 - e2 + e4 - e6;
    float32_t e1357 = e13 + e57;
    float32_t e_13_57 = e_57 - e1_3;
    float32_t e2_6 = e2 - e6 ;
    float32_t e_26 = e6 - e2 ;

    r0.re = d0246 + d1357;

    // r1.re = d0 + ROT_CONST*d1 - ROT_CONST*d3 - d4 - ROT_CONST*d5 + ROT_CONST*d7;
    r1.re = d0_4 + temp1;
    r7.re = r1.re + (e_26 + temp1b);
    r1.re = r1.re - (e_26 + temp1b);

    r2.re = d04 - d26;
    r6.re = r2.re + e_13_57;
    r2.re = r2.re - e_13_57;

    // r3.re = d0 - ROT_CONST*d1 + ROT_CONST*d3 - d4 + ROT_CONST*d5 - ROT_CONST*d7;
    r3.re = d0_4 - temp1;
    r5.re = r3.re + (e2_6 + temp1b);
    r3.re = r3.re - (e2_6 + temp1b);

    r4.re = d0246 - d1357;


    // Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
    r0.im = e0246 + e1357;

    // r1.im = -ROT_CONST*d1 - d2 - ROT_CONST*d3 + ROT_CONST*d5 + d6 + ROT_CONST*d7;
    r1.im = d_26 + temp2;
    r7.im = -r1.im + ( e0_4 + temp2b);
    r1.im =  r1.im + ( e0_4 + temp2b);

    r2.im = d_57 - d1_3;
    r6.im = -r2.im + e0_24_6;
    r2.im =  r2.im + e0_24_6;

    // r3.im = -ROT_CONST*d1 + d2 - ROT_CONST*d3 + ROT_CONST*d5 - d6 + ROT_CONST*d7;
    r3.im =  temp2 - d_26;
    r5.im = -r3.im + (e0_4 - temp2b);
    r3.im =  r3.im + (e0_4 - temp2b);

    r4.im = e0246 - e1357;

    /* In the Last step, twiddle factors are all 1 */
    output[outindex    ] = r0;
    output[outindex+1  ] = r1;
    output[outindex+2*1] = r2;
    output[outindex+3*1] = r3;
    output[outindex+4*1] = r4;
    output[outindex+5*1] = r5;
    output[outindex+6*1] = r6;
    output[outindex+7*1] = r7;
}

static void plp_radix8_rfft_f32s_xpulpv2(   const plp_fft_instance_f32 *S, 
                                            const float32_t *__restrict__ pSrc, 
                                            float32_t *__restrict__ pDst) {

    int k, j, stage, step, d, index;

    Complex_type_f32 temp;
    int dist = S->FFTLength >> 3;
    int nbutterfly = S->FFTLength >> 3;
    int butt = 8; // number of butterflies in the same group

    Complex_type_f32 *_in_ptr;
    Complex_type_f32 *_out_ptr;
    Complex_type_f32 *_tw_ptr;

    // FIRST STAGE
    stage = 1;
    _out_ptr = (Complex_type_f32 *)pDst;
    _tw_ptr = (Complex_type_f32 *)S->pTwiddleFactors;

    for (j = 0; j < nbutterfly; j++) {
        process_butterfly_real_radix8(pSrc, _out_ptr, j, dist, _tw_ptr);
        pSrc++;
        _out_ptr++;
    } // j

    stage = stage + 1;
    dist = dist >> 3;

    // STAGES 2 -> n-1
    while (dist > 1) {
        step = dist << 3;
        for (j = 0; j < butt; j++) {
            _in_ptr = (Complex_type_f32 *)pDst;
            for (d = 0; d < dist; d++) {
                process_butterfly_radix8(_in_ptr, _in_ptr, d * butt, j * step, dist, _tw_ptr);
                _in_ptr++;
            } // d
        }     // j
        stage = stage + 1;
        dist = dist >> 3;
        butt = butt << 3;
    }

    // LAST STAGE
    _in_ptr = (Complex_type_f32 *)pDst;
    index = 0;
    for (j = 0; j < (S->FFTLength >> 3); j++) {
        process_butterfly_last_radix8(_in_ptr, (Complex_type_f32 *)pDst, index);
        _in_ptr += 8;
        index += 8;
    } // j

    // ORDER VALUES
    if (S->bitReverseFlag) {
          int index1, index2, index3, index4;
        _out_ptr = (Complex_type_f32 *)pDst;
        for (j = 0; j < S->FFTLength; j += 4) {
            if (S->pBitReverseLUT) {
                unsigned int index12 = *((unsigned int *)(&S->pBitReverseLUT[j]));
                unsigned int index34 = *((unsigned int *)(&S->pBitReverseLUT[j + 2]));
                index1 = index12 & 0x0000FFFF;
                index2 = index12 >> 16;
                index3 = index34 & 0x0000FFFF;
                index4 = index34 >> 16;
            } else {
                int log2FFTLen = log2(S->FFTLength);
                index1 = bit_rev_radix8(j, log2FFTLen);
                index2 = bit_rev_radix8(j + 1, log2FFTLen);
                index3 = bit_rev_radix8(j + 2, log2FFTLen);
                index4 = bit_rev_radix8(j + 3, log2FFTLen);
            }
            if (index1 > j) {
                temp = _out_ptr[j];
                _out_ptr[j] = _out_ptr[index1];
                _out_ptr[index1] = temp;
            }
            if (index2 > j + 1) {
                temp = _out_ptr[j + 1];
                _out_ptr[j + 1] = _out_ptr[index2];
                _out_ptr[index2] = temp;
            }
            if (index3 > j + 2) {
                temp = _out_ptr[j + 2];
                _out_ptr[j + 2] = _out_ptr[index3];
                _out_ptr[index3] = temp;
            }
            if (index4 > j + 3) {
                temp = _out_ptr[j + 3];
                _out_ptr[j + 3] = _out_ptr[index4];
                _out_ptr[index4] = temp;
            }
        }
    }
}


/* RADIX_2_4 */

int bit_rev_2_4(int value, int log2FFTLen) {
    int i;
    unsigned int new_value = 0;
    for (i = 0; i < (log2FFTLen - 1)/2; i++)
    {
        new_value <<= 2;
        new_value |= (value & 0x3);
        value >>= 2;
    }
    new_value = (new_value << 1) | value;
    return new_value;
}

static void plp_radix_2by4_rfft_f32s_xpulpv2(   const plp_fft_instance_f32 *S, 
                                                const float32_t *__restrict__ pSrc, 
                                                float32_t * pDst) {
    int i, j;
    int dist = S->FFTLength >> 1;
    int nbutterfly = S->FFTLength >> 1;
    Complex_type_f32 * _out_ptr;
    Complex_type_f32 * _tw_ptr;
    Complex_type_f32 temp;

    // STAGE 1
    _out_ptr = (Complex_type_f32 *)pDst;
    _tw_ptr = (Complex_type_f32 *)(S->pTwiddleFactors);
    for (j = 0; j < nbutterfly; j++) {
        process_butterfly_real_radix2(pSrc, &_out_ptr[j], j, dist, _tw_ptr);
        pSrc++;
        // _out_ptr++;
    }

    // STAGES 2 -> n (radix-4)
    _out_ptr = (Complex_type_f32 *)pDst;
    for (j = 0; j < S->FFTLength; j += S->FFTLength/2) {
        fft_radix4_mixed(   (float32_t *)(_out_ptr + j), (float32_t *)(_out_ptr + j), 
                            (S -> FFTLength) / 2,
                            S -> pTwiddleFactors);
    }

    if ((S -> bitReverseFlag) == 1) {
        if (S -> pBitReverseLUT) {
            int log2FFTLen = log2(S -> FFTLength);
            for(j = 0; j < S->FFTLength; j += log2FFTLen) {
                unsigned int index[log2FFTLen + 1];
                unsigned int val;
                // Read a sequence of permutation from the LUT
                val = *((unsigned int *)(&S->pBitReverseLUT[j]));
                if(val == 0) break;
                index[0] = val & 0x0000FFFF;
                index[1] = val >> 16;
                for(int k = 2; k < log2FFTLen + 1; k += 2) {
                  val = *((unsigned int *)(&S->pBitReverseLUT[j + k]));
                  index[k] = val & 0x0000FFFF;
                  index[k + 1] = val >> 16;
                }
                // Apply permutations
                temp = _out_ptr[index[log2FFTLen - 1]];
                for(int k = log2FFTLen - 1; k > 0; k--) {
                  _out_ptr[index[k]] = _out_ptr[index[k - 1]];
                }
                _out_ptr[index[0]] = temp;
            }
        } else {
            int log2FFTLen = log2(S -> FFTLength);
            for(j = 0; j < S -> FFTLength; j++) {
                int skip;
                unsigned int index[log2FFTLen];
                index[0] = j;
                skip = 0;
                // Find a closed set of permutations
                for (int k = 0; k < log2FFTLen - 1; k++) {
                    index[k + 1] = bit_rev_2_4(index[k], log2FFTLen);
                    if (index[k + 1] > j) { 
                        skip = 1; 
                        break;
                    }
                }
                // Apply permutations
                if (!skip) {
                    temp = _out_ptr[index[log2FFTLen - 1]];
                    for (int k = log2FFTLen - 1; k > 0; k--) {
                        _out_ptr[index[k]] = _out_ptr[index[k - 1]];
                    }
                    _out_ptr[index[0]] = temp;
                }
            }
        }
    }

}

static void fft_radix4_mixed(   float32_t *__restrict__ pSrc, 
                                float32_t *__restrict__ pDst,
                                uint32_t FFTLength,
                                const float32_t *pTwiddleFactors) {

    int k, j, stage, step, d, index;
    int dist = FFTLength >> 2;
    int nbutterfly = FFTLength >> 2;
    int butt = 4; // number of butterflies in the same group
    Complex_type_f32 *_in_ptr;
    Complex_type_f32 *_tw_ptr;
    // FIRST STAGE
    stage = 1;
    _in_ptr  = (Complex_type_f32 *)pSrc;
    _tw_ptr = (Complex_type_f32 *)pTwiddleFactors;
    for (j = 0; j < nbutterfly; j++) {
        process_butterfly_radix4(_in_ptr, _in_ptr, 2 * j, 0, dist, _tw_ptr);
        _in_ptr++;
    } // j
    stage = stage + 1;
    dist = dist >> 2;

    // STAGES 2 -> n-1
    while (dist > 1) {
        step = dist << 2;

        for (j = 0; j < butt; j++) {
            _in_ptr  = (Complex_type_f32 *)pSrc;
            for (d = 0; d < dist; d++) {
                process_butterfly_radix4(_in_ptr, _in_ptr, 2 * d * butt, j * step, dist, _tw_ptr);
                _in_ptr++;
            } // d
        }     // j
        stage = stage + 1;
        dist = dist >> 2;
        butt = butt << 2;

    }
    // LAST STAGE
    _in_ptr = (Complex_type_f32 *)pSrc;
    index = 0;
    for (j = 0; j < (FFTLength >> 2); j++) {
        process_butterfly_last_radix4(_in_ptr, (Complex_type_f32 *)pDst, index);
        _in_ptr += 4;
        index += 4;
    } // j
}

Updated on 2023-03-01 at 16:16:33 +0000