/github/workspace/src/TransformFunctions/kernels/plp_cfft_f32p_xpulpv2.c
Functions
Name | |
---|---|
int | bit_rev_radix2(int index, int log2FFTLen) |
int | bit_rev_radix4(int index, int log2FFTLen) |
int | bit_rev_radix8(int index, int log2FFTLen) |
Complex_type_f32 | complex_mul(Complex_type_f32 A, Complex_type_f32 B) |
void | process_butterfly_radix2(Complex_type_f32 * input, Complex_type_f32 * output, int twiddle_index, int index, int distance, Complex_type_f32 * twiddle_ptr) |
void | process_butterfly_last_radix2(Complex_type_f32 * input, Complex_type_f32 * output, int outindex) |
void | process_butterfly_radix4(Complex_type_f32 * input, Complex_type_f32 * output, int twiddle_index, int index, int distance, Complex_type_f32 * twiddle_ptr) |
void | process_butterfly_last_radix4(Complex_type_f32 * input, Complex_type_f32 * output, int outindex) |
void | process_butterfly_radix8(Complex_type_f32 * input, Complex_type_f32 * output, int twiddle_index, int index, int distance, Complex_type_f32 * twiddle_ptr) |
void | process_butterfly_last_radix8(Complex_type_f32 * input, Complex_type_f32 * output, int outindex) |
void | plp_cfft_radix2_f32p_xpulpv2(void * arg) |
void | plp_cfft_radix4_f32p_xpulpv2(void * arg) |
void | plp_cfft_radix8_f32p_xpulpv2(void * arg) |
void | plp_cfft_f32p_xpulpv2(void * arg) Floating-point FFT on complex input data for XPULPV2 extension. |
Attributes
Name | |
---|---|
HAL_CL_L1 float32_t | ROT_CONST |
Defines
Name | |
---|---|
MAX(x, y) | |
MIN(x, y) |
Functions Documentation
function bit_rev_radix2
int bit_rev_radix2(
int index,
int log2FFTLen
)
end of complexFFTKernels group
end of realFFTKernels group
function bit_rev_radix4
int bit_rev_radix4(
int index,
int log2FFTLen
)
function bit_rev_radix8
int bit_rev_radix8(
int index,
int log2FFTLen
)
function complex_mul
static inline Complex_type_f32 complex_mul(
Complex_type_f32 A,
Complex_type_f32 B
)
function process_butterfly_radix2
static inline void process_butterfly_radix2(
Complex_type_f32 * input,
Complex_type_f32 * output,
int twiddle_index,
int index,
int distance,
Complex_type_f32 * twiddle_ptr
)
function process_butterfly_last_radix2
static inline void process_butterfly_last_radix2(
Complex_type_f32 * input,
Complex_type_f32 * output,
int outindex
)
function process_butterfly_radix4
static inline void process_butterfly_radix4(
Complex_type_f32 * input,
Complex_type_f32 * output,
int twiddle_index,
int index,
int distance,
Complex_type_f32 * twiddle_ptr
)
function process_butterfly_last_radix4
static inline void process_butterfly_last_radix4(
Complex_type_f32 * input,
Complex_type_f32 * output,
int outindex
)
function process_butterfly_radix8
static inline void process_butterfly_radix8(
Complex_type_f32 * input,
Complex_type_f32 * output,
int twiddle_index,
int index,
int distance,
Complex_type_f32 * twiddle_ptr
)
function process_butterfly_last_radix8
static inline void process_butterfly_last_radix8(
Complex_type_f32 * input,
Complex_type_f32 * output,
int outindex
)
function plp_cfft_radix2_f32p_xpulpv2
static void plp_cfft_radix2_f32p_xpulpv2(
void * arg
)
function plp_cfft_radix4_f32p_xpulpv2
static void plp_cfft_radix4_f32p_xpulpv2(
void * arg
)
function plp_cfft_radix8_f32p_xpulpv2
static void plp_cfft_radix8_f32p_xpulpv2(
void * arg
)
function plp_cfft_f32p_xpulpv2
void plp_cfft_f32p_xpulpv2(
void * arg
)
Floating-point FFT on complex input data for XPULPV2 extension.
Parameters:
- arg points to an instance of the floating-point FFT structure
Return: none
Floating-point FFT on complex input data for XPULPV2 extension (parallel version).
Attributes Documentation
variable ROT_CONST
static HAL_CL_L1 float32_t ROT_CONST = 0.707106781f;
Macros Documentation
define MAX
#define MAX(
x,
y
)
(((x) > (y)) ? (x) : (y))
define MIN
#define MIN(
x,
y
)
(((x) < (y)) ? (x) : (y))
Source code
/* =====================================================================
* Project: PULP DSP Library
* Title: plp_cfft_f32p_xpulpv2.c
* Description: Floating-point FFT on complex input data for XPULPV2
*
* $Date: 10. August 2020
* $Revision: V1
*
* Target Processor: PULP cores with "F" support (wolfe, vega)
* ===================================================================== */
/*
* Copyright (C) 2020 ETH Zurich and University of Bologna. All rights reserved.
*
* Author: Giuseppe Tagliavini, University of Bologna
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plp_math.h"
static HAL_CL_L1 float32_t ROT_CONST = 0.707106781f;
/* HELPER FUNCTIONS */
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
int bit_rev_radix2(int index, int log2FFTLen);
int bit_rev_radix4(int index, int log2FFTLen);
int bit_rev_radix8(int index, int log2FFTLen);
static inline Complex_type_f32 complex_mul(Complex_type_f32 A, Complex_type_f32 B);
static inline void process_butterfly_radix2(Complex_type_f32 *input,
Complex_type_f32 *output,
int twiddle_index,
int index,
int distance,
Complex_type_f32 *twiddle_ptr);
static inline void process_butterfly_last_radix2(Complex_type_f32 *input, Complex_type_f32 *output, int outindex);
static inline void process_butterfly_radix4(Complex_type_f32 *input,
Complex_type_f32 *output,
int twiddle_index,
int index,
int distance,
Complex_type_f32 *twiddle_ptr);
static inline void process_butterfly_last_radix4(Complex_type_f32 *input, Complex_type_f32 *output, int outindex);
static inline void process_butterfly_radix8(Complex_type_f32 *input,
Complex_type_f32 *output,
int twiddle_index,
int index,
int distance,
Complex_type_f32 *twiddle_ptr);
static inline void process_butterfly_last_radix8(Complex_type_f32 *input, Complex_type_f32 *output, int outindex);
static void plp_cfft_radix2_f32p_xpulpv2(void *arg);
static void plp_cfft_radix4_f32p_xpulpv2(void *arg);
static void plp_cfft_radix8_f32p_xpulpv2(void *arg);
void plp_cfft_f32p_xpulpv2(void *arg) {
switch( (((plp_cfft_instance_f32_parallel*)arg)->S)->fftLen ) {
case 64:
case 512:
plp_cfft_radix8_f32p_xpulpv2(arg);
break;
case 16:
case 256:
case 1024:
plp_cfft_radix4_f32p_xpulpv2(arg);
break;
case 32:
case 128:
case 2048:
plp_cfft_radix2_f32p_xpulpv2(arg);
break;
}
}
void plp_cfft_radix2_f32p_xpulpv2(void *arg) {
int k, j, stage, step, d, index;
plp_cfft_instance_f32 *S = ((plp_cfft_instance_f32_parallel*)arg)->S;
const float32_t *pSrc = ((plp_cfft_instance_f32_parallel*)arg)->pSrc;
const uint32_t ifftFlag = ((plp_cfft_instance_f32_parallel*)arg)->ifftFlag;
const uint32_t bitReverseFlag = ((plp_cfft_instance_f32_parallel*)arg)->bitReverseFlag;
const uint32_t nPE = ((plp_cfft_instance_f32_parallel*)arg)->nPE;
Complex_type_f32 temp;
int dist = S->fftLen >> 1;
int nbutterfly = S->fftLen >> 1;
int butt = 2; // number of butterflies in the same group
Complex_type_f32 *_in_ptr;
Complex_type_f32 *_out_ptr;
Complex_type_f32 *_tw_ptr;
int core_id = hal_core_id();
// FIRST STAGE
stage = 1;
_in_ptr = (Complex_type_f32 *)&pSrc[2 * core_id];
_out_ptr = (Complex_type_f32 *)&pSrc[2 * core_id];
_tw_ptr = (Complex_type_f32 *)S->pTwiddle;
for (j = 0; j < nbutterfly/nPE; j++) {
process_butterfly_radix2(_in_ptr, _out_ptr, j * nPE + core_id, 0, dist, _tw_ptr);
_in_ptr += nPE;
_out_ptr += nPE;
} // j
stage = stage + 1;
dist = dist >> 1;
hal_team_barrier();
// STAGES 2 -> n-1
// As long as the number of processing elements is larger than the distance between couples
// the processing elements operate on multiple couples in parallel and are shifted across
// different butterflies
while (dist > nPE / 2) {
step = dist << 1; // The distance between elements of couples is half of the step between corresponding couples in different butterflies
for (j = 0; j < butt; j++) {
_in_ptr = (Complex_type_f32 *)&pSrc[2 * core_id]; //Takes a factor two because they are complex numbers
for (d = 0; d < dist / nPE; d++) {
process_butterfly_radix2(_in_ptr, _in_ptr, (d * nPE + core_id) * butt, j * step, dist, _tw_ptr);
_in_ptr += nPE;
} // d
} // j
stage = stage + 1;
dist = dist >> 1;
butt = butt << 1;
}
hal_team_barrier();
// The butterflies are divided between the processing elements
while (dist > 1) {
step = dist << 1;
if(butt<nPE) {
if (core_id<butt) {
_in_ptr = (Complex_type_f32 *)pSrc;
for (d = 0; d < dist; d++) {
process_butterfly_radix2(_in_ptr, _in_ptr, butt * d, core_id*step, dist, _tw_ptr);
_in_ptr++;
} // d
}
} else {
for (j = 0; j < butt/nPE; j++) {
_in_ptr = (Complex_type_f32 *)pSrc;
for (d = 0; d < dist; d++) {
process_butterfly_radix2(_in_ptr, _in_ptr, butt * d, (j * nPE + core_id) * step, dist, _tw_ptr);
_in_ptr++;
} // d
}// j
}
stage = stage + 1;
dist = dist >> 1;
butt = butt << 1;
hal_team_barrier();
}
// LAST STAGE
_in_ptr = (Complex_type_f32 *)&pSrc[4 * core_id];
index = 2 * core_id;
for (j = 0; j < S->fftLen / (2 * nPE); j++) {
process_butterfly_last_radix2(_in_ptr, (Complex_type_f32 *)pSrc, index);
_in_ptr += 2 * nPE;
index += 2 * nPE;
} // j
hal_team_barrier();
// ORDER VALUES
if (bitReverseFlag) {
int index1, index2, index3, index4;
_out_ptr = (Complex_type_f32 *)pSrc;
for (j = 4 * core_id; j < S->fftLen; j += nPE * 4) {
if (S->pBitRevTable) {
unsigned int index12 = *((unsigned int *)(&S->pBitRevTable[j]));
unsigned int index34 = *((unsigned int *)(&S->pBitRevTable[j + 2]));
index1 = index12 & 0x0000FFFF;
index2 = index12 >> 16;
index3 = index34 & 0x0000FFFF;
index4 = index34 >> 16;
} else {
int log2FFTLen = log2(S->fftLen);
index1 = bit_rev_radix2(j, log2FFTLen);
index2 = bit_rev_radix2(j + 1, log2FFTLen);
index3 = bit_rev_radix2(j + 2, log2FFTLen);
index4 = bit_rev_radix2(j + 3, log2FFTLen);
}
if (index1 > j) {
temp = _out_ptr[j];
_out_ptr[j] = _out_ptr[index1];
_out_ptr[index1] = temp;
}
if (index2 > j + 1) {
temp = _out_ptr[j + 1];
_out_ptr[j + 1] = _out_ptr[index2];
_out_ptr[index2] = temp;
}
if (index3 > j + 2) {
temp = _out_ptr[j + 2];
_out_ptr[j + 2] = _out_ptr[index3];
_out_ptr[index3] = temp;
}
if (index4 > j + 3) {
temp = _out_ptr[j + 3];
_out_ptr[j + 3] = _out_ptr[index4];
_out_ptr[index4] = temp;
}
}
hal_team_barrier();
}
}
void plp_cfft_radix4_f32p_xpulpv2(void *arg) {
int k, j, stage, step, d, index;
plp_cfft_instance_f32 *S = ((plp_cfft_instance_f32_parallel*)arg)->S;
const float32_t *pSrc = ((plp_cfft_instance_f32_parallel*)arg)->pSrc;
const uint32_t ifftFlag = ((plp_cfft_instance_f32_parallel*)arg)->ifftFlag;
const uint32_t bitReverseFlag = ((plp_cfft_instance_f32_parallel*)arg)->bitReverseFlag;
const uint32_t nPE = ((plp_cfft_instance_f32_parallel*)arg)->nPE;
Complex_type_f32 temp;
int dist = S->fftLen >> 2;
int nbutterfly = S->fftLen >> 2;
int butt = 4; // number of butterflies in the same group
Complex_type_f32 *_in_ptr;
Complex_type_f32 *_out_ptr;
Complex_type_f32 *_tw_ptr;
int core_id = hal_core_id();
// FIRST STAGE
stage = 1;
_in_ptr = (Complex_type_f32 *)&pSrc[2 * core_id];
_out_ptr = (Complex_type_f32 *)&pSrc[2 * core_id];
_tw_ptr = (Complex_type_f32 *)S->pTwiddle;
int first_steps = nbutterfly / nPE;
if (first_steps == 0) {
// Fix for case FFTLen == 16 && nPE > 4
first_steps = (core_id < 4)? 1: 0;
}
for (j = 0; j < first_steps; j++) {
process_butterfly_radix4(_in_ptr, _out_ptr, j * nPE + core_id, 0, dist, _tw_ptr);
_in_ptr += nPE;
_out_ptr += nPE;
} // j
stage = stage + 1;
dist = dist >> 2;
// STAGES 2 -> n-1
while (dist >= nPE) {
hal_team_barrier();
step = dist << 2;
for (j = 0; j < butt; j++) {
_in_ptr = (Complex_type_f32 *)&pSrc[2 * core_id];
for (d = 0; d < dist / nPE; d++) {
process_butterfly_radix4(_in_ptr, _in_ptr, (d * nPE + core_id) * butt, j * step, dist, _tw_ptr);
_in_ptr += nPE;
} // d
} // j
stage = stage + 1;
dist = dist >> 2;
butt = butt << 2;
}
while (dist > 1) {
hal_team_barrier();
step = dist << 2;
for (j = 0; j < butt / nPE; j++) {
_in_ptr = (Complex_type_f32 *)pSrc;
for (d = 0; d < dist; d++) {
process_butterfly_radix4(_in_ptr, _in_ptr, butt * d, (j * nPE + core_id) * step, dist, _tw_ptr);
_in_ptr++;
} // d
} // j
stage = stage + 1;
dist = dist >> 2;
butt = butt << 2;
}
hal_team_barrier();
// LAST STAGE
_in_ptr = (Complex_type_f32 *)&pSrc[8 * core_id];
index = 4 * core_id;
int last_steps = S->fftLen / (4 * nPE);
if (last_steps == 0) {
// Fix for case FFTLen == 16 && nPE > 4
last_steps = (core_id < 4)? 1: 0;
}
for (j = 0; j < last_steps; j++) {
process_butterfly_last_radix4(_in_ptr, (Complex_type_f32 *)pSrc, index);
_in_ptr += 4 * nPE;
index += 4 * nPE;
} // j
hal_team_barrier();
// ORDER VALUES
if (bitReverseFlag) {
int index1, index2, index3, index4;
_out_ptr = (Complex_type_f32 *)pSrc;
for (j = 4 * core_id; j < S->fftLen; j += nPE * 4) {
if (bitReverseFlag) {
unsigned int index12 = *((unsigned int *)(&S->pBitRevTable[j]));
unsigned int index34 = *((unsigned int *)(&S->pBitRevTable[j + 2]));
index1 = index12 & 0x0000FFFF;
index2 = index12 >> 16;
index3 = index34 & 0x0000FFFF;
index4 = index34 >> 16;
} else {
int log2FFTLen = log2(S->fftLen);
index1 = bit_rev_radix4(j, log2FFTLen);
index2 = bit_rev_radix4(j + 1, log2FFTLen);
index3 = bit_rev_radix4(j + 2, log2FFTLen);
index4 = bit_rev_radix4(j + 3, log2FFTLen);
}
if (index1 > j) {
temp = _out_ptr[j];
_out_ptr[j] = _out_ptr[index1];
_out_ptr[index1] = temp;
}
if (index2 > j + 1) {
temp = _out_ptr[j + 1];
_out_ptr[j + 1] = _out_ptr[index2];
_out_ptr[index2] = temp;
}
if (index3 > j + 2) {
temp = _out_ptr[j + 2];
_out_ptr[j + 2] = _out_ptr[index3];
_out_ptr[index3] = temp;
}
if (index4 > j + 3) {
temp = _out_ptr[j + 3];
_out_ptr[j + 3] = _out_ptr[index4];
_out_ptr[index4] = temp;
}
}
hal_team_barrier();
}
}
void plp_cfft_radix8_f32p_xpulpv2(void *arg) {
int k, j, stage, step, d, index;
plp_cfft_instance_f32 *S = ((plp_cfft_instance_f32_parallel*)arg)->S;
const float32_t *pSrc = ((plp_cfft_instance_f32_parallel*)arg)->pSrc;
const uint32_t ifftFlag = ((plp_cfft_instance_f32_parallel*)arg)->ifftFlag;
const uint32_t bitReverseFlag = ((plp_cfft_instance_f32_parallel*)arg)->bitReverseFlag;
const uint32_t nPE = ((plp_cfft_instance_f32_parallel*)arg)->nPE;
Complex_type_f32 temp;
int dist = S->fftLen >> 3;
int nbutterfly = S->fftLen >> 3;
int butt = 8; // number of butterflies in the same group
Complex_type_f32 *_in_ptr;
Complex_type_f32 *_out_ptr;
Complex_type_f32 *_tw_ptr;
int core_id = hal_core_id();
// FIRST STAGE
stage = 1;
_in_ptr = (Complex_type_f32 *)&pSrc[2 * core_id];
_out_ptr = (Complex_type_f32 *)&pSrc[2 * core_id];
_tw_ptr = (Complex_type_f32 *)S->pTwiddle;
for (j = 0; j < nbutterfly / nPE; j++) {
process_butterfly_radix8(_in_ptr, _out_ptr, j * nPE + core_id, 0, dist, _tw_ptr);
_in_ptr += nPE;
_out_ptr += nPE;
} // j
stage = stage + 1;
dist = dist >> 3;
// STAGES 2 -> n-1
while (dist >= nPE) {
hal_team_barrier();
step = dist << 3;
for (j = 0; j < butt; j++) {
_in_ptr = (Complex_type_f32 *)&pSrc[2 * core_id];
for (d = 0; d < dist / nPE; d++) {
process_butterfly_radix8(_in_ptr, _in_ptr, (d * nPE + core_id) * butt, j * step, dist,
_tw_ptr);
_in_ptr += nPE;
} // d
} // j
stage = stage + 1;
dist = dist >> 3;
butt = butt << 3;
}
while (dist > 1) {
hal_team_barrier();
step = dist << 3;
for (j = 0; j < butt / nPE; j++) {
_in_ptr = _in_ptr = (Complex_type_f32 *)pSrc;
;
for (d = 0; d < dist; d++) {
process_butterfly_radix8(_in_ptr, _in_ptr, butt * d, (j * nPE + core_id) * step, dist,
_tw_ptr);
_in_ptr++;
} // d
} // j
stage = stage + 1;
dist = dist >> 3;
butt = butt << 3;
}
hal_team_barrier();
// LAST STAGE
_in_ptr = (Complex_type_f32 *)&pSrc[16 * core_id];
index = 8 * core_id;
for (j = 0; j < S->fftLen / (8 * nPE); j++) {
process_butterfly_last_radix8(_in_ptr, (Complex_type_f32 *)pSrc, index);
_in_ptr += 8 * nPE;
index += 8 * nPE;
} // j
hal_team_barrier();
// ORDER VALUES
if (bitReverseFlag) {
int index1, index2, index3, index4;
_out_ptr = (Complex_type_f32 *)pSrc;
for (j = 4 * core_id; j < S->fftLen; j += nPE * 4) {
if (S->pBitRevTable) {
unsigned int index12 = *((unsigned int *)(&S->pBitRevTable[j]));
unsigned int index34 = *((unsigned int *)(&S->pBitRevTable[j + 2]));
index1 = index12 & 0x0000FFFF;
index2 = index12 >> 16;
index3 = index34 & 0x0000FFFF;
index4 = index34 >> 16;
} else {
int log2FFTLen = log2(S->fftLen);
index1 = bit_rev_radix8(j, log2FFTLen);
index2 = bit_rev_radix8(j + 1, log2FFTLen);
index3 = bit_rev_radix8(j + 2, log2FFTLen);
index4 = bit_rev_radix8(j + 3, log2FFTLen);
}
if (index1 > j) {
temp = _out_ptr[j];
_out_ptr[j] = _out_ptr[index1];
_out_ptr[index1] = temp;
}
if (index2 > j + 1) {
temp = _out_ptr[j + 1];
_out_ptr[j + 1] = _out_ptr[index2];
_out_ptr[index2] = temp;
}
if (index3 > j + 2) {
temp = _out_ptr[j + 2];
_out_ptr[j + 2] = _out_ptr[index3];
_out_ptr[index3] = temp;
}
if (index4 > j + 3) {
temp = _out_ptr[j + 3];
_out_ptr[j + 3] = _out_ptr[index4];
_out_ptr[index4] = temp;
}
}
hal_team_barrier();
}
}
int bit_rev_radix2(int index, int log2FFTLen) {
int i;
unsigned int revNum = 0;
for (i = 0; i < log2FFTLen; i++) {
unsigned int temp = (index & (1 << i));
if (temp != 0)
revNum |= (1 << ((log2FFTLen - 1) - i));
}
return revNum;
}
int bit_rev_radix4(int index, int log2FFTLen) //digit reverse 2 bit blocks
{
int i;
unsigned int revNum = 0;
for (i = 0; i < log2FFTLen/2; i++)
{
revNum <<= 2;
revNum |= (index & 0x3);
index >>= 2;
}
return revNum;
}
int bit_rev_radix8(int index, int log2FFTLen) //digit reverse 3 bit blocks
{
int i;
unsigned int revNum = 0;
for (i = 0; i < log2FFTLen/3; i++)
{
revNum <<= 3;
revNum |= (index & 0x7);
index >>= 3;
}
return revNum;
}
static inline Complex_type_f32 complex_mul(Complex_type_f32 A, Complex_type_f32 B) {
Complex_type_f32 result;
result.re = A.re * B.re - A.im * B.im;
result.im = A.re * B.im + A.im * B.re;
return result;
}
static inline void process_butterfly_radix2(Complex_type_f32 *input,
Complex_type_f32 *output,
int twiddle_index,
int index,
int distance,
Complex_type_f32 *twiddle_ptr) {
Complex_type_f32 r0, r1;
float32_t d0 = input[index].re;
float32_t d1 = input[index + distance].re;
float32_t e0 = input[index].im;
float32_t e1 = input[index + distance].im;
// Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
r0.re = d0 + d1;
r1.re = d0 - d1;
// Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
r0.im = e0 + e1;
r1.im = e0 - e1;
Complex_type_f32 tw0 = twiddle_ptr[twiddle_index];
output[index] = r0;
output[index + distance] = complex_mul(tw0, r1);
}
static inline void
process_butterfly_last_radix2(Complex_type_f32 *input, Complex_type_f32 *output, int outindex) {
int index = 0;
Complex_type_f32 r0, r1;
float32_t d0 = input[index].re;
float32_t d1 = input[index + 1].re;
float32_t e0 = input[index].im;
float32_t e1 = input[index + 1].im;
// Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
r0.re = d0 + d1;
r1.re = d0 - d1;
// Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
r0.im = e0 + e1;
r1.im = e0 - e1;
/* In the Last step, twiddle factors are all 1 */
output[outindex] = r0;
output[outindex + 1] = r1;
}
static inline void process_butterfly_radix4 (Complex_type_f32* input, Complex_type_f32* output, int twiddle_index, int index, int distance, Complex_type_f32* twiddle_ptr)
{
Complex_type_f32 r0, r1, r2, r3;
float32_t d0, d1, d2, d3;
float32_t e0, e1, e2, e3;
Complex_type_f32 tw1, tw2, tw3;
d0 = input[index].re;
d1 = input[index+distance].re;
d2 = input[index+2*distance].re;
d3 = input[index+3*distance].re;
e0 = input[index].im;
e1 = input[index+distance].im;
e2 = input[index+2*distance].im;
e3 = input[index+3*distance].im;
// Basic buttefly rotation
/*
r0 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i
r1 1.0000 + 0.0000i 0.7071 - 0.7071i 0.0000 - 1.0000i -0.7071 - 0.7071i -1.0000 - 0.0000i -0.7071 + 0.7071i 0.0000 + 1.0000i 0.7071 + 0.7071i
r2 1.0000 + 0.0000i 0.0000 - 1.0000i -1.0000 - 0.0000i 0.0000 + 1.0000i 1.0000 + 0.0000i 0.0000 - 1.0000i -1.0000 - 0.0000i 0.0000 + 1.0000i
r3 1.0000 + 0.0000i -0.7071 - 0.7071i -0.0000 + 1.0000i 0.7071 - 0.7071i -1.0000 - 0.0000i 0.7071 + 0.7071i 0.0000 - 1.0000i -0.7071 + 0.7071i
*/
//Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
r0.re = d0 + d1 + d2 + d3;
r1.re = d0 - d2 - ( e3 - e1 );
r2.re = d0 - d1 + d2 - d3;
r3.re = d0 - d2 - ( e1 - e3 );
//Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
r0.im = e0 + e1 + e2 + e3;
output[index] = r0;
r1.im = -d1 + d3 + e0 - e2;
r2.im = e0 - e1 + e2 - e3;
r3.im = d1 - d3 + e0 - e2;
tw1 = twiddle_ptr[twiddle_index];
tw2 = twiddle_ptr[twiddle_index*2];
tw3 = twiddle_ptr[twiddle_index*3];
output[index+distance] = complex_mul(tw1, r1);
output[index+2*distance] = complex_mul(tw2, r2);
output[index+3*distance] = complex_mul(tw3, r3);
}
static inline void process_butterfly_radix8 (Complex_type_f32* input, Complex_type_f32* output, int twiddle_index, int index, int distance, Complex_type_f32* twiddle_ptr)
{
float32_t d0, d1, d2, d3, d4, d5, d6, d7;
float32_t e0, e1, e2, e3, e4, e5, e6, e7;
Complex_type_f32 r0, r1, r2, r3, r4, r5, r6, r7;
Complex_type_f32 tw1, tw2, tw3, tw4, tw5, tw6, tw7;
d0 = input[index].re;
d1 = input[index+distance].re;
d2 = input[index+2*distance].re;
d3 = input[index+3*distance].re;
d4 = input[index+4*distance].re;
d5 = input[index+5*distance].re;
d6 = input[index+6*distance].re;
d7 = input[index+7*distance].re;
e0 = input[index].im;
e1 = input[index+distance].im;
e2 = input[index+2*distance].im;
e3 = input[index+3*distance].im;
e4 = input[index+4*distance].im;
e5 = input[index+5*distance].im;
e6 = input[index+6*distance].im;
e7 = input[index+7*distance].im;
// Basic buttefly rotation
/*
r0 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i
r1 1.0000 + 0.0000i 0.7071 - 0.7071i 0.0000 - 1.0000i -0.7071 - 0.7071i -1.0000 - 0.0000i -0.7071 + 0.7071i 0.0000 + 1.0000i 0.7071 + 0.7071i
r2 1.0000 + 0.0000i 0.0000 - 1.0000i -1.0000 - 0.0000i 0.0000 + 1.0000i 1.0000 + 0.0000i 0.0000 - 1.0000i -1.0000 - 0.0000i 0.0000 + 1.0000i
r3 1.0000 + 0.0000i -0.7071 - 0.7071i -0.0000 + 1.0000i 0.7071 - 0.7071i -1.0000 - 0.0000i 0.7071 + 0.7071i 0.0000 - 1.0000i -0.7071 + 0.7071i
r4 1.0000 + 0.0000i -1.0000 - 0.0000i 1.0000 + 0.0000i -1.0000 - 0.0000i 1.0000 + 0.0000i -1.0000 - 0.0000i 1.0000 + 0.0000i -1.0000 - 0.0000i
r5 1.0000 + 0.0000i -0.7071 + 0.7071i 0.0000 - 1.0000i 0.7071 + 0.7071i -1.0000 - 0.0000i 0.7071 - 0.7071i -0.0000 + 1.0000i -0.7071 - 0.7071i
r6 1.0000 + 0.0000i 0.0000 + 1.0000i -1.0000 - 0.0000i 0.0000 - 1.0000i 1.0000 + 0.0000i 0.0000 + 1.0000i -1.0000 - 0.0000i 0.0000 - 1.0000i
r7 1.0000 + 0.0000i 0.7071 + 0.7071i -0.0000 + 1.0000i -0.7071 + 0.7071i -1.0000 - 0.0000i -0.7071 - 0.7071i 0.0000 - 1.0000i 0.7071 - 0.7071i
*/
//Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
float32_t d13 = d1 + d3;
float32_t d1_3 = d1 - d3;
float32_t e13 = e1 + e3;
float32_t e57 = e5 + e7;
float32_t e1_3 = e1 - e3;
float32_t e_57 = e7 - e5;
float32_t d57 = d5 + d7;
float32_t d_57 = d7 - d5;
float32_t temp1 = ROT_CONST*(d1_3+d_57);
float32_t temp1b = ROT_CONST*(e57 - e13);
float32_t temp2 = ROT_CONST*(d57-d13);
float32_t temp2b = ROT_CONST*(e1_3 + e_57);
float32_t d04 = d0 + d4;
float32_t d0_4 = d0 - d4;
float32_t d26 = d2 + d6;
float32_t d_26 = d6 - d2;
float32_t d0246 = d04 + d26;
float32_t d1357 = d13 + d57;
float32_t e0246 = e0 + e2 + e4 + e6;
float32_t e0_4 = e0 - e4;
float32_t e0_24_6 = e0 - e2 + e4 - e6;
float32_t e1357 = e13 + e57;
float32_t e_13_57 = e_57 - e1_3;
float32_t e2_6 = e2 - e6 ;
float32_t e_26 = e6 - e2 ;
r0.re = d0246 + d1357;
//r1.re = d0 + ROT_CONST*d1 - ROT_CONST*d3 - d4 - ROT_CONST*d5 + ROT_CONST*d7;
r1.re = d0_4 + temp1;
r7.re = r1.re + (e_26 + temp1b);
r1.re = r1.re - (e_26 + temp1b);
r2.re = d04 - d26;
r6.re = r2.re + e_13_57;
r2.re = r2.re - e_13_57;
//r3.re = d0 - ROT_CONST*d1 + ROT_CONST*d3 - d4 + ROT_CONST*d5 - ROT_CONST*d7;
r3.re = d0_4 - temp1;
r5.re = r3.re + (e2_6 + temp1b);
r3.re = r3.re - (e2_6 + temp1b);
r4.re = d0246 - d1357;
//Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
r0.im = e0246 + e1357;
//r1.im = -ROT_CONST*d1 - d2 - ROT_CONST*d3 + ROT_CONST*d5 + d6 + ROT_CONST*d7;
r1.im = d_26 + temp2;
r7.im = -r1.im + ( e0_4 + temp2b);
r1.im = r1.im + ( e0_4 + temp2b);
r2.im = d_57 - d1_3;
r6.im = -r2.im + e0_24_6;
r2.im = r2.im + e0_24_6;
//r3.im = -ROT_CONST*d1 + d2 - ROT_CONST*d3 + ROT_CONST*d5 - d6 + ROT_CONST*d7;
r3.im = temp2 - d_26;
r5.im = -r3.im + (e0_4 - temp2b);
r3.im = r3.im + (e0_4 - temp2b);
r4.im = e0246 - e1357;
// TWIDDLES
tw1 = twiddle_ptr[twiddle_index*1];
tw2 = twiddle_ptr[twiddle_index*2];
tw3 = twiddle_ptr[twiddle_index*3];
tw4 = twiddle_ptr[twiddle_index*4];
tw5 = twiddle_ptr[twiddle_index*5];
tw6 = twiddle_ptr[twiddle_index*6];
tw7 = twiddle_ptr[twiddle_index*7];
output[index] = r0;
output[index+distance] = complex_mul(tw1, r1);
output[index+2*distance] = complex_mul(tw2, r2);
output[index+3*distance] = complex_mul(tw3, r3);
output[index+4*distance] = complex_mul(tw4, r4);
output[index+5*distance] = complex_mul(tw5, r5);
output[index+6*distance] = complex_mul(tw6, r6);
output[index+7*distance] = complex_mul(tw7, r7);
}
static inline void process_butterfly_last_radix4 (Complex_type_f32* input, Complex_type_f32* output, int outindex )
{
int index = 0;
Complex_type_f32 r0, r1, r2, r3;
float32_t d0 = input[index].re;
float32_t d1 = input[index+1].re;
float32_t d2 = input[index+2].re;
float32_t d3 = input[index+3].re;
float32_t e0 = input[index].im;
float32_t e1 = input[index+1].im;
float32_t e2 = input[index+2].im;
float32_t e3 = input[index+3].im;
/* twiddles are all 1s*/
// Basic buttefly rotation
/*
r0 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i
r1 1.0000 + 0.0000i 0.7071 - 0.7071i 0.0000 - 1.0000i -0.7071 - 0.7071i -1.0000 - 0.0000i -0.7071 + 0.7071i 0.0000 + 1.0000i 0.7071 + 0.7071i
r2 1.0000 + 0.0000i 0.0000 - 1.0000i -1.0000 - 0.0000i 0.0000 + 1.0000i 1.0000 + 0.0000i 0.0000 - 1.0000i -1.0000 - 0.0000i 0.0000 + 1.0000i
r3 1.0000 + 0.0000i -0.7071 - 0.7071i -0.0000 + 1.0000i 0.7071 - 0.7071i -1.0000 - 0.0000i 0.7071 + 0.7071i 0.0000 - 1.0000i -0.7071 + 0.7071i
*/
// Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
r0.re = d0 + d1 + d2 + d3;
r1.re = d0 - d2 - ( e3 - e1 );
r2.re = d0 - d1 + d2 - d3;
r3.re = d0 - d2 - ( e1 - e3 );
// Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
r0.im = e0 + e1 + e2 + e3;
r1.im = -d1 + d3 + e0 - e2;
r2.im = e0 - e1 + e2 - e3;
r3.im = d1 - d3 + e0 - e2;
output[outindex ] = r0;
output[outindex+1] = r1;
output[outindex+2] = r2;
output[outindex+3] = r3;
}
static inline void process_butterfly_last_radix8 (Complex_type_f32* input, Complex_type_f32* output, int outindex )
{
int index = 0;
float32_t d0, d1, d2, d3, d4, d5, d6, d7;
float32_t e0, e1, e2, e3, e4, e5, e6, e7;
Complex_type_f32 r0, r1, r2, r3, r4, r7, r6, r5 ;
d0 = input[index].re;
d1 = input[index+1].re;
d2 = input[index+2].re;
d3 = input[index+3].re;
d4 = input[index+4].re;
d5 = input[index+5].re;
d6 = input[index+6].re;
d7 = input[index+7].re;
e0 = input[index].im;
e1 = input[index+1].im;
e2 = input[index+2].im;
e3 = input[index+3].im;
e4 = input[index+4].im;
e5 = input[index+5].im;
e6 = input[index+6].im;
e7 = input[index+7].im;
/* twiddles are all 1s */
// Basic buttefly rotation
/*
r0 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i 1.0000 + 0.0000i
r1 1.0000 + 0.0000i 0.7071 - 0.7071i 0.0000 - 1.0000i -0.7071 - 0.7071i -1.0000 - 0.0000i -0.7071 + 0.7071i 0.0000 + 1.0000i 0.7071 + 0.7071i
r2 1.0000 + 0.0000i 0.0000 - 1.0000i -1.0000 - 0.0000i 0.0000 + 1.0000i 1.0000 + 0.0000i 0.0000 - 1.0000i -1.0000 - 0.0000i 0.0000 + 1.0000i
r3 1.0000 + 0.0000i -0.7071 - 0.7071i -0.0000 + 1.0000i 0.7071 - 0.7071i -1.0000 - 0.0000i 0.7071 + 0.7071i 0.0000 - 1.0000i -0.7071 + 0.7071i
r4 1.0000 + 0.0000i -1.0000 - 0.0000i 1.0000 + 0.0000i -1.0000 - 0.0000i 1.0000 + 0.0000i -1.0000 - 0.0000i 1.0000 + 0.0000i -1.0000 - 0.0000i
r5 1.0000 + 0.0000i -0.7071 + 0.7071i 0.0000 - 1.0000i 0.7071 + 0.7071i -1.0000 - 0.0000i 0.7071 - 0.7071i -0.0000 + 1.0000i -0.7071 - 0.7071i
r6 1.0000 + 0.0000i 0.0000 + 1.0000i -1.0000 - 0.0000i 0.0000 - 1.0000i 1.0000 + 0.0000i 0.0000 + 1.0000i -1.0000 - 0.0000i 0.0000 - 1.0000i
r7 1.0000 + 0.0000i 0.7071 + 0.7071i -0.0000 + 1.0000i -0.7071 + 0.7071i -1.0000 - 0.0000i -0.7071 - 0.7071i 0.0000 - 1.0000i 0.7071 - 0.7071i
*/
// Re(c1*c2) = c1.re*c2.re - c1.im*c2.im
float32_t d13 = d1 + d3;
float32_t d1_3 = d1 - d3;
float32_t e13 = e1 + e3;
float32_t e57 = e5 + e7;
float32_t e1_3 = e1 - e3;
float32_t e_57 = e7 - e5;
float32_t d57 = d5 + d7;
float32_t d_57 = d7 - d5;
float32_t temp1 = ROT_CONST*(d1_3+d_57);
float32_t temp1b = ROT_CONST*(e57 - e13);
float32_t temp2 = ROT_CONST*(d57-d13);
float32_t temp2b = ROT_CONST*(e1_3 + e_57);
float32_t d04 = d0 + d4;
float32_t d0_4 = d0 - d4;
float32_t d26 = d2 + d6;
float32_t d_26 = d6 - d2;
float32_t d0246 = d04 + d26;
float32_t d1357 = d13 + d57;
float32_t e0246 = e0 + e2 + e4 + e6;
float32_t e0_4 = e0 - e4;
float32_t e0_24_6 = e0 - e2 + e4 - e6;
float32_t e1357 = e13 + e57;
float32_t e_13_57 = e_57 - e1_3;
float32_t e2_6 = e2 - e6 ;
float32_t e_26 = e6 - e2 ;
r0.re = d0246 + d1357;
// r1.re = d0 + ROT_CONST*d1 - ROT_CONST*d3 - d4 - ROT_CONST*d5 + ROT_CONST*d7;
r1.re = d0_4 + temp1;
r7.re = r1.re + (e_26 + temp1b);
r1.re = r1.re - (e_26 + temp1b);
r2.re = d04 - d26;
r6.re = r2.re + e_13_57;
r2.re = r2.re - e_13_57;
// r3.re = d0 - ROT_CONST*d1 + ROT_CONST*d3 - d4 + ROT_CONST*d5 - ROT_CONST*d7;
r3.re = d0_4 - temp1;
r5.re = r3.re + (e2_6 + temp1b);
r3.re = r3.re - (e2_6 + temp1b);
r4.re = d0246 - d1357;
// Im(c1*c2) = c1.re*c2.im + c1.im*c2.re
r0.im = e0246 + e1357;
// r1.im = -ROT_CONST*d1 - d2 - ROT_CONST*d3 + ROT_CONST*d5 + d6 + ROT_CONST*d7;
r1.im = d_26 + temp2;
r7.im = -r1.im + ( e0_4 + temp2b);
r1.im = r1.im + ( e0_4 + temp2b);
r2.im = d_57 - d1_3;
r6.im = -r2.im + e0_24_6;
r2.im = r2.im + e0_24_6;
// r3.im = -ROT_CONST*d1 + d2 - ROT_CONST*d3 + ROT_CONST*d5 - d6 + ROT_CONST*d7;
r3.im = temp2 - d_26;
r5.im = -r3.im + (e0_4 - temp2b);
r3.im = r3.im + (e0_4 - temp2b);
r4.im = e0246 - e1357;
/* In the Last step, twiddle factors are all 1 */
output[outindex ] = r0;
output[outindex+1 ] = r1;
output[outindex+2*1] = r2;
output[outindex+3*1] = r3;
output[outindex+4*1] = r4;
output[outindex+5*1] = r5;
output[outindex+6*1] = r6;
output[outindex+7*1] = r7;
}
Updated on 2023-03-01 at 16:16:33 +0000