/github/workspace/src/TransformFunctions/kernels/plp_dwt_q8s_xpulpv2.c
Functions
Name | |
---|---|
void | plp_dwt_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q8 wavelet, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) Q7 fixed-point DWT for XPULPV2 extension. |
void | plp_dwt_haar_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) q7 fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension. |
Defines
Name | |
---|---|
HAAR_COEF | |
MAC_SHIFT | |
SHUFFLEMASK | |
MAC(Acc, A, B) | |
MSU(Acc, A, B) |
Functions Documentation
function plp_dwt_q8s_xpulpv2
void plp_dwt_q8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q8 wavelet,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD
)
Q7 fixed-point DWT for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (q7)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return: none
8bit Fixed-point DWT for XPULPV2 extension.
function plp_dwt_haar_q8s_xpulpv2
void plp_dwt_haar_q8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD
)
q7 fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (q7)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return: none
8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
Macros Documentation
define HAAR_COEF
#define HAAR_COEF (0x5b)
define MAC_SHIFT
#define MAC_SHIFT 7U;
define SHUFFLEMASK
#define SHUFFLEMASK (v4s) { 3, 2, 1, 0 }
define MAC
#define MAC(
Acc,
A,
B
)
Acc += ((int16_t)A * (int16_t)B);
define MSU
#define MSU(
Acc,
A,
B
)
Acc -= ((int16_t)A * (int16_t)B);
Source code
/* ----------------------------------------------------------------------
* Project: PULP DSP Library
* Title: plp_dwt_q8s_xpulpv2.c
* Description: Q8 Fixed-point Discret Wavelet Transform for XPULPV2
*
* $Date: 10. Juli 2021
* $Revision: V1
*
* Target Processor: PULP cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2021 ETH Zurich and University of Bologna. All rights reserved.
*
* Author: Jakub Mandula, ETH Zurich
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plp_math.h"
#include "plp_const_structs.h"
/* HELPER FUNCTIONS */
#define HAAR_COEF (0x5b)
#define MAC_SHIFT 7U;
#define SHUFFLEMASK (v4s) { 3, 2, 1, 0 }
// #define MAC(Acc, A, B) Acc = __MACS(Acc, A, B); (Slower?)
#define MAC(Acc, A, B) Acc += ((int16_t)A * (int16_t)B);
#define MSU(Acc, A, B) Acc -= ((int16_t)A * (int16_t)B);
#include "plp_dwt_signal_ext.h"
void plp_dwt_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q8 wavelet,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD) {
int8_t *pCurrentA = pDstA;
int8_t *pCurrentD = pDstD;
static uint32_t step = 2;
int32_t offset;
/***
* The filter convolution is done in 4 steps handling cases where
* 1. Filter is hanging over the left side of the signal
* 2. Filter is same size, or totally enclosed in signal
* 3. Filter is larger than the enclosed signal and hangs over both edges
* 4. Filter hangs over the right side of the signal
*
* Each of the cases, where signal hangs over the boundary of the signal, values are computed
* on demand based on the edge extension mode.
*/
/* Step 1.
* Handle Left overhanging
*
* X() = x x[A B C D E F]
* H() = [d c b a]
* ^ ^
* | First compute the filter part overlapping with the signal
* Then extend the signal (x x) by computing the values based on the extension mode
*/
for(offset = step-1; offset < wavelet.length - 1 && offset < length; offset += step){
int16_t sum_lo = 0;
int16_t sum_hi = 0;
uint32_t filt_j = 0;
// Compute Filter overlapping with signal
for(; filt_j <= offset; filt_j++){
MAC(sum_lo, wavelet.dec_lo[filt_j], pSrc[offset - filt_j]);
MAC(sum_hi, wavelet.dec_hi[filt_j], pSrc[offset - filt_j]);
}
// Compute Left edge extension
switch(mode){
case PLP_DWT_MODE_CONSTANT:
CONSTANT_EDGE_LEFT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_SYMMETRIC:
SYMMETRIC_EDGE_LEFT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_REFLECT:
REFLECT_EDGE_LEFT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_ANTISYMMETRIC:
ANTISYMMETRIC_EDGE_LEFT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_ANTIREFLECT:
ANTIREFLECT_EDGE_LEFT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset, int32_t);
break;
case PLP_DWT_MODE_PERIODIC:
case PLP_DWT_MODE_ZERO:
default:
break;
}
*pCurrentA++ = sum_lo >> MAC_SHIFT;
*pCurrentD++ = sum_hi >> MAC_SHIFT;
}
/* Step 2.
* Compute center (length >= wavelet.length)
*
* X() = [A B C D E F]
* h() = [d c b a]
* ^
* Compute a full convolution of the filter with the signal
*/
for(;offset < length; offset += step){
int32_t sum_lo = 0;
int32_t sum_hi = 0;
/* We can process 4 elements at a time
*
* X = ... x3[x4 x5 x6 x7]x8 ...
* Y1 = ... y4[y3 y2 y1 y0]
*
* Acc += DOTP4([x7 x6 x5 x4], [y0 y1 y2 y3])
**/
uint32_t blkCnt = wavelet.length >> 2U;
const int8_t *pYlo = wavelet.dec_lo; // Start of wavelet lo
const int8_t *pYhi = wavelet.dec_hi; // Start of wavelet hi
const int8_t *pX = pSrc + offset;
while (blkCnt > 0U){
v4s v_x = *((v4s *)(pX - 3)); // { x[0], x[1], x[2], x[3]}
v4s v_ylo = *((v4s *)pYlo); // {lo[0], lo[1], lo[2], lo[3]}
v4s v_yhi = *((v4s *)pYhi); // {hi[0], hi[1], hi[2], hi[3]}
// Input order must be flipped for convolution
v4s v_sx = __builtin_shuffle(v_x, v_x, SHUFFLEMASK); // {x[3], x[2], x[1], [0]}
sum_lo = __SUMDOTP4(v_sx, v_ylo, sum_lo);
sum_hi = __SUMDOTP4(v_sx, v_yhi, sum_hi);
pYlo += 4;
pYhi += 4;
pX -= 4;
blkCnt--;
}
/* Wavelet length is not a multiple of 4. However it will always be multiple of 2.
* Thus just handel this case extra
*
* X = ... [x2 x3]x5 x6 x7 x8 ...
* Y1 = [y5 y4]y3 y2 y1 y0
*
* Acc += DOTP4([x3 x2 0 0], [y4 y5 0 0])
**/
if(wavelet.length % 4 > 0){
v4s v_ylo = *((v4s *)pYlo); // {lo[0], lo[1], XX , XX }
v4s v_yhi = *((v4s *)pYhi); // {hi[0], hi[1], XX , XX }
v4s v_x = *((v4s *)(pX - 3)); // { XX , XX , x[0], x[1]}
// Input order must be flipped for convolution and we must mask out the invalid bytes {XX XX, x[0], x[1]}
v_x = __AND4(v_x, ((v4s){ 0, 0, 0xff, 0xff })); // { 0 , 0 , x[0], x[1]}
v4s v_sx = __builtin_shuffle(v_x, v_x, SHUFFLEMASK); // {x[1], x[0], 0 , 0 }
sum_lo = __SUMDOTP4(v_sx, v_ylo, sum_lo);
sum_hi = __SUMDOTP4(v_sx, v_yhi, sum_hi);
}
*pCurrentA++ = sum_lo >> MAC_SHIFT;
*pCurrentD++ = sum_hi >> MAC_SHIFT;
}
/* Step 3.
* Compute center (length < wavelet.length)
*
* X() = y y[A B C]x x x
* h() = [h g f e d c b a]
* ^ ^ ^
* | | Compute Right extension (x x x) based on extension mode
* | Compute a full convolution of the filter overlapping with the signal
* Compute Left extension (y y) based on extension mode
*/
for(;offset < wavelet.length - 1; offset += step){
int16_t sum_lo = 0;
int16_t sum_hi = 0;
uint32_t filt_j = 0;
// Filter Right extension
switch(mode){
case PLP_DWT_MODE_CONSTANT:
CONSTANT_EDGE_RIGHT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_SYMMETRIC:
SYMMETRIC_EDGE_RIGHT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_REFLECT:
REFLECT_EDGE_RIGHT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_ANTISYMMETRIC:
ANTISYMMETRIC_EDGE_RIGHT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_ANTIREFLECT:
ANTIREFLECT_EDGE_RIGHT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset, int32_t);
break;
case PLP_DWT_MODE_PERIODIC:
case PLP_DWT_MODE_ZERO:
default:
filt_j = offset - length + 1;
break;
}
// Filter Center overlapp
for(; filt_j <= offset; filt_j++){
MAC(sum_lo, wavelet.dec_lo[filt_j], pSrc[offset - filt_j]);
MAC(sum_hi, wavelet.dec_hi[filt_j], pSrc[offset - filt_j]);
}
// Filter Left extension
switch(mode){
case PLP_DWT_MODE_CONSTANT:
CONSTANT_EDGE_LEFT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_SYMMETRIC:
SYMMETRIC_EDGE_LEFT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_REFLECT:
REFLECT_EDGE_LEFT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_ANTISYMMETRIC:
ANTISYMMETRIC_EDGE_LEFT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_ANTIREFLECT:
ANTIREFLECT_EDGE_LEFT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset, int32_t);
break;
case PLP_DWT_MODE_PERIODIC:
case PLP_DWT_MODE_ZERO:
default:
break;
}
*pCurrentA++ = sum_lo >> MAC_SHIFT;
*pCurrentD++ = sum_hi >> MAC_SHIFT;
}
/* Step 4.
* Handle Right overhanging
*
* X() = [A B C D E F]x x
* H() = [d c b a]
* ^ ^
* | First extend the signal (x x) by computing the values based on the extension mode
* Then compute the filter part overlapping with the signal
*/
for(; offset < length + wavelet.length - 1; offset += step){
int16_t sum_lo = 0;
int16_t sum_hi = 0;
uint32_t filt_j = 0;
// Compute Left edge extension
switch(mode){
case PLP_DWT_MODE_CONSTANT:
CONSTANT_EDGE_RIGHT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_SYMMETRIC:
SYMMETRIC_EDGE_RIGHT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_REFLECT:
REFLECT_EDGE_RIGHT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_ANTISYMMETRIC:
ANTISYMMETRIC_EDGE_RIGHT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset);
break;
case PLP_DWT_MODE_ANTIREFLECT:
ANTIREFLECT_EDGE_RIGHT(sum_lo, sum_hi, pSrc, length, wavelet, filt_j, offset, int32_t);
break;
case PLP_DWT_MODE_PERIODIC:
case PLP_DWT_MODE_ZERO:
default:
filt_j = offset - length + 1;
break;
}
// Filter overlapping with signal
for(; filt_j < wavelet.length; filt_j++){
MAC(sum_lo, wavelet.dec_lo[filt_j], pSrc[offset - filt_j]);
MAC(sum_hi, wavelet.dec_hi[filt_j], pSrc[offset - filt_j]);
}
*pCurrentA++ = sum_lo >> MAC_SHIFT;
*pCurrentD++ = sum_hi >> MAC_SHIFT;
}
}
void plp_dwt_haar_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD) {
int8_t *pCurrentA = pDstA;
int8_t *pCurrentD = pDstD;
// Left and Right version of Vectored filter coefficients
static v4s v_ylo_l = (v4s){HAAR_COEF, HAAR_COEF, 0, 0};
static v4s v_ylo_r = (v4s){0, 0, HAAR_COEF, HAAR_COEF};
static v4s v_yhi_l = (v4s){HAAR_COEF, -HAAR_COEF, 0, 0};
static v4s v_yhi_r = (v4s){0, 0, HAAR_COEF, -HAAR_COEF};
v4s v_x1, v_x2;
/***
* The filter convolution is done in 2 steps handling cases where
* 1. Filter is same size, or totally enclosed in signal center
* 2. Filter hangs over the right side of the signal
*
* In of the cases, where signal hangs over the boundary of the signal, values are computed
* on demand based on the edge extension mode.
*/
/* Step 1.
* Compute center (length >= wavelet.length)
*
* X() = [A B C D E F]
* h() = [b a]
* ^
* Compute a full convolution of the filter with the signal
*/
/* We read 8 numbers at a time performing 4 convolutions (if signal is longer than or equal to 8)*/
uint32_t blkCnt = length >> 3U;
const int8_t *pS = pSrc;
int8_t destA1, destA2, destA3, destA4, destD1, destD2, destD3, destD4;
while(blkCnt > 0){
// Get next 4 numbers
v_x1 = *((v4s *)(pS)); // { x[0], x[1], x[2], x[3]}
pS += 4;
v_x2 = *((v4s *)(pS)); // { x[0], x[1], x[2], x[3]}
pS += 4;
// Perfrom dot product of left pair
destA1 = __DOTP4(v_x1, v_ylo_l) >> MAC_SHIFT;
destD1 = __DOTP4(v_x1, v_yhi_l) >> MAC_SHIFT;
// Perfrom dot product of right pair
destA2 = __DOTP4(v_x1, v_ylo_r) >> MAC_SHIFT;
destD2 = __DOTP4(v_x1, v_yhi_r) >> MAC_SHIFT;
// Perfrom dot product of left pair
destA3 = __DOTP4(v_x2, v_ylo_l) >> MAC_SHIFT;
destD3 = __DOTP4(v_x2, v_yhi_l) >> MAC_SHIFT;
// Perfrom dot product of right pair
destA4 = __DOTP4(v_x2, v_ylo_r) >> MAC_SHIFT;
destD4 = __DOTP4(v_x2, v_yhi_r) >> MAC_SHIFT;
*((v4s *)pCurrentA) = __PACK4(destA1, destA2, destA3, destA4);
pCurrentA += 4;
*((v4s *)pCurrentD) = __PACK4(destD1, destD2, destD3, destD4);
pCurrentD += 4;
blkCnt--;
}
// Handle any elements left to process
switch(length % 8U){
case 1:
// We have no more pairs left but there is a odd element left, so continue to Step 2.
break;
case 4:
case 5:
case 6:
case 7:
// We need to process 4
v_x1 = *((v4s *)(pS)); // { x[0], x[1], x[2], x[3]}
//
pS += 4;
*pCurrentA++ = __DOTP4(v_x1, v_ylo_l) >> MAC_SHIFT;
*pCurrentD++ = __DOTP4(v_x1, v_yhi_l) >> MAC_SHIFT;
*pCurrentA++ = __DOTP4(v_x1, v_ylo_r) >> MAC_SHIFT;
*pCurrentD++ = __DOTP4(v_x1, v_yhi_r) >> MAC_SHIFT;
if(length % 8U < 6){
// If we had 6 or 7 remaining, still need 2 to process (continue to 2/3)
break;
}
case 2:
// We Still have one pair left, but after that we are done (skip Step 2.).
case 3:
// We compute the full overlap pair here (In case there is a trailing element left, it will be handled in Step 2.)
v_x1 = *((v4s *)(pS)); // { x[0], x[1], x[2], x[3]}
*pCurrentA++ = __DOTP4(v_x1, v_ylo_l) >> MAC_SHIFT;
*pCurrentD++ = __DOTP4(v_x1, v_yhi_l) >> MAC_SHIFT;
// *pCurrentA++ = __MULSN(HAAR_COEF, (pSrc[offset - 1] + pSrc[offset]), MAC_SHIFT);
// *pCurrentD++ = __MULSN(HAAR_COEF, (pSrc[offset - 1] - pSrc[offset]), MAC_SHIFT);
break;
case 0:
default:
// We are actually done, the signal fits perfectly (skip Step 2.)
return;
break;
}
/* Step 2.
* Handle Right overhanging (only for odd signal lengths)
*
* X() = [A B C D E F]x
* H() = [b a]
* ^ ^
* | Extend the signal (x) by computing the values based on the extension mode
* Then compute the filter part overlapping with the signal
*/
if(length % 2U){
int16_t sum_lo = 0;
int16_t sum_hi = 0;
uint32_t filt_j = 0;
// Compute Left edge extension
switch(mode){
case PLP_DWT_MODE_CONSTANT:
case PLP_DWT_MODE_SYMMETRIC:
sum_lo = 2 * HAAR_COEF * pSrc[length - 1]; // dec_lo[0] * src[N-1] + dec_lo[1] * src[N-1]
sum_hi = 0; // dec_hi[0] * src[N-1] + dec_hi[1] * src[N-1] == -dec_hi[1] * src[N-1] + dec_hi[1] * src[N-1]
break;
case PLP_DWT_MODE_REFLECT:
sum_lo = HAAR_COEF * (pSrc[length - 1] + pSrc[length - 2]);
sum_hi = HAAR_COEF * (pSrc[length - 1] - pSrc[length - 2]);
break;
case PLP_DWT_MODE_ANTISYMMETRIC:
sum_lo = HAAR_COEF * (pSrc[length - 1] - pSrc[length - 1]);
sum_hi = HAAR_COEF * (pSrc[length - 1] + pSrc[length - 1]);
break;
case PLP_DWT_MODE_ANTIREFLECT:
sum_lo = HAAR_COEF * (3*pSrc[length - 1] - pSrc[length - 2]);
sum_hi = HAAR_COEF * ( -pSrc[length - 1] + pSrc[length - 2]);
break;
case PLP_DWT_MODE_PERIODIC:
case PLP_DWT_MODE_ZERO:
default:
sum_lo = HAAR_COEF * pSrc[length - 1];
sum_hi = HAAR_COEF * pSrc[length - 1];
break;
}
*pCurrentA = sum_lo >> MAC_SHIFT;
*pCurrentD = sum_hi >> MAC_SHIFT;
}
}
Updated on 2023-03-01 at 16:16:33 +0000