/github/workspace/include/plp_math.h

Public header file for PULP DSP Library. More...

Classes

	Name
struct	plp_dot_prod_instance_i32 Instance structure for integer parallel dot product.
struct	plp_dot_prod_instance_q32 Instance structure for fixed point parallel dot product.
struct	plp_dot_prod_instance_f32 Instance structure for float parallel dot product.
struct	plp_mult_instance_f32 Instance structure for float parallel multiplication.
struct	plp_log_instance_f32 Instance structure for float parallel log.
struct	plp_conv_instance_i32 Instance structure for basic integer convolution.
struct	plp_conv_instance_i16 Instance structure for basic integer convolution.
struct	plp_conv_instance_i8 Instance structure for basic integer convolution.
struct	plp_conv_tree_add_instance Instance structure for basic integer convolution.
struct	plp_cfft_instance_q16 Instance structure for the fixed-point CFFT/CIFFT function.
struct	plp_cfft_instance_q16_parallel Instance structure for the parallel CFFT Q16.
struct	plp_cfft_instance_q32 Instance structure for the fixed-point CFFT/CIFFT function.
struct	plp_cfft_instance_q32_parallel Instance structure for the parallel CFFT Q16.
struct	plp_cfft_instance_f32 Instance structure for the floating-point CFFT/CIFFT function.
struct	plp_cfft_instance_f32_parallel Instance structure for floating-point FFT (parallel version)
struct	plp_fft_instance_f32 Instance structure for floating-point FFT.
struct	plp_fft_fast_instance_f32 Instance structure for floating-point FFT.
struct	plp_fft_fast_instance_f32_parallel Instance structure for floating-point FFT.
struct	plp_fft_instance_f32_parallel Instance structure for floating-point FFT (parallel version)
struct	plp_triangular_filter_f32 structure containing non-zero values of triangular filterbanks
struct	Complex_type_f32 Helper type to represent complex values with float32 components.
struct	plp_mat_mult_instance_i8 Instance structure for integer parallel matrix multiplication.
struct	plp_mat_mult_instance_i16 Instance structure for integer parallel matrix multiplication.
struct	plp_mat_mult_instance_i32 Instance structure for integer parallel matrix multiplication.
struct	plp_mat_mult_instance_f32 Instance structure for floating-point parallel matrix multiplication.
struct	plp_mat_mult_instance_q8 Instance structure for 8-bit fix-point parallel matrix multiplication.
struct	plp_mat_mult_instance_q16 Instance structure for 16-bit fix-point parallel matrix multiplication.
struct	plp_mat_mult_instance_q32 Instance structure for 32-bit fix-point parallel matrix multiplication.
struct	plp_mat_mult_cmplx_instance_i8 Instance structure for integer parallel complex matrix matrix multiplication.
struct	plp_mat_mult_cmplx_instance_i16 Instance structure for integer parallel complex matrix matrix multiplication.
struct	plp_mat_mult_cmplx_instance_i32 Instance structure for integer parallel complex matrix matrix multiplication.
struct	plp_mat_mult_cmplx_instance_f32 Instance structure for floating-point parallel complex matrix matrix multiplication.
struct	plp_mat_mult_cmplx_instance_q8 Instance structure for 8-bit fix-point parallel complex matrix matrix multiplication.
struct	plp_mat_mult_cmplx_instance_q16 Instance structure for 16-bit fix-point parallel complex matrix matrix multiplication.
struct	plp_mat_mult_cmplx_instance_q32 Instance structure for 32-bit fix-point parallel complex matrix matrix multiplication.
struct	plp_mat_add_instance_i8 Instance structure for integer parallel matrix addition.
struct	plp_mat_add_instance_i16 Instance structure for integer parallel matrix addition.
struct	plp_mat_add_instance_i32 Instance structure for integer parallel matrix addition.
struct	plp_mat_add_instance_f32 Instance structure for floating-point parallel matrix addition.
struct	plp_mat_sub_instance_i8 Instance structure for integer parallel matrix subtraction.
struct	plp_mat_sub_instance_i16 Instance structure for integer parallel matrix subtraction.
struct	plp_mat_sub_instance_i32 Instance structure for integer parallel matrix subtraction.
struct	plp_mat_sub_instance_f32 Instance structure for floating-point parallel matrix subtraction.
struct	plp_mat_scale_instance_i8 Instance structure for integer parallel matrix scale.
struct	plp_mat_scale_instance_i16 Instance structure for integer parallel matrix scale.
struct	plp_mat_scale_instance_i32 Instance structure for integer parallel matrix scale.
struct	plp_mat_scale_instance_f32 Instance structure for floating-point parallel matrix scale.
struct	plp_mat_trans_instance_i8 Instance structure for integer parallel matrix transpose.
struct	plp_mat_trans_instance_i16 Instance structure for integer parallel matrix transpose.
struct	plp_mat_trans_instance_i32 Instance structure for integer parallel matrix transpose.
struct	plp_mat_fill_I_instance_i8 Instance structure for integer parallel identity matrix creation.
struct	plp_mat_fill_I_instance_i16 Instance structure for integer parallel identity matrix creation.
struct	plp_mat_fill_I_instance_i32 Instance structure for integer parallel identity matrix creation.
struct	plp_mat_fill_I_instance_f32 Instance structure for floating-point parallel identity matrix creation.
struct	plp_mat_fill_I_instance_q8 Instance structure for fix-point parallel identity matrix creation.
struct	plp_mat_fill_I_instance_q16 Instance structure for fix-point parallel identity matrix creation.
struct	plp_mat_fill_I_instance_q32 Instance structure for fix-point parallel identity matrix creation.
struct	plp_mat_inv_instance_f32 Instance structure for floating-point parallel matrix inversion.
struct	plp_mat_mult_stride_instance_i8 Instance structure for strided integer parallel matrix multiplication.
struct	plp_mat_mult_stride_instance_i16 Instance structure for strided integer parallel matrix multiplication.
struct	plp_mat_mult_stride_instance_i32 Instance structure for strided integer parallel matrix multiplication.
struct	plp_mat_mult_stride_instance_f32 Instance structure for strided floating-point parallel matrix multiplication.
struct	plp_mat_mult_stride_instance_q8 Instance structure for strided 8-bit fix-point parallel matrix multiplication.
struct	plp_mat_mult_stride_instance_q16 Instance structure for strided 16-bit fix-point parallel matrix multiplication.
struct	plp_mat_mult_stride_instance_q32 Instance structure for strided 32-bit fix-point parallel matrix multiplication.
struct	plp_mat_mult_cmplx_stride_instance_i8 Instance structure for integer parallel complex strided matrix matrix multiplication.
struct	plp_mat_mult_cmplx_stride_instance_i16 Instance structure for integer parallel complex strided matrix matrix multiplication.
struct	plp_mat_mult_cmplx_stride_instance_i32 Instance structure for integer parallel complex strided matrix matrix multiplication.
struct	plp_mat_mult_cmplx_stride_instance_f32 Instance structure for floating-point parallel complex strided matrix matrix multiplication.
struct	plp_mat_mult_cmplx_stride_instance_q8 Instance structure for 8-bit fix-point parallel complex strided matrix matrix multiplication.
struct	plp_mat_mult_cmplx_stride_instance_q16 Instance structure for 16-bit fix-point parallel complex strided matrix matrix multiplication.
struct	plp_mat_mult_cmplx_stride_instance_q32 Instance structure for 32-bit fix-point parallel complex strided matrix matrix multiplication.
struct	plp_mat_add_stride_instance_i8 Instance structure for strided integer parallel matrix addition.
struct	plp_mat_add_stride_instance_i16 Instance structure for strided integer parallel matrix addition.
struct	plp_mat_add_stride_instance_i32 Instance structure for strided integer parallel matrix addition.
struct	plp_mat_add_stride_instance_f32 Instance structure for strided floating-point parallel matrix addition.
struct	plp_mat_sub_stride_instance_i8 Instance structure for strided integer parallel matrix subtraction.
struct	plp_mat_sub_stride_instance_i16 Instance structure for strided integer parallel matrix subtraction.
struct	plp_mat_sub_stride_instance_i32 Instance structure for strided integer parallel matrix subtraction.
struct	plp_mat_sub_stride_instance_f32 Instance structure for strided floating-point parallel matrix subtraction.
struct	plp_mat_scale_stride_instance_i8 Instance structure for strided integer parallel matrix scale.
struct	plp_mat_scale_stride_instance_i16 Instance structure for strided integer parallel matrix scale.
struct	plp_mat_scale_stride_instance_i32 Instance structure for strided integer parallel matrix scale.
struct	plp_mat_scale_stride_instance_f32 Instance structure for strided floating-point parallel matrix scale.
struct	plp_mat_fill_I_stride_instance_i8 Instance structure for integer parallel strided identity matrix creation.
struct	plp_mat_fill_I_stride_instance_i16 Instance structure for integer parallel strided identity matrix creation.
struct	plp_mat_fill_I_stride_instance_i32 Instance structure for integer parallel strided identity matrix creation.
struct	plp_mat_fill_I_stride_instance_f32 Instance structure for floating-point parallel strided identity matrix creation.
struct	plp_mat_fill_I_stride_instance_q8 Instance structure for 8-bit fix-point parallel strided identity matrix creation.
struct	plp_mat_fill_I_stride_instance_q16 Instance structure for 16-bit fix-point parallel strided identity matrix creation.
struct	plp_mat_fill_I_stride_instance_q32 Instance structure for 32-bit fix-point parallel strided identity matrix creation.
struct	plp_mat_fill_stride_instance_i8 Instance structure for filling an integer matrix in parallel.
struct	plp_mat_fill_stride_instance_i16 Instance structure for filling an integer matrix in parallel.
struct	plp_mat_fill_stride_instance_i32 Instance structure for filling an integer matrix in parallel.
struct	plp_mat_fill_stride_instance_f32 Instance structure for filling a floating-point matrix in parallel.
struct	plp_mat_copy_stride_instance_i8 Instance structure for integer parallel strided matrix copy.
struct	plp_mat_copy_stride_instance_i16 Instance structure for integer parallel strided matrix copy.
struct	plp_mat_copy_stride_instance_i32 Instance structure for integer parallel strided matrix copy.
struct	plp_mat_copy_stride_instance_f32 Instance structure for floating-point parallel strided matrix copy.
struct	plp_euclidean_distance_instance_f32 Instance structure for float parallel Euclidean distance.
struct	plp_euclidean_distance_instance_q32 Instance structure for float parallel Euclidean distance.
struct	plp_cosine_distance_instance_f32 Instance structure for float parallel cosine distance.
struct	plp_power_instance_q32 Instance structure for fixed point parallel power.
struct	plp_power_instance_f32 Instance structure for float parallel power.
struct	plp_dwt_wavelet_f32
struct	plp_dwt_wavelet_q32
struct	plp_dwt_wavelet_q16
struct	plp_dwt_wavelet_q8
struct	plp_dwt_instance_f32 Instance structure for float parallel dwt.
struct	plp_dwt_instance_q32 Instance structure for Q32 parallel dwt.
struct	plp_dwt_instance_q16 Instance structure for Q16 parallel dwt.
struct	plp_dwt_instance_q8 Instance structure for Q8 parallel dwt.

Types

	Name
enum	plp_dwt_wavelet_type { PLP_DWT_WAVELET_OTHER, PLP_DWT_WAVELET_HAAR, PLP_DWT_WAVELET_DB1, PLP_DWT_WAVELET_DB2, PLP_DWT_WAVELET_DB3, PLP_DWT_WAVELET_DB4, PLP_DWT_WAVELET_DB5, PLP_DWT_WAVELET_DB6, PLP_DWT_WAVELET_DB7, PLP_DWT_WAVELET_DB8, PLP_DWT_WAVELET_DB9, PLP_DWT_WAVELET_DB10, PLP_DWT_WAVELET_DB11, PLP_DWT_WAVELET_DB12, PLP_DWT_WAVELET_DB13, PLP_DWT_WAVELET_DB14, PLP_DWT_WAVELET_DB15, PLP_DWT_WAVELET_DB16, PLP_DWT_WAVELET_DB17, PLP_DWT_WAVELET_DB18, PLP_DWT_WAVELET_DB19, PLP_DWT_WAVELET_DB20, PLP_DWT_WAVELET_SYM2, PLP_DWT_WAVELET_SYM3, PLP_DWT_WAVELET_SYM4, PLP_DWT_WAVELET_SYM5, PLP_DWT_WAVELET_SYM6, PLP_DWT_WAVELET_SYM7, PLP_DWT_WAVELET_SYM8, PLP_DWT_WAVELET_SYM9, PLP_DWT_WAVELET_SYM10, PLP_DWT_WAVELET_SYM11, PLP_DWT_WAVELET_SYM12, PLP_DWT_WAVELET_SYM13, PLP_DWT_WAVELET_SYM14, PLP_DWT_WAVELET_SYM15, PLP_DWT_WAVELET_SYM16, PLP_DWT_WAVELET_SYM17, PLP_DWT_WAVELET_SYM18, PLP_DWT_WAVELET_SYM19, PLP_DWT_WAVELET_SYM20, PLP_DWT_WAVELET_COIF1, PLP_DWT_WAVELET_COIF2, PLP_DWT_WAVELET_COIF3, PLP_DWT_WAVELET_COIF4, PLP_DWT_WAVELET_COIF5, PLP_DWT_WAVELET_COIF6, PLP_DWT_WAVELET_COIF7, PLP_DWT_WAVELET_COIF8, PLP_DWT_WAVELET_COIF9, PLP_DWT_WAVELET_COIF10, PLP_DWT_WAVELET_COIF11, PLP_DWT_WAVELET_COIF12, PLP_DWT_WAVELET_COIF13, PLP_DWT_WAVELET_COIF14, PLP_DWT_WAVELET_COIF15, PLP_DWT_WAVELET_COIF16, PLP_DWT_WAVELET_COIF17}
enum	plp_dwt_extension_mode { PLP_DWT_MODE_ZERO, PLP_DWT_MODE_CONSTANT, PLP_DWT_MODE_SYMMETRIC, PLP_DWT_MODE_REFLECT, PLP_DWT_MODE_PERIODIC, PLP_DWT_MODE_ANTISYMMETRIC, PLP_DWT_MODE_ANTIREFLECT}
typedef float	float32_t

Functions

	Name
uint32_t	plp_dwt_max_level(uint32_t sig_len, uint32_t wavelet_len) Computes maximum available decomposition level for a signal length and wavelet length.
uint32_t	plp_dwt_dec_len(uint32_t sig_len, uint32_t wavelet_len, uint32_t level) Calculates decomposition output length given a level.
void	plp_dot_prod_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, int32_t *restrict pRes) Glue code for parallel dot product of 32-bit integer vectors.
void	plp_dot_prod_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, uint32_t nPE, int32_t *restrict pRes) Glue code for parallel dot product of 32-bit fixed point vectors.
void	plp_dot_prod_f32_parallel(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, float32_t *restrict pRes) Glue code for parallel dot product of 32-bit float vectors.
void	plp_dot_prod_i32p_xpulpv2(void * S) Parallel dot product with interleaved access of 32-bit integer vectors kernel for XPULPV2 extension.
void	plp_dot_prod_q32p_xpulpv2(void * S) Parallel dot product with interleaved access of 32-bit fixed point vectors kernel for XPULPV2 extension.
void	plp_dot_prod_f32p_xpulpv2(void * S) Parallel dot product with interleaved access of 32-bit float vectors kernel for XPULPV2 extension.
void	plp_dot_prod_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Glue code for dot product of 32-bit integer vectors.
void	plp_dot_prod_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Scalar dot product of 32-bit integer vectors kernel for RV32IM extension.
void	plp_dot_prod_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Scalar dot product of 32-bit integer vectors kernel for XPULPV2 extension.
void	plp_dot_prod_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Glue code for dot product of 32-bit fixed point vectors.
void	plp_dot_prod_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Scalar dot product of 32-bit fixed point vectors kernel for RV32IM extension.
void	plp_dot_prod_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Scalar dot product of 32-bit fixed point vectors kernel for XPULPV2 extension.
void	plp_dot_prod_f32(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) Glue code for dot product of 32-bit float vectors.
void	plp_dot_prod_f32s_xpulpv2(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) Glue code for dot product of 32-bit float vectors.
void	plp_dot_prod_f32s_rv32im(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) Glue code for dot product of 32-bit float vectors.
void	plp_dot_prod_i16(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t blockSize, int32_t *restrict pRes) Glue code for dot product of 16-bit integer vectors.
void	plp_dot_prod_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Vectorized dot product of 16-bit integer vectors kernel for RV32IM extension.
void	plp_dot_prod_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Vectorized dot product of 16-bit integer vectors kernel singlecore for XPULPV2 extension.
void	plp_dot_prod_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Glue code for dot product of 16-bit fixed point vectors.
void	plp_dot_prod_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Scalar dot product of 16-bit fixed point vectors kernel for RV32IM extension.
void	plp_dot_prod_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Vectorized dot product of 16-bit fixed point vectors singlecore kernel for XPULPV2 extension.
void	plp_dot_prod_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Glue code for dot product of 8-bit integer vectors.
void	plp_dot_prod_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Vectorized dot product of 8-bit integer vectors kernel for RV32IM extension.
void	plp_dot_prod_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Vectorized dot product of 8-bit integer vectors singlecore kernel for XPULPV2 extension.
void	plp_dot_prod_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Glue code for dot product of 8-bit fixed point vectors.
void	plp_dot_prod_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Scalar dot product of 8-bit fixed point vectors kernel for RV32IM extension.
void	plp_dot_prod_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Scalar dot product of 8-bit fixed point vectors singlecore kernel for XPULPV2 extension.
void	plp_abs_i32(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) Glue code for absolute value of 32-bit integer vectors.
void	plp_abs_i32s_rv32im(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) Element-by-element absolute value of 32-bit integer vectors kernel for RV32IM extension.
void	plp_abs_i32s_xpulpv2(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) Element-by-element absolute value of 32-bit integer vectors kernel for XPULPV2 extension.
void	plp_abs_i16(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) Glue code for absolute value of 16-bit integer vectors.
void	plp_abs_i16s_rv32im(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) Element-by-element absolute value of 16-bit integer vectors kernel for RV32IM extension.
void	plp_abs_i16s_xpulpv2(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) Element-by-element absolute value of 16-bit integer vectors kernel for XPULPV2 extension.
void	plp_abs_i8(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) Glue code for absolute value of 8-bit integer vectors.
void	plp_abs_i8s_rv32im(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) Element-by-element absolute value of 8-bit integer vectors kernel for RV32IM extension.
void	plp_abs_i8s_xpulpv2(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) Element-by-element absolute value of 8-bit integer vectors kernel for XPULPV2 extension.
void	plp_add_i32(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element addition of 32-bit integer vectors.
void	plp_add_i32s_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 32-bit integer vectors kernel for RV32IM extension.
void	plp_add_i32s_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 32-bit integer vectors kernel for XPULPV2 extension.
void	plp_add_i16(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element addition of 16-bit integer vectors.
void	plp_add_i16s_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 16-bit integer vectors kernel for RV32IM extension.
void	plp_add_i16s_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 16-bit integer vectors kernel for XPULPV2 extension.
void	plp_add_i8(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element addition of 8-bit integer vectors.
void	plp_add_i8s_rv32im(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 8-bit integer vectors kernel for RV32IM extension.
void	plp_add_i8s_xpulpv2(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 8-bit integer vectors kernel for XPULPV2 extension.
void	plp_mult_i32(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element multiplication of 32-bit integer vectors.
void	plp_mult_i32s_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 32-bit integer vectors kernel for RV32IM extension.
void	plp_mult_i32s_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 32-bit integer vectors kernel for XPULPV2 extension.
void	plp_mult_i16(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element multiplication of 16-bit integer vectors.
void	plp_mult_i16s_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 16-bit integer vectors kernel for RV32IM extension.
void	plp_mult_i16s_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 16-bit integer vectors kernel for XPULPV2 extension.
void	plp_mult_i8(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element multiplication of 8-bit integer vectors.
void	plp_mult_i8s_rv32im(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 8-bit integer vectors kernel for RV32IM extension.
void	plp_mult_i8s_xpulpv2(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 8-bit integer vectors kernel for XPULPV2 extension.
void	plp_mult_f32(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize) Glue code for element-by-element multiplication of 32-bit float vectors.
void	plp_mult_f32s_xpulpv2(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 32-bit float vectors kernel for XPULPV2 extension.
void	plp_mult_f32_parallel(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, float32_t *restrict pDst) Glue code for parallel dot product of 32-bit float vectors.
void	plp_mult_f32p_xpulpv2(void * S) Parallel multiplication with interleaved access of 32-bit float vectors kernel for XPULPV2 extension.
void	plp_log_f32_parallel(const float32_t restrict pSrc, uint32_t blockSize, uint32_t nPE, float32_t restrict pDst) Glue code for parallel log of 32-bit float vectors.
void	plp_log_f32p_xpulpv2(void * S) Parallel log with interleaved access of 32-bit float vectors kernel for XPULPV2 extension.
void	plp_negate_i32(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) Glue code of negate the elements of a vector for 32-bit integers.
void	plp_negate_i32s_rv32im(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) negate the elements of a vector for 32-bit integers on RV32IM
void	plp_negate_i32s_xpulpv2(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) negate the elements of a vector for 32-bit integers on XpulpV2
void	plp_negate_i16(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) Glue code of negate the elements of a vector for 16-bit integers.
void	plp_negate_i16s_rv32im(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) negate the elements of a vector for 16-bit integers on RV32IM
void	plp_negate_i16s_xpulpv2(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) negate the elements of a vector for 16-bit integers on XpulpV2
void	plp_negate_i8(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) Glue code of negate the elements of a vector for 8-bit integers.
void	plp_negate_i8s_rv32im(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) negate the elements of a vector for 8-bit integers on RV32IM
void	plp_negate_i8s_xpulpv2(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) negate the elements of a vector for 8-bit integers on XpulpV2
void	plp_negate_f32(const float32_t * pSrc, float32_t * pDst, uint32_t blockSize) Glue code of negate the elements of a vector for 32-bit floats.
void	plp_negate_f32s_xpulpv2(const float32_t * pSrc, float32_t * pDst, uint32_t blockSize) negate the elements of a vector for 32-bit floats on XpulpV2
void	plp_offset_i32(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize) Glue code of add a constant offset to a vector for 32-bit integers.
void	plp_offset_i32s_rv32im(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize) add a constant offset to a vector for 32-bit integers on RV32IM
void	plp_offset_i32s_xpulpv2(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize) add a constant offset to a vector for 32-bit integers on XpulpV2
void	plp_offset_i16(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize) Glue code of add a constant offset to a vector for 16-bit integers.
void	plp_offset_i16s_rv32im(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize) add a constant offset to a vector for 16-bit integers on RV32IM
void	plp_offset_i16s_xpulpv2(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize) add a constant offset to a vector for 16-bit integers on XpulpV2
void	plp_offset_i8(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize) Glue code of add a constant offset to a vector for 8-bit integers.
void	plp_offset_i8s_rv32im(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize) add a constant offset to a vector for 8-bit integers on RV32IM
void	plp_offset_i8s_xpulpv2(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize) add a constant offset to a vector for 8-bit integers on XpulpV2
void	plp_offset_f32(const float32_t * pSrc, float32_t offset, float32_t * pDst, uint32_t blockSize) Glue code of add a constant offset to a vector for 32-bit floats.
void	plp_offset_f32s_xpulpv2(const float32_t * pSrc, float32_t offset, float32_t * pDst, uint32_t blockSize) add a constant offset to a vector for 32-bit floats on XpulpV2
void	plp_sub_i32(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code of vector substraction for 32-bit integers.
void	plp_sub_i32s_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 32-bit integers on RV32IM
void	plp_sub_i32s_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 32-bit integers on XpulpV2
void	plp_sub_i16(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code of vector substraction for 16-bit integers.
void	plp_sub_i16s_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 16-bit integers on RV32IM
void	plp_sub_i16s_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 16-bit integers on XpulpV2
void	plp_sub_i8(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code of vector substraction for 8-bit integers.
void	plp_sub_i8s_rv32im(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 8-bit integers on RV32IM
void	plp_sub_i8s_xpulpv2(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 8-bit integers on XpulpV2
void	plp_sub_f32(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize) Glue code of vector substraction for 32-bit floats.
void	plp_sub_f32s_xpulpv2(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize) vector substraction for 32-bit floats on XpulpV2
void	plp_scale_i32(const int32_t restrict pSrc, int32_t scaleFactor, int32_t shift, int32_t restrict pDst, uint32_t blockSize) Glue code of multiply a vector by a scalar for 32-bit integers.
void	plp_scale_i32s_rv32im(const int32_t restrict pSrc, int32_t scaleFactor, int32_t shift, int32_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 32-bit integers on RV32IM
void	plp_scale_i32s_xpulpv2(const int32_t restrict pSrc, int32_t scaleFactor, int32_t shift, int32_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 32-bit integers on XpulpV2
void	plp_scale_i16(const int16_t restrict pSrc, int16_t scaleFactor, int32_t shift, int16_t restrict pDst, uint32_t blockSize) Glue code of multiply a vector by a scalar for 16-bit integers.
void	plp_scale_i16s_rv32im(const int16_t restrict pSrc, int16_t scaleFactor, int32_t shift, int16_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 16-bit integers on RV32IM
void	plp_scale_i16s_xpulpv2(const int16_t restrict pSrc, int16_t scaleFactor, int32_t shift, int16_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 16-bit integers on XpulpV2
void	plp_scale_i8(const int8_t restrict pSrc, int8_t scaleFactor, int32_t shift, int8_t restrict pDst, uint32_t blockSize) Glue code of multiply a vector by a scalar for 8-bit integers.
void	plp_scale_i8s_rv32im(const int8_t restrict pSrc, int8_t scaleFactor, int32_t shift, int8_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 8-bit integers on RV32IM
void	plp_scale_i8s_xpulpv2(const int8_t restrict pSrc, int8_t scaleFactor, int32_t shift, int8_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 8-bit integers on XpulpV2
void	plp_scale_f32(const float32_t restrict pSrc, float32_t scaleFactor, float32_t restrict pDst, uint32_t blockSize) Glue code of multiply a vector by a scalar for 32-bit floats.
void	plp_scale_f32s_xpulpv2(const float32_t restrict pSrc, float32_t scaleFactor, float32_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 32-bit floats on XpulpV2
void	plp_fill_i32(int32_t value, int32_t *restrict pDst, uint32_t blockSize) Glue code for filling a constant value into a 32-bit integer vector.
void	plp_fill_i32s_rv32im(int32_t value, int32_t *restrict pDst, uint32_t blockSize) Fills a constant value into a 32-bit integer vector for RV32IM extension.
void	plp_fill_i32s_xpulpv2(int32_t value, int32_t *restrict pDst, uint32_t blockSize) Fills a constant value into a 32-bit integer vector for XPULPV2 extension.
void	plp_copy_i32(int32_t restrict pSrc, int32_t restrict pDst, uint32_t blockSize) Glue code for copying the elements of a 32-bit integer vector.
void	plp_copy_i32s_rv32im(int32_t restrict pSrc, int32_t restrict pDst, uint32_t blockSize) Copies the elements of a 32-bit integer vector for RV32IM extension.
void	plp_copy_i32s_xpulpv2(int32_t restrict pSrc, int32_t restrict pDst, uint32_t blockSize) Copies the elements of a 32-bit integer vector for XPULPV2 extension.
void	plp_copy_f32(float32_t restrict pSrc, float32_t restrict pDst, uint32_t blockSize) Glue code for copying the elements of a 32-bit float vector.
void	plp_copy_f32s_xpulpv2(float32_t restrict pSrc, float32_t restrict pDst, uint32_t blockSize) Copies the elements of a 32-bit integer vector for XPULPV2 extension.
void	plp_copy_f32s_rv32im(float32_t restrict pSrc, float32_t restrict pDst, uint32_t blockSize) Copies the elements of a 32-bit integer vector for XPULPV2 extension.
void	plp_mean_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for mean value of a 32-bit float vector.
void	plp_mean_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for mean value of a 32-bit float vector.
void	plp_mean_i32(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for mean value of a 32-bit integer vector.
void	plp_mean_i32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Mean value of a 32-bit integer vector for RV32IM extension.
void	plp_mean_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Mean value of a 32-bit integer vector for XPULPV2 extension.
void	plp_mean_i16(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Glue code for mean value of a 16-bit integer vector.
void	plp_mean_i16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Mean value of a 16-bit integer vector for RV32IM extension.
void	plp_mean_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Mean value of a 16-bit integer vector for XPULPV2 extension.
void	plp_mean_i8(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Glue code for mean value of a 8-bit integer vector.
void	plp_mean_i8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Mean value of a 8-bit integer vector for RV32IM extension.
void	plp_mean_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Mean value of a 8-bit integer vector for XPULPV2 extension.
void	plp_max_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for max value of a 32-bit float vector.
void	plp_max_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for max value of a 32-bit float vector.
void	plp_max_i32(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for max value of a 32-bit integer vector.
void	plp_max_i32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Max value of a 32-bit integer vector for RV32IM extension.
void	plp_max_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Max value of a 32-bit integer vector for XPULPV2 extension.
void	plp_max_i16(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Glue code for max value of a 16-bit integer vector.
void	plp_max_i16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Max value of a 16-bit integer vector for RV32IM extension.
void	plp_max_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Max value of a 16-bit integer vector for XPULPV2 extension.
void	plp_max_i8(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Glue code for max value of a 8-bit integer vector.
void	plp_max_i8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Max value of a 8-bit integer vector for RV32IM extension.
void	plp_max_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Max value of a 8-bit integer vector for XPULPV2 extension.
void	plp_min_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for min value of a 32-bit float vector.
void	plp_min_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for min value of a 32-bit float vector.
void	plp_min_i32(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for min value of a 32-bit integer vector.
void	plp_min_i32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Min value of a 32-bit integer vector for RV32IM extension.
void	plp_min_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Min value of a 32-bit integer vector for XPULPV2 extension.
void	plp_min_i16(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Glue code for min value of a 16-bit integer vector.
void	plp_min_i16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Min value of a 16-bit integer vector for RV32IM extension.
void	plp_min_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Min value of a 16-bit integer vector for XPULPV2 extension.
void	plp_min_i8(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Glue code for min value of a 8-bit integer vector.
void	plp_min_i8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Min value of a 8-bit integer vector for RV32IM extension.
void	plp_min_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Min value of a 8-bit integer vector for XPULPV2 extension.
void	plp_power_f32_parallel(const float32_t restrict pSrc, uint32_t blockSize, uint32_t nPE, float32_t restrict pRes) Glue code for parallel power of 32-bit floating point vectors.
void	plp_power_f32p_xpulpv2(void * S) Parallel sum of squares of a 32-bit float vector for XPULPV2 extension.
void	plp_power_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for Sum of squares of a 32-bit float vector.
void	plp_power_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for Sum of squares of a 32-bit float vector.
void	plp_power_f32s_rv32im(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Sum of squares of a 32-bit float vector for RV32IM.
void	plp_power_i32(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for Sum of squares of a 32-bit integer vector.
void	plp_power_i32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 32-bit integer vector for RV32IM extension.
void	plp_power_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 32-bit integer vector for XPULPV2 extension.
void	plp_power_i16(const int16_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for Sum of squares of a 16-bit integer vector.
void	plp_power_i16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 16-bit integer vector for RV32IM extension.
void	plp_power_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 16-bit integer vector for XPULPV2 extension.
void	plp_power_i8(const int8_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for Sum of squares of a 8-bit integer vector.
void	plp_power_i8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 8-bit integer vector for RV32IM extension.
void	plp_power_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 8-bit integer vector for XPULPV2 extension.
void	plp_power_q32_parallel(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, uint32_t nPE, int32_t restrict pRes) Glue code for parallel power of 32-bit fixed point vectors.
void	plp_power_q32p_xpulpv2(void * S) Parallel sum of squares of a 32-bit fixed-point vector for XPULPV2 extension.
void	plp_power_q32(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Sum of squares of a 32-bit fixed point vector.
void	plp_power_q32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 32-bit fixed point vector for RV32IM extension.
void	plp_power_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 32-bit fixed point vector for XPULPV2 extension.
void	plp_power_q16(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Sum of squares of a 16-bit fixed point vector.
void	plp_power_q16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 16-bit fixed point vector for RV32IM extension.
void	plp_power_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 16-bit fixed point vector for XPULPV2 extension.
void	plp_power_q8(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Sum of squares of a 8-bit fixed point vector.
void	plp_power_q8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 8-bit fixed point vector for RV32IM extension.
void	plp_power_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 8-bit fixed point vector for XPULPV2 extension.
void	plp_var_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for Statisical variance of a 32-bit float vector.
void	plp_var_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for Statisical variance of a 32-bit float vector.
void	plp_var_q32(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Statisical variance of a 32-bit fixed point vector.
void	plp_var_q32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical variance of a 32-bit fixed point vector for RV32IM extension.
void	plp_var_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical variance of a 32-bit fixed point vector for XPULPV2 extension.
void	plp_var_q16(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Glue code for Statisical variance of a 16-bit fixed point vector.
void	plp_var_q16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical variance of a 16-bit fixed point vector for RV32IM extension.
void	plp_var_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical variance of a 16-bit fixed point vector for XPULPV2 extension.
void	plp_var_q8(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Glue code for Statisical variance of a 8-bit fixed point vector.
void	plp_var_q8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical variance of a 8-bit fixed point vector for RV32IM extension.
void	plp_var_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical variance of a 8-bit fixed point vector for XPULPV2 extension.
void	plp_std_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for Statisical standard deviation of a 32-bit floating point vector.
void	plp_std_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for Statisical standard deviation of a 32-bit float vector.
void	plp_std_q32(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Statisical standard deviation of a 32-bit fixed point vector.
void	plp_std_q32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension.
void	plp_std_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension.
void	plp_std_q16(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Glue code for Statisical standard deviation of a 16-bit fixed point vector.
void	plp_std_q16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension.
void	plp_std_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension.
void	plp_std_q8(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Glue code for Statisical standard deviation of a 8-bit fixed point vector.
void	plp_std_q8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension.
void	plp_std_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension.
void	plp_rms_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for Statisical standard deviation of a 32-bit floating point vector.
void	plp_rms_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for Statisical standard deviation of a 32-bit float vector.
void	plp_rms_q32(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Statisical standard deviation of a 32-bit fixed point vector.
void	plp_rms_q32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension.
void	plp_rms_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension.
void	plp_rms_q16(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Glue code for Statisical standard deviation of a 16-bit fixed point vector.
void	plp_rms_q16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension.
void	plp_rms_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension.
void	plp_rms_q8(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Glue code for Statisical standard deviation of a 8-bit fixed point vector.
void	plp_rms_q8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension.
void	plp_rms_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension.
void	plp_sqrt_q32(const int32_t restrict pSrc, const uint32_t fracBits, int32_t restrict pRes) Glue code for square root of a 32-bit fixed point number.
void	plp_sqrt_q32s_rv32im(const int32_t restrict pSrc, const uint32_t fracBits, int32_t restrict pRes) Square root of a 32-bit fixed point number for XPULPV2 extension.
void	plp_sqrt_q32s_xpulpv2(const int32_t restrict pSrc, const uint32_t fracBits, int32_t restrict pRes) Square root of a 32-bit fixed point number for XPULPV2 extension.
void	plp_sqrt_q16(const int16_t restrict pSrc, const uint32_t fracBits, int16_t restrict pRes) Glue code for square root of a 16-bit fixed point number.
void	plp_sqrt_q16s_rv32im(const int16_t restrict pSrc, const uint32_t fracBits, int16_t restrict pRes) Square root of a 16-bit fixed point number for XPULPV2 extension.
void	plp_sqrt_q16s_xpulpv2(const int16_t restrict pSrc, const uint32_t fracBits, int16_t restrict pRes) Square root of a 16-bit fixed point number for XPULPV2 extension.
void	plp_sqrt_f32(const float restrict pSrc, float restrict pRes) Glue code for square root of a 32-bit floating point number.
void	plp_sqrt_f32s_rv32im(const float restrict pSrc, float restrict pRes) Square root of a 32-bit floating point number for RV32IM.
void	plp_sqrt_f32s_xpulpv2(const float restrict pSrc, float restrict pRes) Kernel for square root of a 32-bit floating point number.
int32_t	plp_cos_q32(int32_t x) Glue code for q32 cosine function.
int32_t	plp_cos_q32s_rv32im(int32_t x) q32 cosine function for RV32IM
int32_t	plp_cos_q32s_xpulpv2(int32_t x) q32 cosine function for XPULPV2
int16_t	plp_cos_q16(int16_t x) Glue code for q16 cosine function.
int16_t	plp_cos_q16s_rv32im(int16_t x) q16 cosine function for RV32IM
int16_t	plp_cos_q16s_xpulpv2(int16_t x) q16 cosine function for XPULPV2
float32_t	plp_cos_f32(float32_t x) Glue code for f32 cosine function.
float32_t	plp_cos_f32s_xpulpv2(float32_t x) F32 cosine function for XPULPV2.
int32_t	plp_sin_q32(int32_t x) Glue code for q32 sine function.
int32_t	plp_sin_q32s_rv32im(int32_t x) q32 sine function for RV32IM
int32_t	plp_sin_q32s_xpulpv2(int32_t x) q32 sine function for XPULPV2
int16_t	plp_sin_q16(int16_t x) Glue code for q16 sine function.
int16_t	plp_sin_q16s_rv32im(int16_t x) q16 sine function for RV32IM
int16_t	plp_sin_q16s_xpulpv2(int16_t x) q16 sine function for XPULPV2
float32_t	plp_sin_f32(float32_t x) Glue code for f32 sine function.
float32_t	plp_sin_f32s_xpulpv2(float32_t x) F32 sine function for XPULPV2.
void	plp_correlate_i32(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for correlation of 32-bit integer vectors.
void	plp_correlate_i32s_rv32im(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Correlation of 32-bit integer vectors kernel for RV32IM extension.
void	plp_correlate_i32s_xpulpv2(const int32_t restrict pSrcA, const uint32_t srcALen, const int32_t restrict pSrcB, const uint32_t srcBLen, int32_t *restrict pRes) Correlation of 32-bit integer vectors kernel for XPULPV2 extension.
void	plp_correlate_i16(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for correlation of 16-bit integer vectors.
void	plp_correlate_i16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Correlation of 16-bit integer vectors kernel for XPULPV2 extension.
void	plp_correlate_i16s_rv32im(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Correlation of 16-bit integer vectors kernel for RV32IM extension.
void	plp_correlate_i8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for correlation of 8-bit integer vectors.
void	plp_correlate_valid_i8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for correlation (valid) of 8-bit integer vectors.
void	plp_correlate_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Correlation of 8-bit integer vectors kernel for XPULPV2 extension.
void	plp_correlate_i8s_rv32im(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Correlation of 8-bit integer vectors kernel for RV32IM extension.
void	plp_correlate_q32(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Glue code for correlation of 32-bit integer vectors.
void	plp_correlate_q32s_rv32im(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Correlation of 32-bit integer vectors kernel for RV32IM extension.
void	plp_correlate_q32s_xpulpv2(const int32_t restrict pSrcA, const uint32_t srcALen, const int32_t restrict pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t *restrict pRes) Correlation of 32-bit integer vectors kernel for XPULPV2 extension.
void	plp_correlate_q16(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Glue code for correlation of 16-bit integer vectors.
void	plp_correlate_q16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Correlation of 16-bit integer vectors kernel for XPULPV2 extension.
void	plp_correlate_q16s_rv32im(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Correlation of 16-bit integer vectors kernel for RV32IM extension.
void	plp_correlate_q8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Glue code for correlation of 8-bit integer vectors.
void	plp_correlate_valid_q8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Glue code for correlation (valid) of 8-bit integer vectors.
void	plp_correlate_q8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Correlation of 8-bit integer vectors kernel for XPULPV2 extension.
void	plp_correlate_q8s_rv32im(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Correlation of 8-bit integer vectors kernel for RV32IM extension.
void	plp_conv_i32(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution of 32-bit integer vectors.
void	plp_conv_valid_i32(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution (valid) of 32-bit integer vectors.
void	plp_conv_i32s_rv32im(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 32-bit integer vectors kernel for RV32IM extension.
void	plp_conv_i32s_xpulpv2(const int32_t restrict pSrcA, const uint32_t srcALen, const int32_t restrict pSrcB, const uint32_t srcBLen, int32_t *restrict pRes) Convolution of 32-bit integer vectors kernel for XPULPV2 extension.
void	plp_conv_valid_i32s_xpulpv2(const int32_t restrict pSrcA, const uint32_t srcALen, const int32_t restrict pSrcB, const uint32_t srcBLen, int32_t *restrict pRes) Convolution (valid) of 32-bit integer vectors kernel for XPULPV2 extension.
void	plp_conv_i16(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution of 16-bit integer vectors.
void	plp_conv_valid_i16(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution (valid) of 16-bit integer vectors.
void	plp_conv_valid_rep_i16(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution (valid with replication) of 16-bit integer vectors.
void	plp_conv_i16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 16-bit integer vectors kernel for XPULPV2 extension.
void	plp_conv_valid_i16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution (valid) of 16-bit integer vectors kernel for XPULPV2 extension.
void	plp_conv_valid_rep_i16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const uint32_t srcAMem, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution (valid with data replication) of 16-bit integer vectors kernel for XPULPV2 extension.
void	plp_conv_i16s_rv32im(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 16-bit integer vectors kernel for RV32IM extension.
void	plp_conv_i8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution of 8-bit integer vectors.
void	plp_conv_valid_i8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution (valid) of 8-bit integer vectors.
void	plp_conv_valid_rep_i8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution (valid with data replication) of 8-bit integer vectors.
void	plp_conv_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 8-bit integer vectors kernel for XPULPV2 extension.
void	plp_conv_valid_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution (valid) of 8-bit integer vectors kernel for XPULPV2 extension.
void	plp_conv_valid_rep_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const uint32_t srcAMem, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution (valid with data replication) of 8-bit integer vectors kernel for XPULPV2 extension.
void	plp_conv_i8s_rv32im(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 8-bit integer vectors kernel for RV32IM extension.
void	plp_conv_i32_parallel(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, const uint8_t nPE, int32_t * pRes) Glue code for parallel convolution of 32-bit integer vectors.
void	plp_conv_i32p_xpulpv2(void * task_args) Setup code for parallel convolution of 32-bit integer vectors.
void	plp_conv_i16_parallel(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, const uint8_t nPE, int32_t * pRes) Glue code for parallel convolution of 16-bit integer vectors.
void	plp_conv_i16p_xpulpv2(void * task_args) Setup code for parallel convolution of 16-bit integer vectors.
void	plp_conv_i8_parallel(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, const uint8_t nPE, int32_t * pRes) Glue code for parallel convolution of 8-bit integer vectors.
void	plp_conv_i8p_xpulpv2(void * task_args) Setup code for parallel convolution of 8-bit integer vectors.
void	plp_conv_parallel_OLA(uint32_t nPE, uint32_t srcALen, uint32_t srcBLen, int32_t * resultsBuffer) Helper function for parallelized overlap-adding of partial convolution results.
void	plp_conv_parallel_OLA_kernel(void * task_args) Helper function for parallelized overlap-adding of partial convolution results.
void	plp_mat_mult_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix matrix multiplication of a 32-bit integer matrices.
void	plp_mat_mult_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_mult_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix matrix multiplication of a 16-bit integer matrices.
void	plp_mat_mult_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_mult_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix matrix multiplication of a 8-bit integer matrices.
void	plp_mat_mult_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_mult_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices.
void	plp_mat_mult_i32p_xpulpv2(void * args) Parallel matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 16-bit integer matrices.
void	plp_mat_mult_i16p_xpulpv2(void * args) Parallel matrix multiplication of 16-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_mult_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 8-bit integer matrices.
void	plp_mat_mult_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Glue code for matrix matrix multiplication of a 32-bit floating-point matrices.
void	plp_mat_mult_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
void	plp_mat_mult_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, float *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 32-bit floating-point matrices.
void	plp_mat_mult_f32p_xpulpv2(void * args) Parallel matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_i8p_xpulpv2(void * args) Parallel matrix multiplication of 8-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_mult_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Glue code for matrix matrix multiplication of a 32-bit fix-point matrices.
void	plp_mat_mult_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 32-bit fix-point matrices.
void	plp_mat_mult_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_q32p_xpulpv2(void * args) Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Glue code for matrix matrix multiplication of a 16-bit fix-point matrices.
void	plp_mat_mult_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 16-bit fix-point matrices.
void	plp_mat_mult_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_q16p_xpulpv2(void * args) Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Glue code for matrix matrix multiplication of a 8-bit fix-point matrices.
void	plp_mat_mult_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 8-bit fix-point matrices.
void	plp_mat_mult_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_q8p_xpulpv2(void * args) Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_cmplx_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 32-bit integers.
void	plp_mat_mult_cmplx_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 32-bit integers on RV32IM.
void	plp_mat_mult_cmplx_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 32-bit integers on XpulpV2.
void	plp_mat_mult_cmplx_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 32-bit integers.
void	plp_mat_mult_cmplx_i32p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 32-bit integers on XpulpV2
void	plp_mat_mult_cmplx_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 16-bit integers.
void	plp_mat_mult_cmplx_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 16-bit integers on RV32IM.
void	plp_mat_mult_cmplx_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 16-bit integers on XpulpV2.
void	plp_mat_mult_cmplx_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 16-bit integers.
void	plp_mat_mult_cmplx_i16p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 16-bit integers on XpulpV2
void	plp_mat_mult_cmplx_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 8-bit integers.
void	plp_mat_mult_cmplx_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 8-bit integers on RV32IM.
void	plp_mat_mult_cmplx_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 8-bit integers on XpulpV2.
void	plp_mat_mult_cmplx_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 8-bit integers.
void	plp_mat_mult_cmplx_i8p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 8-bit integers on XpulpV2
void	plp_mat_mult_cmplx_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Glue code of matrix matrix multiplication for complex 32-bit floats.
void	plp_mat_mult_cmplx_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Matrix matrix multiplication for complex 32-bit floats on XpulpV2.
void	plp_mat_mult_cmplx_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, float *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 32-bit floats.
void	plp_mat_mult_cmplx_f32p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 32-bit floats on XpulpV2
void	plp_mat_mult_cmplx_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 32-bit fix-point.
void	plp_mat_mult_cmplx_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Matrix matrix multiplication for complex 32-bit fix-point on RV32IM.
void	plp_mat_mult_cmplx_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Matrix matrix multiplication for complex 32-bit fix-point on XpulpV2.
void	plp_mat_mult_cmplx_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 32-bit fix-point.
void	plp_mat_mult_cmplx_q32p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 32-bit fix-point on XpulpV2
void	plp_mat_mult_cmplx_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 16-bit fix-point.
void	plp_mat_mult_cmplx_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Matrix matrix multiplication for complex 16-bit fix-point on RV32IM.
void	plp_mat_mult_cmplx_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Matrix matrix multiplication for complex 16-bit fix-point on XpulpV2.
void	plp_mat_mult_cmplx_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 16-bit fix-point.
void	plp_mat_mult_cmplx_q16p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 16-bit fix-point on XpulpV2
void	plp_mat_mult_cmplx_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 8-bit fix-point.
void	plp_mat_mult_cmplx_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Matrix matrix multiplication for complex 8-bit fix-point on RV32IM.
void	plp_mat_mult_cmplx_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Matrix matrix multiplication for complex 8-bit fix-point on XpulpV2.
void	plp_mat_mult_cmplx_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 8-bit fix-point.
void	plp_mat_mult_cmplx_q8p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 8-bit fix-point on XpulpV2
void	plp_mat_mult_trans_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 32-bit integer matrices.
void	plp_mat_mult_trans_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_mult_trans_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_trans_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 16-bit integer matrices.
void	plp_mat_mult_trans_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_mult_trans_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_trans_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 8-bit integer matrices.
void	plp_mat_mult_trans_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_mult_trans_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_trans_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices.
void	plp_mat_mult_trans_i32p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_mult_trans_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 16-bit integer matrices.
void	plp_mat_mult_trans_i16p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_trans_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 8-bit integer matrices.
void	plp_mat_mult_trans_i8p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_trans_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 32-bit fix-point matrices.
void	plp_mat_mult_trans_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 32-bit fix-point matrices.
void	plp_mat_mult_trans_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_trans_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_trans_q32p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_trans_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 16-bit fix-point matrices.
void	plp_mat_mult_trans_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 16-bit fix-point matrices.
void	plp_mat_mult_trans_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_trans_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_trans_q16p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_trans_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 8-bit fix-point matrices.
void	plp_mat_mult_trans_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 8-bit fix-point matrices.
void	plp_mat_mult_trans_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_trans_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_trans_q8p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_trans_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 32-bit floating-point matrices.
void	plp_mat_mult_trans_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
void	plp_mat_mult_trans_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, float *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 32-bit floating-point matrices.
void	plp_mat_mult_trans_f32p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_trans_cmplx_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 32-bit integers.
void	plp_mat_mult_trans_cmplx_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 32-bit integers on RV32IM
void	plp_mat_mult_trans_cmplx_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 32-bit integers.
void	plp_mat_mult_trans_cmplx_i32p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 16-bit integers.
void	plp_mat_mult_trans_cmplx_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 16-bit integers on RV32IM
void	plp_mat_mult_trans_cmplx_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 16-bit integers.
void	plp_mat_mult_trans_cmplx_i16p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 8-bit integers.
void	plp_mat_mult_trans_cmplx_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 8-bit integers on RV32IM
void	plp_mat_mult_trans_cmplx_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 8-bit integers.
void	plp_mat_mult_trans_cmplx_i8p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 32-bit floats.
void	plp_mat_mult_trans_cmplx_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2
void	plp_mat_mult_trans_cmplx_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, float *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 32-bit floats.
void	plp_mat_mult_trans_cmplx_f32p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2
void	plp_mat_mult_trans_cmplx_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 32-bit fix-point.
void	plp_mat_mult_trans_cmplx_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM
void	plp_mat_mult_trans_cmplx_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 32-bit fix-point.
void	plp_mat_mult_trans_cmplx_q32p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 16-bit fix-point.
void	plp_mat_mult_trans_cmplx_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM
void	plp_mat_mult_trans_cmplx_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 16-bit fix-point.
void	plp_mat_mult_trans_cmplx_q16p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 8-bit fix-point.
void	plp_mat_mult_trans_cmplx_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM
void	plp_mat_mult_trans_cmplx_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 8-bit fix-point.
void	plp_mat_mult_trans_cmplx_q8p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2
void	plp_cmplx_mag_f32(const float32_t * pSrc, float32_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation in float32.
void	plp_cmplx_mag_f32s_xpulpv2(const float32_t * pSrc, float32_t * pRes, uint32_t numSamples) complex magnitude for float32 on XPULPV2
void	plp_cmplx_mag_q32(const int32_t * pSrc, const uint32_t fracBits, int32_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation for 32 bit fixpoint.
void	plp_cmplx_mag_q32s_rv32im(const int32_t * pSrc, const uint32_t fracBits, int32_t * pRes, uint32_t numSamples) complex magnitude for q32 on RV32IM
void	plp_cmplx_mag_q32s_xpulpv2(const int32_t * pSrc, const uint32_t fracBits, int32_t * pRes, uint32_t numSamples) complex magnitude for q32 on XPULPV2
void	plp_cmplx_mag_q8(const int8_t * pSrc, const uint32_t fracBits, int8_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation for 8 bit fixpoint.
void	plp_cmplx_mag_q8s_rv32im(const int8_t * pSrc, const uint32_t fracBits, int8_t * pRes, uint32_t numSamples) complex magnitude for q8 on RV32IM
void	plp_cmplx_mag_q8s_xpulpv2(const int8_t * pSrc, const uint32_t fracBits, int8_t * pRes, uint32_t numSamples) complex magnitude for q8 on XPULPV2
void	plp_cmplx_mag_i16(const int16_t * pSrc, int16_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation in 16-bit integer.
void	plp_cmplx_mag_i16s_rv32im(const int16_t * pSrc, int16_t * pRes, uint32_t numSamples) complex magnitude for i16 on RV32IM
void	plp_cmplx_mag_i16s_xpulpv2(const int16_t * pSrc, int16_t * pRes, uint32_t numSamples) complex magnitude for i16 on XPULPV2
void	plp_cmplx_mag_i32(const int32_t * pSrc, int32_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation in 32-bit integer.
void	plp_cmplx_mag_i32s_rv32im(const int32_t * pSrc, int32_t * pRes, uint32_t numSamples) complex magnitude for i32 on RV32IM
void	plp_cmplx_mag_i32s_xpulpv2(const int32_t * pSrc, int32_t * pRes, uint32_t numSamples) complex magnitude for i32 on XPULPV2
void	plp_cmplx_mag_i8(const int8_t * pSrc, int8_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation in 8-bit integer.
void	plp_cmplx_mag_i8s_rv32im(const int8_t * pSrc, int8_t * pRes, uint32_t numSamples) complex magnitude for i8 on RV32IM
void	plp_cmplx_mag_i8s_xpulpv2(const int8_t * pSrc, int8_t * pRes, uint32_t numSamples) complex magnitude for i8 on XPULPV2
void	plp_cmplx_mag_q16(const int16_t * pSrc, const uint32_t fracBits, int16_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation in 16-bit quantized integer.
void	plp_cmplx_mag_q16s_rv32im(const int16_t * pSrc, const uint32_t fracBits, int16_t * pRes, uint32_t numSamples) complex magnitude for q16 on RV32IM
void	plp_cmplx_mag_q16s_xpulpv2(const int16_t * pSrc, const uint32_t fracBits, int16_t * pRes, uint32_t numSamples) complex magnitude for q16 on XPULPV2
void	plp_bitreversal_16s_rv32im(uint16_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab) In-place 16 bit reversal function for RV32IM.
void	plp_bitreversal_16s_xpulpv2(uint16_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab) In-place 16 bit reversal function for XPULPV2.
void	plp_bitreversal_16p_xpulpv2(uint16_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab, uint32_t nPE) In-place 16 bit reversal function.
void	plp_cfft_q16(const plp_cfft_instance_q16 * S, int16_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t deciPoint) Glue code for quantized 16 bit complex fast fourier transform.
void	plp_cfft_q16_parallel(const plp_cfft_instance_q16 * S, int16_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t deciPoint, uint32_t nPE) Glue code for quantized 16 bit complex fast fourier transform.
void	plp_cfft_q16s_rv32im(const plp_cfft_instance_q16 * S, int16_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t deciPoint) Quantized 16 bit complex fast fourier transform for RV32IM.
void	plp_cfft_q16s_xpulpv2(const plp_cfft_instance_q16 * S, int16_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t deciPoint) Quantized 16 bit complex fast fourier transform for XPULPV2.
void	plp_cfft_q16p_xpulpv2(void * args) Parallel quantized 16 bit complex fast fourier transform for XPULPV2.
void	plp_bitreversal_32s_rv32im(uint32_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab) In-place 32 bit reversal function for RV32IM.
void	plp_bitreversal_32s_xpulpv2(uint32_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab) In-place 32 bit reversal function for XPULPV2.
void	plp_bitreversal_32p_xpulpv2(uint32_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab, uint32_t nPE) In-place 32 bit reversal function for XPULPV2.
void	plp_cfft_q32(const plp_cfft_instance_q32 * S, int32_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t fracBits) Glue code for quantized 32-bit complex fast fourier transform.
void	plp_cfft_q32_parallel(const plp_cfft_instance_q32 * S, int32_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t fracBits, uint32_t nPE) Quantized 32-bit complex fast fourier transform for XPULPV2.
void	plp_cfft_q32s_rv32im(const plp_cfft_instance_q32 * S, int32_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t fracBits) Quantized 32-bit complex fast fourier transform for RV32IM.
void	plp_cfft_q32s_xpulpv2(const plp_cfft_instance_q32 * S, int32_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t fracBits) Quantized 32-bit complex fast fourier transform for XPULPV2.
void	plp_cfft_q32p_xpulpv2(void * args) Parallel quantized 32 bit complex fast fourier transform for XPULPV2.
void	plp_rfft_f32(const plp_fft_instance_f32 * S, const float32_t restrict pSrc, float32_t restrict pDst) Floating-point FFT on real input data.
void	plp_rfft_f32_parallel(const plp_fft_instance_f32 * S, const float32_t restrict pSrc, const uint32_t nPE, float32_t restrict pDst) Floating-point FFT on real input data (parallel version).
void	plp_rfft_f32s_xpulpv2(const plp_fft_instance_f32 * S, const float32_t restrict pSrc, float32_t restrict pDst) Floating-point FFT on real input data for XPULPV2 extension.
void	plp_rfft_f32p_xpulpv2(void * arg) Floating-point FFT on real input data for XPULPV2 extension (parallel version).
void	plp_rfftfast_f32(const plp_fft_fast_instance_f32 * S, const float32_t restrict pSrc, float32_t restrict pDst) Floating-point FFT on real input data.
void	plp_rfftfast_f32_parallel(const plp_fft_fast_instance_f32 * S, float32_t restrict pSrc, float32_t restrict pDst, const uint32_t nPE) Floating-point parallel FFT on real input data.
void	plp_rfftfast_f32s_xpulpv2(const plp_fft_fast_instance_f32 * S, float32_t * pSrc, float32_t * pDst) Floating-point FFT on real input data for XPULPV2 extension.
void	plp_rfftfast_f32p_xpulpv2(void * arg) Floating-point parallel FFT on real input data for XPULPV2 extension.
void	plp_cfft_f32(const plp_cfft_instance_f32 * S, float32_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag) Floating-point FFT on complex input data.
void	plp_cfft_f32_parallel(const plp_cfft_instance_f32 * S, const float32_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag, const uint32_t nPE) Floating-point FFT on complex input data (parallel version).
void	plp_cfft_f32s_xpulpv2(const plp_cfft_instance_f32 * S, const float32_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag) Floating-point FFT on complex input data for XPULPV2 extension.
void	plp_cfft_f32p_xpulpv2(void * arg) Floating-point FFT on complex input data for XPULPV2 extension (parallel version).
void	plp_dct2_f32(const plp_fft_instance_f32 * S, const Complex_type_f32 * pShift, const uint8_t orthoNorm, const float32_t restrict pSrc, float32_t restrict pBuf, float32_t *restrict pDst) Floating-point DCT on real input data. Implementation of John Makhoul's "A Fast Cosine Transform in One and Two Dimensions" 1980 IEEE paper.
void	plp_dct2_f32_parallel(const plp_fft_instance_f32 * S, const Complex_type_f32 * pShift, const uint8_t orthoNorm, const float32_t restrict pSrc, const uint32_t nPE, float32_t restrict pBuf, float32_t *restrict pDst) Floating-point DCT on real input data. Implementation of John Makhoul's "A Fast Cosine Transform in One and Two Dimensions" 1980 IEEE paper.
void	plp_mfcc_f32(const plp_fft_instance_f32 * SFFT, const plp_fft_instance_f32 * SDCT, const Complex_type_f32 * pShift, const plp_triangular_filter_f32 * filterBank, const float32_t * window, const uint8_t * orthoNorm, const float32_t restrict pSrc, float32_t restrict pDst) MFCC on real input data.
void	plp_mfcc_f32_parallel(const plp_fft_instance_f32 * SFFT, const plp_fft_instance_f32 * SDCT, const Complex_type_f32 * pShift, const plp_triangular_filter_f32 * filterBank, const float32_t * window, const uint8_t * orthoNorm, const float32_t restrict pSrc, const uint32_t nPE, float32_t restrict pDst) MFCC on real input data.
void	plp_dwt_f32(const float32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_f32 wavelet, plp_dwt_extension_mode mode, float32_t restrict pDstA, float32_t *restrict pDstD) Glue code for matrix addition of a 32-bit integer matrices.
void	plp_dwt_q32(const int32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q32 wavelet, plp_dwt_extension_mode mode, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Fixed-point DWT for XPULPV2 extension.
void	plp_dwt_q16(const int16_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q16 wavelet, plp_dwt_extension_mode mode, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Fixed-point DWT for XPULPV2 extension.
void	plp_dwt_q8(const int8_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q8 wavelet, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Fixed-point DWT for XPULPV2 extension.
void	plp_dwt_dec_f32(const float32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_f32 wavelet, plp_dwt_extension_mode mode, uint32_t level, float32_t restrict pTmp, float32_t *restrict pDst) Floating-point n-level DWT for XPULPV2 extension.
void	plp_dwt_dec_f32_parallel(const float32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_f32 wavelet, plp_dwt_extension_mode mode, uint32_t level, uint32_t nPE, float32_t restrict pTemp, float32_t *restrict pDst) Floating-point parallel n-level DWT for XPULPV2 extension.
void	plp_dwt_f32s_xpulpv2(const float32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_f32 wavelet, plp_dwt_extension_mode mode, float32_t restrict pDstA, float32_t *restrict pDstD) Floating-point DWT on real input data for XPULPV2 extension.
void	plp_dwt_haar_f32s_xpulpv2(const float32_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, float32_t restrict pDstA, float32_t *restrict pDstD) Floating-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
void	plp_dwt_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q32 wavelet, plp_dwt_extension_mode mode, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Fixed-point DWT for XPULPV2 extension.
void	plp_dwt_haar_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
void	plp_dwt_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q16 wavelet, plp_dwt_extension_mode mode, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Fixed-point DWT for XPULPV2 extension.
void	plp_dwt_haar_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
void	plp_dwt_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q8 wavelet, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Fixed-point DWT for XPULPV2 extension.
void	plp_dwt_haar_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
void	plp_dwt_f32_parallel(const float32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_f32 wavelet, plp_dwt_extension_mode mode, uint32_t nPE, float32_t restrict pDstA, float32_t *restrict pDstD) Parallel Floating-point DWT on real input data for XPULPV2 extension.
void	plp_dwt_q8_parallel(const int8_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q8 wavelet, plp_dwt_extension_mode mode, uint32_t nPE, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Parallel Fixed-point DWT on real input data for XPULPV2 extension.
void	plp_dwt_q16_parallel(const int16_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q16 wavelet, plp_dwt_extension_mode mode, uint32_t nPE, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Parallel Fixed-point DWT on real input data for XPULPV2 extension.
void	plp_dwt_q32_parallel(const int32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q32 wavelet, plp_dwt_extension_mode mode, uint32_t nPE, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Parallel Fixed-point DWT on real input data for XPULPV2 extension.
void	plp_dwt_f32p_xpulpv2(void * args) Floating-point DWT on real input data for XPULPV2 extension.
void	plp_dwt_haar_f32p_xpulpv2(void * args) Floating-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
void	plp_dwt_q8p_xpulpv2(void * args) Q7 fixed-point DWT for XPULPV2 extension.
void	plp_dwt_haar_q8p_xpulpv2(void * args) q7 fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension.
void	plp_dwt_q16p_xpulpv2(void * args) Q15 fixed-point DWT for XPULPV2 extension.
void	plp_dwt_haar_q16p_xpulpv2(void * args) q15 fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension.
void	plp_dwt_q32p_xpulpv2(void * arg) Q31 fixed-point DWT on real input data for XPULPV2 extension.
void	plp_dwt_haar_q32p_xpulpv2(void * args) Q31 Fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension.
void	plp_dwt_q32s_rv32im(const int32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q32 wavelet, plp_dwt_extension_mode mode, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Fixed-point DWT.
void	plp_dwt_haar_q32s_rv32im(const int32_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.
void	plp_dwt_q16s_rv32im(const int16_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q16 wavelet, plp_dwt_extension_mode mode, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Fixed-point DWT.
void	plp_dwt_haar_q16s_rv32im(const int16_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.
void	plp_dwt_q8s_rv32im(const int8_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q8 wavelet, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Fixed-point DWT.
void	plp_dwt_haar_q8s_rv32im(const int8_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.
void	plp_mat_add_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) Glue code for matrix addition of 16-bit integer matrices.
void	plp_mat_add_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) matrix addition of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_add_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) matrix addition of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_add_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int32_t *restrict pDst) Glue code for parallel matrix addition of a 32-bit integer matrices.
void	plp_mat_add_i32p_xpulpv2(void * args) Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_add_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) Glue code for matrix addition of a 16-bit integer matrices.
void	plp_mat_add_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) matrix addition of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_add_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) matrix addition of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_add_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int16_t *restrict pDst) Glue code for parallel matrix addition of a 16-bit integer matrices.
void	plp_mat_add_i16p_xpulpv2(void * args) Parallel matrix addition of 16-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_add_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) Glue code for matrix addition of a 8-bit integer matrices.
void	plp_mat_add_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) matrix addition of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_add_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) matrix addition of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_add_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int8_t *restrict pDst) Glue code for parallel matrix addition of a 8-bit integer matrices.
void	plp_mat_add_i8p_xpulpv2(void * args) Parallel matrix addition of 8-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_add_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, float *restrict pDst) Glue code for matrix addition of a 32-bit floating-point matrices.
void	plp_mat_add_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, float *restrict pDst) matrix addition of a 32-bit floating-point matrices for XPULPV2 extension.
void	plp_mat_add_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, float *restrict pDst) Glue code for parallel matrix addition of a 32-bit floating-point matrices.
void	plp_mat_add_f32p_xpulpv2(void * args) Parallel matrix addition of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_sub_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) Glue code for matrix subtraction of a 32-bit integer matrices.
void	plp_mat_sub_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) matrix subtraction of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_sub_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_sub_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int32_t *restrict pDst) Glue code for parallel matrix subtraction of a 32-bit integer matrices.
void	plp_mat_sub_i32p_xpulpv2(void * args) Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_sub_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) Glue code for matrix subtraction of a 16-bit integer matrices.
void	plp_mat_sub_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) matrix subtraction of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_sub_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) matrix subtraction of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_sub_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int16_t *restrict pDst) Glue code for parallel matrix subtraction of a 16-bit integer matrices.
void	plp_mat_sub_i16p_xpulpv2(void * args) Parallel matrix subtraction of 16-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_sub_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) Glue code for matrix subtraction of a 8-bit integer matrices.
void	plp_mat_sub_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) matrix subtraction of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_sub_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) matrix subtraction of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_sub_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int8_t *restrict pDst) Glue code for parallel matrix subtraction of a 8-bit integer matrices.
void	plp_mat_sub_i8p_xpulpv2(void * args) Parallel matrix subtraction of 8-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_sub_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, float *restrict pDst) Glue code for matrix subtraction of a 32-bit floating-point matrices.
void	plp_mat_sub_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, float *restrict pDst) matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension.
void	plp_mat_sub_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, float *restrict pDst) Glue code for parallel matrix subtraction of a 32-bit floating-point matrices.
void	plp_mat_sub_f32p_xpulpv2(void * args) Parallel matrix subtraction of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_scale_i32(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) Glue code for matrix scale of a 32-bit integer matrices.
void	plp_mat_scale_i32s_rv32im(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) matrix scale of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_scale_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) matrix scale of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_scale_i32_parallel(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t scaleFactor, int32_t shift, uint32_t nPE, int32_t restrict pDst) Glue code for parallel matrix scale of a 32-bit integer matrices.
void	plp_mat_scale_i32p_xpulpv2(void * args) Parallel matrix scale of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_scale_i16(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) Glue code for matrix scale of a 16-bit integer matrices.
void	plp_mat_scale_i16s_rv32im(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) matrix scale of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_scale_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) matrix scale of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_scale_i16_parallel(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t scaleFactor, int32_t shift, uint32_t nPE, int16_t restrict pDst) Glue code for parallel matrix scale of a 16-bit integer matrices.
void	plp_mat_scale_i16p_xpulpv2(void * args) Parallel matrix scale of 16-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_scale_i8(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) Glue code for matrix scale of a 8-bit integer matrices.
void	plp_mat_scale_i8s_rv32im(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) matrix scale of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_scale_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) matrix scale of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_scale_i8_parallel(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t scaleFactor, int32_t shift, uint32_t nPE, int8_t restrict pDst) Glue code for parallel matrix scale of a 8-bit integer matrices.
void	plp_mat_scale_i8p_xpulpv2(void * args) Parallel matrix scale of 8-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_scale_f32(const float restrict pSrc, uint32_t M, uint32_t N, float scaleFactor, float restrict pDst) Glue code for matrix scale of a 32-bit floating-point matrices.
void	plp_mat_scale_f32s_xpulpv2(const float restrict pSrc, uint32_t M, uint32_t N, float scaleFactor, float restrict pDst) matrix scale of a 32-bit floating-point matrices for XPULPV2 extension.
void	plp_mat_scale_f32_parallel(const float restrict pSrc, uint32_t M, uint32_t N, float scaleFactor, uint32_t nPE, float restrict pDst) Glue code for parallel matrix scale of a 32-bit floating-point matrices.
void	plp_mat_scale_f32p_xpulpv2(void * args) Parallel matrix scale of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_trans_i32(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t restrict pDst) Glue code for matrix transpose of a 32-bit integer matrices.
void	plp_mat_trans_i32s_rv32im(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t restrict pDst) matrix transpose of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_trans_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t restrict pDst) matrix transpose of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_trans_i32_parallel(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t nPE, int32_t restrict pDst) Glue code for parallel matrix transpose of a 32-bit integer matrices.
void	plp_mat_trans_i32p_xpulpv2(void * args) Parallel matrix transpose of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_trans_i16(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t restrict pDst) Glue code for matrix transpose of a 16-bit integer matrices.
void	plp_mat_trans_i16s_rv32im(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t restrict pDst) matrix transpose of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_trans_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t restrict pDst) matrix transpose of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_trans_i16_parallel(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t nPE, int16_t restrict pDst) Glue code for parallel matrix transpose of a 16-bit integer matrices.
void	plp_mat_trans_i16p_xpulpv2(void * args) Parallel matrix transpose of 16-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_trans_i8(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t restrict pDst) Glue code for matrix transpose of a 8-bit integer matrices.
void	plp_mat_trans_i8s_rv32im(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t restrict pDst) matrix transpose of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_trans_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t restrict pDst) matrix transpose of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_trans_i8_parallel(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t nPE, int8_t restrict pDst) Glue code for parallel matrix transpose of a 8-bit integer matrices.
void	plp_mat_trans_i8p_xpulpv2(void * args) Parallel matrix transpose of 8-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_trans_f32(const float restrict pSrc, uint32_t M, uint32_t N, float restrict pDst) Glue code for matrix transpose of a 32-bit float*ing-point matrices.
void	plp_mat_trans_f32_parallel(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t nPE, float restrict pDst) Glue code for parallel matrix transpose of a 32-bit float*ing-point matrices.
int	plp_mat_inv_f32(float restrict pSrc, float restrict pDst, uint32_t N) Glue code for matrix inverse of a 32-bit floating-point matrices.
int	plp_mat_inv_f32s_xpulpv2(float restrict pSrc, float restrict pDst, uint32_t N) matrix inverse of a 32-bit floating-point matrices for XPULPV2 extension.
int	plp_mat_inv_f32_parallel(float restrict pSrc, float restrict pDst, uint32_t N, uint32_t nPE) Glue code for parallel matrix inverse of a 32-bit floating-point matrices.
int	plp_mat_inv_f32p_xpulpv2(void * args) Parallel matrix inverse of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_fill_I_i32(uint32_t N, int32_t *restrict pDst) Glue code for creating a 32-bit integer identity matrix.
void	plp_mat_fill_I_i32s_rv32im(uint32_t N, int32_t *restrict pDst) Create a 32-bit integer identity matrix on RV32IM.
void	plp_mat_fill_I_i32s_xpulpv2(uint32_t N, int32_t *restrict pDst) Create a 32-bit integer identity matrix on XpulpV2.
void	plp_mat_fill_I_i32_parallel(uint32_t N, uint32_t nPE, int32_t *restrict pDst) Glue code for creating a 32-bit integer identity matrix in parallel.
void	plp_mat_fill_I_i32p_xpulpv2(void * args) Create a 32-bit integer identity matrix in parallel on XpulpV2.
void	plp_mat_fill_I_i16(uint32_t N, int16_t *restrict pDst) Glue code for creating a 16-bit integer identity matrix.
void	plp_mat_fill_I_i16s_rv32im(uint32_t N, int16_t *restrict pDst) Create a 16-bit integer identity matrix on RV32IM.
void	plp_mat_fill_I_i16s_xpulpv2(uint32_t N, int16_t *restrict pDst) Create a 16-bit integer identity matrix on XpulpV2.
void	plp_mat_fill_I_i16_parallel(uint32_t N, uint32_t nPE, int16_t *restrict pDst) Glue code for creating a 16-bit integer identity matrix in parallel.
void	plp_mat_fill_I_i16p_xpulpv2(void * args) Create a 16-bit integer identity matrix in parallel on XpulpV2.
void	plp_mat_fill_I_i8(uint32_t N, int8_t *restrict pDst) Glue code for creating a 8-bit integer identity matrix.
void	plp_mat_fill_I_i8s_rv32im(uint32_t N, int8_t *restrict pDst) Create a 8-bit integer identity matrix on RV32IM.
void	plp_mat_fill_I_i8s_xpulpv2(uint32_t N, int8_t *restrict pDst) Create a 8-bit integer identity matrix on XpulpV2.
void	plp_mat_fill_I_i8_parallel(uint32_t N, uint32_t nPE, int8_t *restrict pDst) Glue code for creating a 8-bit integer identity matrix in parallel.
void	plp_mat_fill_I_i8p_xpulpv2(void * args) Create a 8-bit integer identity matrix in parallel on XpulpV2.
void	plp_mat_fill_I_f32(uint32_t N, float *restrict pDst) Glue code for creating a 32-bit float identity matrix.
void	plp_mat_fill_I_f32s_xpulpv2(uint32_t N, float *restrict pDst) Create a 32-bit float identity matrix on XpulpV2.
void	plp_mat_fill_I_f32_parallel(uint32_t N, uint32_t nPE, float *restrict pDst) Glue code for creating a 32-bit float identity matrix in parallel.
void	plp_mat_fill_I_f32p_xpulpv2(void * args) Create a 32-bit float identity matrix in parallel on XpulpV2.
void	plp_mat_fill_I_q32(uint32_t N, int32_t fracBits, int32_t *restrict pDst) Glue code for creating a 32-bit fix-point identity matrix.
void	plp_mat_fill_I_q32s_rv32im(uint32_t N, int32_t fracBits, int32_t *restrict pDst) Create a 32-bit fix-point identity matrix on RV32IM.
void	plp_mat_fill_I_q32s_xpulpv2(uint32_t N, int32_t fracBits, int32_t *restrict pDst) Create a 32-bit fix-point identity matrix on XpulpV2.
void	plp_mat_fill_I_q32_parallel(uint32_t N, int32_t fracBits, uint32_t nPE, int32_t *restrict pDst) Glue code for creating a 32-bit fix-point identity matrix in parallel.
void	plp_mat_fill_I_q32p_xpulpv2(void * args) Create a 32-bit fix-point identity matrix in parallel on XpulpV2.
void	plp_mat_fill_I_q16(uint32_t N, int32_t fracBits, int16_t *restrict pDst) Glue code for creating a 16-bit fix-point identity matrix.
void	plp_mat_fill_I_q16s_rv32im(uint32_t N, int32_t fracBits, int16_t *restrict pDst) Create a 16-bit fix-point identity matrix on RV32IM.
void	plp_mat_fill_I_q16s_xpulpv2(uint32_t N, int32_t fracBits, int16_t *restrict pDst) Create a 16-bit fix-point identity matrix on XpulpV2.
void	plp_mat_fill_I_q16_parallel(uint32_t N, int32_t fracBits, uint32_t nPE, int16_t *restrict pDst) Glue code for creating a 16-bit fix-point identity matrix in parallel.
void	plp_mat_fill_I_q16p_xpulpv2(void * args) Create a 16-bit fix-point identity matrix in parallel on XpulpV2.
void	plp_mat_fill_I_q8(uint32_t N, int32_t fracBits, int8_t *restrict pDst) Glue code for creating a 8-bit fix-point identity matrix.
void	plp_mat_fill_I_q8s_rv32im(uint32_t N, int32_t fracBits, int8_t *restrict pDst) Create a 8-bit fix-point identity matrix on RV32IM.
void	plp_mat_fill_I_q8s_xpulpv2(uint32_t N, int32_t fracBits, int8_t *restrict pDst) Create a 8-bit fix-point identity matrix on XpulpV2.
void	plp_mat_fill_I_q8_parallel(uint32_t N, int32_t fracBits, uint32_t nPE, int8_t *restrict pDst) Glue code for creating a 8-bit fix-point identity matrix in parallel.
void	plp_mat_fill_I_q8p_xpulpv2(void * args) Create a 8-bit fix-point identity matrix in parallel on XpulpV2.
void	plp_mat_mult_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 32-bit integer matrices.
void	plp_mat_mult_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_mult_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 16-bit integer matrices.
void	plp_mat_mult_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_mult_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 8-bit integer matrices.
void	plp_mat_mult_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_mult_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices.
void	plp_mat_mult_stride_i32p_xpulpv2(void * args) Parallel strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 16-bit integer matrices.
void	plp_mat_mult_stride_i16p_xpulpv2(void * args) Parallel matrix multiplication of 16-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_mult_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 8-bit integer matrices.
void	plp_mat_mult_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) Glue code for strided matrix matrix multiplication of a 32-bit floating-point matrices.
void	plp_mat_mult_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) strided matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
void	plp_mat_mult_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, float *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 32-bit floating-point matrices.
void	plp_mat_mult_stride_f32p_xpulpv2(void * args) Parallel matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_stride_i8p_xpulpv2(void * args) Parallel matrix multiplication of 8-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_mult_stride_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 32-bit fix-point matrices.
void	plp_mat_mult_stride_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 32-bit fix-point matrices.
void	plp_mat_mult_stride_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_stride_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_stride_q32p_xpulpv2(void * args) Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_stride_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 16-bit fix-point matrices.
void	plp_mat_mult_stride_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 16-bit fix-point matrices.
void	plp_mat_mult_stride_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_stride_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_stride_q16p_xpulpv2(void * args) Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_stride_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 8-bit fix-point matrices.
void	plp_mat_mult_stride_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 8-bit fix-point matrices.
void	plp_mat_mult_stride_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_stride_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_stride_q8p_xpulpv2(void * args) Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_trans_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 32-bit integer matrices.
void	plp_mat_mult_trans_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_mult_trans_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_trans_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 16-bit integer matrices.
void	plp_mat_mult_trans_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_mult_trans_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_trans_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 8-bit integer matrices.
void	plp_mat_mult_trans_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_mult_trans_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_trans_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices.
void	plp_mat_mult_trans_stride_i32p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_mult_trans_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices.
void	plp_mat_mult_trans_stride_i16p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_trans_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices.
void	plp_mat_mult_trans_stride_i8p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_mult_trans_stride_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 32-bit fix-point matrices.
void	plp_mat_mult_trans_stride_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit fix-point matrices.
void	plp_mat_mult_trans_stride_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_trans_stride_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_trans_stride_q32p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_trans_stride_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 16-bit fix-point matrices.
void	plp_mat_mult_trans_stride_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit fix-point matrices.
void	plp_mat_mult_trans_stride_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_trans_stride_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_trans_stride_q16p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_trans_stride_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 8-bit fix-point matrices.
void	plp_mat_mult_trans_stride_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit fix-point matrices.
void	plp_mat_mult_trans_stride_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
void	plp_mat_mult_trans_stride_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
void	plp_mat_mult_trans_stride_q8p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_trans_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 32-bit floating-point matrices.
void	plp_mat_mult_trans_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) strided matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
void	plp_mat_mult_trans_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, float *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit floating-point matrices.
void	plp_mat_mult_trans_stride_f32p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_mult_cmplx_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 32-bit integers.
void	plp_mat_mult_cmplx_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 32-bit integers on RV32IM.
void	plp_mat_mult_cmplx_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 32-bit integers on XpulpV2.
void	plp_mat_mult_cmplx_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 32-bit integers.
void	plp_mat_mult_cmplx_stride_i32p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 32-bit integers on XpulpV2
void	plp_mat_mult_cmplx_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 16-bit integers.
void	plp_mat_mult_cmplx_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 16-bit integers on RV32IM.
void	plp_mat_mult_cmplx_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 16-bit integers on XpulpV2.
void	plp_mat_mult_cmplx_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 16-bit integers.
void	plp_mat_mult_cmplx_stride_i16p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 16-bit integers on XpulpV2
void	plp_mat_mult_cmplx_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 8-bit integers.
void	plp_mat_mult_cmplx_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 8-bit integers on RV32IM.
void	plp_mat_mult_cmplx_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 8-bit integers on XpulpV2.
void	plp_mat_mult_cmplx_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 8-bit integers.
void	plp_mat_mult_cmplx_stride_i8p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 8-bit integers on XpulpV2
void	plp_mat_mult_cmplx_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 32-bit floats.
void	plp_mat_mult_cmplx_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) Strided strided matrix matrix multiplication for complex 32-bit floats on XpulpV2.
void	plp_mat_mult_cmplx_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, float *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 32-bit floats.
void	plp_mat_mult_cmplx_stride_f32p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 32-bit floats on XpulpV2
void	plp_mat_mult_cmplx_stride_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 32-bit fix-point.
void	plp_mat_mult_cmplx_stride_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 32-bit fix-point on RV32IM.
void	plp_mat_mult_cmplx_stride_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 32-bit fix-point on XpulpV2.
void	plp_mat_mult_cmplx_stride_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 32-bit fix-point.
void	plp_mat_mult_cmplx_stride_q32p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 32-bit fix-point on XpulpV2
void	plp_mat_mult_cmplx_stride_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 16-bit fix-point.
void	plp_mat_mult_cmplx_stride_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 16-bit fix-point on RV32IM.
void	plp_mat_mult_cmplx_stride_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 16-bit fix-point on XpulpV2.
void	plp_mat_mult_cmplx_stride_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 16-bit fix-point.
void	plp_mat_mult_cmplx_stride_q16p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 16-bit fix-point on XpulpV2
void	plp_mat_mult_cmplx_stride_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 8-bit fix-point.
void	plp_mat_mult_cmplx_stride_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 8-bit fix-point on RV32IM.
void	plp_mat_mult_cmplx_stride_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 8-bit fix-point on XpulpV2.
void	plp_mat_mult_cmplx_stride_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 8-bit fix-point.
void	plp_mat_mult_cmplx_stride_q8p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 8-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 32-bit integers.
void	plp_mat_mult_trans_cmplx_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 32-bit integers on RV32IM
void	plp_mat_mult_trans_cmplx_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit integers.
void	plp_mat_mult_trans_cmplx_stride_i32p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 16-bit integers.
void	plp_mat_mult_trans_cmplx_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 16-bit integers on RV32IM
void	plp_mat_mult_trans_cmplx_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 16-bit integers.
void	plp_mat_mult_trans_cmplx_stride_i16p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 8-bit integers.
void	plp_mat_mult_trans_cmplx_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 8-bit integers on RV32IM
void	plp_mat_mult_trans_cmplx_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 8-bit integers.
void	plp_mat_mult_trans_cmplx_stride_i8p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 32-bit floats.
void	plp_mat_mult_trans_cmplx_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) strided matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, float *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit floats.
void	plp_mat_mult_trans_cmplx_stride_f32p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 32-bit fix-point.
void	plp_mat_mult_trans_cmplx_stride_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM
void	plp_mat_mult_trans_cmplx_stride_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit fix-point.
void	plp_mat_mult_trans_cmplx_stride_q32p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 16-bit fix-point.
void	plp_mat_mult_trans_cmplx_stride_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM
void	plp_mat_mult_trans_cmplx_stride_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 16-bit fix-point.
void	plp_mat_mult_trans_cmplx_stride_q16p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 8-bit fix-point.
void	plp_mat_mult_trans_cmplx_stride_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM
void	plp_mat_mult_trans_cmplx_stride_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2
void	plp_mat_mult_trans_cmplx_stride_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 8-bit fix-point.
void	plp_mat_mult_trans_cmplx_stride_q8p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2
void	plp_mat_add_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) Glue code for matrix addition of a 32-bit integer matrices.
void	plp_mat_add_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) matrix addition of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_add_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) matrix addition of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_add_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int32_t *restrict pDst) Glue code for parallel matrix addition of a 32-bit integer matrices.
void	plp_mat_add_stride_i32p_xpulpv2(void * args) Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_add_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) Glue code for matrix addition of a 16-bit integer matrices.
void	plp_mat_add_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) matrix addition of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_add_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) matrix addition of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_add_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int16_t *restrict pDst) Glue code for parallel matrix addition of a 16-bit integer matrices.
void	plp_mat_add_stride_i16p_xpulpv2(void * args) Parallel matrix addition of 16-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_add_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) Glue code for matrix addition of a 8-bit integer matrices.
void	plp_mat_add_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) matrix addition of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_add_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) matrix addition of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_add_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int8_t *restrict pDst) Glue code for parallel matrix addition of a 8-bit integer matrices.
void	plp_mat_add_stride_i8p_xpulpv2(void * args) Parallel matrix addition of 8-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_add_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, float *restrict pDst) Glue code for matrix addition of a 32-bit floating-point matrices.
void	plp_mat_add_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, float *restrict pDst) matrix addition of a 32-bit floating-point matrices for XPULPV2 extension.
void	plp_mat_add_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, float *restrict pDst) Glue code for parallel matrix addition of a 32-bit floating-point matrices.
void	plp_mat_add_stride_f32p_xpulpv2(void * args) Parallel matrix addition of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_sub_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) Glue code for matrix subtraction of a 32-bit integer matrices.
void	plp_mat_sub_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) matrix subtraction of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_sub_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_sub_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int32_t *restrict pDst) Glue code for parallel matrix subtraction of a 32-bit integer matrices.
void	plp_mat_sub_stride_i32p_xpulpv2(void * args) Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_sub_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) Glue code for matrix subtraction of a 16-bit integer matrices.
void	plp_mat_sub_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) matrix subtraction of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_sub_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) matrix subtraction of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_sub_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int16_t *restrict pDst) Glue code for parallel matrix subtraction of a 16-bit integer matrices.
void	plp_mat_sub_stride_i16p_xpulpv2(void * args) Parallel matrix subtraction of 16-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_sub_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) Glue code for matrix subtraction of a 8-bit integer matrices.
void	plp_mat_sub_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) matrix subtraction of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_sub_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) matrix subtraction of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_sub_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int8_t *restrict pDst) Glue code for parallel matrix subtraction of a 8-bit integer matrices.
void	plp_mat_sub_stride_i8p_xpulpv2(void * args) Parallel matrix subtraction of 8-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_sub_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, float *restrict pDst) Glue code for matrix subtraction of a 32-bit floating-point matrices.
void	plp_mat_sub_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, float *restrict pDst) matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension.
void	plp_mat_sub_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, float *restrict pDst) Glue code for parallel matrix subtraction of a 32-bit floating-point matrices.
void	plp_mat_sub_stride_f32p_xpulpv2(void * args) Parallel matrix subtraction of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_scale_stride_i32(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) Glue code for strided matrix scale of a 32-bit integer matrices.
void	plp_mat_scale_stride_i32s_rv32im(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) strided matrix scale of a 32-bit integer matrices for RV32IM extension.
void	plp_mat_scale_stride_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) strided matrix scale of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_scale_stride_i32_parallel(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t scaleFactor, int32_t shift, uint32_t nPE, int32_t restrict pDst) Glue code for parallel strided matrix scale of a 32-bit integer matrices.
void	plp_mat_scale_stride_i32p_xpulpv2(void * args) Parallel strided matrix scale of a 32-bit integer matrices for XPULPV2 extension.
void	plp_mat_scale_stride_i16(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) Glue code for strided matrix scale of a 16-bit integer matrices.
void	plp_mat_scale_stride_i16s_rv32im(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) strided matrix scale of a 16-bit integer matrices for RV32IM extension.
void	plp_mat_scale_stride_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) strided matrix scale of a 16-bit integer matrices for XPULPV2 extension.
void	plp_mat_scale_stride_i16_parallel(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t scaleFactor, int32_t shift, uint32_t nPE, int16_t restrict pDst) Glue code for parallel strided matrix scale of a 16-bit integer matrices.
void	plp_mat_scale_stride_i16p_xpulpv2(void * args) Parallel strided matrix scale of 16-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_scale_stride_i8(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) Glue code for strided matrix scale of a 8-bit integer matrices.
void	plp_mat_scale_stride_i8s_rv32im(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) strided matrix scale of a 8-bit integer matrices for RV32IM extension.
void	plp_mat_scale_stride_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) strided matrix scale of a 8-bit integer matrices for XPULPV2 extension.
void	plp_mat_scale_stride_i8_parallel(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t scaleFactor, int32_t shift, uint32_t nPE, int8_t restrict pDst) Glue code for parallel strided matrix scale of a 8-bit integer matrices.
void	plp_mat_scale_stride_i8p_xpulpv2(void * args) Parallel strided matrix scale of 8-bit integer matrices kernel for XPULPV2 extension.
void	plp_mat_scale_stride_f32(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, float scaleFactor, float restrict pDst) Glue code for strided matrix scale of a 32-bit floating-point matrices.
void	plp_mat_scale_stride_f32s_xpulpv2(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, float scaleFactor, float restrict pDst) strided matrix scale of a 32-bit floating-point matrices for XPULPV2 extension.
void	plp_mat_scale_stride_f32_parallel(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, float scaleFactor, uint32_t nPE, float restrict pDst) Glue code for parallel strided matrix scale of a 32-bit floating-point matrices.
void	plp_mat_scale_stride_f32p_xpulpv2(void * args) Parallel strided matrix scale of 32-bit floating-point matrices kernel for XPULPV2 extension.
void	plp_mat_fill_I_stride_i32(uint32_t N, uint32_t stride, int32_t *restrict pDst) Glue code for creating a strided 32-bit integers identity matrix.
void	plp_mat_fill_I_stride_i32s_rv32im(uint32_t N, uint32_t stride, int32_t *restrict pDst) Create a strided 32-bit integers identity matrix on RV32IM.
void	plp_mat_fill_I_stride_i32s_xpulpv2(uint32_t N, uint32_t stride, int32_t *restrict pDst) Create a strided 32-bit integers identity matrix on XpulpV2.
void	plp_mat_fill_I_stride_i32_parallel(uint32_t N, uint32_t stride, uint32_t nPE, int32_t *restrict pDst) Glue code for creating a strided 32-bit integers identity matrix in parallel.
void	plp_mat_fill_I_stride_i32p_xpulpv2(void * args) Create a strided 32-bit integers identity matrix on XpulpV2 in parallel.
void	plp_mat_fill_I_stride_i16(uint32_t N, uint32_t stride, int16_t *restrict pDst) Glue code for creating a strided 16-bit integers identity matrix.
void	plp_mat_fill_I_stride_i16s_rv32im(uint32_t N, uint32_t stride, int16_t *restrict pDst) Create a strided 16-bit integers identity matrix on RV32IM.
void	plp_mat_fill_I_stride_i16s_xpulpv2(uint32_t N, uint32_t stride, int16_t *restrict pDst) Create a strided 16-bit integers identity matrix on XpulpV2.
void	plp_mat_fill_I_stride_i16_parallel(uint32_t N, uint32_t stride, uint32_t nPE, int16_t *restrict pDst) Glue code for creating a strided 16-bit integers identity matrix in parallel.
void	plp_mat_fill_I_stride_i16p_xpulpv2(void * args) Create a strided 16-bit integers identity matrix on XpulpV2 in parallel.
void	plp_mat_fill_I_stride_i8(uint32_t N, uint32_t stride, int8_t *restrict pDst) Glue code for creating a strided 8-bit integers identity matrix.
void	plp_mat_fill_I_stride_i8s_rv32im(uint32_t N, uint32_t stride, int8_t *restrict pDst) Create a strided 8-bit integers identity matrix on RV32IM.
void	plp_mat_fill_I_stride_i8s_xpulpv2(uint32_t N, uint32_t stride, int8_t *restrict pDst) Create a strided 8-bit integers identity matrix on XpulpV2.
void	plp_mat_fill_I_stride_i8_parallel(uint32_t N, uint32_t stride, uint32_t nPE, int8_t *restrict pDst) Glue code for creating a strided 8-bit integers identity matrix in parallel.
void	plp_mat_fill_I_stride_i8p_xpulpv2(void * args) Create a strided 8-bit integers identity matrix on XpulpV2 in parallel.
void	plp_mat_fill_I_stride_f32(uint32_t N, uint32_t stride, float *restrict pDst) Glue code for creating a strided 32-bit floats identity matrix.
void	plp_mat_fill_I_stride_f32s_xpulpv2(uint32_t N, uint32_t stride, float *restrict pDst) Create a strided 32-bit floats identity matrix on XpulpV2.
void	plp_mat_fill_I_stride_f32_parallel(uint32_t N, uint32_t stride, uint32_t nPE, float *restrict pDst) Glue code for creating a strided 32-bit floats identity matrix in parallel.
void	plp_mat_fill_I_stride_f32p_xpulpv2(void * args) Create a strided 32-bit floats identity matrix on XpulpV2 in parallel.
void	plp_mat_fill_I_stride_q32(uint32_t N, uint32_t stride, int32_t fracBits, int32_t *restrict pDst) Glue code for creating a strided 32-bit fix-point identity matrix.
void	plp_mat_fill_I_stride_q32s_rv32im(uint32_t N, uint32_t stride, int32_t fracBits, int32_t *restrict pDst) Create a strided 32-bit fix-point identity matrix on RV32IM.
void	plp_mat_fill_I_stride_q32s_xpulpv2(uint32_t N, uint32_t stride, int32_t fracBits, int32_t *restrict pDst) Create a strided 32-bit fix-point identity matrix on XpulpV2.
void	plp_mat_fill_I_stride_q32_parallel(uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int32_t *restrict pDst) Glue code for creating a strided 32-bit fix-point identity matrix in parallel.
void	plp_mat_fill_I_stride_q32p_xpulpv2(void * args) Create a strided 32-bit fix-point identity matrix on XpulpV2 in parallel.
void	plp_mat_fill_I_stride_q16(uint32_t N, uint32_t stride, int32_t fracBits, int16_t *restrict pDst) Glue code for creating a strided 16-bit fix-point identity matrix.
void	plp_mat_fill_I_stride_q16s_rv32im(uint32_t N, uint32_t stride, int32_t fracBits, int16_t *restrict pDst) Create a strided 16-bit fix-point identity matrix on RV32IM.
void	plp_mat_fill_I_stride_q16s_xpulpv2(uint32_t N, uint32_t stride, int32_t fracBits, int16_t *restrict pDst) Create a strided 16-bit fix-point identity matrix on XpulpV2.
void	plp_mat_fill_I_stride_q16_parallel(uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int16_t *restrict pDst) Glue code for creating a strided 16-bit fix-point identity matrix in parallel.
void	plp_mat_fill_I_stride_q16p_xpulpv2(void * args) Create a strided 16-bit fix-point identity matrix on XpulpV2 in parallel.
void	plp_mat_fill_I_stride_q8(uint32_t N, uint32_t stride, int32_t fracBits, int8_t *restrict pDst) Glue code for creating a strided 8-bit fix-point identity matrix.
void	plp_mat_fill_I_stride_q8s_rv32im(uint32_t N, uint32_t stride, int32_t fracBits, int8_t *restrict pDst) Create a strided 8-bit fix-point identity matrix on RV32IM.
void	plp_mat_fill_I_stride_q8s_xpulpv2(uint32_t N, uint32_t stride, int32_t fracBits, int8_t *restrict pDst) Create a strided 8-bit fix-point identity matrix on XpulpV2.
void	plp_mat_fill_I_stride_q8_parallel(uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int8_t *restrict pDst) Glue code for creating a strided 8-bit fix-point identity matrix in parallel.
void	plp_mat_fill_I_stride_q8p_xpulpv2(void * args) Create a strided 8-bit fix-point identity matrix on XpulpV2 in parallel.
void	plp_mat_fill_stride_i32(uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *restrict pDst) Glue code for filling an MxN strided 32-bit integers matrix.
void	plp_mat_fill_stride_i32s_rv32im(uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *restrict pDst) Fill an MxN strided 32-bit integers matrix on RV32IM.
void	plp_mat_fill_stride_i32s_xpulpv2(uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *restrict pDst) Fill an MxN strided 32-bit integers matrix on XpulpV2.
void	plp_mat_fill_stride_i32_parallel(uint32_t M, uint32_t N, uint32_t stride, int32_t value, uint32_t nPE, int32_t *restrict pDst) Glue code for filling an MxN strided 32-bit integers matrix in parallel.
void	plp_mat_fill_stride_i32p_xpulpv2(void * args) Fill an MxN strided 32-bit integers matrix on XpulpV2 in parallel.
void	plp_mat_fill_stride_i16(uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *restrict pDst) Glue code for filling an MxN strided 16-bit integers matrix.
void	plp_mat_fill_stride_i16s_rv32im(uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *restrict pDst) Fill an MxN strided 16-bit integers matrix on RV32IM.
void	plp_mat_fill_stride_i16s_xpulpv2(uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *restrict pDst) Fill an MxN strided 16-bit integers matrix on XpulpV2.
void	plp_mat_fill_stride_i16_parallel(uint32_t M, uint32_t N, uint32_t stride, int16_t value, uint32_t nPE, int16_t *restrict pDst) Glue code for filling an MxN strided 16-bit integers matrix in parallel.
void	plp_mat_fill_stride_i16p_xpulpv2(void * args) Fill an MxN strided 16-bit integers matrix on XpulpV2 in parallel.
void	plp_mat_fill_stride_i8(uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *restrict pDst) Glue code for filling an MxN strided 8-bit integers matrix.
void	plp_mat_fill_stride_i8s_rv32im(uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *restrict pDst) Fill an MxN strided 8-bit integers matrix on RV32IM.
void	plp_mat_fill_stride_i8s_xpulpv2(uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *restrict pDst) Fill an MxN strided 8-bit integers matrix on XpulpV2.
void	plp_mat_fill_stride_i8_parallel(uint32_t M, uint32_t N, uint32_t stride, int8_t value, uint32_t nPE, int8_t *restrict pDst) Glue code for filling an MxN strided 8-bit integers matrix in parallel.
void	plp_mat_fill_stride_i8p_xpulpv2(void * args) Fill an MxN strided 8-bit integers matrix on XpulpV2 in parallel.
void	plp_mat_fill_stride_f32(uint32_t M, uint32_t N, uint32_t stride, float value, float *restrict pDst) Glue code for filling an MxN strided 32-bit floats matrix.
void	plp_mat_fill_stride_f32s_xpulpv2(uint32_t M, uint32_t N, uint32_t stride, float value, float *restrict pDst) Fill an MxN strided 32-bit floats matrix on XpulpV2.
void	plp_mat_fill_stride_f32_parallel(uint32_t M, uint32_t N, uint32_t stride, float value, uint32_t nPE, float *restrict pDst) Glue code for filling an MxN strided 32-bit floats matrix in parallel.
void	plp_mat_fill_stride_f32p_xpulpv2(void * args) Fill an MxN strided 32-bit floats matrix on XpulpV2 in parallel.
void	plp_mat_copy_stride_i32(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t restrict pDst) Glue code to copy an MxN strided 32-bit integers matrix.
void	plp_mat_copy_stride_i32s_rv32im(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t restrict pDst) Copy an MxN strided 32-bit integers matrix on RV32IM.
void	plp_mat_copy_stride_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t restrict pDst) Copy an MxN strided 32-bit integers matrix on XpulpV2.
void	plp_mat_copy_stride_i32_parallel(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, uint32_t nPE, int32_t restrict pDst) Glue code to copy an MxN strided 32-bit integers matrix in parallel.
void	plp_mat_copy_stride_i32p_xpulpv2(void * args) Copy an MxN strided 32-bit integers matrix on XpulpV2 in parallel.
void	plp_mat_copy_stride_i16(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t restrict pDst) Glue code to copy an MxN strided 16-bit integers matrix.
void	plp_mat_copy_stride_i16s_rv32im(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t restrict pDst) Copy an MxN strided 16-bit integers matrix on RV32IM.
void	plp_mat_copy_stride_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t restrict pDst) Copy an MxN strided 16-bit integers matrix on XpulpV2.
void	plp_mat_copy_stride_i16_parallel(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, uint32_t nPE, int16_t restrict pDst) Glue code to copy an MxN strided 16-bit integers matrix in parallel.
void	plp_mat_copy_stride_i16p_xpulpv2(void * args) Copy an MxN strided 16-bit integers matrix on XpulpV2 in parallel.
void	plp_mat_copy_stride_i8(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t restrict pDst) Glue code to copy an MxN strided 8-bit integers matrix.
void	plp_mat_copy_stride_i8s_rv32im(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t restrict pDst) Copy an MxN strided 8-bit integers matrix on RV32IM.
void	plp_mat_copy_stride_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t restrict pDst) Copy an MxN strided 8-bit integers matrix on XpulpV2.
void	plp_mat_copy_stride_i8_parallel(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, uint32_t nPE, int8_t restrict pDst) Glue code to copy an MxN strided 8-bit integers matrix in parallel.
void	plp_mat_copy_stride_i8p_xpulpv2(void * args) Copy an MxN strided 8-bit integers matrix on XpulpV2 in parallel.
void	plp_mat_copy_stride_f32(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, float restrict pDst) Glue code to copy an MxN strided 32-bit floats matrix.
void	plp_mat_copy_stride_f32s_xpulpv2(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, float restrict pDst) Copy an MxN strided 32-bit floats matrix on XpulpV2.
void	plp_mat_copy_stride_f32_parallel(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, uint32_t nPE, float restrict pDst) Glue code to copy an MxN strided 32-bit floats matrix in parallel.
void	plp_mat_copy_stride_f32p_xpulpv2(void * args) Copy an MxN strided 32-bit floats matrix on XpulpV2 in parallel.
void	plp_cmplx_conj_f32(const float32_t restrict pSrc, float32_t restrict pDst, uint32_t numSamples) Glue code for complex conjugate of 32-bit float vectors.
void	plp_cmplx_conj_f32_xpulpv2(const float32_t restrict pSrc, float32_t restrict pDst, uint32_t numSamples) Floating-point complex conjugate.
void	plp_cmplx_conj_i32(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) Glue code for complex conjugate of 32-bit integer vectors.
void	plp_cmplx_conj_i32_xpulpv2(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) 32-bit integer complex conjugate.
void	plp_cmplx_conj_i32_rv32im(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) 32-bit integer complex conjugate.
void	plp_cmplx_conj_i16(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) Glue code for complex conjugate of 16-bit integer vectors.
void	plp_cmplx_conj_i16_xpulpv2(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) 16-bit integer complex conjugate.
void	plp_cmplx_conj_i16_rv32im(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) 16-bit integer complex conjugate.
void	plp_cmplx_conj_i8(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) Glue code for complex conjugate of 8-bit integer vectors.
void	plp_cmplx_conj_i8_xpulpv2(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) 8-bit integer complex conjugate.
void	plp_cmplx_conj_i8_rv32im(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) 8-bit integer complex conjugate.
void	plp_cmplx_dot_prod_f32(const float32_t * pSrcA, const float32_t * pSrcB, uint32_t numSamples, float32_t * realResult, float32_t * imagResult) Glue code for complex dot product of 32-bit float vectors.
void	plp_cmplx_dot_prod_f32_xpulpv2(const float32_t * pSrcA, const float32_t * pSrcB, uint32_t numSamples, float32_t * realResult, float32_t * imagResult) Floating-point complex dot product.
void	plp_cmplx_dot_prod_i32(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, int32_t * realResult, int32_t * imagResult) Glue code for complex dot product of 32-bit integer vectors.
void	plp_cmplx_dot_prod_i32_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, int32_t * realResult, int32_t * imagResult) 32-bit integer complex dot product.
void	plp_cmplx_dot_prod_i32_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, int32_t * realResult, int32_t * imagResult) 32-bit integer complex dot product.
void	plp_cmplx_dot_prod_i16(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, int16_t * realResult, int16_t * imagResult) Glue code for complex dot product of 16-bit integer vectors.
void	plp_cmplx_dot_prod_i16_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, int16_t * realResult, int16_t * imagResult) 16-bit integer complex dot product.
void	plp_cmplx_dot_prod_i16_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, int16_t * realResult, int16_t * imagResult) 16-bit integer complex dot product.
void	plp_cmplx_dot_prod_i8(const int8_t * pSrcA, const int8_t * pSrcB, uint32_t numSamples, int8_t * realResult, int8_t * imagResult) Glue code for complex dot product of 8-bit integer vectors.
void	plp_cmplx_dot_prod_i8_xpulpv2(const int8_t * pSrcA, const int8_t * pSrcB, uint32_t numSamples, int8_t * realResult, int8_t * imagResult) 8-bit integer complex dot product.
void	plp_cmplx_dot_prod_i8_rv32im(const int8_t * pSrcA, const int8_t * pSrcB, uint32_t numSamples, int8_t * realResult, int8_t * imagResult) 8-bit integer complex dot product.
void	plp_cmplx_dot_prod_q32(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int32_t * realResult, int32_t * imagResult) Glue code for complex dot product of 32-bit fixed-point vectors.
void	plp_cmplx_dot_prod_q32_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int32_t * realResult, int32_t * imagResult) 32-bit fixed-point complex dot product.
void	plp_cmplx_dot_prod_q32_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int32_t * realResult, int32_t * imagResult) 32-bit integer complex dot product.
void	plp_cmplx_dot_prod_q16(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int16_t * realResult, int16_t * imagResult) Glue code for complex dot product of 16-bit fixed-point vectors.
void	plp_cmplx_dot_prod_q16_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int16_t * realResult, int16_t * imagResult) 16-bit fixed-point complex dot product.
void	plp_cmplx_dot_prod_q16_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int16_t * realResult, int16_t * imagResult) 16-bit fixed-point complex dot product.
void	plp_cmplx_mult_real_f32(const float32_t restrict pSrcCmplx, const float32_t restrict pSrcReal, float32_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied with real of 32-bit float vectors.
void	plp_cmplx_mult_real_f32_xpulpv2(const float32_t restrict pSrcCmplx, const float32_t restrict pSrcReal, float32_t *restrict pDst, uint32_t numSamples) Floating-point complex multiplied with real.
void	plp_cmplx_mult_real_i32(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied with real of 32-bit integer vectors.
void	plp_cmplx_mult_real_i32_xpulpv2(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t numSamples) 32-bit integer complex multiplied with real.
void	plp_cmplx_mult_real_i32_rv32im(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t numSamples) 32-bit integer complex multiplied with real.
void	plp_cmplx_mult_real_i16(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied with real of 16-bit integer vectors.
void	plp_cmplx_mult_real_i16_xpulpv2(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t numSamples) 16-bit integer complex multiplied with real.
void	plp_cmplx_mult_real_i16_rv32im(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t numSamples) 16-bit integer complex multiplied with real.
void	plp_cmplx_mult_real_i8(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied with real of 8-bit integer vectors.
void	plp_cmplx_mult_real_i8_xpulpv2(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t numSamples) 8-bit integer complex multiplied with real.
void	plp_cmplx_mult_real_i8_rv32im(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t numSamples) 8-bit integer complex multiplied with real.
void	plp_cmplx_mult_real_q32(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied with real of 32-bit fixed-point vectors.
void	plp_cmplx_mult_real_q32_xpulpv2(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32-bit fixed-point complex multiplied with real.
void	plp_cmplx_mult_real_q32_rv32im(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32-bit fixed-point complex multiplied with real.
void	plp_cmplx_mult_real_q16(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied with real of 16-bit fixed-point vectors.
void	plp_cmplx_mult_real_q16_xpulpv2(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16-bit fixed-point complex multiplied with real.
void	plp_cmplx_mult_real_q16_rv32im(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16-bit fixed-point complex multiplied with real.
void	plp_cmplx_mult_real_q8(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied with real of 8-bit fixed-point vectors.
void	plp_cmplx_mult_real_q8_xpulpv2(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8-bit fixed-point complex multiplied with real.
void	plp_cmplx_mult_real_q8_rv32im(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8-bit fixed-point complex multiplied with real.
void	plp_cmplx_mag_squared_f32(const float32_t restrict pSrc, float32_t restrict pDst, uint32_t numSamples) Glue code for complex squared magnitude of 32-bit float vectors.
void	plp_cmplx_mag_squared_f32_xpulpv2(const float32_t restrict pSrc, float32_t restrict pDst, uint32_t numSamples) Floating-point complex squared magnitude.
void	plp_cmplx_mag_squared_i16(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) Glue code for complex squared magnitude of 16-bit integer vectors.
void	plp_cmplx_mag_squared_i16_rv32im(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) 16-bit integer complex squared magnitude.
void	plp_cmplx_mag_squared_i16_xpulpv2(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) 16 bit Integer complex squared magnitude.
void	plp_cmplx_mag_squared_i32(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) Glue code for complex squared magnitude of 32-bit integer vectors.
void	plp_cmplx_mag_squared_i32_rv32im(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) 32-bit integer complex squared magnitude.
void	plp_cmplx_mag_squared_i32_xpulpv2(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) 32-bit integer complex squared magnitude.
void	plp_cmplx_mag_squared_i8_xpulpv2(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) 8 bit Integer complex squared magnitude.
void	plp_cmplx_mag_squared_i8(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) Glue code for complex squared magnitude of 32-bit integer vectors.
void	plp_cmplx_mag_squared_i8_rv32im(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) 8-bit integer complex squared magnitude.
void	plp_cmplx_mag_squared_q32(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex squared magnitude of 32-bit fixed-point vectors.
void	plp_cmplx_mag_squared_q32_rv32im(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32-bit fixed-point complex squared magnitude.
void	plp_cmplx_mag_squared_q32_xpulpv2(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32 bit fixed-point complex squared magnitude.
void	plp_cmplx_mag_squared_q16(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex squared magnitude of 16-bit fixed-point vectors.
void	plp_cmplx_mag_squared_q16_rv32im(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16-bit fixed-point complex squared magnitude.
void	plp_cmplx_mag_squared_q16_xpulpv2(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16 bit fixed-point complex squared magnitude.
void	plp_cmplx_mag_squared_q8(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex squared magnitude of 8-bit fixed-point vectors.
void	plp_cmplx_mag_squared_q8_rv32im(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8-bit fixed-point complex squared magnitude.
void	plp_cmplx_mag_squared_q8_xpulpv2(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8 bit fixed-point complex squared magnitude.
void	plp_cmplx_mult_cmplx_f32(const float32_t restrict pSrcA, const float32_t restrict pSrcB, float32_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied by complex of 32-bit float vectors.
void	plp_cmplx_mult_cmplx_f32_xpulpv2(const float32_t restrict pSrcA, const float32_t restrict pSrcB, float32_t *restrict pDst, uint32_t numSamples) Floating-point complex multiplied by complex.
void	plp_cmplx_mult_cmplx_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied by complex of 32-bit integer vectors.
void	plp_cmplx_mult_cmplx_i32_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t numSamples) 32-bit integer complex multiplied by complex.
void	plp_cmplx_mult_cmplx_i32_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t numSamples) 32-bit integer complex multiplied by complex.
void	plp_cmplx_mult_cmplx_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied by complex of 16-bit integer vectors.
void	plp_cmplx_mult_cmplx_i16_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t numSamples) 16-bit integer complex multiplied by complex.
void	plp_cmplx_mult_cmplx_i16_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t numSamples) 16-bit integer complex multiplied by complex.
void	plp_cmplx_mult_cmplx_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied by complex of 8-bit integer vectors.
void	plp_cmplx_mult_cmplx_i8_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t numSamples) 8-bit integer complex multiplied by complex.
void	plp_cmplx_mult_cmplx_i8_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t numSamples) 8-bit integer complex multiplied by complex.
void	plp_cmplx_mult_cmplx_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied by complex of 32-bit fixed-point vectors.
void	plp_cmplx_mult_cmplx_q32_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32-bit fixed-point complex multiplied by complex.
void	plp_cmplx_mult_cmplx_q32_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32-bit fixed-point complex multiplied by complex.
void	plp_cmplx_mult_cmplx_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied by complex of 16-bit fixed-point vectors.
void	plp_cmplx_mult_cmplx_q16_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16-bit fixed-point complex multiplied by complex.
void	plp_cmplx_mult_cmplx_q16_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16-bit fixed-point complex multiplied by complex.
void	plp_cmplx_mult_cmplx_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied by complex of 8-bit fixed-point vectors.
void	plp_cmplx_mult_cmplx_q8_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8-bit fixed-point complex multiplied by complex.
void	plp_cmplx_mult_cmplx_q8_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8-bit fixed-point complex multiplied by complex.
void	plp_euclidean_distance_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, uint32_t nPE, uint32_t *restrict pRes) Glue code for parallel Euclidean distance of 32-bit fixed point vectors.
void	plp_euclidean_distance_f32_parallel(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, float32_t *restrict pRes) Glue code for parallel Euclidean distance between 32-bit float vectors.
void	plp_euclidean_distance_q32p_xpulpv2(void * S) Parallel euclidean distance with interleaved access 32-bit fixed point vectors. vectors kernel for XPULPV2 extension.
void	plp_euclidean_distance_f32p_xpulpv2(void * S) 32-bit floating-point parallel Euclidean distance between two vectors
void	plp_euclidean_distance_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) Glue code for euclidean distance of 32-bit fixed point vectors.
void	plp_euclidean_distance_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) Euclidean distance of 32-bit fixed point vectors kernel for XPULPV2 extension.
void	plp_euclidean_distance_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) Euclidean distance of 32-bit fixed point vectors.
void	plp_euclidean_distance_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint16_t blockSize, uint16_t fracBits, int32_t *restrict pRes) Glue code for euclidean distance of 16-bit fixed point vectors.
void	plp_euclidean_distance_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Euclidean distance of 16-bit fixed point vectors kernel for XPULPV2.
void	plp_euclidean_distance_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) Euclidean distance of 16-bit fixed point vectors kernel for RV32IM extension.
void	plp_euclidean_distance_f32(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) Glue code for Euclidean distance between 32-bit float vectors.
void	plp_euclidean_distance_f32s_xpulpv2(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) 32-bit floating point Euclidean distance between two vectors
void	plp_euclidean_distance_f32s_rv32im(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) 32-bit floating point Euclidean distance between two vectors
void	plp_cosine_distance_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, uint32_t nPE, int32_t *restrict pRes) Glue code for parallel cosine distance between 32-bit fixed-precision vectors.
void	plp_cosine_distance_f32_parallel(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, float32_t *restrict pRes) Glue code for parallel cosine distance between 32-bit float vectors.
void	plp_cosine_distance_f32p_xpulpv2(void * S) 32-bit floating-point parallel cosine distance between two vectors (computes power in parallel)
void	plp_cosine_distance_f32(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) Glue code for cosine distance between 32-bit float vectors.
void	plp_cosine_distance_f32s_rv32im(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) 32-bit floating point cosine distance between two vectors
void	plp_cosine_distance_f32s_xpulpv2(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) 32-bit floating point cosine distance between two vectors
void	plp_cosine_distance_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) Glue code for cosine distance of 32-bit fixed point vectors.
void	plp_cosine_distance_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) cosine distance of 32-bit fixed point vectors.
void	plp_cosine_distance_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) cosine distance of 32-bit fixed point vectors kernel for XPULPV2 extension.
void	plp_cosine_distance_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint16_t blockSize, uint16_t fracBits, int32_t *restrict pRes) Glue code for cosine distance of 16-bit fixed point vectors.
void	plp_cosine_distance_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) cosine distance of 16-bit fixed point vectors kernel for RV32IM extension.
void	plp_cosine_distance_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) cosine distance of 16-bit fixed point vectors kernel for XPULPV2.

Defines

	Name
	PLP_MATH_IBEX
	PLP_MATH_LOOPUNROLL
	PLP_DWT_DEC_LEN(SIG_LEN, WAVELET, LEVEL)
	PLP_DWT_DEC_TEMP_LEN(SRC_LEN, WAVELET_LEN)
	PLP_DWT_OUTPUT_LENGTH(SIG_LEN, WAVELET_LEN)
	FAST_MATH_TABLE_SIZE Glue code for square root of a 32-bit floating point number.
	FAST_MATH_Q32_SHIFT
	FAST_MATH_Q16_SHIFT
	CONTROLLER_Q32_SHIFT
	TABLE_SPACING_Q32
	TABLE_SPACING_Q16

Detailed Description

Public header file for PULP DSP Library.

Version: V0

==========================================================================

@date 16. May 2019

Types Documentation

enum plp_dwt_wavelet_type

Enumerator	Value	Description
PLP_DWT_WAVELET_OTHER
PLP_DWT_WAVELET_HAAR
PLP_DWT_WAVELET_DB1
PLP_DWT_WAVELET_DB2
PLP_DWT_WAVELET_DB3
PLP_DWT_WAVELET_DB4
PLP_DWT_WAVELET_DB5
PLP_DWT_WAVELET_DB6
PLP_DWT_WAVELET_DB7
PLP_DWT_WAVELET_DB8
PLP_DWT_WAVELET_DB9
PLP_DWT_WAVELET_DB10
PLP_DWT_WAVELET_DB11
PLP_DWT_WAVELET_DB12
PLP_DWT_WAVELET_DB13
PLP_DWT_WAVELET_DB14
PLP_DWT_WAVELET_DB15
PLP_DWT_WAVELET_DB16
PLP_DWT_WAVELET_DB17
PLP_DWT_WAVELET_DB18
PLP_DWT_WAVELET_DB19
PLP_DWT_WAVELET_DB20
PLP_DWT_WAVELET_SYM2
PLP_DWT_WAVELET_SYM3
PLP_DWT_WAVELET_SYM4
PLP_DWT_WAVELET_SYM5
PLP_DWT_WAVELET_SYM6
PLP_DWT_WAVELET_SYM7
PLP_DWT_WAVELET_SYM8
PLP_DWT_WAVELET_SYM9
PLP_DWT_WAVELET_SYM10
PLP_DWT_WAVELET_SYM11
PLP_DWT_WAVELET_SYM12
PLP_DWT_WAVELET_SYM13
PLP_DWT_WAVELET_SYM14
PLP_DWT_WAVELET_SYM15
PLP_DWT_WAVELET_SYM16
PLP_DWT_WAVELET_SYM17
PLP_DWT_WAVELET_SYM18
PLP_DWT_WAVELET_SYM19
PLP_DWT_WAVELET_SYM20
PLP_DWT_WAVELET_COIF1
PLP_DWT_WAVELET_COIF2
PLP_DWT_WAVELET_COIF3
PLP_DWT_WAVELET_COIF4
PLP_DWT_WAVELET_COIF5
PLP_DWT_WAVELET_COIF6
PLP_DWT_WAVELET_COIF7
PLP_DWT_WAVELET_COIF8
PLP_DWT_WAVELET_COIF9
PLP_DWT_WAVELET_COIF10
PLP_DWT_WAVELET_COIF11
PLP_DWT_WAVELET_COIF12
PLP_DWT_WAVELET_COIF13
PLP_DWT_WAVELET_COIF14
PLP_DWT_WAVELET_COIF15
PLP_DWT_WAVELET_COIF16
PLP_DWT_WAVELET_COIF17

enum plp_dwt_extension_mode

Enumerator	Value	Description
PLP_DWT_MODE_ZERO
PLP_DWT_MODE_CONSTANT
PLP_DWT_MODE_SYMMETRIC
PLP_DWT_MODE_REFLECT
PLP_DWT_MODE_PERIODIC
PLP_DWT_MODE_ANTISYMMETRIC
PLP_DWT_MODE_ANTIREFLECT

typedef float32_t

typedef float float32_t;

Functions Documentation

function plp_dwt_max_level

uint32_t plp_dwt_max_level(
    uint32_t sig_len,
    uint32_t wavelet_len
)

Computes maximum available decomposition level for a signal length and wavelet length.

Parameters:

sig_len length of input signal
wavelet_len wavelet length

Return: Maximal decomposition level

function plp_dwt_dec_len

uint32_t plp_dwt_dec_len(
    uint32_t sig_len,
    uint32_t wavelet_len,
    uint32_t level
)

Calculates decomposition output length given a level.

Parameters:

sig_len length of input signal
wavelet_len wavelet length
level decomposition level (0 for maximal decomposition)

Return: Length of decomposition output buffer

function plp_dot_prod_i32_parallel

void plp_dot_prod_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t nPE,
    int32_t *__restrict__ pRes
)

Glue code for parallel dot product of 32-bit integer vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pRes output result returned here

Return:

none
none

function plp_dot_prod_q32_parallel

void plp_dot_prod_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    uint32_t nPE,
    int32_t *__restrict__ pRes
)

Glue code for parallel dot product of 32-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
deciPoint decimal point for right shift
nPE number of parallel processing units
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
deciPoint decimal point for right shift
nPE number of parallel processing units
pRes output result returned here

Return:

none
none

function plp_dot_prod_f32_parallel

void plp_dot_prod_f32_parallel(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t nPE,
    float32_t *__restrict__ pRes
)

Glue code for parallel dot product of 32-bit float vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pRes output result returned here

Return:

none
none

function plp_dot_prod_i32p_xpulpv2

void plp_dot_prod_i32p_xpulpv2(
    void * S
)

Parallel dot product with interleaved access of 32-bit integer vectors kernel for XPULPV2 extension.

Parameters:

S points to the instance structure for integer parallel dot product
S points to the instance structure for integer parallel dot product

Return:

none
none

function plp_dot_prod_q32p_xpulpv2

void plp_dot_prod_q32p_xpulpv2(
    void * S
)

Parallel dot product with interleaved access of 32-bit fixed point vectors kernel for XPULPV2 extension.

Parameters:

S points to the instance structure for fixed point parallel dot product
S points to the instance structure for fixed point parallel dot product

Return:

none
none

function plp_dot_prod_f32p_xpulpv2

void plp_dot_prod_f32p_xpulpv2(
    void * S
)

Parallel dot product with interleaved access of 32-bit float vectors kernel for XPULPV2 extension.

Parameters:

S points to the instance structure for float parallel dot product
S points to the instance structure for float parallel dot product

Return:

none
none

function plp_dot_prod_i32

void plp_dot_prod_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Glue code for dot product of 32-bit integer vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here

Return: none

function plp_dot_prod_i32s_rv32im

void plp_dot_prod_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Scalar dot product of 32-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here

Return: none

function plp_dot_prod_i32s_xpulpv2

void plp_dot_prod_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Scalar dot product of 32-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here

Return: none

function plp_dot_prod_q32

void plp_dot_prod_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    int32_t *__restrict__ pRes
)

Glue code for dot product of 32-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here

Return:

none
none

function plp_dot_prod_q32s_rv32im

void plp_dot_prod_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    int32_t *__restrict__ pRes
)

Scalar dot product of 32-bit fixed point vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here

Return:

none
none

function plp_dot_prod_q32s_xpulpv2

void plp_dot_prod_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    int32_t *__restrict__ pRes
)

Scalar dot product of 32-bit fixed point vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here

Return:

none
none

function plp_dot_prod_f32

void plp_dot_prod_f32(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    float32_t *__restrict__ pRes
)

Glue code for dot product of 32-bit float vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here

Return:

none
none

function plp_dot_prod_f32s_xpulpv2

void plp_dot_prod_f32s_xpulpv2(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    float32_t *__restrict__ pRes
)

Glue code for dot product of 32-bit float vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here

Return:

none
none

function plp_dot_prod_f32s_rv32im

void plp_dot_prod_f32s_rv32im(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    float32_t *__restrict__ pRes
)

Glue code for dot product of 32-bit float vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
pRes output result returned here

Return:

none
none

function plp_dot_prod_i16

void plp_dot_prod_i16(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Glue code for dot product of 16-bit integer vectors.

Parameters:

pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]

Par: Exploiting SIMD instructions

When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.

function plp_dot_prod_i16s_rv32im

void plp_dot_prod_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Vectorized dot product of 16-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]
pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2). * Exploiting SIMD instructions

When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).

Vectorized dot product of 16-bit integer vectors kernel for RV32IM extension.

function plp_dot_prod_i16s_xpulpv2

void plp_dot_prod_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Vectorized dot product of 16-bit integer vectors kernel singlecore for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]
pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors. * Exploiting SIMD instructions

The 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.

Vectorized dot product of 16-bit integer vectors kernel singlecore for XPULPV2 extension.

function plp_dot_prod_q16

void plp_dot_prod_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    int32_t *__restrict__ pRes
)

Glue code for dot product of 16-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]
pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.

function plp_dot_prod_q16s_rv32im

void plp_dot_prod_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    int32_t *__restrict__ pRes
)

Scalar dot product of 16-bit fixed point vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]
pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2). * Exploiting SIMD instructions

When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).

function plp_dot_prod_q16s_xpulpv2

void plp_dot_prod_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    int32_t *__restrict__ pRes
)

Vectorized dot product of 16-bit fixed point vectors singlecore kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]
pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.

function plp_dot_prod_i8

void plp_dot_prod_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Glue code for dot product of 8-bit integer vectors.

Parameters:

pSrcA points to the first input vector [8 bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]
pSrcA points to the first input vector [8 bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.

function plp_dot_prod_i8s_rv32im

void plp_dot_prod_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Vectorized dot product of 8-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector [8] bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]
pSrcA points to the first input vector [8] bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2). * Exploiting SIMD instructions

When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).

Vectorized dot product of 8-bit integer vectors kernel for RV32IM extension.

function plp_dot_prod_i8s_xpulpv2

void plp_dot_prod_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Vectorized dot product of 8-bit integer vectors singlecore kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector [8 bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]
pSrcA points to the first input vector [8 bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_dot_prod_q8

void plp_dot_prod_q8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    int32_t *__restrict__ pRes
)

Glue code for dot product of 8-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector [8 bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]
pSrcA points to the first input vector [8 bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.

function plp_dot_prod_q8s_rv32im

void plp_dot_prod_q8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    int32_t *__restrict__ pRes
)

Scalar dot product of 8-bit fixed point vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector [8 bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]
pSrcA points to the first input vector [8 bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2). * Exploiting SIMD instructions

When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).

function plp_dot_prod_q8s_xpulpv2

void plp_dot_prod_q8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    int32_t *__restrict__ pRes
)

Scalar dot product of 8-bit fixed point vectors singlecore kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector [8 bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]
pSrcA points to the first input vector [8 bit]
pSrcB points to the second input vector [8 bit]
blockSize number of samples in each vector
deciPoint decimal point for right shift
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_abs_i32

void plp_abs_i32(
    const int32_t * pSrc,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code for absolute value of 32-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_abs_i32s_rv32im

void plp_abs_i32s_rv32im(
    const int32_t * pSrc,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element absolute value of 32-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_abs_i32s_xpulpv2

void plp_abs_i32s_xpulpv2(
    const int32_t * pSrc,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element absolute value of 32-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_abs_i16

void plp_abs_i16(
    const int16_t * pSrc,
    int16_t * pDst,
    uint32_t blockSize
)

Glue code for absolute value of 16-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_abs_i16s_rv32im

void plp_abs_i16s_rv32im(
    const int16_t * pSrc,
    int16_t * pDst,
    uint32_t blockSize
)

Element-by-element absolute value of 16-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_abs_i16s_xpulpv2

void plp_abs_i16s_xpulpv2(
    const int16_t * pSrc,
    int16_t * pDst,
    uint32_t blockSize
)

Element-by-element absolute value of 16-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_abs_i8

void plp_abs_i8(
    const int8_t * pSrc,
    int8_t * pDst,
    uint32_t blockSize
)

Glue code for absolute value of 8-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_abs_i8s_rv32im

void plp_abs_i8s_rv32im(
    const int8_t * pSrc,
    int8_t * pDst,
    uint32_t blockSize
)

Element-by-element absolute value of 8-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_abs_i8s_xpulpv2

void plp_abs_i8s_xpulpv2(
    const int8_t * pSrc,
    int8_t * pDst,
    uint32_t blockSize
)

Element-by-element absolute value of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_add_i32

void plp_add_i32(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code for element-by-element addition of 32-bit integer vectors.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_add_i32s_rv32im

void plp_add_i32s_rv32im(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element addition of 32-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_add_i32s_xpulpv2

void plp_add_i32s_xpulpv2(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element addition of 32-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_add_i16

void plp_add_i16(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code for element-by-element addition of 16-bit integer vectors.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_add_i16s_rv32im

void plp_add_i16s_rv32im(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element addition of 16-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_add_i16s_xpulpv2

void plp_add_i16s_xpulpv2(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element addition of 16-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_add_i8

void plp_add_i8(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code for element-by-element addition of 8-bit integer vectors.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_add_i8s_rv32im

void plp_add_i8s_rv32im(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element addition of 8-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_add_i8s_xpulpv2

void plp_add_i8s_xpulpv2(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element addition of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_i32

void plp_mult_i32(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code for element-by-element multiplication of 32-bit integer vectors.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_i32s_rv32im

void plp_mult_i32s_rv32im(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element multiplication of 32-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_i32s_xpulpv2

void plp_mult_i32s_xpulpv2(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element multiplication of 32-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_i16

void plp_mult_i16(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code for element-by-element multiplication of 16-bit integer vectors.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_i16s_rv32im

void plp_mult_i16s_rv32im(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element multiplication of 16-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_i16s_xpulpv2

void plp_mult_i16s_xpulpv2(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element multiplication of 16-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_i8

void plp_mult_i8(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code for element-by-element multiplication of 8-bit integer vectors.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_i8s_rv32im

void plp_mult_i8s_rv32im(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element multiplication of 8-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_i8s_xpulpv2

void plp_mult_i8s_xpulpv2(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Element-by-element multiplication of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_f32

void plp_mult_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    float32_t * pDst,
    uint32_t blockSize
)

Glue code for element-by-element multiplication of 32-bit float vectors.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mult_f32s_xpulpv2

void plp_mult_f32s_xpulpv2(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    float32_t * pDst,
    uint32_t blockSize
)

Element-by-element multiplication of 32-bit float vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

Element-by-element multiplication of 32-bit float vectors kernel for XPULPV2 extension.

function plp_mult_f32_parallel

void plp_mult_f32_parallel(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t nPE,
    float32_t *__restrict__ pDst
)

Glue code for parallel dot product of 32-bit float vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pDst points to output vector

Return: none

function plp_mult_f32p_xpulpv2

void plp_mult_f32p_xpulpv2(
    void * S
)

Parallel multiplication with interleaved access of 32-bit float vectors kernel for XPULPV2 extension.

Parameters:

S points to the instance structure for float parallel multiplication

Return: none

function plp_log_f32_parallel

void plp_log_f32_parallel(
    const float32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t nPE,
    float32_t *__restrict__ pDst
)

Glue code for parallel log of 32-bit float vectors.

Parameters:

pSrc points to the input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pDst points to output vector

Return: none

function plp_log_f32p_xpulpv2

void plp_log_f32p_xpulpv2(
    void * S
)

Parallel log with interleaved access of 32-bit float vectors kernel for XPULPV2 extension.

Parameters:

S points to the instance structure for float parallel log

Return: none

function plp_negate_i32

void plp_negate_i32(
    const int32_t * pSrc,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code of negate the elements of a vector for 32-bit integers.

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

function plp_negate_i32s_rv32im

void plp_negate_i32s_rv32im(
    const int32_t * pSrc,
    int32_t * pDst,
    uint32_t blockSize
)

negate the elements of a vector for 32-bit integers on RV32IM

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

function plp_negate_i32s_xpulpv2

void plp_negate_i32s_xpulpv2(
    const int32_t * pSrc,
    int32_t * pDst,
    uint32_t blockSize
)

negate the elements of a vector for 32-bit integers on XpulpV2

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

function plp_negate_i16

void plp_negate_i16(
    const int16_t * pSrc,
    int16_t * pDst,
    uint32_t blockSize
)

Glue code of negate the elements of a vector for 16-bit integers.

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

function plp_negate_i16s_rv32im

void plp_negate_i16s_rv32im(
    const int16_t * pSrc,
    int16_t * pDst,
    uint32_t blockSize
)

negate the elements of a vector for 16-bit integers on RV32IM

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

function plp_negate_i16s_xpulpv2

void plp_negate_i16s_xpulpv2(
    const int16_t * pSrc,
    int16_t * pDst,
    uint32_t blockSize
)

negate the elements of a vector for 16-bit integers on XpulpV2

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_negate_i8

void plp_negate_i8(
    const int8_t * pSrc,
    int8_t * pDst,
    uint32_t blockSize
)

Glue code of negate the elements of a vector for 8-bit integers.

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

function plp_negate_i8s_rv32im

void plp_negate_i8s_rv32im(
    const int8_t * pSrc,
    int8_t * pDst,
    uint32_t blockSize
)

negate the elements of a vector for 8-bit integers on RV32IM

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

function plp_negate_i8s_xpulpv2

void plp_negate_i8s_xpulpv2(
    const int8_t * pSrc,
    int8_t * pDst,
    uint32_t blockSize
)

negate the elements of a vector for 8-bit integers on XpulpV2

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_negate_f32

void plp_negate_f32(
    const float32_t * pSrc,
    float32_t * pDst,
    uint32_t blockSize
)

Glue code of negate the elements of a vector for 32-bit floats.

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

function plp_negate_f32s_xpulpv2

void plp_negate_f32s_xpulpv2(
    const float32_t * pSrc,
    float32_t * pDst,
    uint32_t blockSize
)

negate the elements of a vector for 32-bit floats on XpulpV2

Parameters:

pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.
pSrc points to input vector.
pDst points to output vector.
blockSize number of samples in each vector.

Return:

none
none

function plp_offset_i32

void plp_offset_i32(
    const int32_t * pSrc,
    int32_t offset,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code of add a constant offset to a vector for 32-bit integers.

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_offset_i32s_rv32im

void plp_offset_i32s_rv32im(
    const int32_t * pSrc,
    int32_t offset,
    int32_t * pDst,
    uint32_t blockSize
)

add a constant offset to a vector for 32-bit integers on RV32IM

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_offset_i32s_xpulpv2

void plp_offset_i32s_xpulpv2(
    const int32_t * pSrc,
    int32_t offset,
    int32_t * pDst,
    uint32_t blockSize
)

add a constant offset to a vector for 32-bit integers on XpulpV2

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_offset_i16

void plp_offset_i16(
    const int16_t * pSrc,
    int16_t offset,
    int16_t * pDst,
    uint32_t blockSize
)

Glue code of add a constant offset to a vector for 16-bit integers.

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_offset_i16s_rv32im

void plp_offset_i16s_rv32im(
    const int16_t * pSrc,
    int16_t offset,
    int16_t * pDst,
    uint32_t blockSize
)

add a constant offset to a vector for 16-bit integers on RV32IM

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_offset_i16s_xpulpv2

void plp_offset_i16s_xpulpv2(
    const int16_t * pSrc,
    int16_t offset,
    int16_t * pDst,
    uint32_t blockSize
)

add a constant offset to a vector for 16-bit integers on XpulpV2

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_offset_i8

void plp_offset_i8(
    const int8_t * pSrc,
    int8_t offset,
    int8_t * pDst,
    uint32_t blockSize
)

Glue code of add a constant offset to a vector for 8-bit integers.

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_offset_i8s_rv32im

void plp_offset_i8s_rv32im(
    const int8_t * pSrc,
    int8_t offset,
    int8_t * pDst,
    uint32_t blockSize
)

add a constant offset to a vector for 8-bit integers on RV32IM

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_offset_i8s_xpulpv2

void plp_offset_i8s_xpulpv2(
    const int8_t * pSrc,
    int8_t offset,
    int8_t * pDst,
    uint32_t blockSize
)

add a constant offset to a vector for 8-bit integers on XpulpV2

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_offset_f32

void plp_offset_f32(
    const float32_t * pSrc,
    float32_t offset,
    float32_t * pDst,
    uint32_t blockSize
)

Glue code of add a constant offset to a vector for 32-bit floats.

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_offset_f32s_xpulpv2

void plp_offset_f32s_xpulpv2(
    const float32_t * pSrc,
    float32_t offset,
    float32_t * pDst,
    uint32_t blockSize
)

add a constant offset to a vector for 32-bit floats on XpulpV2

Parameters:

pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
offset is the offset to be added
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_sub_i32

void plp_sub_i32(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code of vector substraction for 32-bit integers.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_sub_i32s_rv32im

void plp_sub_i32s_rv32im(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

vector substraction for 32-bit integers on RV32IM

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_sub_i32s_xpulpv2

void plp_sub_i32s_xpulpv2(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

vector substraction for 32-bit integers on XpulpV2

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_sub_i16

void plp_sub_i16(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code of vector substraction for 16-bit integers.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_sub_i16s_rv32im

void plp_sub_i16s_rv32im(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

vector substraction for 16-bit integers on RV32IM

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_sub_i16s_xpulpv2

void plp_sub_i16s_xpulpv2(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

vector substraction for 16-bit integers on XpulpV2

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_sub_i8

void plp_sub_i8(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

Glue code of vector substraction for 8-bit integers.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_sub_i8s_rv32im

void plp_sub_i8s_rv32im(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

vector substraction for 8-bit integers on RV32IM

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_sub_i8s_xpulpv2

void plp_sub_i8s_xpulpv2(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    int32_t * pDst,
    uint32_t blockSize
)

vector substraction for 8-bit integers on XpulpV2

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_sub_f32

void plp_sub_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    float32_t * pDst,
    uint32_t blockSize
)

Glue code of vector substraction for 32-bit floats.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_sub_f32s_xpulpv2

void plp_sub_f32s_xpulpv2(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    float32_t * pDst,
    uint32_t blockSize
)

vector substraction for 32-bit floats on XpulpV2

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_scale_i32

void plp_scale_i32(
    const int32_t *__restrict__ pSrc,
    int32_t scaleFactor,
    int32_t shift,
    int32_t *__restrict__ pDst,
    uint32_t blockSize
)

Glue code of multiply a vector by a scalar for 32-bit integers.

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_scale_i32s_rv32im

void plp_scale_i32s_rv32im(
    const int32_t *__restrict__ pSrc,
    int32_t scaleFactor,
    int32_t shift,
    int32_t *__restrict__ pDst,
    uint32_t blockSize
)

multiply a vector by a scalar for 32-bit integers on RV32IM

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_scale_i32s_xpulpv2

void plp_scale_i32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    int32_t scaleFactor,
    int32_t shift,
    int32_t *__restrict__ pDst,
    uint32_t blockSize
)

multiply a vector by a scalar for 32-bit integers on XpulpV2

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_scale_i16

void plp_scale_i16(
    const int16_t *__restrict__ pSrc,
    int16_t scaleFactor,
    int32_t shift,
    int16_t *__restrict__ pDst,
    uint32_t blockSize
)

Glue code of multiply a vector by a scalar for 16-bit integers.

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_scale_i16s_rv32im

void plp_scale_i16s_rv32im(
    const int16_t *__restrict__ pSrc,
    int16_t scaleFactor,
    int32_t shift,
    int16_t *__restrict__ pDst,
    uint32_t blockSize
)

multiply a vector by a scalar for 16-bit integers on RV32IM

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_scale_i16s_xpulpv2

void plp_scale_i16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    int16_t scaleFactor,
    int32_t shift,
    int16_t *__restrict__ pDst,
    uint32_t blockSize
)

multiply a vector by a scalar for 16-bit integers on XpulpV2

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_scale_i8

void plp_scale_i8(
    const int8_t *__restrict__ pSrc,
    int8_t scaleFactor,
    int32_t shift,
    int8_t *__restrict__ pDst,
    uint32_t blockSize
)

Glue code of multiply a vector by a scalar for 8-bit integers.

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_scale_i8s_rv32im

void plp_scale_i8s_rv32im(
    const int8_t *__restrict__ pSrc,
    int8_t scaleFactor,
    int32_t shift,
    int8_t *__restrict__ pDst,
    uint32_t blockSize
)

multiply a vector by a scalar for 8-bit integers on RV32IM

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_scale_i8s_xpulpv2

void plp_scale_i8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    int8_t scaleFactor,
    int32_t shift,
    int8_t *__restrict__ pDst,
    uint32_t blockSize
)

multiply a vector by a scalar for 8-bit integers on XpulpV2

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
shift number of bits to shift the result by
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_scale_f32

void plp_scale_f32(
    const float32_t *__restrict__ pSrc,
    float32_t scaleFactor,
    float32_t *__restrict__ pDst,
    uint32_t blockSize
)

Glue code of multiply a vector by a scalar for 32-bit floats.

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_scale_f32s_xpulpv2

void plp_scale_f32s_xpulpv2(
    const float32_t *__restrict__ pSrc,
    float32_t scaleFactor,
    float32_t *__restrict__ pDst,
    uint32_t blockSize
)

multiply a vector by a scalar for 32-bit floats on XpulpV2

Parameters:

pSrc points to the input vector
scaleFactor Factor to multiply all elements before shifting
pDst points to the output vector
blockSize number of samples in each vector
pSrc points to the input vector
scaleFactor Factor to multiply all elements
pDst points to the output vector
blockSize number of samples in each vector

Return:

none
none

function plp_fill_i32

void plp_fill_i32(
    int32_t value,
    int32_t *__restrict__ pDst,
    uint32_t blockSize
)

Glue code for filling a constant value into a 32-bit integer vector.

Parameters:

value input value to be filled
pDst points to output vector
blockSize number of samples in each vector
value input value to be filled
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_fill_i32s_rv32im

void plp_fill_i32s_rv32im(
    int32_t value,
    int32_t *__restrict__ pDst,
    uint32_t blockSize
)

Fills a constant value into a 32-bit integer vector for RV32IM extension.

Parameters:

value input value to be filled
pDst points to output vector
blockSize number of samples in each vector
value input value to be filled
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_fill_i32s_xpulpv2

void plp_fill_i32s_xpulpv2(
    int32_t value,
    int32_t *__restrict__ pDst,
    uint32_t blockSize
)

Fills a constant value into a 32-bit integer vector for XPULPV2 extension.

Parameters:

value input value to be filled
pDst points to output vector
blockSize number of samples in each vector
value input value to be filled
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_copy_i32

void plp_copy_i32(
    int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t blockSize
)

Glue code for copying the elements of a 32-bit integer vector.

Parameters:

pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector
pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_copy_i32s_rv32im

void plp_copy_i32s_rv32im(
    int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t blockSize
)

Copies the elements of a 32-bit integer vector for RV32IM extension.

Parameters:

pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector
pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_copy_i32s_xpulpv2

void plp_copy_i32s_xpulpv2(
    int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t blockSize
)

Copies the elements of a 32-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector
pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_copy_f32

void plp_copy_f32(
    float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst,
    uint32_t blockSize
)

Glue code for copying the elements of a 32-bit float vector.

Parameters:

pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector
pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

Glue code for copying the elements of a 32-bit float vector.

function plp_copy_f32s_xpulpv2

void plp_copy_f32s_xpulpv2(
    float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst,
    uint32_t blockSize
)

Copies the elements of a 32-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector
pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_copy_f32s_rv32im

void plp_copy_f32s_rv32im(
    float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst,
    uint32_t blockSize
)

Copies the elements of a 32-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector
pSrc points to input vector
pDst points to output vector
blockSize number of samples in each vector

Return:

none
none

function plp_mean_f32

void plp_mean_f32(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Glue code for mean value of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult mean value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here

Return:

none
none

function plp_mean_f32s_xpulpv2

void plp_mean_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Glue code for mean value of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult mean value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here

Return:

none
none

Glue code for mean value of a 32-bit float vector.

function plp_mean_i32

void plp_mean_i32(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Glue code for mean value of a 32-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult mean value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here

Return:

none
none

function plp_mean_i32s_rv32im

void plp_mean_i32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Mean value of a 32-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult mean value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here

Return:

none
none

function plp_mean_i32s_xpulpv2

void plp_mean_i32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Mean value of a 32-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here

Return:

none
none

function plp_mean_i16

void plp_mean_i16(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int16_t *__restrict__ pRes
)

Glue code for mean value of a 16-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult mean value returned here
pSrc points to the input vector
pRes mean value returned here

Return:

none
none

function plp_mean_i16s_rv32im

void plp_mean_i16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int16_t *__restrict__ pRes
)

Mean value of a 16-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult mean value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here

Return:

none
none

function plp_mean_i16s_xpulpv2

void plp_mean_i16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int16_t *__restrict__ pRes
)

Mean value of a 16-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here

Return:

none
none

function plp_mean_i8

void plp_mean_i8(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int8_t *__restrict__ pRes
)

Glue code for mean value of a 8-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult mean value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here

Return:

none
none

function plp_mean_i8s_rv32im

void plp_mean_i8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int8_t *__restrict__ pRes
)

Mean value of a 8-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult mean value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here

Return:

none
none

function plp_mean_i8s_xpulpv2

void plp_mean_i8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int8_t *__restrict__ pRes
)

Mean value of a 8-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes mean value returned here

Return:

none
none

function plp_max_f32

void plp_max_f32(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Glue code for max value of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

function plp_max_f32s_xpulpv2

void plp_max_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Kernel for max value of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

Kernel for max value of a 32-bit float vector.

function plp_max_i32

void plp_max_i32(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Glue code for max value of a 32-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

function plp_max_i32s_rv32im

void plp_max_i32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Max value of a 32-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

function plp_max_i32s_xpulpv2

void plp_max_i32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Max value of a 32-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

function plp_max_i16

void plp_max_i16(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int16_t *__restrict__ pRes
)

Glue code for max value of a 16-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

function plp_max_i16s_rv32im

void plp_max_i16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int16_t *__restrict__ pRes
)

Max value of a 16-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

function plp_max_i16s_xpulpv2

void plp_max_i16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int16_t *__restrict__ pRes
)

Max value of a 16-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

function plp_max_i8

void plp_max_i8(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int8_t *__restrict__ pRes
)

Glue code for max value of a 8-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

function plp_max_i8s_rv32im

void plp_max_i8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int8_t *__restrict__ pRes
)

Max value of a 8-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

function plp_max_i8s_xpulpv2

void plp_max_i8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int8_t *__restrict__ pRes
)

Max value of a 8-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes max value returned here

Return:

none
none

function plp_min_f32

void plp_min_f32(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Glue code for min value of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

function plp_min_f32s_xpulpv2

void plp_min_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Kernel for min value of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

Kernel for min value of a 32-bit float vector.

function plp_min_i32

void plp_min_i32(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Glue code for min value of a 32-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

function plp_min_i32s_rv32im

void plp_min_i32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Min value of a 32-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

function plp_min_i32s_xpulpv2

void plp_min_i32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Min value of a 32-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

function plp_min_i16

void plp_min_i16(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int16_t *__restrict__ pRes
)

Glue code for min value of a 16-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

function plp_min_i16s_rv32im

void plp_min_i16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int16_t *__restrict__ pRes
)

Min value of a 16-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

function plp_min_i16s_xpulpv2

void plp_min_i16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int16_t *__restrict__ pRes
)

Min value of a 16-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

function plp_min_i8

void plp_min_i8(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int8_t *__restrict__ pRes
)

Glue code for min value of a 8-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

function plp_min_i8s_rv32im

void plp_min_i8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int8_t *__restrict__ pRes
)

Min value of a 8-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

function plp_min_i8s_xpulpv2

void plp_min_i8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int8_t *__restrict__ pRes
)

Min value of a 8-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes min value returned here

Return:

none
none

function plp_power_f32_parallel

void plp_power_f32_parallel(
    const float32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t nPE,
    float32_t *__restrict__ pRes
)

Glue code for parallel power of 32-bit floating point vectors.

Parameters:

pSrc points to the input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
nPE number of parallel processing units
pRes output result returned here

Return: none

function plp_power_f32p_xpulpv2

void plp_power_f32p_xpulpv2(
    void * S
)

Parallel sum of squares of a 32-bit float vector for XPULPV2 extension.

Parameters:

S points to the instance structure for floating-point parallel power

Return: none

function plp_power_f32

void plp_power_f32(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Glue code for Sum of squares of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

Glue code for Sum of squares of a 32-bit float vector.

function plp_power_f32s_xpulpv2

void plp_power_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Kernel for Sum of squares of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

Kernel for Sum of squares of a 32-bit float vector.

function plp_power_f32s_rv32im

void plp_power_f32s_rv32im(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Sum of squares of a 32-bit float vector for RV32IM.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return: none

function plp_power_i32

void plp_power_i32(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Glue code for Sum of squares of a 32-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

Glue code for Sum of squares of a 32-bit integer vector.

function plp_power_i32s_rv32im

void plp_power_i32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Sum of squares of a 32-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_i32s_xpulpv2

void plp_power_i32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Sum of squares of a 32-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_i16

void plp_power_i16(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Glue code for Sum of squares of a 16-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

Glue code for Sum of squares of a 16-bit integer vector.

function plp_power_i16s_rv32im

void plp_power_i16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Sum of squares of a 16-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_i16s_xpulpv2

void plp_power_i16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Sum of squares of a 16-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_i8

void plp_power_i8(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Glue code for Sum of squares of a 8-bit integer vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

Glue code for Sum of squares of a 8-bit integer vector.

function plp_power_i8s_rv32im

void plp_power_i8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Sum of squares of a 8-bit integer vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_i8s_xpulpv2

void plp_power_i8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    int32_t *__restrict__ pRes
)

Sum of squares of a 8-bit integer vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Sum of squares value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_q32_parallel

void plp_power_q32_parallel(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    uint32_t nPE,
    int32_t *__restrict__ pRes
)

Glue code for parallel power of 32-bit fixed point vectors.

Parameters:

pSrc points to the input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
nPE number of parallel processing units
pRes output result returned here
pSrc points to the input vector
blockSize number of samples in each vector
deciPoint number of fixed point fractional bits
nPE number of parallel processing units
pRes output result returned here

Return:

none
none

Glue code for parallel power of 32-bit fixed point vectors.

function plp_power_q32p_xpulpv2

void plp_power_q32p_xpulpv2(
    void * S
)

Parallel sum of squares of a 32-bit fixed-point vector for XPULPV2 extension.

Parameters:

S points to the instance structure for fixed-point parallel power
S points to the instance structure for floating-point parallel power

Return:

none
none

Parallel sum of squares of a 32-bit fixed-point vector for XPULPV2 extension.

function plp_power_q32

void plp_power_q32(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for Sum of squares of a 32-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

Glue code for Sum of squares of a 32-bit fixed point vector.

function plp_power_q32s_rv32im

void plp_power_q32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Sum of squares of a 32-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_q32s_xpulpv2

void plp_power_q32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Sum of squares of a 32-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_q16

void plp_power_q16(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for Sum of squares of a 16-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

Glue code for Sum of squares of a 16-bit fixed point vector.

function plp_power_q16s_rv32im

void plp_power_q16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Sum of squares of a 16-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_q16s_xpulpv2

void plp_power_q16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Sum of squares of a 16-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_q8

void plp_power_q8(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for Sum of squares of a 8-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

Glue code for Sum of squares of a 8-bit fixed point vector.

function plp_power_q8s_rv32im

void plp_power_q8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Sum of squares of a 8-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Sum of squares returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_power_q8s_xpulpv2

void plp_power_q8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Sum of squares of a 8-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Sum of squares value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes sum of squares returned here

Return:

none
none

function plp_var_f32

void plp_var_f32(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Glue code for Statisical variance of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical variance returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes variance value returned here

Return:

none
none

Glue code for Statisical variance of a 32-bit float vector.

function plp_var_f32s_xpulpv2

void plp_var_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Kernel for Statisical variance of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical variance returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes variance value returned here

Return:

none
none

Kernel for Statisical variance of a 32-bit float vector.

function plp_var_q32

void plp_var_q32(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for Statisical variance of a 32-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical variance returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes variance value returned here

Return:

none
none

Glue code for Statisical variance of a 32-bit fixed point vector.

function plp_var_q32s_rv32im

void plp_var_q32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Statisical variance of a 32-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical variance value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes Variance returned here

Return:

none
none

Statisical variance of a 32-bit fixed point vector for RV32IM extension.

function plp_var_q32s_xpulpv2

void plp_var_q32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Statisical variance of a 32-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Statisical variance returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes variance returned here

Return:

none
none

Statisical variance of a 32-bit fixed point vector for XPULPV2 extension.

function plp_var_q16

void plp_var_q16(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Glue code for Statisical variance of a 16-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical variance returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes variance value returned here

Return:

none
none

Glue code for Statisical variance of a 16-bit fixed point vector.

function plp_var_q16s_rv32im

void plp_var_q16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Statisical variance of a 16-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical variance returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes variance returned here

Return:

none
none

Statisical variance of a 16-bit fixed point vector for RV32IM extension.

function plp_var_q16s_xpulpv2

void plp_var_q16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Statisical variance of a 16-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Statisical variance returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes variance returned here

Return:

none
none

Statisical variance of a 16-bit fixed point vector for XPULPV2 extension.

function plp_var_q8

void plp_var_q8(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int8_t *__restrict__ pRes
)

Glue code for Statisical variance of a 8-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical variance returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes variance value returned here

Return:

none
none

Glue code for Statisical variance of a 8-bit fixed point vector.

function plp_var_q8s_rv32im

void plp_var_q8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int8_t *__restrict__ pRes
)

Statisical variance of a 8-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical variance returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes variance returned here

Return:

none
none

Statisical variance of a 8-bit fixed point vector for RV32IM extension.

function plp_var_q8s_xpulpv2

void plp_var_q8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int8_t *__restrict__ pRes
)

Statisical variance of a 8-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Statisical variance value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes variance returned here

Return:

none
none

Statisical variance of a 8-bit fixed point vector for XPULPV2 extension.

function plp_std_f32

void plp_std_f32(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Glue code for Statisical standard deviation of a 32-bit floating point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard value returned here

Return:

none
none

Glue code for Statisical standard deviation of a 32-bit floating point vector.

function plp_std_f32s_xpulpv2

void plp_std_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Kernel for Statisical standard deviation of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard deviation returned here

Return:

none
none

Kernel for Statisical standard deviation of a 32-bit float vector.

function plp_std_q32

void plp_std_q32(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for Statisical standard deviation of a 32-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard value returned here

Return:

none
none

Glue code for Statisical standard deviation of a 32-bit fixed point vector.

function plp_std_q32s_rv32im

void plp_std_q32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard deviation returned here

Return:

none
none

Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension.

function plp_std_q32s_xpulpv2

void plp_std_q32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard deviation returned here

Return:

none
none

Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension.

function plp_std_q16

void plp_std_q16(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Glue code for Statisical standard deviation of a 16-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard value returned here

Return:

none
none

Glue code for Statisical standard deviation of a 16-bit fixed point vector.

function plp_std_q16s_rv32im

void plp_std_q16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard deviation returned here

Return:

none
none

Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension.

function plp_std_q16s_xpulpv2

void plp_std_q16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard deviation returned here

Return:

none
none

Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension.

function plp_std_q8

void plp_std_q8(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int8_t *__restrict__ pRes
)

Glue code for Statisical standard deviation of a 8-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard value returned here

Return:

none
none

Glue code for Statisical standard deviation of a 8-bit fixed point vector.

function plp_std_q8s_rv32im

void plp_std_q8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int8_t *__restrict__ pRes
)

Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard deviation returned here

Return:

none
none

Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension.

function plp_std_q8s_xpulpv2

void plp_std_q8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int8_t *__restrict__ pRes
)

Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Statisical standard deviation value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes standard deviation returned here

Return:

none
none

Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension.

function plp_rms_f32

void plp_rms_f32(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Glue code for Statisical standard deviation of a 32-bit floating point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Glue code for Statisical standard deviation of a 32-bit floating point vector.

function plp_rms_f32s_xpulpv2

void plp_rms_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    uint32_t blockSize,
    float *__restrict__ pRes
)

Kernel for Statisical standard deviation of a 32-bit float vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Kernel for Statisical standard deviation of a 32-bit float vector.

function plp_rms_q32

void plp_rms_q32(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for Statisical standard deviation of a 32-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Glue code for Statisical standard deviation of a 32-bit fixed point vector.

function plp_rms_q32s_rv32im

void plp_rms_q32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension.

function plp_rms_q32s_xpulpv2

void plp_rms_q32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension.

function plp_rms_q16

void plp_rms_q16(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Glue code for Statisical standard deviation of a 16-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Glue code for Statisical standard deviation of a 16-bit fixed point vector.

function plp_rms_q16s_rv32im

void plp_rms_q16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension.

function plp_rms_q16s_xpulpv2

void plp_rms_q16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension.

function plp_rms_q8

void plp_rms_q8(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int8_t *__restrict__ pRes
)

Glue code for Statisical standard deviation of a 8-bit fixed point vector.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Glue code for Statisical standard deviation of a 8-bit fixed point vector.

function plp_rms_q8s_rv32im

void plp_rms_q8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int8_t *__restrict__ pRes
)

Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pResult Statisical standard deviation returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension.

function plp_rms_q8s_xpulpv2

void plp_rms_q8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t blockSize,
    uint32_t fracBits,
    int8_t *__restrict__ pRes
)

Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension.

Parameters:

pSrc points to the input vector
blockSize number of samples in input vector
pRes Statisical standard deviation value returned here
pSrc points to the input vector
blockSize number of samples in input vector
pRes RMS value returned here

Return:

none
none

Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension.

function plp_sqrt_q32

void plp_sqrt_q32(
    const int32_t *__restrict__ pSrc,
    const uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for square root of a 32-bit fixed point number.

Parameters:

in 32-Bit input integer
out Square root of the input
pSrc points to the input vector
pRes Square root returned here

Return:

none
none

function plp_sqrt_q32s_rv32im

void plp_sqrt_q32s_rv32im(
    const int32_t *__restrict__ pSrc,
    const uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Square root of a 32-bit fixed point number for XPULPV2 extension.

Parameters:

in 32-Bit input integer
out Square root of the input
pSrc points to the input vector
pRes Square root returned here

Return:

none
none

Square root of a 32-bit fixed point number for XPULPV2 extension.

function plp_sqrt_q32s_xpulpv2

void plp_sqrt_q32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    const uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Square root of a 32-bit fixed point number for XPULPV2 extension.

Parameters:

in 32-Bit input integer
out Square root of the input
pSrc points to the input vector
pRes Square root returned here

Return:

none
none

function plp_sqrt_q16

void plp_sqrt_q16(
    const int16_t *__restrict__ pSrc,
    const uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Glue code for square root of a 16-bit fixed point number.

Parameters:

in 16-Bit input integer
out Square root of the input
pSrc points to the input vector
pRes Square root returned here

Return:

none
none

function plp_sqrt_q16s_rv32im

void plp_sqrt_q16s_rv32im(
    const int16_t *__restrict__ pSrc,
    const uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Square root of a 16-bit fixed point number for XPULPV2 extension.

Parameters:

in 16-Bit input integer
out Square root of the input
pSrc points to the input vector
pRes Square root returned here

Return:

none
none

Square root of a 16-bit fixed point number for XPULPV2 extension.

function plp_sqrt_q16s_xpulpv2

void plp_sqrt_q16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    const uint32_t fracBits,
    int16_t *__restrict__ pRes
)

Square root of a 16-bit fixed point number for XPULPV2 extension.

Parameters:

in 16-Bit input integer
out Square root of the input
pSrc points to the input vector
pRes Square root returned here

Return:

none
none

function plp_sqrt_f32

void plp_sqrt_f32(
    const float *__restrict__ pSrc,
    float *__restrict__ pRes
)

Glue code for square root of a 32-bit floating point number.

Parameters:

pSrc points to the input vector
pRes Square root returned here
pSrc points to the input vectoro
pRes Square root returned here

Return:

none
none

function plp_sqrt_f32s_rv32im

void plp_sqrt_f32s_rv32im(
    const float *__restrict__ pSrc,
    float *__restrict__ pRes
)

Square root of a 32-bit floating point number for RV32IM.

Parameters:

pSrc points to the input vector
pRes Square root returned here

Return: none

function plp_sqrt_f32s_xpulpv2

void plp_sqrt_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    float *__restrict__ pRes
)

Kernel for square root of a 32-bit floating point number.

Parameters:

pSrc points to the input vector
pRes Square root returned here
pSrc points to the input vector
pRes Square root returned here

Return:

none
none

Kernel for square root of a 32-bit floating point number.

function plp_cos_q32

int32_t plp_cos_q32(
    int32_t x
)

Glue code for q32 cosine function.

Parameters:

x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

cos(x)
cos(x)

function plp_cos_q32s_rv32im

int32_t plp_cos_q32s_rv32im(
    int32_t x
)

q32 cosine function for RV32IM

Parameters:

x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

cos(x)
cos(x)

function plp_cos_q32s_xpulpv2

int32_t plp_cos_q32s_xpulpv2(
    int32_t x
)

q32 cosine function for XPULPV2

Parameters:

x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

cos(x)
cos(x)

function plp_cos_q16

int16_t plp_cos_q16(
    int16_t x
)

Glue code for q16 cosine function.

Parameters:

x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

cos(x)
cos(x)

function plp_cos_q16s_rv32im

int16_t plp_cos_q16s_rv32im(
    int16_t x
)

q16 cosine function for RV32IM

Parameters:

x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

cos(x)
cos(x)

function plp_cos_q16s_xpulpv2

int16_t plp_cos_q16s_xpulpv2(
    int16_t x
)

q16 cosine function for XPULPV2

Parameters:

x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

cos(x)
cos(x)

function plp_cos_f32

float32_t plp_cos_f32(
    float32_t x
)

Glue code for f32 cosine function.

Parameters:

x input value in radians
x input value in radians

Return:

cos(x)
cos(x)

function plp_cos_f32s_xpulpv2

float32_t plp_cos_f32s_xpulpv2(
    float32_t x
)

F32 cosine function for XPULPV2.

Parameters:

x input value in radians
x input value in radians

Return:

cos(x)
cos(x)

F32 cosine function for XPULPV2.

function plp_sin_q32

int32_t plp_sin_q32(
    int32_t x
)

Glue code for q32 sine function.

Parameters:

x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

sin(x)
sin(x)

function plp_sin_q32s_rv32im

int32_t plp_sin_q32s_rv32im(
    int32_t x
)

q32 sine function for RV32IM

Parameters:

x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

sin(x)
sin(x)

function plp_sin_q32s_xpulpv2

int32_t plp_sin_q32s_xpulpv2(
    int32_t x
)

q32 sine function for XPULPV2

Parameters:

x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

sin(x)
sin(x)

function plp_sin_q16

int16_t plp_sin_q16(
    int16_t x
)

Glue code for q16 sine function.

Parameters:

x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

sin(x)
sin(x)

function plp_sin_q16s_rv32im

int16_t plp_sin_q16s_rv32im(
    int16_t x
)

q16 sine function for RV32IM

Parameters:

x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

sin(x)
sin(x)

function plp_sin_q16s_xpulpv2

int16_t plp_sin_q16s_xpulpv2(
    int16_t x
)

q16 sine function for XPULPV2

Parameters:

x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)

Return:

sin(x)
sin(x)

function plp_sin_f32

float32_t plp_sin_f32(
    float32_t x
)

Glue code for f32 sine function.

Parameters:

x input value in radians
x input value in radians

Return:

sin(x)
sin(x)

function plp_sin_f32s_xpulpv2

float32_t plp_sin_f32s_xpulpv2(
    float32_t x
)

F32 sine function for XPULPV2.

Parameters:

x input value in radians
x input value in radians

Return:

sin(x)
sin(x)

function plp_correlate_i32

void plp_correlate_i32(
    const int32_t * pSrcA,
    const uint32_t srcALen,
    const int32_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for correlation of 32-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

function plp_correlate_i32s_rv32im

void plp_correlate_i32s_rv32im(
    const int32_t * pSrcA,
    const uint32_t srcALen,
    const int32_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Correlation of 32-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_correlate_i32s_xpulpv2

void plp_correlate_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const uint32_t srcALen,
    const int32_t *__restrict__ pSrcB,
    const uint32_t srcBLen,
    int32_t *__restrict__ pRes
)

Correlation of 32-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

function plp_correlate_i16

void plp_correlate_i16(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for correlation of 16-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes result returned here

Return: none

function plp_correlate_i16s_xpulpv2

void plp_correlate_i16s_xpulpv2(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Correlation of 16-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_correlate_i16s_rv32im

void plp_correlate_i16s_rv32im(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Correlation of 16-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_correlate_i8

void plp_correlate_i8(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for correlation of 8-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

function plp_correlate_valid_i8

void plp_correlate_valid_i8(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for correlation (valid) of 8-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

function plp_correlate_i8s_xpulpv2

void plp_correlate_i8s_xpulpv2(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Correlation of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_correlate_i8s_rv32im

void plp_correlate_i8s_rv32im(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Correlation of 8-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_correlate_q32

void plp_correlate_q32(
    const int32_t * pSrcA,
    const uint32_t srcALen,
    const int32_t * pSrcB,
    const uint32_t srcBLen,
    const uint32_t fracBits,
    int32_t * pRes
)

Glue code for correlation of 32-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

function plp_correlate_q32s_rv32im

void plp_correlate_q32s_rv32im(
    const int32_t * pSrcA,
    const uint32_t srcALen,
    const int32_t * pSrcB,
    const uint32_t srcBLen,
    const uint32_t fracBits,
    int32_t * pRes
)

Correlation of 32-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

Correlation of 32-bit integer vectors kernel for RV32IM extension.

function plp_correlate_q32s_xpulpv2

void plp_correlate_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const uint32_t srcALen,
    const int32_t *__restrict__ pSrcB,
    const uint32_t srcBLen,
    const uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Correlation of 32-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

function plp_correlate_q16

void plp_correlate_q16(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    const uint32_t fracBits,
    int32_t * pRes
)

Glue code for correlation of 16-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes result returned here

Return: none

function plp_correlate_q16s_xpulpv2

void plp_correlate_q16s_xpulpv2(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    const uint32_t fracBits,
    int32_t * pRes
)

Correlation of 16-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

Correlation of 16-bit integer vectors kernel for XPULPV2 extension.

function plp_correlate_q16s_rv32im

void plp_correlate_q16s_rv32im(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    const uint32_t fracBits,
    int32_t * pRes
)

Correlation of 16-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

Correlation of 16-bit integer vectors kernel for RV32IM extension.

function plp_correlate_q8

void plp_correlate_q8(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    const uint32_t fracBits,
    int32_t * pRes
)

Glue code for correlation of 8-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

function plp_correlate_valid_q8

void plp_correlate_valid_q8(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    const uint32_t fracBits,
    int32_t * pRes
)

Glue code for correlation (valid) of 8-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

function plp_correlate_q8s_xpulpv2

void plp_correlate_q8s_xpulpv2(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    const uint32_t fracBits,
    int32_t * pRes
)

Correlation of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

Correlation of 8-bit integer vectors kernel for XPULPV2 extension.

function plp_correlate_q8s_rv32im

void plp_correlate_q8s_rv32im(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    const uint32_t fracBits,
    int32_t * pRes
)

Correlation of 8-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes ocdutput result returned here

Return:

none
none

Correlation of 8-bit integer vectors kernel for RV32IM extension.

function plp_conv_i32

void plp_conv_i32(
    const int32_t * pSrcA,
    const uint32_t srcALen,
    const int32_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for convolution of 32-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes result returned here

Return: none

function plp_conv_valid_i32

void plp_conv_valid_i32(
    const int32_t * pSrcA,
    const uint32_t srcALen,
    const int32_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for convolution (valid) of 32-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen ength of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here, of size |srcALen - srcBLen| + 1

Return:

none
none

Glue code for convolution (valid) of 32-bit integer vectors.

function plp_conv_i32s_rv32im

void plp_conv_i32s_rv32im(
    const int32_t * pSrcA,
    const uint32_t srcALen,
    const int32_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution of 32-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_conv_i32s_xpulpv2

void plp_conv_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const uint32_t srcALen,
    const int32_t *__restrict__ pSrcB,
    const uint32_t srcBLen,
    int32_t *__restrict__ pRes
)

Convolution of 32-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

function plp_conv_valid_i32s_xpulpv2

void plp_conv_valid_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const uint32_t srcALen,
    const int32_t *__restrict__ pSrcB,
    const uint32_t srcBLen,
    int32_t *__restrict__ pRes
)

Convolution (valid) of 32-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return: none

function plp_conv_i16

void plp_conv_i16(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for convolution of 16-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_conv_valid_i16

void plp_conv_valid_i16(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for convolution (valid) of 16-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen ength of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here, of size |srcALen - srcBLen| + 1

Return:

none
none

Glue code for convolution (valid) of 16-bit integer vectors.

function plp_conv_valid_rep_i16

void plp_conv_valid_rep_i16(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for convolution (valid with replication) of 16-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector, must be on L2
srcALen ength of the first input vector
pSrcB points to the second input vector, must be on L2
srcBLen Length of the second input vector
pRes output result returned here, of size |srcALen - srcBLen| + 1, preferably in L1

Return:

none
none

Glue code for convolution (valid with replication) of 16-bit integer vectors.

function plp_conv_i16s_xpulpv2

void plp_conv_i16s_xpulpv2(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution of 16-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_conv_valid_i16s_xpulpv2

void plp_conv_valid_i16s_xpulpv2(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution (valid) of 16-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

Convolution (valid) of 16-bit integer vectors kernel for XPULPV2 extension.

function plp_conv_valid_rep_i16s_xpulpv2

void plp_conv_valid_rep_i16s_xpulpv2(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const uint32_t srcAMem,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution (valid with data replication) of 16-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector of the replicated data
srcALen Number of elements in (unreplicated) vector a
srcAMem Number of elements between each replication
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector of the replicated data
srcALen Number of elements in (unreplicated) vector a
srcAMem Number of elements between each replication
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

Convolution (valid with data replication) of 16-bit integer vectors kernel for XPULPV2 extension.

function plp_conv_i16s_rv32im

void plp_conv_i16s_rv32im(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution of 16-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_conv_i8

void plp_conv_i8(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for convolution of 8-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_conv_valid_i8

void plp_conv_valid_i8(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for convolution (valid) of 8-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen ength of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here, of size |srcALen - srcBLen| + 1

Return:

none
none

Glue code for convolution (valid) of 8-bit integer vectors.

function plp_conv_valid_rep_i8

void plp_conv_valid_rep_i8(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Glue code for convolution (valid with data replication) of 8-bit integer vectors.

Parameters:

pSrcA points to the first input vector (in L2)
srcALen Length of the first input vector
pSrcB points to the second input vector (in L2)
srcBLen Length of the second input vector
pRes output result returned here (preferably in L1)
pSrcA points to the first input vector, must be on L2
srcALen ength of the first input vector
pSrcB points to the second input vector, must be on L2
srcBLen Length of the second input vector
pRes output result returned here, of size |srcALen - srcBLen| + 1, preferably in L1

Return:

none
none

Glue code for convolution (valid with data replication) of 8-bit integer vectors.

function plp_conv_i8s_xpulpv2

void plp_conv_i8s_xpulpv2(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_conv_valid_i8s_xpulpv2

void plp_conv_valid_i8s_xpulpv2(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution (valid) of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

Convolution (valid) of 8-bit integer vectors kernel for XPULPV2 extension.

function plp_conv_valid_rep_i8s_xpulpv2

void plp_conv_valid_rep_i8s_xpulpv2(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const uint32_t srcAMem,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution (valid with data replication) of 8-bit integer vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector of the replicated data
srcALen Number of elements in (unreplicated) vector a
srcAMem Number of elements between each replication
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector of the replicated data
srcALen Number of elements in (unreplicated) vector a
srcAMem Number of elements between each replication
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

Convolution (valid with data replication) of 8-bit integer vectors kernel for XPULPV2 extension.

function plp_conv_i8s_rv32im

void plp_conv_i8s_rv32im(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    int32_t * pRes
)

Convolution of 8-bit integer vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
pRes output result returned here

Return:

none
none

function plp_conv_i32_parallel

void plp_conv_i32_parallel(
    const int32_t * pSrcA,
    const uint32_t srcALen,
    const int32_t * pSrcB,
    const uint32_t srcBLen,
    const uint8_t nPE,
    int32_t * pRes
)

Glue code for parallel convolution of 32-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
nPE Number of cores to compute on
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
nPE Number of cores to compute on
pRes output result returned here

Return:

none
none

function plp_conv_i32p_xpulpv2

void plp_conv_i32p_xpulpv2(
    void * task_args
)

Setup code for parallel convolution of 32-bit integer vectors.

Parameters:

task_args pointer to plp_conv_instance_i32 struct initialized by plp_conv_i32_parallel
task_args pointer to plp_conv_instance_i32 struct initialized by plp_conv_i32_parallel

Return:

none
none

Setup code for parallel convolution of 32-bit integer vectors.

function plp_conv_i16_parallel

void plp_conv_i16_parallel(
    const int16_t * pSrcA,
    const uint32_t srcALen,
    const int16_t * pSrcB,
    const uint32_t srcBLen,
    const uint8_t nPE,
    int32_t * pRes
)

Glue code for parallel convolution of 16-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
nPE Number of cores to compute on
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
nPE Number of cores to compute on
pRes output result returned here

Return:

none
none

function plp_conv_i16p_xpulpv2

void plp_conv_i16p_xpulpv2(
    void * task_args
)

Setup code for parallel convolution of 16-bit integer vectors.

Parameters:

task_args pointer to plp_conv_instance_i16 struct initialized by plp_conv_i16_parallel
task_args pointer to plp_conv_instance_i16 struct initialized by plp_conv_i16_parallel

Return:

none
none

Setup code for parallel convolution of 16-bit integer vectors.

function plp_conv_i8_parallel

void plp_conv_i8_parallel(
    const int8_t * pSrcA,
    const uint32_t srcALen,
    const int8_t * pSrcB,
    const uint32_t srcBLen,
    const uint8_t nPE,
    int32_t * pRes
)

Glue code for parallel convolution of 8-bit integer vectors.

Parameters:

pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
nPE Number of cores to compute on
pRes output result returned here
pSrcA points to the first input vector
srcALen Length of the first input vector
pSrcB points to the second input vector
srcBLen Length of the second input vector
nPE Number of cores to compute on
pRes output result returned here

Return:

none
none

function plp_conv_i8p_xpulpv2

void plp_conv_i8p_xpulpv2(
    void * task_args
)

Setup code for parallel convolution of 8-bit integer vectors.

Parameters:

task_args pointer to plp_conv_instance_i8 struct initialized by plp_conv_i8_parallel
task_args pointer to plp_conv_instance_i8 struct initialized by plp_conv_i8_parallel

Return:

none
none

Setup code for parallel convolution of 8-bit integer vectors.

function plp_conv_parallel_OLA

void plp_conv_parallel_OLA(
    uint32_t nPE,
    uint32_t srcALen,
    uint32_t srcBLen,
    int32_t * resultsBuffer
)

Helper function for parallelized overlap-adding of partial convolution results.

Parameters:

nPE Number of processing cores
srcALen Length of the first original input vector
srcBLen Length of the second original input vector
resultsBuffer resultsBuffer array from plp_conv_i[XX]_parallel
nPE Number of processing cores
srcALen Length of the first original input vector
srcBLen Length of the second original input vector
resultsBuffer resultsBuffer array from plp_conv_i[XX]_parallel

Return:

none
none

function plp_conv_parallel_OLA_kernel

void plp_conv_parallel_OLA_kernel(
    void * task_args
)

Helper function for parallelized overlap-adding of partial convolution results.

Parameters:

task_args Holds the plp_conv_tree_add_instance that describes the vector parameters
task_args Holds the plp_conv_tree_add_instance that describes the vector parameters

Return:

none
none

function plp_mat_mult_i32

void plp_mat_mult_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code for matrix matrix multiplication of a 32-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Glue code for matrix matrix multiplication of a 32-bit integer matrices.

function plp_mat_mult_i32s_rv32im

void plp_mat_mult_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension.

function plp_mat_mult_i32s_xpulpv2

void plp_mat_mult_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_i16

void plp_mat_mult_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code for matrix matrix multiplication of a 16-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Glue code for matrix matrix multiplication of a 16-bit integer matrices.

function plp_mat_mult_i16s_rv32im

void plp_mat_mult_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension.

function plp_mat_mult_i16s_xpulpv2

void plp_mat_mult_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

Matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_i8

void plp_mat_mult_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code for matrix matrix multiplication of a 8-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Glue code for matrix matrix multiplication of a 8-bit integer matrices.

function plp_mat_mult_i8s_rv32im

void plp_mat_mult_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension.

function plp_mat_mult_i8s_xpulpv2

void plp_mat_mult_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here

Return: none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_i32_parallel

void plp_mat_mult_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices.

function plp_mat_mult_i32p_xpulpv2

void plp_mat_mult_i32p_xpulpv2(
    void * args
)

Parallel matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_i32 struct initialized by plp_mat_mult_i32_parallel
args pointer to plp_mat_mult_instance_i32 struct initialized by plp_mat_mult_i32_parallel

Return:

none
none

Parallel matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_i16_parallel

void plp_mat_mult_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel matrix matrix multiplication of a 16-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel matrix matrix multiplication of a 16-bit integer matrices.

function plp_mat_mult_i16p_xpulpv2

void plp_mat_mult_i16p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 16-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_i16 struct initialized by plp_mat_mult_i16_parallel
args pointer to plp_mat_mult_instance_i16 struct initialized by plp_mat_mult_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_i8_parallel

void plp_mat_mult_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel matrix matrix multiplication of a 8-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel matrix matrix multiplication of a 8-bit integer matrices.

function plp_mat_mult_f32

void plp_mat_mult_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    float *__restrict__ pDstC
)

Glue code for matrix matrix multiplication of a 32-bit floating-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Glue code for matrix matrix multiplication of a 32-bit floating-point matrices.

function plp_mat_mult_f32s_xpulpv2

void plp_mat_mult_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    float *__restrict__ pDstC
)

Matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_mult_f32_parallel

void plp_mat_mult_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    float *__restrict__ pDstC
)

Glue code for parallel matrix matrix multiplication of a 32-bit floating-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel matrix matrix multiplication of a 32-bit floating-point matrices.

function plp_mat_mult_f32p_xpulpv2

void plp_mat_mult_f32p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_f32 struct initialized by plp_mat_mult_f32_parallel
args pointer to plp_mat_mult_instance_f32 struct initialized by plp_mat_mult_f32_parallel

Return:

none
none

function plp_mat_mult_i8p_xpulpv2

void plp_mat_mult_i8p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 8-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_i8 struct initialized by plp_mat_mult_i8_parallel

Return: none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_q32

void plp_mat_mult_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Glue code for matrix matrix multiplication of a 32-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Glue code for matrix matrix multiplication of a 32-bit fix-point matrices.

function plp_mat_mult_q32_parallel

void plp_mat_mult_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel matrix matrix multiplication of a 32-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Glue code for parallel matrix matrix multiplication of a 32-bit fix-point matrices.

function plp_mat_mult_q32s_rv32im

void plp_mat_mult_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.

function plp_mat_mult_q32s_xpulpv2

void plp_mat_mult_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.

function plp_mat_mult_q32p_xpulpv2

void plp_mat_mult_q32p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_q32 struct initialized by plp_mat_mult_q32_parallel
args pointer to plp_mat_mult_instance_q32 struct initialized by plp_mat_mult_q32_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.

function plp_mat_mult_q16

void plp_mat_mult_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Glue code for matrix matrix multiplication of a 16-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for matrix matrix multiplication of a 16-bit fix-point matrices.

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_q16_parallel

void plp_mat_mult_q16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int16_t *__restrict__ pDstC
)

Glue code for parallel matrix matrix multiplication of a 16-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for parallel matrix matrix multiplication of a 16-bit fix-point matrices.

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_q16s_rv32im

void plp_mat_mult_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

Matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_q16s_xpulpv2

void plp_mat_mult_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

Matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_q16p_xpulpv2

void plp_mat_mult_q16p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_q16 struct initialized by plp_mat_mult_q16_parallel
args pointer to plp_mat_mult_instance_q16 struct initialized by plp_mat_mult_q16_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_q8

void plp_mat_mult_q8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Glue code for matrix matrix multiplication of a 8-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for matrix matrix multiplication of a 8-bit fix-point matrices.

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_q8_parallel

void plp_mat_mult_q8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int8_t *__restrict__ pDstC
)

Glue code for parallel matrix matrix multiplication of a 8-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for parallel matrix matrix multiplication of a 8-bit fix-point matrices.

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_q8s_rv32im

void plp_mat_mult_q8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

Matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_q8s_xpulpv2

void plp_mat_mult_q8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

Matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_q8p_xpulpv2

void plp_mat_mult_q8p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_q8 struct initialized by plp_mat_mult_q8_parallel
args pointer to plp_mat_mult_instance_q8 struct initialized by plp_mat_mult_q8_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_cmplx_i32

void plp_mat_mult_cmplx_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code of matrix matrix multiplication for complex 32-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_i32s_rv32im

void plp_mat_mult_cmplx_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 32-bit integers on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_i32s_xpulpv2

void plp_mat_mult_cmplx_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 32-bit integers on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_i32_parallel

void plp_mat_mult_cmplx_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel matrix matrix multiplication for complex 32-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_i32p_xpulpv2

void plp_mat_mult_cmplx_i32p_xpulpv2(
    void * args
)

parallel matrix matrix multiplication for complex 32-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_i32 struct initialized by plp_mat_mult_cmplx_i32_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_i32 struct initialized by plp_mat_mult_cmplx_i32_parallel

Return:

none
none

function plp_mat_mult_cmplx_i16

void plp_mat_mult_cmplx_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code of matrix matrix multiplication for complex 16-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_i16s_rv32im

void plp_mat_mult_cmplx_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 16-bit integers on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_i16s_xpulpv2

void plp_mat_mult_cmplx_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 16-bit integers on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_cmplx_i16_parallel

void plp_mat_mult_cmplx_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel matrix matrix multiplication for complex 16-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_i16p_xpulpv2

void plp_mat_mult_cmplx_i16p_xpulpv2(
    void * args
)

parallel matrix matrix multiplication for complex 16-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_i16 struct initialized by plp_mat_mult_cmplx_i16_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_i16 struct initialized by plp_mat_mult_cmplx_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_cmplx_i8

void plp_mat_mult_cmplx_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code of matrix matrix multiplication for complex 8-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_i8s_rv32im

void plp_mat_mult_cmplx_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 8-bit integers on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_i8s_xpulpv2

void plp_mat_mult_cmplx_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 8-bit integers on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_cmplx_i8_parallel

void plp_mat_mult_cmplx_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel matrix matrix multiplication for complex 8-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_i8p_xpulpv2

void plp_mat_mult_cmplx_i8p_xpulpv2(
    void * args
)

parallel matrix matrix multiplication for complex 8-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_i8 struct initialized by plp_mat_mult_cmplx_i8_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_i8 struct initialized by plp_mat_mult_cmplx_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_cmplx_f32

void plp_mat_mult_cmplx_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    float *__restrict__ pDstC
)

Glue code of matrix matrix multiplication for complex 32-bit floats.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_f32s_xpulpv2

void plp_mat_mult_cmplx_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    float *__restrict__ pDstC
)

Matrix matrix multiplication for complex 32-bit floats on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_f32_parallel

void plp_mat_mult_cmplx_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    float *__restrict__ pDstC
)

Glue code of parallel matrix matrix multiplication for complex 32-bit floats.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_f32p_xpulpv2

void plp_mat_mult_cmplx_f32p_xpulpv2(
    void * args
)

parallel matrix matrix multiplication for complex 32-bit floats on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_f32 struct initialized by plp_mat_mult_cmplx_f32_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_f32 struct initialized by plp_mat_mult_cmplx_f32_parallel

Return:

none
none

function plp_mat_mult_cmplx_q32

void plp_mat_mult_cmplx_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Glue code of matrix matrix multiplication for complex 32-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q32s_rv32im

void plp_mat_mult_cmplx_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 32-bit fix-point on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q32s_xpulpv2

void plp_mat_mult_cmplx_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 32-bit fix-point on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q32_parallel

void plp_mat_mult_cmplx_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel matrix matrix multiplication for complex 32-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q32p_xpulpv2

void plp_mat_mult_cmplx_q32p_xpulpv2(
    void * args
)

parallel matrix matrix multiplication for complex 32-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_q32 struct initialized by plp_mat_mult_cmplx_q32_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_q32 struct initialized by plp_mat_mult_cmplx_q32_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q16

void plp_mat_mult_cmplx_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Glue code of matrix matrix multiplication for complex 16-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q16s_rv32im

void plp_mat_mult_cmplx_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 16-bit fix-point on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q16s_xpulpv2

void plp_mat_mult_cmplx_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 16-bit fix-point on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q16_parallel

void plp_mat_mult_cmplx_q16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int16_t *__restrict__ pDstC
)

Glue code of parallel matrix matrix multiplication for complex 16-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q16p_xpulpv2

void plp_mat_mult_cmplx_q16p_xpulpv2(
    void * args
)

parallel matrix matrix multiplication for complex 16-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_q16 struct initialized by plp_mat_mult_cmplx_q16_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_q16 struct initialized by plp_mat_mult_cmplx_q16_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_cmplx_q8

void plp_mat_mult_cmplx_q8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Glue code of matrix matrix multiplication for complex 8-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q8s_rv32im

void plp_mat_mult_cmplx_q8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 8-bit fix-point on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q8s_xpulpv2

void plp_mat_mult_cmplx_q8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Matrix matrix multiplication for complex 8-bit fix-point on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q8_parallel

void plp_mat_mult_cmplx_q8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int8_t *__restrict__ pDstC
)

Glue code of parallel matrix matrix multiplication for complex 8-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_q8p_xpulpv2

void plp_mat_mult_cmplx_q8p_xpulpv2(
    void * args
)

parallel matrix matrix multiplication for complex 8-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_q8 struct initialized by plp_mat_mult_cmplx_q8_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_q8 struct initialized by plp_mat_mult_cmplx_q8_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_trans_i32

void plp_mat_mult_trans_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code for matrix transposed matrix multiplication of a 32-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Glue code for matrix transposed matrix multiplication of a 32-bit integer matrices.

function plp_mat_mult_trans_i32s_rv32im

void plp_mat_mult_trans_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.

function plp_mat_mult_trans_i32s_xpulpv2

void plp_mat_mult_trans_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_trans_i16

void plp_mat_mult_trans_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code for matrix transposed matrix multiplication of a 16-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Glue code for matrix transposed matrix multiplication of a 16-bit integer matrices.

function plp_mat_mult_trans_i16s_rv32im

void plp_mat_mult_trans_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension.

function plp_mat_mult_trans_i16s_xpulpv2

void plp_mat_mult_trans_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

Matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_trans_i8

void plp_mat_mult_trans_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code for matrix transposed matrix multiplication of a 8-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Glue code for matrix transposed matrix multiplication of a 8-bit integer matrices.

function plp_mat_mult_trans_i8s_rv32im

void plp_mat_mult_trans_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension.

function plp_mat_mult_trans_i8s_xpulpv2

void plp_mat_mult_trans_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

Matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_trans_i32_parallel

void plp_mat_mult_trans_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices.

function plp_mat_mult_trans_i32p_xpulpv2

void plp_mat_mult_trans_i32p_xpulpv2(
    void * args
)

Parallel matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.

Parameters:

args pointer to plp_mat_mult_instance_i32 struct initialized by plp_mat_mult_i32_parallel
args pointer to plp_mat_mult_instance_i32 struct initialized by plp_mat_mult_i32_parallel

Return:

none
none

function plp_mat_mult_trans_i16_parallel

void plp_mat_mult_trans_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel matrix transposed matrix multiplication of a 16-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel matrix transposed matrix multiplication of a 16-bit integer matrices.

function plp_mat_mult_trans_i16p_xpulpv2

void plp_mat_mult_trans_i16p_xpulpv2(
    void * args
)

Parallel matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_i16 struct initialized by plp_mat_mult_i16_parallel
args pointer to plp_mat_mult_instance_i16 struct initialized by plp_mat_mult_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_trans_i8_parallel

void plp_mat_mult_trans_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel matrix transposed matrix multiplication of a 8-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel matrix transposed matrix multiplication of a 8-bit integer matrices.

function plp_mat_mult_trans_i8p_xpulpv2

void plp_mat_mult_trans_i8p_xpulpv2(
    void * args
)

Parallel matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_i8 struct initialized by plp_mat_mult_i8_parallel
args pointer to plp_mat_mult_instance_i8 struct initialized by plp_mat_mult_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_trans_q32

void plp_mat_mult_trans_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Glue code for matrix transposed matrix multiplication of a 32-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Glue code for matrix transposed matrix multiplication of a 32-bit fix-point matrices.

function plp_mat_mult_trans_q32_parallel

void plp_mat_mult_trans_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel matrix transposed matrix multiplication of a 32-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Glue code for parallel matrix transposed matrix multiplication of a 32-bit fix-point matrices.

function plp_mat_mult_trans_q32s_rv32im

void plp_mat_mult_trans_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.

function plp_mat_mult_trans_q32s_xpulpv2

void plp_mat_mult_trans_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.

function plp_mat_mult_trans_q32p_xpulpv2

void plp_mat_mult_trans_q32p_xpulpv2(
    void * args
)

Parallel matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_q32 struct initialized by plp_mat_mult_trans_q32_parallel
args pointer to plp_mat_mult_instance_q32 struct initialized by plp_mat_mult_trans_q32_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.

function plp_mat_mult_trans_q16

void plp_mat_mult_trans_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Glue code for matrix transposed matrix multiplication of a 16-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for matrix transposed matrix multiplication of a 16-bit fix-point matrices.

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_q16_parallel

void plp_mat_mult_trans_q16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int16_t *__restrict__ pDstC
)

Glue code for parallel matrix transposed matrix multiplication of a 16-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for parallel matrix transposed matrix multiplication of a 16-bit fix-point matrices.

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_q16s_rv32im

void plp_mat_mult_trans_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_q16s_xpulpv2

void plp_mat_mult_trans_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_q16p_xpulpv2

void plp_mat_mult_trans_q16p_xpulpv2(
    void * args
)

Parallel matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_q16 struct initialized by plp_mat_mult_trans_q16_parallel
args pointer to plp_mat_mult_instance_q16 struct initialized by plp_mat_mult_trans_q16_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_q8

void plp_mat_mult_trans_q8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Glue code for matrix transposed matrix multiplication of a 8-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for matrix transposed matrix multiplication of a 8-bit fix-point matrices.

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_q8_parallel

void plp_mat_mult_trans_q8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int8_t *__restrict__ pDstC
)

Glue code for parallel matrix transposed matrix multiplication of a 8-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for parallel matrix transposed matrix multiplication of a 8-bit fix-point matrices.

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_q8s_rv32im

void plp_mat_mult_trans_q8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_q8s_xpulpv2

void plp_mat_mult_trans_q8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_q8p_xpulpv2

void plp_mat_mult_trans_q8p_xpulpv2(
    void * args
)

Parallel matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_q8 struct initialized by plp_mat_mult_trans_q8_parallel
args pointer to plp_mat_mult_instance_q8 struct initialized by plp_mat_mult_trans_q8_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_f32

void plp_mat_mult_trans_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    float *__restrict__ pDstC
)

Glue code for matrix transposed matrix multiplication of a 32-bit floating-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

Glue code for matrix transposed matrix multiplication of a 32-bit floating-point matrices.

function plp_mat_mult_trans_f32s_xpulpv2

void plp_mat_mult_trans_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    float *__restrict__ pDstC
)

matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
pDstC points to the output matrix

Return:

none
none

matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_mult_trans_f32_parallel

void plp_mat_mult_trans_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    float *__restrict__ pDstC
)

Glue code for parallel matrix transposed matrix multiplication of a 32-bit floating-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix, stored transposed in memory
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel matrix transposed matrix multiplication of a 32-bit floating-point matrices.

function plp_mat_mult_trans_f32p_xpulpv2

void plp_mat_mult_trans_f32p_xpulpv2(
    void * args
)

Parallel matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_instance_f32 struct initialized by plp_mat_mult_trans_f32_parallel
args pointer to plp_mat_mult_instance_f32 struct initialized by plp_mat_mult_trans_f32_parallel

Return:

none
none

Parallel matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.

function plp_mat_mult_trans_cmplx_i32

void plp_mat_mult_trans_cmplx_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code of matrix transpose matrix multiplication for complex 32-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_i32s_rv32im

void plp_mat_mult_trans_cmplx_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 32-bit integers on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

matrix transpose matrix multiplication for complex 32-bit integers on RV32IM

function plp_mat_mult_trans_cmplx_i32s_xpulpv2

void plp_mat_mult_trans_cmplx_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2

function plp_mat_mult_trans_cmplx_i32_parallel

void plp_mat_mult_trans_cmplx_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel matrix transpose matrix multiplication for complex 32-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_i32p_xpulpv2

void plp_mat_mult_trans_cmplx_i32p_xpulpv2(
    void * args
)

parallel matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_i32 struct initialized by plp_mat_mult_trans_cmplx_i32_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_i32 struct initialized by plp_mat_mult_trans_cmplx_i32_parallel

Return:

none
none

function plp_mat_mult_trans_cmplx_i16

void plp_mat_mult_trans_cmplx_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code of matrix transpose matrix multiplication for complex 16-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_i16s_rv32im

void plp_mat_mult_trans_cmplx_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 16-bit integers on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

matrix transpose matrix multiplication for complex 16-bit integers on RV32IM

function plp_mat_mult_trans_cmplx_i16s_xpulpv2

void plp_mat_mult_trans_cmplx_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2

function plp_mat_mult_trans_cmplx_i16_parallel

void plp_mat_mult_trans_cmplx_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel matrix transpose matrix multiplication for complex 16-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_i16p_xpulpv2

void plp_mat_mult_trans_cmplx_i16p_xpulpv2(
    void * args
)

parallel matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_i16 struct initialized by plp_mat_mult_trans_cmplx_i16_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_i16 struct initialized by plp_mat_mult_trans_cmplx_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_trans_cmplx_i8

void plp_mat_mult_trans_cmplx_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

Glue code of matrix transpose matrix multiplication for complex 8-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_i8s_rv32im

void plp_mat_mult_trans_cmplx_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 8-bit integers on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

matrix transpose matrix multiplication for complex 8-bit integers on RV32IM

function plp_mat_mult_trans_cmplx_i8s_xpulpv2

void plp_mat_mult_trans_cmplx_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    int32_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2

function plp_mat_mult_trans_cmplx_i8_parallel

void plp_mat_mult_trans_cmplx_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel matrix transpose matrix multiplication for complex 8-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_i8p_xpulpv2

void plp_mat_mult_trans_cmplx_i8p_xpulpv2(
    void * args
)

parallel matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_i8 struct initialized by plp_mat_mult_trans_cmplx_i8_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_i8 struct initialized by plp_mat_mult_trans_cmplx_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_trans_cmplx_f32

void plp_mat_mult_trans_cmplx_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    float *__restrict__ pDstC
)

Glue code of matrix transpose matrix multiplication for complex 32-bit floats.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_f32s_xpulpv2

void plp_mat_mult_trans_cmplx_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    float *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
pDstC Points to the output matrix of shape MxO

Return:

none
none

matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2

function plp_mat_mult_trans_cmplx_f32_parallel

void plp_mat_mult_trans_cmplx_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t nPE,
    float *__restrict__ pDstC
)

Glue code of parallel matrix transpose matrix multiplication for complex 32-bit floats.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_f32p_xpulpv2

void plp_mat_mult_trans_cmplx_f32p_xpulpv2(
    void * args
)

parallel matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_f32 struct initialized by plp_mat_mult_trans_cmplx_f32_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_f32 struct initialized by plp_mat_mult_trans_cmplx_f32_parallel

Return:

none
none

function plp_mat_mult_trans_cmplx_q32

void plp_mat_mult_trans_cmplx_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Glue code of matrix transpose matrix multiplication for complex 32-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_q32s_rv32im

void plp_mat_mult_trans_cmplx_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM

function plp_mat_mult_trans_cmplx_q32s_xpulpv2

void plp_mat_mult_trans_cmplx_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2

function plp_mat_mult_trans_cmplx_q32_parallel

void plp_mat_mult_trans_cmplx_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel matrix transpose matrix multiplication for complex 32-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_q32p_xpulpv2

void plp_mat_mult_trans_cmplx_q32p_xpulpv2(
    void * args
)

parallel matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_q32 struct initialized by plp_mat_mult_trans_cmplx_q32_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_q32 struct initialized by plp_mat_mult_trans_cmplx_q32_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_q16

void plp_mat_mult_trans_cmplx_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Glue code of matrix transpose matrix multiplication for complex 16-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_q16s_rv32im

void plp_mat_mult_trans_cmplx_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM

function plp_mat_mult_trans_cmplx_q16s_xpulpv2

void plp_mat_mult_trans_cmplx_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2

function plp_mat_mult_trans_cmplx_q16_parallel

void plp_mat_mult_trans_cmplx_q16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int16_t *__restrict__ pDstC
)

Glue code of parallel matrix transpose matrix multiplication for complex 16-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_q16p_xpulpv2

void plp_mat_mult_trans_cmplx_q16p_xpulpv2(
    void * args
)

parallel matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_q16 struct initialized by plp_mat_mult_trans_cmplx_q16_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_q16 struct initialized by plp_mat_mult_trans_cmplx_q16_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_trans_cmplx_q8

void plp_mat_mult_trans_cmplx_q8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Glue code of matrix transpose matrix multiplication for complex 8-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_q8s_rv32im

void plp_mat_mult_trans_cmplx_q8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM

function plp_mat_mult_trans_cmplx_q8s_xpulpv2

void plp_mat_mult_trans_cmplx_q8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2

function plp_mat_mult_trans_cmplx_q8_parallel

void plp_mat_mult_trans_cmplx_q8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t shift,
    uint32_t nPE,
    int8_t *__restrict__ pDstC
)

Glue code of parallel matrix transpose matrix multiplication for complex 8-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_q8p_xpulpv2

void plp_mat_mult_trans_cmplx_q8p_xpulpv2(
    void * args
)

parallel matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_instance_q8 struct initialized by plp_mat_mult_trans_cmplx_q8_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_q8 struct initialized by plp_mat_mult_trans_cmplx_q8_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_cmplx_mag_f32

void plp_cmplx_mag_f32(
    const float32_t * pSrc,
    float32_t * pRes,
    uint32_t numSamples
)

Glue code for complex magnitude calculation in float32.

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_f32s_xpulpv2

void plp_cmplx_mag_f32s_xpulpv2(
    const float32_t * pSrc,
    float32_t * pRes,
    uint32_t numSamples
)

complex magnitude for float32 on XPULPV2

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_q32

void plp_cmplx_mag_q32(
    const int32_t * pSrc,
    const uint32_t fracBits,
    int32_t * pRes,
    uint32_t numSamples
)

Glue code for complex magnitude calculation for 32 bit fixpoint.

Parameters:

pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number samples
pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_q32s_rv32im

void plp_cmplx_mag_q32s_rv32im(
    const int32_t * pSrc,
    const uint32_t fracBits,
    int32_t * pRes,
    uint32_t numSamples
)

complex magnitude for q32 on RV32IM

Parameters:

pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number samples
pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_q32s_xpulpv2

void plp_cmplx_mag_q32s_xpulpv2(
    const int32_t * pSrc,
    const uint32_t fracBits,
    int32_t * pRes,
    uint32_t numSamples
)

complex magnitude for q32 on XPULPV2

Parameters:

pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number samples
pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_q8

void plp_cmplx_mag_q8(
    const int8_t * pSrc,
    const uint32_t fracBits,
    int8_t * pRes,
    uint32_t numSamples
)

Glue code for complex magnitude calculation for 8 bit fixpoint.

Parameters:

pSrc pointer to source
fracBits fractional bits -> Q(8-fracBits).fracBits
pRes pointer to result
numSamples The number samples
pSrc pointer to source
fracBits fractional bits -> Q(8-fracBits).fracBits
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_q8s_rv32im

void plp_cmplx_mag_q8s_rv32im(
    const int8_t * pSrc,
    const uint32_t fracBits,
    int8_t * pRes,
    uint32_t numSamples
)

complex magnitude for q8 on RV32IM

Parameters:

pSrc pointer to source
fracBits fractional bits -> Q(8-fracBits).fracBits
pRes pointer to result
numSamples The number samples
pSrc pointer to source
fracBits fractional bits -> Q(8-fracBits).fracBits
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_q8s_xpulpv2

void plp_cmplx_mag_q8s_xpulpv2(
    const int8_t * pSrc,
    const uint32_t fracBits,
    int8_t * pRes,
    uint32_t numSamples
)

complex magnitude for q8 on XPULPV2

Parameters:

pSrc pointer to source
fracBits fractional bits -> Q(8-fracBits).fracBits
pRes pointer to result
numSamples The number samples
pSrc pointer to source
fracBits fractional bits -> Q(8-fracBits).fracBits
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_i16

void plp_cmplx_mag_i16(
    const int16_t * pSrc,
    int16_t * pRes,
    uint32_t numSamples
)

Glue code for complex magnitude calculation in 16-bit integer.

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_i16s_rv32im

void plp_cmplx_mag_i16s_rv32im(
    const int16_t * pSrc,
    int16_t * pRes,
    uint32_t numSamples
)

complex magnitude for i16 on RV32IM

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_i16s_xpulpv2

void plp_cmplx_mag_i16s_xpulpv2(
    const int16_t * pSrc,
    int16_t * pRes,
    uint32_t numSamples
)

complex magnitude for i16 on XPULPV2

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_i32

void plp_cmplx_mag_i32(
    const int32_t * pSrc,
    int32_t * pRes,
    uint32_t numSamples
)

Glue code for complex magnitude calculation in 32-bit integer.

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_i32s_rv32im

void plp_cmplx_mag_i32s_rv32im(
    const int32_t * pSrc,
    int32_t * pRes,
    uint32_t numSamples
)

complex magnitude for i32 on RV32IM

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_i32s_xpulpv2

void plp_cmplx_mag_i32s_xpulpv2(
    const int32_t * pSrc,
    int32_t * pRes,
    uint32_t numSamples
)

complex magnitude for i32 on XPULPV2

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_i8

void plp_cmplx_mag_i8(
    const int8_t * pSrc,
    int8_t * pRes,
    uint32_t numSamples
)

Glue code for complex magnitude calculation in 8-bit integer.

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_i8s_rv32im

void plp_cmplx_mag_i8s_rv32im(
    const int8_t * pSrc,
    int8_t * pRes,
    uint32_t numSamples
)

complex magnitude for i8 on RV32IM

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_i8s_xpulpv2

void plp_cmplx_mag_i8s_xpulpv2(
    const int8_t * pSrc,
    int8_t * pRes,
    uint32_t numSamples
)

complex magnitude for i8 on XPULPV2

Parameters:

pSrc pointer to source
pRes pointer to result
numSamples The number samples
pSrc pointer to source
pRes pointer to result
numSamples The number samples

function plp_cmplx_mag_q16

void plp_cmplx_mag_q16(
    const int16_t * pSrc,
    const uint32_t fracBits,
    int16_t * pRes,
    uint32_t numSamples
)

Glue code for complex magnitude calculation in 16-bit quantized integer.

Parameters:

pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number of samples
pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number of samples

function plp_cmplx_mag_q16s_rv32im

void plp_cmplx_mag_q16s_rv32im(
    const int16_t * pSrc,
    const uint32_t fracBits,
    int16_t * pRes,
    uint32_t numSamples
)

complex magnitude for q16 on RV32IM

Parameters:

pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number of samples
pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number of samples

function plp_cmplx_mag_q16s_xpulpv2

void plp_cmplx_mag_q16s_xpulpv2(
    const int16_t * pSrc,
    const uint32_t fracBits,
    int16_t * pRes,
    uint32_t numSamples
)

complex magnitude for q16 on XPULPV2

Parameters:

pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number of samples
pSrc pointer to source
fracBits fractional bits -> Q(32-fracBits).fracBits
pRes pointer to result
numSamples The number of samples

function plp_bitreversal_16s_rv32im

void plp_bitreversal_16s_rv32im(
    uint16_t * pSrc,
    const uint16_t bitRevLen,
    const uint16_t * pBitRevTab
)

In-place 16 bit reversal function for RV32IM.

Parameters:

pSrc points to in-place buffer of unknown 16-bit data type
bitRevLen bit reversal table length
pBitRevTab points to bit reversal table
pSrc points to in-place buffer of unknown 16-bit data type
bitRevLen bit reversal table length
pBitRevTab points to bit reversal table

Return:

none
none

In-place 16 bit reversal function for RV32IM.

function plp_bitreversal_16s_xpulpv2

void plp_bitreversal_16s_xpulpv2(
    uint16_t * pSrc,
    const uint16_t bitRevLen,
    const uint16_t * pBitRevTab
)

In-place 16 bit reversal function for XPULPV2.

Parameters:

pSrc points to in-place buffer of unknown 16-bit data type
bitRevLen bit reversal table length
pBitRevTab points to bit reversal table
pSrc points to in-place buffer of unknown 16-bit data type
bitRevLen bit reversal table length
pBitRevTab points to bit reversal table

Return:

none
none

In-place 16 bit reversal function for XPULPV2.

function plp_bitreversal_16p_xpulpv2

void plp_bitreversal_16p_xpulpv2(
    uint16_t * pSrc,
    const uint16_t bitRevLen,
    const uint16_t * pBitRevTab,
    uint32_t nPE
)

In-place 16 bit reversal function.

Parameters:

pSrc points to in-place buffer of unknown 16-bit data type
bitRevLen bit reversal table length
pBitRevTab points to bit reversal table
nPE number of cores

Return: none

function plp_cfft_q16

void plp_cfft_q16(
    const plp_cfft_instance_q16 * S,
    int16_t * p1,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag,
    uint32_t deciPoint
)

Glue code for quantized 16 bit complex fast fourier transform.

Parameters:

S points to an instance of the 16bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
deciPoint decimal point for right shift
S points to an instance of the 16bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
deciPoint decimal point for right shift

Fixed point units input -> output dependent on length: len=16: Q1.15 -> Q5.11 len=32: Q1.15 -> Q6.10 len=64: Q1.15 -> Q7.9 len=128: Q1.15 -> Q8.8 len=256: Q1.15 -> Q9.7 len=512: Q1.15 -> Q10.6 len=1024: Q1.15 -> Q11.5 len=2048: Q1.15 -> Q12.4 len=4096: Q1.15 -> Q13.3

function plp_cfft_q16_parallel

void plp_cfft_q16_parallel(
    const plp_cfft_instance_q16 * S,
    int16_t * p1,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag,
    uint32_t deciPoint,
    uint32_t nPE
)

Glue code for quantized 16 bit complex fast fourier transform.

Parameters:

S points to an instance of the 16bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
deciPoint decimal point for right shift
nPE Number of cores to use
S points to an instance of the 16bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
deciPoint decimal point for right shift
nPE Number of cores to use

Fixed point units input -> output dependent on length: len=16: Q1.15 -> Q5.11 len=32: Q1.15 -> Q6.10 len=64: Q1.15 -> Q7.9 len=128: Q1.15 -> Q8.8 len=256: Q1.15 -> Q9.7 len=512: Q1.15 -> Q10.6 len=1024: Q1.15 -> Q11.5 len=2048: Q1.15 -> Q12.4 len=4096: Q1.15 -> Q13.3

function plp_cfft_q16s_rv32im

void plp_cfft_q16s_rv32im(
    const plp_cfft_instance_q16 * S,
    int16_t * p1,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag,
    uint32_t deciPoint
)

Quantized 16 bit complex fast fourier transform for RV32IM.

Parameters:

S points to an instance of the 16bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
deciPoint decimal point for right shift

function plp_cfft_q16s_xpulpv2

void plp_cfft_q16s_xpulpv2(
    const plp_cfft_instance_q16 * S,
    int16_t * p1,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag,
    uint32_t deciPoint
)

Quantized 16 bit complex fast fourier transform for XPULPV2.

Parameters:

S points to an instance of the 16bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
deciPoint decimal point for right shift

function plp_cfft_q16p_xpulpv2

void plp_cfft_q16p_xpulpv2(
    void * args
)

Parallel quantized 16 bit complex fast fourier transform for XPULPV2.

Parameters:

args points to the plp_cfft_instance_q16_parallel

function plp_bitreversal_32s_rv32im

void plp_bitreversal_32s_rv32im(
    uint32_t * pSrc,
    const uint16_t bitRevLen,
    const uint16_t * pBitRevTab
)

In-place 32 bit reversal function for RV32IM.

Parameters:

pSrc points to in-place buffer of unknown 32-bit data type
bitRevLen bit reversal table length
pBitRevTab points to bit reversal table
pSrc points to in-place buffer of unknown 32-bit data type
bitRevLen bit reversal table length
pBitRevTab points to bit reversal table

Return:

none
none

In-place 32 bit reversal function for RV32IM.

function plp_bitreversal_32s_xpulpv2

void plp_bitreversal_32s_xpulpv2(
    uint32_t * pSrc,
    const uint16_t bitRevLen,
    const uint16_t * pBitRevTab
)

In-place 32 bit reversal function for XPULPV2.

Parameters:

pSrc points to in-place buffer of unknown 32-bit data type
bitRevLen bit reversal table length
pBitRevTab points to bit reversal table

Return: none

function plp_bitreversal_32p_xpulpv2

void plp_bitreversal_32p_xpulpv2(
    uint32_t * pSrc,
    const uint16_t bitRevLen,
    const uint16_t * pBitRevTab,
    uint32_t nPE
)

In-place 32 bit reversal function for XPULPV2.

Parameters:

pSrc points to in-place buffer of unknown 32-bit data type
bitRevLen bit reversal table length
pBitRevTab points to bit reversal table
nPE number of cores

Return: none

function plp_cfft_q32

void plp_cfft_q32(
    const plp_cfft_instance_q32 * S,
    int32_t * p1,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag,
    uint32_t fracBits
)

Glue code for quantized 32-bit complex fast fourier transform.

Parameters:

S points to an instance of the 32bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
S points to an instance of the 32bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)

Fixed point units input -> output dependent on length: len=16: Q1.31 -> Q5.27 len=32: Q1.31 -> Q6.26 len=64: Q1.31 -> Q7.25 len=128: Q1.31 -> Q8.24 len=256: Q1.31 -> Q9.23 len=512: Q1.31 -> Q10.22 len=1024: Q1.31 -> Q11.21 len=2048: Q1.31 -> Q12.20 len=4096: Q1.31 -> Q13.19

function plp_cfft_q32_parallel

void plp_cfft_q32_parallel(
    const plp_cfft_instance_q32 * S,
    int32_t * p1,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag,
    uint32_t fracBits,
    uint32_t nPE
)

Quantized 32-bit complex fast fourier transform for XPULPV2.

Parameters:

S points to an instance of the 32bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
nPE Number of cores to use
S points to an instance of the 32bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
nPE Number of cores to use

function plp_cfft_q32s_rv32im

void plp_cfft_q32s_rv32im(
    const plp_cfft_instance_q32 * S,
    int32_t * p1,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag,
    uint32_t fracBits
)

Quantized 32-bit complex fast fourier transform for RV32IM.

Parameters:

S points to an instance of the 32bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
S points to an instance of the 32bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)

function plp_cfft_q32s_xpulpv2

void plp_cfft_q32s_xpulpv2(
    const plp_cfft_instance_q32 * S,
    int32_t * p1,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag,
    uint32_t fracBits
)

Quantized 32-bit complex fast fourier transform for XPULPV2.

Parameters:

S points to an instance of the 32bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
S points to an instance of the 32bit quantized CFFT structure
p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)

function plp_cfft_q32p_xpulpv2

void plp_cfft_q32p_xpulpv2(
    void * args
)

Parallel quantized 32 bit complex fast fourier transform for XPULPV2.

Parameters:

args points to the plp_cfft_instance_q32_parallel

function plp_rfft_f32

void plp_rfft_f32(
    const plp_fft_instance_f32 * S,
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst
)

Floating-point FFT on real input data.

Parameters:

S points to an instance of the floating-point FFT structure
pSrc points to the input buffer (real data)
pDst points to the output buffer (complex data)

Return: none

function plp_rfft_f32_parallel

void plp_rfft_f32_parallel(
    const plp_fft_instance_f32 * S,
    const float32_t *__restrict__ pSrc,
    const uint32_t nPE,
    float32_t *__restrict__ pDst
)

Floating-point FFT on real input data (parallel version).

Parameters:

S points to an instance of the floating-point FFT structure
pSrc points to the input buffer (real data)
nPE number of parallel processing units
pDst points to the output buffer (complex data)

Return: none

function plp_rfft_f32s_xpulpv2

void plp_rfft_f32s_xpulpv2(
    const plp_fft_instance_f32 * S,
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst
)

Floating-point FFT on real input data for XPULPV2 extension.

Parameters:

S points to an instance of the floating-point FFT structure
pSrcA points to the input buffer (real data)
pDst points to the output buffer (complex data)
S points to an instance of the floating-point FFT structure
pSrc points to the input buffer (real data)
pDst points to the output buffer (complex data)

Return:

none
none

function plp_rfft_f32p_xpulpv2

void plp_rfft_f32p_xpulpv2(
    void * arg
)

Floating-point FFT on real input data for XPULPV2 extension (parallel version).

Parameters:

arg points to an instance of the floating-point FFT structure
arg points to an instance of the floating-point FFT structure

Return:

none
none

Floating-point FFT on real input data for XPULPV2 extension (parallel version).

function plp_rfftfast_f32

void plp_rfftfast_f32(
    const plp_fft_fast_instance_f32 * S,
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst
)

Floating-point FFT on real input data.

Parameters:

S points to an instance of the floating-point FFT structure
pSrc points to the input buffer (real data)
pDst points to the output buffer (complex data)

Return: none

function plp_rfftfast_f32_parallel

void plp_rfftfast_f32_parallel(
    const plp_fft_fast_instance_f32 * S,
    float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst,
    const uint32_t nPE
)

Floating-point parallel FFT on real input data.

Parameters:

S points to an instance of the floating-point FFT structure
pSrc points to the input buffer (real data)
pDst points to the output buffer (complex data)
S points to an instance of the floating-point FFT structure
pSrc points to the input buffer (real data)
pDst points to the output buffer (complex data)

Return:

none
none

Floating-point parallel FFT on real input data.

function plp_rfftfast_f32s_xpulpv2

void plp_rfftfast_f32s_xpulpv2(
    const plp_fft_fast_instance_f32 * S,
    float32_t * pSrc,
    float32_t * pDst
)

Floating-point FFT on real input data for XPULPV2 extension.

Parameters:

S points to an instance of the floating-point FFT structure
pSrcA points to the input buffer (real data)
pDst points to the output buffer (complex data)

Return: none

function plp_rfftfast_f32p_xpulpv2

void plp_rfftfast_f32p_xpulpv2(
    void * arg
)

Floating-point parallel FFT on real input data for XPULPV2 extension.

Parameters:

arg points to an instance of the floating-point FFT structure

Return: none

function plp_cfft_f32

void plp_cfft_f32(
    const plp_cfft_instance_f32 * S,
    float32_t * pSrc,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag
)

Floating-point FFT on complex input data.

Parameters:

S points to an instance of the floating-point FFT structure
pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.

Return: none

function plp_cfft_f32_parallel

void plp_cfft_f32_parallel(
    const plp_cfft_instance_f32 * S,
    const float32_t * pSrc,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag,
    const uint32_t nPE
)

Floating-point FFT on complex input data (parallel version).

Parameters:

S points to an instance of the floating-point FFT structure
pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
nPE number of parallel processing units

Return: none

function plp_cfft_f32s_xpulpv2

void plp_cfft_f32s_xpulpv2(
    const plp_cfft_instance_f32 * S,
    const float32_t * pSrc,
    uint8_t ifftFlag,
    uint8_t bitReverseFlag
)

Floating-point FFT on complex input data for XPULPV2 extension.

Parameters:

S points to an instance of the floating-point FFT structure
pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.

Return: none

function plp_cfft_f32p_xpulpv2

void plp_cfft_f32p_xpulpv2(
    void * arg
)

Floating-point FFT on complex input data for XPULPV2 extension (parallel version).

Parameters:

arg points to an instance of the floating-point FFT structure
arg points to an instance of the floating-point FFT structure

Return:

none
none

Floating-point FFT on complex input data for XPULPV2 extension (parallel version).

function plp_dct2_f32

void plp_dct2_f32(
    const plp_fft_instance_f32 * S,
    const Complex_type_f32 * pShift,
    const uint8_t orthoNorm,
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pBuf,
    float32_t *__restrict__ pDst
)

Floating-point DCT on real input data. Implementation of John Makhoul's "A Fast Cosine Transform in One and Two Dimensions" 1980 IEEE paper.

Parameters:

S points to an instance of the floating-point FFT structure with FFTLength = DCTLength
pShift points to twiddle coefficient table of 4*FFTLength, of which only the first quarter is necessary.
pSrc points to the input buffer (real data) of size FFTLength
pBuf points to buffer of size 2*FFTLength, used for computation.
pDst points to output buffer (real data) of size FFTLength, may be the same as pSrc.
S points to an instance of the floating-point FFT structure with FFTLength = DCTLength
pShift points to twiddle coefficient table of 4*FFTLength, of which only the first quadrant of the complex unit circle is used. For example, if S contains twiddleCoef_rfft_32, pShift can be set to twiddleCoef_rfft_128.
pSrc points to the input buffer (real data) of size FFTLength.
pBuf points to buffer of size 2*FFTLength, used for computation.
pDst points to output buffer (real data) of size FFTLength, may be the same as pSrc.

Return:

none
none

function plp_dct2_f32_parallel

void plp_dct2_f32_parallel(
    const plp_fft_instance_f32 * S,
    const Complex_type_f32 * pShift,
    const uint8_t orthoNorm,
    const float32_t *__restrict__ pSrc,
    const uint32_t nPE,
    float32_t *__restrict__ pBuf,
    float32_t *__restrict__ pDst
)

Floating-point DCT on real input data. Implementation of John Makhoul's "A Fast Cosine Transform in One and Two Dimensions" 1980 IEEE paper.

Parameters:

S points to an instance of the floating-point FFT structure with FFTLength = DCTLength
pShift points to twiddle coefficient table of 4*FFTLength, of which only the first quarter is necessary.
pSrc points to the input buffer (real data) of size FFTLength
nPE number of parallel processing units
pBuf points to buffer of size 2*FFTLength, used for computation.
pDst points to output buffer (real data) of size FFTLength, may be the same as pSrc.
S points to an instance of the floating-point FFT structure with FFTLength = DCTLength
pShift points to twiddle coefficient table of 4*FFTLength, of which only the first quadrant of the complex unit circle is used. For example, if S contains twiddleCoef_rfft_32, pShift can be set to twiddleCoef_rfft_128.
pSrc points to the input buffer (real data) of size FFTLength.
pBuf points to buffer of size 2*FFTLength, used for computation.
pDst points to output buffer (real data) of size FFTLength, may be the same as pSrc.

Return:

none
none

Floating-point DCT on real input data. Implementation of John Makhoul's "A Fast Cosine Transform in One and Two Dimensions" 1980 IEEE paper.

function plp_mfcc_f32

void plp_mfcc_f32(
    const plp_fft_instance_f32 * SFFT,
    const plp_fft_instance_f32 * SDCT,
    const Complex_type_f32 * pShift,
    const plp_triangular_filter_f32 * filterBank,
    const float32_t * window,
    const uint8_t * orthoNorm,
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst
)

MFCC on real input data.

Parameters:

SFFT points to an instance of the floating-point FFT structure for the initial FFT (with FFTLength = n_fft). bitReverseFlag should be on.
SDCT points to an instance of the floating-point FFT structure for the DCT (with FFTLength = n_mels). bitReverseFlag should be on.
pShift points to twiddle coefficient table with FFTLength = 4*n_mels. Only first quarter necessary.
filterBank points to plp_triangular_filter_f32 instance with nFilters = n_mels.
window vector to use for windowing
orthoNorm whether to use dct orthonormalisation or not
pSrc points to the input buffer (real data, size n_fft)
pDst points to the output buffer of length at least 3*n_fft. pSrc and pDst must not overlap, the calculation can not be done in place. MFCCs are returned in the first n_mels spots.

Return: none

function plp_mfcc_f32_parallel

void plp_mfcc_f32_parallel(
    const plp_fft_instance_f32 * SFFT,
    const plp_fft_instance_f32 * SDCT,
    const Complex_type_f32 * pShift,
    const plp_triangular_filter_f32 * filterBank,
    const float32_t * window,
    const uint8_t * orthoNorm,
    const float32_t *__restrict__ pSrc,
    const uint32_t nPE,
    float32_t *__restrict__ pDst
)

MFCC on real input data.

Parameters:

SFFT points to an instance of the floating-point FFT structure for the initial FFT (with FFTLength = n_fft). bitReverseFlag should be on.
SDCT points to an instance of the floating-point FFT structure for the DCT (with FFTLength = n_mels). bitReverseFlag should be on.
pShift points to twiddle coefficient table with FFTLength = 4*n_mels. Only first quarter necessary.
filterBank points to plp_triangular_filter_f32 instance with nFilters = n_mels.
window vector to use for windowing
orthoNorm whether to use dct orthonormalisation or not
pSrc points to the input buffer (real data, size n_fft)
nPE number of parallel processing units
pDst points to the output buffer of length at least 3*n_fft. pSrc and pDst must not overlap, the calculation can not be done in place. MFCCs are returned in the first n_mels spots.

Return: none

function plp_dwt_f32

void plp_dwt_f32(
    const float32_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_f32 wavelet,
    plp_dwt_extension_mode mode,
    float32_t *__restrict__ pDstA,
    float32_t *__restrict__ pDstD
)

Glue code for matrix addition of a 32-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none
none

Floating-point DWT on real input data for XPULPV2 extension. Glue code for matrix addition of a 32-bit integer matrices.

function plp_dwt_q32

void plp_dwt_q32(
    const int32_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q32 wavelet,
    plp_dwt_extension_mode mode,
    int32_t *__restrict__ pDstA,
    int32_t *__restrict__ pDstD
)

32bit Fixed-point DWT for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (q32)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

32bit Fixed-point DWT for XPULPV2 extension.

function plp_dwt_q16

void plp_dwt_q16(
    const int16_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q16 wavelet,
    plp_dwt_extension_mode mode,
    int16_t *__restrict__ pDstA,
    int16_t *__restrict__ pDstD
)

16bit Fixed-point DWT for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (q16)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

16bit Fixed-point DWT for XPULPV2 extension.

function plp_dwt_q8

void plp_dwt_q8(
    const int8_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q8 wavelet,
    plp_dwt_extension_mode mode,
    int8_t *__restrict__ pDstA,
    int8_t *__restrict__ pDstD
)

8bit Fixed-point DWT for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (q8)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

8bit Fixed-point DWT for XPULPV2 extension.

function plp_dwt_dec_f32

void plp_dwt_dec_f32(
    const float32_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_f32 wavelet,
    plp_dwt_extension_mode mode,
    uint32_t level,
    float32_t *__restrict__ pTmp,
    float32_t *__restrict__ pDst
)

Floating-point n-level DWT for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
level Levels of Wavelet decomposition
pDst points to ouput buffer with Detailed coefficients and final approximate

Return: none

function plp_dwt_dec_f32_parallel

void plp_dwt_dec_f32_parallel(
    const float32_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_f32 wavelet,
    plp_dwt_extension_mode mode,
    uint32_t level,
    uint32_t nPE,
    float32_t *__restrict__ pTemp,
    float32_t *__restrict__ pDst
)

Floating-point parallel n-level DWT for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
level Levels of Wavelet decomposition
pDst points to ouput buffer with Detailed coefficients and final approximate

Return: none

function plp_dwt_f32s_xpulpv2

void plp_dwt_f32s_xpulpv2(
    const float32_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_f32 wavelet,
    plp_dwt_extension_mode mode,
    float32_t *__restrict__ pDstA,
    float32_t *__restrict__ pDstD
)

Floating-point DWT on real input data for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return: none

function plp_dwt_haar_f32s_xpulpv2

void plp_dwt_haar_f32s_xpulpv2(
    const float32_t *__restrict__ pSrc,
    uint32_t length,
    plp_dwt_extension_mode mode,
    float32_t *__restrict__ pDstA,
    float32_t *__restrict__ pDstD
)

Floating-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return: none

function plp_dwt_q32s_xpulpv2

void plp_dwt_q32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q32 wavelet,
    plp_dwt_extension_mode mode,
    int32_t *__restrict__ pDstA,
    int32_t *__restrict__ pDstD
)

32bit Fixed-point DWT for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

32bit Fixed-point DWT for XPULPV2 extension.

function plp_dwt_haar_q32s_xpulpv2

void plp_dwt_haar_q32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t length,
    plp_dwt_extension_mode mode,
    int32_t *__restrict__ pDstA,
    int32_t *__restrict__ pDstD
)

32bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return: none

function plp_dwt_q16s_xpulpv2

void plp_dwt_q16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q16 wavelet,
    plp_dwt_extension_mode mode,
    int16_t *__restrict__ pDstA,
    int16_t *__restrict__ pDstD
)

16bit Fixed-point DWT for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (q15)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

16bit Fixed-point DWT for XPULPV2 extension.

function plp_dwt_haar_q16s_xpulpv2

void plp_dwt_haar_q16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t length,
    plp_dwt_extension_mode mode,
    int16_t *__restrict__ pDstA,
    int16_t *__restrict__ pDstD
)

16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (q15)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.

function plp_dwt_q8s_xpulpv2

void plp_dwt_q8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q8 wavelet,
    plp_dwt_extension_mode mode,
    int8_t *__restrict__ pDstA,
    int8_t *__restrict__ pDstD
)

8bit Fixed-point DWT for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (q7)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

8bit Fixed-point DWT for XPULPV2 extension.

function plp_dwt_haar_q8s_xpulpv2

void plp_dwt_haar_q8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t length,
    plp_dwt_extension_mode mode,
    int8_t *__restrict__ pDstA,
    int8_t *__restrict__ pDstD
)

8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (q7)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.

function plp_dwt_f32_parallel

void plp_dwt_f32_parallel(
    const float32_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_f32 wavelet,
    plp_dwt_extension_mode mode,
    uint32_t nPE,
    float32_t *__restrict__ pDstA,
    float32_t *__restrict__ pDstD
)

Parallel Floating-point DWT on real input data for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
nPE Number of cores to use
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return: none

function plp_dwt_q8_parallel

void plp_dwt_q8_parallel(
    const int8_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q8 wavelet,
    plp_dwt_extension_mode mode,
    uint32_t nPE,
    int8_t *__restrict__ pDstA,
    int8_t *__restrict__ pDstD
)

8bit Parallel Fixed-point DWT on real input data for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (q8)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
nPE Number of cores to use
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return: none

function plp_dwt_q16_parallel

void plp_dwt_q16_parallel(
    const int16_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q16 wavelet,
    plp_dwt_extension_mode mode,
    uint32_t nPE,
    int16_t *__restrict__ pDstA,
    int16_t *__restrict__ pDstD
)

16bit Parallel Fixed-point DWT on real input data for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (q16)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
nPE Number of cores to use
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return: none

function plp_dwt_q32_parallel

void plp_dwt_q32_parallel(
    const int32_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q32 wavelet,
    plp_dwt_extension_mode mode,
    uint32_t nPE,
    int32_t *__restrict__ pDstA,
    int32_t *__restrict__ pDstD
)

32bit Parallel Fixed-point DWT on real input data for XPULPV2 extension.

Parameters:

pSrc points to the input buffer (q32)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
nPE Number of cores to use
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return: none

function plp_dwt_f32p_xpulpv2

void plp_dwt_f32p_xpulpv2(
    void * args
)

Floating-point DWT on real input data for XPULPV2 extension.

Parameters:

args points to the plp_dwt_instance_f32

Return: none

function plp_dwt_haar_f32p_xpulpv2

void plp_dwt_haar_f32p_xpulpv2(
    void * args
)

Floating-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.

Parameters:

args points to the plp_dwt_instance_f32

Return: none

function plp_dwt_q8p_xpulpv2

void plp_dwt_q8p_xpulpv2(
    void * args
)

Q7 fixed-point DWT for XPULPV2 extension.

Parameters:

args points to the plp_dwt_instance_q8

Return: none

function plp_dwt_haar_q8p_xpulpv2

void plp_dwt_haar_q8p_xpulpv2(
    void * args
)

q7 fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension.

Parameters:

args points to the plp_dwt_instance_q8

Return: none

function plp_dwt_q16p_xpulpv2

void plp_dwt_q16p_xpulpv2(
    void * args
)

Q15 fixed-point DWT for XPULPV2 extension.

Parameters:

args points to the plp_dwt_instance_q16

Return: none

function plp_dwt_haar_q16p_xpulpv2

void plp_dwt_haar_q16p_xpulpv2(
    void * args
)

q15 fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension.

Parameters:

args points to the plp_dwt_instance_q16

Return: none

function plp_dwt_q32p_xpulpv2

void plp_dwt_q32p_xpulpv2(
    void * arg
)

Q31 fixed-point DWT on real input data for XPULPV2 extension.

Parameters:

args points to the plp_dwt_instance_q32

Return: none

function plp_dwt_haar_q32p_xpulpv2

void plp_dwt_haar_q32p_xpulpv2(
    void * args
)

Q31 Fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension.

Parameters:

args points to the plp_dwt_instance_q32

Return: none

function plp_dwt_q32s_rv32im

void plp_dwt_q32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q32 wavelet,
    plp_dwt_extension_mode mode,
    int32_t *__restrict__ pDstA,
    int32_t *__restrict__ pDstD
)

32bit Fixed-point DWT.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

32bit Fixed-point DWT.

function plp_dwt_haar_q32s_rv32im

void plp_dwt_haar_q32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t length,
    plp_dwt_extension_mode mode,
    int32_t *__restrict__ pDstA,
    int32_t *__restrict__ pDstD
)

32bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return: none

function plp_dwt_q16s_rv32im

void plp_dwt_q16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q16 wavelet,
    plp_dwt_extension_mode mode,
    int16_t *__restrict__ pDstA,
    int16_t *__restrict__ pDstD
)

16bit Fixed-point DWT.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

16bit Fixed-point DWT.

function plp_dwt_haar_q16s_rv32im

void plp_dwt_haar_q16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t length,
    plp_dwt_extension_mode mode,
    int16_t *__restrict__ pDstA,
    int16_t *__restrict__ pDstD
)

16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (real data)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.

function plp_dwt_q8s_rv32im

void plp_dwt_q8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t length,
    const plp_dwt_wavelet_q8 wavelet,
    plp_dwt_extension_mode mode,
    int8_t *__restrict__ pDstA,
    int8_t *__restrict__ pDstD
)

8bit Fixed-point DWT.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (real data)
length length of input buffer
wavelet wavelet structure for calculating DWT
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

8bit Fixed-point DWT.

function plp_dwt_haar_q8s_rv32im

void plp_dwt_haar_q8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t length,
    plp_dwt_extension_mode mode,
    int8_t *__restrict__ pDstA,
    int8_t *__restrict__ pDstD
)

8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.

Parameters:

pSrc points to the input buffer (real data)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients
pSrc points to the input buffer (real data)
length length of input buffer
mode boundary extension mode
pDstA points to ouput buffer with Approximate coefficients
pDstD points to ouput buffer with Detailed coefficients

Return:

none
none

8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.

function plp_mat_add_i32

void plp_mat_add_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int32_t *__restrict__ pDst
)

Glue code for matrix addition of 16-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return: none

function plp_mat_add_i32s_rv32im

void plp_mat_add_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int32_t *__restrict__ pDst
)

matrix addition of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

matrix addition of a 32-bit integer matrices for RV32IM extension.

function plp_mat_add_i32s_xpulpv2

void plp_mat_add_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int32_t *__restrict__ pDst
)

matrix addition of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

matrix addition of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_add_i32_parallel

void plp_mat_add_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for parallel matrix addition of a 32-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix addition of a 32-bit integer matrices.

function plp_mat_add_i32p_xpulpv2

void plp_mat_add_i32p_xpulpv2(
    void * args
)

Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_add_instance_i32 struct initialized by plp_mat_add_i32_parallel
args pointer to plp_mat_add_instance_i32 struct initialized by plp_mat_add_i32_parallel

Return:

none
none

Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_add_i16

void plp_mat_add_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int16_t *__restrict__ pDst
)

Glue code for matrix addition of a 16-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Glue code for matrix addition of a 16-bit integer matrices.

function plp_mat_add_i16s_rv32im

void plp_mat_add_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int16_t *__restrict__ pDst
)

matrix addition of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

matrix addition of a 16-bit integer matrices for RV32IM extension.

function plp_mat_add_i16s_xpulpv2

void plp_mat_add_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int16_t *__restrict__ pDst
)

matrix addition of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix addition of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_add_i16_parallel

void plp_mat_add_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for parallel matrix addition of a 16-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix addition of a 16-bit integer matrices.

function plp_mat_add_i16p_xpulpv2

void plp_mat_add_i16p_xpulpv2(
    void * args
)

Parallel matrix addition of 16-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_add_instance_i16 struct initialized by plp_mat_add_i16_parallel
args pointer to plp_mat_add_instance_i16 struct initialized by plp_mat_add_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_add_i8

void plp_mat_add_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int8_t *__restrict__ pDst
)

Glue code for matrix addition of a 8-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Glue code for matrix addition of a 8-bit integer matrices.

function plp_mat_add_i8s_rv32im

void plp_mat_add_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int8_t *__restrict__ pDst
)

matrix addition of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

matrix addition of a 8-bit integer matrices for RV32IM extension.

function plp_mat_add_i8s_xpulpv2

void plp_mat_add_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int8_t *__restrict__ pDst
)

matrix addition of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix addition of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_add_i8_parallel

void plp_mat_add_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for parallel matrix addition of a 8-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix addition of a 8-bit integer matrices.

function plp_mat_add_i8p_xpulpv2

void plp_mat_add_i8p_xpulpv2(
    void * args
)

Parallel matrix addition of 8-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_add_instance_i8 struct initialized by plp_mat_add_i8_parallel
args pointer to plp_mat_add_instance_i8 struct initialized by plp_mat_add_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_add_f32

void plp_mat_add_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    float *__restrict__ pDst
)

Glue code for matrix addition of a 32-bit floating-point matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Glue code for matrix addition of a 32-bit floating-point matrices.

function plp_mat_add_f32s_xpulpv2

void plp_mat_add_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    float *__restrict__ pDst
)

matrix addition of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

matrix addition of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_add_f32_parallel

void plp_mat_add_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code for parallel matrix addition of a 32-bit floating-point matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix addition of a 32-bit floating-point matrices.

function plp_mat_add_f32p_xpulpv2

void plp_mat_add_f32p_xpulpv2(
    void * args
)

Parallel matrix addition of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_add_instance_f32 struct initialized by plp_mat_add_f32_parallel
args pointer to plp_mat_add_instance_f32 struct initialized by plp_mat_add_f32_parallel

Return:

none
none

function plp_mat_sub_i32

void plp_mat_sub_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int32_t *__restrict__ pDst
)

Glue code for matrix subtraction of a 32-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Glue code for matrix subtraction of a 32-bit integer matrices.

function plp_mat_sub_i32s_rv32im

void plp_mat_sub_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int32_t *__restrict__ pDst
)

matrix subtraction of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

matrix subtraction of a 32-bit integer matrices for RV32IM extension.

function plp_mat_sub_i32s_xpulpv2

void plp_mat_sub_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int32_t *__restrict__ pDst
)

matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_sub_i32_parallel

void plp_mat_sub_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for parallel matrix subtraction of a 32-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix subtraction of a 32-bit integer matrices.

function plp_mat_sub_i32p_xpulpv2

void plp_mat_sub_i32p_xpulpv2(
    void * args
)

Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_sub_instance_i32 struct initialized by plp_mat_sub_i32_parallel
args pointer to plp_mat_sub_instance_i32 struct initialized by plp_mat_sub_i32_parallel

Return:

none
none

Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_sub_i16

void plp_mat_sub_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int16_t *__restrict__ pDst
)

Glue code for matrix subtraction of a 16-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Glue code for matrix subtraction of a 16-bit integer matrices.

function plp_mat_sub_i16s_rv32im

void plp_mat_sub_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int16_t *__restrict__ pDst
)

matrix subtraction of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

matrix subtraction of a 16-bit integer matrices for RV32IM extension.

function plp_mat_sub_i16s_xpulpv2

void plp_mat_sub_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int16_t *__restrict__ pDst
)

matrix subtraction of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix subtraction of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_sub_i16_parallel

void plp_mat_sub_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for parallel matrix subtraction of a 16-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix subtraction of a 16-bit integer matrices.

function plp_mat_sub_i16p_xpulpv2

void plp_mat_sub_i16p_xpulpv2(
    void * args
)

Parallel matrix subtraction of 16-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_sub_instance_i16 struct initialized by plp_mat_sub_i16_parallel
args pointer to plp_mat_sub_instance_i16 struct initialized by plp_mat_sub_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_sub_i8

void plp_mat_sub_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int8_t *__restrict__ pDst
)

Glue code for matrix subtraction of a 8-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Glue code for matrix subtraction of a 8-bit integer matrices.

function plp_mat_sub_i8s_rv32im

void plp_mat_sub_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int8_t *__restrict__ pDst
)

matrix subtraction of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

matrix subtraction of a 8-bit integer matrices for RV32IM extension.

function plp_mat_sub_i8s_xpulpv2

void plp_mat_sub_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    int8_t *__restrict__ pDst
)

matrix subtraction of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix subtraction of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_sub_i8_parallel

void plp_mat_sub_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for parallel matrix subtraction of a 8-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix subtraction of a 8-bit integer matrices.

function plp_mat_sub_i8p_xpulpv2

void plp_mat_sub_i8p_xpulpv2(
    void * args
)

Parallel matrix subtraction of 8-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_sub_instance_i8 struct initialized by plp_mat_sub_i8_parallel
args pointer to plp_mat_sub_instance_i8 struct initialized by plp_mat_sub_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_sub_f32

void plp_mat_sub_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    float *__restrict__ pDst
)

Glue code for matrix subtraction of a 32-bit floating-point matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

Glue code for matrix subtraction of a 32-bit floating-point matrices.

function plp_mat_sub_f32s_xpulpv2

void plp_mat_sub_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    float *__restrict__ pDst
)

matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
pDst Points to the output matrix

Return:

none
none

matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_sub_f32_parallel

void plp_mat_sub_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code for parallel matrix subtraction of a 32-bit floating-point matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix subtraction of a 32-bit floating-point matrices.

function plp_mat_sub_f32p_xpulpv2

void plp_mat_sub_f32p_xpulpv2(
    void * args
)

Parallel matrix subtraction of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_sub_instance_f32 struct initialized by plp_mat_sub_f32_parallel
args pointer to plp_mat_sub_instance_f32 struct initialized by plp_mat_sub_f32_parallel

Return:

none
none

function plp_mat_scale_i32

void plp_mat_scale_i32(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int32_t scaleFactor,
    int32_t shift,
    int32_t *__restrict__ pDst
)

Glue code for matrix scale of a 32-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Glue code for matrix scale of a 32-bit integer matrices.

function plp_mat_scale_i32s_rv32im

void plp_mat_scale_i32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int32_t scaleFactor,
    int32_t shift,
    int32_t *__restrict__ pDst
)

matrix scale of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

matrix scale of a 32-bit integer matrices for RV32IM extension.

function plp_mat_scale_i32s_xpulpv2

void plp_mat_scale_i32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int32_t scaleFactor,
    int32_t shift,
    int32_t *__restrict__ pDst
)

matrix scale of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

matrix scale of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_scale_i32_parallel

void plp_mat_scale_i32_parallel(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int32_t scaleFactor,
    int32_t shift,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for parallel matrix scale of a 32-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
nPE Number of cores to use for computation
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
nPE Number of cores to use for computation
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix scale of a 32-bit integer matrices.

function plp_mat_scale_i32p_xpulpv2

void plp_mat_scale_i32p_xpulpv2(
    void * args
)

Parallel matrix scale of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_scale_instance_i32 struct initialized by plp_mat_scale_i32_parallel
args pointer to plp_mat_scale_instance_i32 struct initialized by plp_mat_scale_i32_parallel

Return:

none
none

Parallel matrix scale of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_scale_i16

void plp_mat_scale_i16(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int16_t scaleFactor,
    int32_t shift,
    int16_t *__restrict__ pDst
)

Glue code for matrix scale of a 16-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Glue code for matrix scale of a 16-bit integer matrices.

function plp_mat_scale_i16s_rv32im

void plp_mat_scale_i16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int16_t scaleFactor,
    int32_t shift,
    int16_t *__restrict__ pDst
)

matrix scale of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

matrix scale of a 16-bit integer matrices for RV32IM extension.

function plp_mat_scale_i16s_xpulpv2

void plp_mat_scale_i16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int16_t scaleFactor,
    int32_t shift,
    int16_t *__restrict__ pDst
)

matrix scale of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix scale of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_scale_i16_parallel

void plp_mat_scale_i16_parallel(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int16_t scaleFactor,
    int32_t shift,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for parallel matrix scale of a 16-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
nPE Number of cores to use for computation
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix scale of a 16-bit integer matrices.

function plp_mat_scale_i16p_xpulpv2

void plp_mat_scale_i16p_xpulpv2(
    void * args
)

Parallel matrix scale of 16-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_scale_instance_i16 struct initialized by plp_mat_scale_i16_parallel
args pointer to plp_mat_scale_instance_i16 struct initialized by plp_mat_scale_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_scale_i8

void plp_mat_scale_i8(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int8_t scaleFactor,
    int32_t shift,
    int8_t *__restrict__ pDst
)

Glue code for matrix scale of a 8-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Glue code for matrix scale of a 8-bit integer matrices.

function plp_mat_scale_i8s_rv32im

void plp_mat_scale_i8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int8_t scaleFactor,
    int32_t shift,
    int8_t *__restrict__ pDst
)

matrix scale of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

matrix scale of a 8-bit integer matrices for RV32IM extension.

function plp_mat_scale_i8s_xpulpv2

void plp_mat_scale_i8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int8_t scaleFactor,
    int32_t shift,
    int8_t *__restrict__ pDst
)

matrix scale of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix scale of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_scale_i8_parallel

void plp_mat_scale_i8_parallel(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int8_t scaleFactor,
    int32_t shift,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for parallel matrix scale of a 8-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
nPE Number of cores to use for computation
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix scale of a 8-bit integer matrices.

function plp_mat_scale_i8p_xpulpv2

void plp_mat_scale_i8p_xpulpv2(
    void * args
)

Parallel matrix scale of 8-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_scale_instance_i8 struct initialized by plp_mat_scale_i8_parallel
args pointer to plp_mat_scale_instance_i8 struct initialized by plp_mat_scale_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_scale_f32

void plp_mat_scale_f32(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    float scaleFactor,
    float *__restrict__ pDst
)

Glue code for matrix scale of a 32-bit floating-point matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements
pDst Points to the output matrix

Return:

none
none

Glue code for matrix scale of a 32-bit floating-point matrices.

function plp_mat_scale_f32s_xpulpv2

void plp_mat_scale_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    float scaleFactor,
    float *__restrict__ pDst
)

matrix scale of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements
pDst Points to the output matrix

Return:

none
none

matrix scale of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_scale_f32_parallel

void plp_mat_scale_f32_parallel(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    float scaleFactor,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code for parallel matrix scale of a 32-bit floating-point matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements
nPE Number of cores to use for computation
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
scaleFactor Factor to mulitply all elements
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix scale of a 32-bit floating-point matrices.

function plp_mat_scale_f32p_xpulpv2

void plp_mat_scale_f32p_xpulpv2(
    void * args
)

Parallel matrix scale of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_scale_instance_f32 struct initialized by plp_mat_scale_f32_parallel
args pointer to plp_mat_scale_instance_f32 struct initialized by plp_mat_scale_f32_parallel

Return:

none
none

function plp_mat_trans_i32

void plp_mat_trans_i32(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int32_t *__restrict__ pDst
)

Glue code for matrix transpose of a 32-bit integer matrices.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM

Return:

none
none

Glue code for matrix transpose of a 32-bit integer matrices.

function plp_mat_trans_i32s_rv32im

void plp_mat_trans_i32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int32_t *__restrict__ pDst
)

matrix transpose of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM

Return:

none
none

matrix transpose of a 32-bit integer matrices for RV32IM extension.

function plp_mat_trans_i32s_xpulpv2

void plp_mat_trans_i32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int32_t *__restrict__ pDst
)

matrix transpose of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM

Return:

none
none

matrix transpose of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_trans_i32_parallel

void plp_mat_trans_i32_parallel(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for parallel matrix transpose of a 32-bit integer matrices.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxM

Return: none

Glue code for parallel matrix transpose of a 32-bit integer matrices.

function plp_mat_trans_i32p_xpulpv2

void plp_mat_trans_i32p_xpulpv2(
    void * args
)

Parallel matrix transpose of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_trans_instance_i32 struct initialized by plp_mat_trans_i32_parallel
args pointer to plp_mat_trans_instance_i32 struct initialized by plp_mat_trans_i32_parallel

Return:

none
none

Parallel matrix transpose of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_trans_i16

void plp_mat_trans_i16(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int16_t *__restrict__ pDst
)

Glue code for matrix transpose of a 16-bit integer matrices.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM

Return:

none
none

Glue code for matrix transpose of a 16-bit integer matrices.

function plp_mat_trans_i16s_rv32im

void plp_mat_trans_i16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int16_t *__restrict__ pDst
)

matrix transpose of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM

Return:

none
none

matrix transpose of a 16-bit integer matrices for RV32IM extension.

function plp_mat_trans_i16s_xpulpv2

void plp_mat_trans_i16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int16_t *__restrict__ pDst
)

matrix transpose of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix transpose of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_trans_i16_parallel

void plp_mat_trans_i16_parallel(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for parallel matrix transpose of a 16-bit integer matrices.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxM

Return:

none
none

Glue code for parallel matrix transpose of a 16-bit integer matrices.

function plp_mat_trans_i16p_xpulpv2

void plp_mat_trans_i16p_xpulpv2(
    void * args
)

Parallel matrix transpose of 16-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_trans_instance_i16 struct initialized by plp_mat_trans_i16_parallel
args pointer to plp_mat_trans_instance_i16 struct initialized by plp_mat_trans_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_trans_i8

void plp_mat_trans_i8(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int8_t *__restrict__ pDst
)

Glue code for matrix transpose of a 8-bit integer matrices.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM

Return:

none
none

Glue code for matrix transpose of a 8-bit integer matrices.

function plp_mat_trans_i8s_rv32im

void plp_mat_trans_i8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int8_t *__restrict__ pDst
)

matrix transpose of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM

Return:

none
none

matrix transpose of a 8-bit integer matrices for RV32IM extension.

function plp_mat_trans_i8s_xpulpv2

void plp_mat_trans_i8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    int8_t *__restrict__ pDst
)

matrix transpose of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix transpose of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_trans_i8_parallel

void plp_mat_trans_i8_parallel(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for parallel matrix transpose of a 8-bit integer matrices.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxM

Return:

none
none

Glue code for parallel matrix transpose of a 8-bit integer matrices.

function plp_mat_trans_i8p_xpulpv2

void plp_mat_trans_i8p_xpulpv2(
    void * args
)

Parallel matrix transpose of 8-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_trans_instance_i8 struct initialized by plp_mat_trans_i8_parallel
args pointer to plp_mat_trans_instance_i8 struct initialized by plp_mat_trans_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_trans_f32

void plp_mat_trans_f32(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    float *__restrict__ pDst
)

Glue code for matrix transpose of a 32-bit float*ing-point matrices.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
pDst Points to the output matrix of shape NxM

Return:

none
none

Par:

This function will use plp_mat_trans_i32s_xpulpv2 for its computation.
This function will use plp_mat_trans_i32s_xpulpv2 for its computation.

Glue code for matrix transpose of a 32-bit float*ing-point matrices.

function plp_mat_trans_f32_parallel

void plp_mat_trans_f32_parallel(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code for parallel matrix transpose of a 32-bit float*ing-point matrices.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxM
pSrc Points to the input matrix of shape MxN
M Height of the input matrix and width of the output matrix
N Width of the input matrix and height of the output matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxM

Return:

none
none

Par:

This function will use plp_mat_trans_i32p_xpulpv2 for its computation.
This function will use plp_mat_trans_i32p_xpulpv2 for its computation.

Glue code for parallel matrix transpose of a 32-bit float*ing-point matrices.

function plp_mat_inv_f32

int plp_mat_inv_f32(
    float *__restrict__ pSrc,
    float *__restrict__ pDst,
    uint32_t N
)

Glue code for matrix inverse of a 32-bit floating-point matrices.

Parameters:

pSrc Points to the first input matrix. pSrc is modified by this funciton
N Width and height of both matrices
pDst Points to the output matrix
pSrc Points to the input matrix. pSrc is modified by this function
N Width and height of both matrices
pDst Points to the output matrix

Return:

none
0: Success, 1: Matrix is singular, 2: operation not supported

Par: This function will use plp_mat_inv_i32s_xpulpv2 for its computation.

Glue code for matrix inverse of a 32-bit floating-point matrices.

function plp_mat_inv_f32s_xpulpv2

int plp_mat_inv_f32s_xpulpv2(
    float *__restrict__ pSrc,
    float *__restrict__ pDst,
    uint32_t N
)

matrix inverse of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrc Points to the first input matrix. pSrc is modified by this funciton
N Width and height of both matrices
pDst Points to the output matrix
pSrc Points to the first input matrix. pSrc is modified by this funciton
N Width and height of both matrices
pDst Points to the output matrix

Return:

0: Success, 1: Matrix is singular
0: Success, 1: Matrix is singular

matrix inverse of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_inv_f32_parallel

int plp_mat_inv_f32_parallel(
    float *__restrict__ pSrc,
    float *__restrict__ pDst,
    uint32_t N,
    uint32_t nPE
)

Glue code for parallel matrix inverse of a 32-bit floating-point matrices.

Parameters:

pSrc Points to the input matrix. pSrc is modified by this funciton
pDst Points to the output matrix
N Width and height of both matrices
nPE Number of cores to use for computation
pSrc Points to the input matrix, pSrc is modified by this function
N Width and height of both matrices
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

0: Success, 1: Matrix is singular, 2: operation not supported
0: Success, 1: Matrix is singular, 2: operation not supported

Glue code for parallel matrix inverse of a 32-bit floating-point matrices.

@warn This function is not yet implemented in parallel, and it will call the single-core implementation!

function plp_mat_inv_f32p_xpulpv2

int plp_mat_inv_f32p_xpulpv2(
    void * args
)

Parallel matrix inverse of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_inv_instance_f32 struct initialized by plp_mat_inv_f32_parallel
args pointer to plp_mat_inv_instance_f32 struct initialized by plp_mat_inv_f32_parallel

Return:

0: Success, 1: Matrix is singular
0: Success, 1: Matrix is singular

Parallel matrix inverse of 32-bit floating-point matrices kernel for XPULPV2 extension.

@warn Not yet implemented

function plp_mat_fill_I_i32

void plp_mat_fill_I_i32(
    uint32_t N,
    int32_t *__restrict__ pDst
)

Glue code for creating a 32-bit integer identity matrix.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_i32s_rv32im

void plp_mat_fill_I_i32s_rv32im(
    uint32_t N,
    int32_t *__restrict__ pDst
)

Create a 32-bit integer identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_i32s_xpulpv2

void plp_mat_fill_I_i32s_xpulpv2(
    uint32_t N,
    int32_t *__restrict__ pDst
)

Create a 32-bit integer identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_i32_parallel

void plp_mat_fill_I_i32_parallel(
    uint32_t N,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for creating a 32-bit integer identity matrix in parallel.

Parameters:

N Width and height of the matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_i32p_xpulpv2

void plp_mat_fill_I_i32p_xpulpv2(
    void * args
)

Create a 32-bit integer identity matrix in parallel on XpulpV2.

Parameters:

args pointer to plp_mat_fill_I_instance_i32 struct initialized by plp_mat_fill_I_i32_parallel
args pointer to plp_mat_fill_I_instance_i32 struct initialized by plp_mat_fill_I_i32_parallel

Return:

none
none

function plp_mat_fill_I_i16

void plp_mat_fill_I_i16(
    uint32_t N,
    int16_t *__restrict__ pDst
)

Glue code for creating a 16-bit integer identity matrix.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_i16s_rv32im

void plp_mat_fill_I_i16s_rv32im(
    uint32_t N,
    int16_t *__restrict__ pDst
)

Create a 16-bit integer identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_i16s_xpulpv2

void plp_mat_fill_I_i16s_xpulpv2(
    uint32_t N,
    int16_t *__restrict__ pDst
)

Create a 16-bit integer identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_i16_parallel

void plp_mat_fill_I_i16_parallel(
    uint32_t N,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for creating a 16-bit integer identity matrix in parallel.

Parameters:

N Width and height of the matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_i16p_xpulpv2

void plp_mat_fill_I_i16p_xpulpv2(
    void * args
)

Create a 16-bit integer identity matrix in parallel on XpulpV2.

Parameters:

args pointer to plp_mat_fill_I_instance_i16 struct initialized by plp_mat_fill_I_i16_parallel
args pointer to plp_mat_fill_I_instance_i16 struct initialized by plp_mat_fill_I_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_i8

void plp_mat_fill_I_i8(
    uint32_t N,
    int8_t *__restrict__ pDst
)

Glue code for creating a 8-bit integer identity matrix.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_i8s_rv32im

void plp_mat_fill_I_i8s_rv32im(
    uint32_t N,
    int8_t *__restrict__ pDst
)

Create a 8-bit integer identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_i8s_xpulpv2

void plp_mat_fill_I_i8s_xpulpv2(
    uint32_t N,
    int8_t *__restrict__ pDst
)

Create a 8-bit integer identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_i8_parallel

void plp_mat_fill_I_i8_parallel(
    uint32_t N,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for creating a 8-bit integer identity matrix in parallel.

Parameters:

N Width and height of the matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_i8p_xpulpv2

void plp_mat_fill_I_i8p_xpulpv2(
    void * args
)

Create a 8-bit integer identity matrix in parallel on XpulpV2.

Parameters:

args pointer to plp_mat_fill_I_instance_i8 struct initialized by plp_mat_fill_I_i8_parallel
args pointer to plp_mat_fill_I_instance_i8 struct initialized by plp_mat_fill_I_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_f32

void plp_mat_fill_I_f32(
    uint32_t N,
    float *__restrict__ pDst
)

Glue code for creating a 32-bit float identity matrix.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_f32s_xpulpv2

void plp_mat_fill_I_f32s_xpulpv2(
    uint32_t N,
    float *__restrict__ pDst
)

Create a 32-bit float identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_f32_parallel

void plp_mat_fill_I_f32_parallel(
    uint32_t N,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code for creating a 32-bit float identity matrix in parallel.

Parameters:

N Width and height of the matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_f32p_xpulpv2

void plp_mat_fill_I_f32p_xpulpv2(
    void * args
)

Create a 32-bit float identity matrix in parallel on XpulpV2.

Parameters:

args pointer to plp_mat_fill_I_instance_f32 struct initialized by plp_mat_fill_I_f32_parallel
args pointer to plp_mat_fill_I_instance_f32 struct initialized by plp_mat_fill_I_f32_parallel

Return:

none
none

function plp_mat_fill_I_q32

void plp_mat_fill_I_q32(
    uint32_t N,
    int32_t fracBits,
    int32_t *__restrict__ pDst
)

Glue code for creating a 32-bit fix-point identity matrix.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_q32s_rv32im

void plp_mat_fill_I_q32s_rv32im(
    uint32_t N,
    int32_t fracBits,
    int32_t *__restrict__ pDst
)

Create a 32-bit fix-point identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_q32s_xpulpv2

void plp_mat_fill_I_q32s_xpulpv2(
    uint32_t N,
    int32_t fracBits,
    int32_t *__restrict__ pDst
)

Create a 32-bit fix-point identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_q32_parallel

void plp_mat_fill_I_q32_parallel(
    uint32_t N,
    int32_t fracBits,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for creating a 32-bit fix-point identity matrix in parallel.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_q32p_xpulpv2

void plp_mat_fill_I_q32p_xpulpv2(
    void * args
)

Create a 32-bit fix-point identity matrix in parallel on XpulpV2.

Parameters:

args pointer to plp_mat_fill_I_instance_q32 struct initialized by plp_mat_fill_I_q32_parallel
args pointer to plp_mat_fill_I_instance_q32 struct initialized by plp_mat_fill_I_q32_parallel

Return:

none
none

function plp_mat_fill_I_q16

void plp_mat_fill_I_q16(
    uint32_t N,
    int32_t fracBits,
    int16_t *__restrict__ pDst
)

Glue code for creating a 16-bit fix-point identity matrix.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_q16s_rv32im

void plp_mat_fill_I_q16s_rv32im(
    uint32_t N,
    int32_t fracBits,
    int16_t *__restrict__ pDst
)

Create a 16-bit fix-point identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_q16s_xpulpv2

void plp_mat_fill_I_q16s_xpulpv2(
    uint32_t N,
    int32_t fracBits,
    int16_t *__restrict__ pDst
)

Create a 16-bit fix-point identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_q16_parallel

void plp_mat_fill_I_q16_parallel(
    uint32_t N,
    int32_t fracBits,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for creating a 16-bit fix-point identity matrix in parallel.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_q16p_xpulpv2

void plp_mat_fill_I_q16p_xpulpv2(
    void * args
)

Create a 16-bit fix-point identity matrix in parallel on XpulpV2.

Parameters:

args pointer to plp_mat_fill_I_instance_q16 struct initialized by plp_mat_fill_I_q16_parallel
args pointer to plp_mat_fill_I_instance_q16 struct initialized by plp_mat_fill_I_q16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_q8

void plp_mat_fill_I_q8(
    uint32_t N,
    int32_t fracBits,
    int8_t *__restrict__ pDst
)

Glue code for creating a 8-bit fix-point identity matrix.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_q8s_rv32im

void plp_mat_fill_I_q8s_rv32im(
    uint32_t N,
    int32_t fracBits,
    int8_t *__restrict__ pDst
)

Create a 8-bit fix-point identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_q8s_xpulpv2

void plp_mat_fill_I_q8s_xpulpv2(
    uint32_t N,
    int32_t fracBits,
    int8_t *__restrict__ pDst
)

Create a 8-bit fix-point identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix of shape NxN

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_q8_parallel

void plp_mat_fill_I_q8_parallel(
    uint32_t N,
    int32_t fracBits,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for creating a 8-bit fix-point identity matrix in parallel.

Parameters:

N Width and height of the matrix
fracBits decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN
N Width and height of the matrix
fracBits Decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix of shape NxN

Return:

none
none

function plp_mat_fill_I_q8p_xpulpv2

void plp_mat_fill_I_q8p_xpulpv2(
    void * args
)

Create a 8-bit fix-point identity matrix in parallel on XpulpV2.

Parameters:

args pointer to plp_mat_fill_I_instance_q8 struct initialized by plp_mat_fill_I_q8_parallel
args pointer to plp_mat_fill_I_instance_q8 struct initialized by plp_mat_fill_I_q8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_stride_i32

void plp_mat_mult_stride_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code for strided matrix matrix multiplication of a 32-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Glue code for strided matrix matrix multiplication of a 32-bit integer matrices.

function plp_mat_mult_stride_i32s_rv32im

void plp_mat_mult_stride_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

strided matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension.

function plp_mat_mult_stride_i32s_xpulpv2

void plp_mat_mult_stride_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_stride_i16

void plp_mat_mult_stride_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code for strided matrix matrix multiplication of a 16-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Glue code for strided matrix matrix multiplication of a 16-bit integer matrices.

function plp_mat_mult_stride_i16s_rv32im

void plp_mat_mult_stride_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

strided matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension.

function plp_mat_mult_stride_i16s_xpulpv2

void plp_mat_mult_stride_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

strided matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_stride_i8

void plp_mat_mult_stride_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code for strided matrix matrix multiplication of a 8-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Glue code for strided matrix matrix multiplication of a 8-bit integer matrices.

function plp_mat_mult_stride_i8s_rv32im

void plp_mat_mult_stride_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

strided matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension.

function plp_mat_mult_stride_i8s_xpulpv2

void plp_mat_mult_stride_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

strided matrix matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_stride_i32_parallel

void plp_mat_mult_stride_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices.

function plp_mat_mult_stride_i32p_xpulpv2

void plp_mat_mult_stride_i32p_xpulpv2(
    void * args
)

Parallel strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_i32 struct initialized by plp_mat_mult_stride_i32_parallel
args pointer to plp_mat_mult_stride_instance_i32 struct initialized by plp_mat_mult_stride_i32_parallel

Return:

none
none

Parallel strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_stride_i16_parallel

void plp_mat_mult_stride_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel strided matrix matrix multiplication of a 16-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel strided matrix matrix multiplication of a 16-bit integer matrices.

function plp_mat_mult_stride_i16p_xpulpv2

void plp_mat_mult_stride_i16p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 16-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_i16 struct initialized by plp_mat_mult_stride_i16_parallel
args pointer to plp_mat_mult_stride_instance_i16 struct initialized by plp_mat_mult_stride_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

Parallel matrix multiplication of 16-bit integer matrices kernel for XPULPV2 extension.

function plp_mat_mult_stride_i8_parallel

void plp_mat_mult_stride_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel strided matrix matrix multiplication of a 8-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel strided matrix matrix multiplication of a 8-bit integer matrices.

function plp_mat_mult_stride_f32

void plp_mat_mult_stride_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    float *__restrict__ pDstC
)

Glue code for strided matrix matrix multiplication of a 32-bit floating-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Glue code for strided matrix matrix multiplication of a 32-bit floating-point matrices.

function plp_mat_mult_stride_f32s_xpulpv2

void plp_mat_mult_stride_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    float *__restrict__ pDstC
)

strided matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

strided matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_mult_stride_f32_parallel

void plp_mat_mult_stride_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    float *__restrict__ pDstC
)

Glue code for parallel strided matrix matrix multiplication of a 32-bit floating-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel strided matrix matrix multiplication of a 32-bit floating-point matrices.

function plp_mat_mult_stride_f32p_xpulpv2

void plp_mat_mult_stride_f32p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_f32 struct initialized by plp_mat_mult_stride_f32_parallel
args pointer to plp_mat_mult_stride_instance_f32 struct initialized by plp_mat_mult_stride_f32_parallel

Return:

none
none

Parallel matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.

function plp_mat_mult_stride_i8p_xpulpv2

void plp_mat_mult_stride_i8p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 8-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_i8 struct initialized by plp_mat_mult_stride_i8_parallel
args pointer to plp_mat_mult_stride_instance_i8 struct initialized by plp_mat_mult_stride_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

Parallel matrix multiplication of 8-bit integer matrices kernel for XPULPV2 extension.

function plp_mat_mult_stride_q32

void plp_mat_mult_stride_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Glue code for strided matrix matrix multiplication of a 32-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Glue code for strided matrix matrix multiplication of a 32-bit fix-point matrices.

function plp_mat_mult_stride_q32_parallel

void plp_mat_mult_stride_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel strided matrix matrix multiplication of a 32-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Glue code for parallel strided matrix matrix multiplication of a 32-bit fix-point matrices.

function plp_mat_mult_stride_q32s_rv32im

void plp_mat_mult_stride_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

strided matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.

function plp_mat_mult_stride_q32s_xpulpv2

void plp_mat_mult_stride_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

strided matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.

function plp_mat_mult_stride_q32p_xpulpv2

void plp_mat_mult_stride_q32p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_q32 struct initialized by plp_mat_mult_stride_q32_parallel
args pointer to plp_mat_mult_stride_instance_q32 struct initialized by plp_mat_mult_stride_q32_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.

function plp_mat_mult_stride_q16

void plp_mat_mult_stride_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Glue code for strided matrix matrix multiplication of a 16-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for strided matrix matrix multiplication of a 16-bit fix-point matrices.

The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_stride_q16_parallel

void plp_mat_mult_stride_q16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int16_t *__restrict__ pDstC
)

Glue code for parallel strided matrix matrix multiplication of a 16-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for parallel strided matrix matrix multiplication of a 16-bit fix-point matrices.

The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_stride_q16s_rv32im

void plp_mat_mult_stride_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

strided matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.

The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_stride_q16s_xpulpv2

void plp_mat_mult_stride_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

strided matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.

The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_stride_q16p_xpulpv2

void plp_mat_mult_stride_q16p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_q16 struct initialized by plp_mat_mult_stride_q16_parallel
args pointer to plp_mat_mult_stride_instance_q16 struct initialized by plp_mat_mult_stride_q16_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.

The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_stride_q8

void plp_mat_mult_stride_q8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Glue code for strided matrix matrix multiplication of a 8-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for strided matrix matrix multiplication of a 8-bit fix-point matrices.

The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_stride_q8_parallel

void plp_mat_mult_stride_q8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int8_t *__restrict__ pDstC
)

Glue code for parallel strided matrix matrix multiplication of a 8-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for parallel strided matrix matrix multiplication of a 8-bit fix-point matrices.

The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_stride_q8s_rv32im

void plp_mat_mult_stride_q8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

strided matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.

The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_stride_q8s_xpulpv2

void plp_mat_mult_stride_q8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

strided matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

strided matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.

The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_stride_q8p_xpulpv2

void plp_mat_mult_stride_q8p_xpulpv2(
    void * args
)

Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_q8 struct initialized by plp_mat_mult_stride_q8_parallel
args pointer to plp_mat_mult_stride_instance_q8 struct initialized by plp_mat_mult_stride_q8_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.

The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_i32

void plp_mat_mult_trans_stride_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code for strided matrix transposed matrix multiplication of a 32-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Glue code for strided matrix transposed matrix multiplication of a 32-bit integer matrices.

function plp_mat_mult_trans_stride_i32s_rv32im

void plp_mat_mult_trans_stride_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.

function plp_mat_mult_trans_stride_i32s_xpulpv2

void plp_mat_mult_trans_stride_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

strided matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_trans_stride_i16

void plp_mat_mult_trans_stride_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code for strided matrix transposed matrix multiplication of a 16-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Glue code for strided matrix transposed matrix multiplication of a 16-bit integer matrices.

function plp_mat_mult_trans_stride_i16s_rv32im

void plp_mat_mult_trans_stride_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

strided matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension.

function plp_mat_mult_trans_stride_i16s_xpulpv2

void plp_mat_mult_trans_stride_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_trans_stride_i8

void plp_mat_mult_trans_stride_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code for strided matrix transposed matrix multiplication of a 8-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Glue code for strided matrix transposed matrix multiplication of a 8-bit integer matrices.

function plp_mat_mult_trans_stride_i8s_rv32im

void plp_mat_mult_trans_stride_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

strided matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension.

function plp_mat_mult_trans_stride_i8s_xpulpv2

void plp_mat_mult_trans_stride_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_trans_stride_i32_parallel

void plp_mat_mult_trans_stride_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices.

function plp_mat_mult_trans_stride_i32p_xpulpv2

void plp_mat_mult_trans_stride_i32p_xpulpv2(
    void * args
)

Parallel strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_i32 struct initialized by plp_mat_mult_stride_i32_parallel
args pointer to plp_mat_mult_stride_instance_i32 struct initialized by plp_mat_mult_trans_stride_i32_parallel

Return:

none
none

Parallel strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.

function plp_mat_mult_trans_stride_i16_parallel

void plp_mat_mult_trans_stride_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices.

function plp_mat_mult_trans_stride_i16p_xpulpv2

void plp_mat_mult_trans_stride_i16p_xpulpv2(
    void * args
)

Parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_i16 struct initialized by plp_mat_mult_stride_i16_parallel
args pointer to plp_mat_mult_stride_instance_i16 struct initialized by plp_mat_mult_trans_stride_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

Parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_trans_stride_i8_parallel

void plp_mat_mult_trans_stride_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices.

function plp_mat_mult_trans_stride_i8p_xpulpv2

void plp_mat_mult_trans_stride_i8p_xpulpv2(
    void * args
)

Parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_i8 struct initialized by plp_mat_mult_stride_i8_parallel
args pointer to plp_mat_mult_stride_instance_i8 struct initialized by plp_mat_mult_trans_stride_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

Parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_mult_trans_stride_q32

void plp_mat_mult_trans_stride_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Glue code for strided matrix transposed matrix multiplication of a 32-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Glue code for strided matrix transposed matrix multiplication of a 32-bit fix-point matrices.

function plp_mat_mult_trans_stride_q32_parallel

void plp_mat_mult_trans_stride_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit fix-point matrices.

function plp_mat_mult_trans_stride_q32s_rv32im

void plp_mat_mult_trans_stride_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.

function plp_mat_mult_trans_stride_q32s_xpulpv2

void plp_mat_mult_trans_stride_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.

function plp_mat_mult_trans_stride_q32p_xpulpv2

void plp_mat_mult_trans_stride_q32p_xpulpv2(
    void * args
)

Parallel strided matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_q32 struct initialized by plp_mat_mult_trans_stride_q32_parallel
args pointer to plp_mat_mult_stride_instance_q32 struct initialized by plp_mat_mult_trans_stride_q32_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel strided matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.

function plp_mat_mult_trans_stride_q16

void plp_mat_mult_trans_stride_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Glue code for strided matrix transposed matrix multiplication of a 16-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for strided matrix transposed matrix multiplication of a 16-bit fix-point matrices.

The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_q16_parallel

void plp_mat_mult_trans_stride_q16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int16_t *__restrict__ pDstC
)

Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit fix-point matrices.

The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_q16s_rv32im

void plp_mat_mult_trans_stride_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.

The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_q16s_xpulpv2

void plp_mat_mult_trans_stride_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.

The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_q16p_xpulpv2

void plp_mat_mult_trans_stride_q16p_xpulpv2(
    void * args
)

Parallel strided matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_q16 struct initialized by plp_mat_mult_trans_stride_q16_parallel
args pointer to plp_mat_mult_stride_instance_q16 struct initialized by plp_mat_mult_trans_stride_q16_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel strided matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.

The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_q8

void plp_mat_mult_trans_stride_q8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Glue code for strided matrix transposed matrix multiplication of a 8-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for strided matrix transposed matrix multiplication of a 8-bit fix-point matrices.

The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_q8_parallel

void plp_mat_mult_trans_stride_q8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int8_t *__restrict__ pDstC
)

Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit fix-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit fix-point matrices.

The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_q8s_rv32im

void plp_mat_mult_trans_stride_q8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.

The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_q8s_xpulpv2

void plp_mat_mult_trans_stride_q8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
shift Amount to shift the result of each multiplication.
pDstC points to the output matrix

Return:

none
none

Par:

Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift). * Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.

The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_q8p_xpulpv2

void plp_mat_mult_trans_stride_q8p_xpulpv2(
    void * args
)

Parallel strided matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_q8 struct initialized by plp_mat_mult_trans_stride_q8_parallel
args pointer to plp_mat_mult_stride_instance_q8 struct initialized by plp_mat_mult_trans_stride_q8_parallel

Return:

none
none

Par: Fix-Point and Shifting

The result will be shifted by the parameter shift to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).

Parallel strided matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.

The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift parameter such that no overflow ocurrs.

function plp_mat_mult_trans_stride_f32

void plp_mat_mult_trans_stride_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    float *__restrict__ pDstC
)

Glue code for strided matrix transposed matrix multiplication of a 32-bit floating-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

Glue code for strided matrix transposed matrix multiplication of a 32-bit floating-point matrices.

function plp_mat_mult_trans_stride_f32s_xpulpv2

void plp_mat_mult_trans_stride_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    float *__restrict__ pDstC
)

strided matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
pDstC points to the output matrix

Return:

none
none

strided matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_mult_trans_stride_f32_parallel

void plp_mat_mult_trans_stride_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    float *__restrict__ pDstC
)

Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit floating-point matrices.

Parameters:

pSrcA points to first the input matrix
pSrcB points to second the input matrix
M Height of first matrix
N Width of first and heigt of second matrix
O Width of second matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strudeY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC Output is written here
pSrcA points to the first input matrix
pSrcB points to the second input matrix
M height of the first input matrix
N width of the first input matrix and hight of the second
O width of the second input matrix
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideC Stride of output matrix (elements between each row)
nPE Number of cores to use
pDstC points to the output matrix

Return:

none
none

Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit floating-point matrices.

function plp_mat_mult_trans_stride_f32p_xpulpv2

void plp_mat_mult_trans_stride_f32p_xpulpv2(
    void * args
)

Parallel strided matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_mult_stride_instance_f32 struct initialized by plp_mat_mult_trans_stride_f32_parallel
args pointer to plp_mat_mult_stride_instance_f32 struct initialized by plp_mat_mult_trans_stride_f32_parallel

Return:

none
none

Parallel strided matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.

function plp_mat_mult_cmplx_stride_i32

void plp_mat_mult_cmplx_stride_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code of strided matrix matrix multiplication for complex 32-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_i32s_rv32im

void plp_mat_mult_cmplx_stride_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 32-bit integers on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_i32s_xpulpv2

void plp_mat_mult_cmplx_stride_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 32-bit integers on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_i32_parallel

void plp_mat_mult_cmplx_stride_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel strided matrix matrix multiplication for complex 32-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_i32p_xpulpv2

void plp_mat_mult_cmplx_stride_i32p_xpulpv2(
    void * args
)

parallel strided matrix matrix multiplication for complex 32-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_i32 struct initialized by plp_mat_mult_cmplx_stride_i32_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_i32 struct initialized by plp_mat_mult_cmplx_stride_i32_parallel

Return:

none
none

function plp_mat_mult_cmplx_stride_i16

void plp_mat_mult_cmplx_stride_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code of strided matrix matrix multiplication for complex 16-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_i16s_rv32im

void plp_mat_mult_cmplx_stride_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 16-bit integers on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_i16s_xpulpv2

void plp_mat_mult_cmplx_stride_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 16-bit integers on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_cmplx_stride_i16_parallel

void plp_mat_mult_cmplx_stride_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel strided matrix matrix multiplication for complex 16-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_i16p_xpulpv2

void plp_mat_mult_cmplx_stride_i16p_xpulpv2(
    void * args
)

parallel strided matrix matrix multiplication for complex 16-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_i16 struct initialized by plp_mat_mult_cmplx_stride_i16_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_i16 struct initialized by plp_mat_mult_cmplx_stride_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_cmplx_stride_i8

void plp_mat_mult_cmplx_stride_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code of strided matrix matrix multiplication for complex 8-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_i8s_rv32im

void plp_mat_mult_cmplx_stride_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 8-bit integers on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_i8s_xpulpv2

void plp_mat_mult_cmplx_stride_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 8-bit integers on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_cmplx_stride_i8_parallel

void plp_mat_mult_cmplx_stride_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel strided matrix matrix multiplication for complex 8-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_i8p_xpulpv2

void plp_mat_mult_cmplx_stride_i8p_xpulpv2(
    void * args
)

parallel strided matrix matrix multiplication for complex 8-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_i8 struct initialized by plp_mat_mult_cmplx_stride_i8_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_i8 struct initialized by plp_mat_mult_cmplx_stride_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_cmplx_stride_f32

void plp_mat_mult_cmplx_stride_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    float *__restrict__ pDstC
)

Glue code of strided matrix matrix multiplication for complex 32-bit floats.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_f32s_xpulpv2

void plp_mat_mult_cmplx_stride_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    float *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 32-bit floats on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_f32_parallel

void plp_mat_mult_cmplx_stride_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    float *__restrict__ pDstC
)

Glue code of parallel strided matrix matrix multiplication for complex 32-bit floats.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_cmplx_stride_f32p_xpulpv2

void plp_mat_mult_cmplx_stride_f32p_xpulpv2(
    void * args
)

parallel strided matrix matrix multiplication for complex 32-bit floats on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_f32 struct initialized by plp_mat_mult_cmplx_stride_f32_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_f32 struct initialized by plp_mat_mult_cmplx_stride_f32_parallel

Return:

none
none

function plp_mat_mult_cmplx_stride_q32

void plp_mat_mult_cmplx_stride_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Glue code of strided matrix matrix multiplication for complex 32-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q32s_rv32im

void plp_mat_mult_cmplx_stride_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 32-bit fix-point on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q32s_xpulpv2

void plp_mat_mult_cmplx_stride_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 32-bit fix-point on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q32_parallel

void plp_mat_mult_cmplx_stride_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel strided matrix matrix multiplication for complex 32-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q32p_xpulpv2

void plp_mat_mult_cmplx_stride_q32p_xpulpv2(
    void * args
)

parallel strided matrix matrix multiplication for complex 32-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_q32 struct initialized by plp_mat_mult_cmplx_stride_q32_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_q32 struct initialized by plp_mat_mult_cmplx_stride_q32_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q16

void plp_mat_mult_cmplx_stride_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Glue code of strided matrix matrix multiplication for complex 16-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q16s_rv32im

void plp_mat_mult_cmplx_stride_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 16-bit fix-point on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q16s_xpulpv2

void plp_mat_mult_cmplx_stride_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 16-bit fix-point on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q16_parallel

void plp_mat_mult_cmplx_stride_q16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int16_t *__restrict__ pDstC
)

Glue code of parallel strided matrix matrix multiplication for complex 16-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q16p_xpulpv2

void plp_mat_mult_cmplx_stride_q16p_xpulpv2(
    void * args
)

parallel strided matrix matrix multiplication for complex 16-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_q16 struct initialized by plp_mat_mult_cmplx_stride_q16_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_q16 struct initialized by plp_mat_mult_cmplx_stride_q16_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_cmplx_stride_q8

void plp_mat_mult_cmplx_stride_q8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Glue code of strided matrix matrix multiplication for complex 8-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q8s_rv32im

void plp_mat_mult_cmplx_stride_q8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 8-bit fix-point on RV32IM.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q8s_xpulpv2

void plp_mat_mult_cmplx_stride_q8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Strided strided matrix matrix multiplication for complex 8-bit fix-point on XpulpV2.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q8_parallel

void plp_mat_mult_cmplx_stride_q8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int8_t *__restrict__ pDstC
)

Glue code of parallel strided matrix matrix multiplication for complex 8-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape NxO
M Height of matrix SrcA and DstC
N Width of matrix SrcA and height of matrix SrcB
O Width of matrix SrcB and DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_cmplx_stride_q8p_xpulpv2

void plp_mat_mult_cmplx_stride_q8p_xpulpv2(
    void * args
)

parallel strided matrix matrix multiplication for complex 8-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_q8 struct initialized by plp_mat_mult_cmplx_stride_q8_parallel
args pointer to plp_mat_mat_mult_cmplx_instance_q8 struct initialized by plp_mat_mult_cmplx_stride_q8_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_trans_cmplx_stride_i32

void plp_mat_mult_trans_cmplx_stride_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code of strided matrix transpose matrix multiplication for complex 32-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_stride_i32s_rv32im

void plp_mat_mult_trans_cmplx_stride_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 32-bit integers on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

strided matrix transpose matrix multiplication for complex 32-bit integers on RV32IM

function plp_mat_mult_trans_cmplx_stride_i32s_xpulpv2

void plp_mat_mult_trans_cmplx_stride_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

strided matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2

function plp_mat_mult_trans_cmplx_stride_i32_parallel

void plp_mat_mult_trans_cmplx_stride_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_stride_i32p_xpulpv2

void plp_mat_mult_trans_cmplx_stride_i32p_xpulpv2(
    void * args
)

parallel strided matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_i32 struct initialized by plp_mat_mult_trans_cmplx_stride_i32_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_i32 struct initialized by plp_mat_mult_trans_cmplx_stride_i32_parallel

Return:

none
none

function plp_mat_mult_trans_cmplx_stride_i16

void plp_mat_mult_trans_cmplx_stride_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code of strided matrix transpose matrix multiplication for complex 16-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_stride_i16s_rv32im

void plp_mat_mult_trans_cmplx_stride_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 16-bit integers on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

strided matrix transpose matrix multiplication for complex 16-bit integers on RV32IM

function plp_mat_mult_trans_cmplx_stride_i16s_xpulpv2

void plp_mat_mult_trans_cmplx_stride_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

strided matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2

function plp_mat_mult_trans_cmplx_stride_i16_parallel

void plp_mat_mult_trans_cmplx_stride_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel strided matrix transpose matrix multiplication for complex 16-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_stride_i16p_xpulpv2

void plp_mat_mult_trans_cmplx_stride_i16p_xpulpv2(
    void * args
)

parallel strided matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_i16 struct initialized by plp_mat_mult_trans_cmplx_stride_i16_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_i16 struct initialized by plp_mat_mult_trans_cmplx_stride_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_trans_cmplx_stride_i8

void plp_mat_mult_trans_cmplx_stride_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

Glue code of strided matrix transpose matrix multiplication for complex 8-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_stride_i8s_rv32im

void plp_mat_mult_trans_cmplx_stride_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 8-bit integers on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

strided matrix transpose matrix multiplication for complex 8-bit integers on RV32IM

function plp_mat_mult_trans_cmplx_stride_i8s_xpulpv2

void plp_mat_mult_trans_cmplx_stride_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    int32_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

strided matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2

function plp_mat_mult_trans_cmplx_stride_i8_parallel

void plp_mat_mult_trans_cmplx_stride_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel strided matrix transpose matrix multiplication for complex 8-bit integers.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_stride_i8p_xpulpv2

void plp_mat_mult_trans_cmplx_stride_i8p_xpulpv2(
    void * args
)

parallel strided matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_i8 struct initialized by plp_mat_mult_trans_cmplx_stride_i8_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_i8 struct initialized by plp_mat_mult_trans_cmplx_stride_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_trans_cmplx_stride_f32

void plp_mat_mult_trans_cmplx_stride_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    float *__restrict__ pDstC
)

Glue code of strided matrix transpose matrix multiplication for complex 32-bit floats.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_stride_f32s_xpulpv2

void plp_mat_mult_trans_cmplx_stride_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    float *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
pDstC Points to the output matrix of shape MxO

Return:

none
none

strided matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2

function plp_mat_mult_trans_cmplx_stride_f32_parallel

void plp_mat_mult_trans_cmplx_stride_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t nPE,
    float *__restrict__ pDstC
)

Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit floats.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

function plp_mat_mult_trans_cmplx_stride_f32p_xpulpv2

void plp_mat_mult_trans_cmplx_stride_f32p_xpulpv2(
    void * args
)

parallel strided matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_f32 struct initialized by plp_mat_mult_trans_cmplx_stride_f32_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_f32 struct initialized by plp_mat_mult_trans_cmplx_stride_f32_parallel

Return:

none
none

function plp_mat_mult_trans_cmplx_stride_q32

void plp_mat_mult_trans_cmplx_stride_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

Glue code of strided matrix transpose matrix multiplication for complex 32-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_stride_q32s_rv32im

void plp_mat_mult_trans_cmplx_stride_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

strided matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM

function plp_mat_mult_trans_cmplx_stride_q32s_xpulpv2

void plp_mat_mult_trans_cmplx_stride_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int32_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

strided matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2

function plp_mat_mult_trans_cmplx_stride_q32_parallel

void plp_mat_mult_trans_cmplx_stride_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int32_t *__restrict__ pDstC
)

Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_stride_q32p_xpulpv2

void plp_mat_mult_trans_cmplx_stride_q32p_xpulpv2(
    void * args
)

parallel strided matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_q32 struct initialized by plp_mat_mult_trans_cmplx_stride_q32_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_q32 struct initialized by plp_mat_mult_trans_cmplx_stride_q32_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_stride_q16

void plp_mat_mult_trans_cmplx_stride_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

Glue code of strided matrix transpose matrix multiplication for complex 16-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_stride_q16s_rv32im

void plp_mat_mult_trans_cmplx_stride_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

strided matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM

function plp_mat_mult_trans_cmplx_stride_q16s_xpulpv2

void plp_mat_mult_trans_cmplx_stride_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int16_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

strided matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2

function plp_mat_mult_trans_cmplx_stride_q16_parallel

void plp_mat_mult_trans_cmplx_stride_q16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int16_t *__restrict__ pDstC
)

Glue code of parallel strided matrix transpose matrix multiplication for complex 16-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_stride_q16p_xpulpv2

void plp_mat_mult_trans_cmplx_stride_q16p_xpulpv2(
    void * args
)

parallel strided matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_q16 struct initialized by plp_mat_mult_trans_cmplx_stride_q16_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_q16 struct initialized by plp_mat_mult_trans_cmplx_stride_q16_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_mult_trans_cmplx_stride_q8

void plp_mat_mult_trans_cmplx_stride_q8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

Glue code of strided matrix transpose matrix multiplication for complex 8-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_stride_q8s_rv32im

void plp_mat_mult_trans_cmplx_stride_q8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

strided matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM

function plp_mat_mult_trans_cmplx_stride_q8s_xpulpv2

void plp_mat_mult_trans_cmplx_stride_q8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    int8_t *__restrict__ pDstC
)

strided matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

strided matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2

function plp_mat_mult_trans_cmplx_stride_q8_parallel

void plp_mat_mult_trans_cmplx_stride_q8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t O,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideC,
    uint32_t shift,
    uint32_t nPE,
    int8_t *__restrict__ pDstC
)

Glue code of parallel strided matrix transpose matrix multiplication for complex 8-bit fix-point.

Parameters:

pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO
pSrcA Points to the first input matrix of shape MxN
pSrcB Points to the second input matrix of shape OxN
M Height of matrix SrcA and DstC
N Width of matrix SrcA and SrcB
O Height of matrix SrcB and width of matrix DstC
strideA Stride of input matrix A (elements between each row)
strideB Stride of input matrix B (elements between each row)
strideC Stride of output matrix C (Elements between each row)
shift Amount to shift the result of each multiplication ot the right
nPE Number of cores to use for computation
pDstC Points to the output matrix of shape MxO

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs.

function plp_mat_mult_trans_cmplx_stride_q8p_xpulpv2

void plp_mat_mult_trans_cmplx_stride_q8p_xpulpv2(
    void * args
)

parallel strided matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2

Parameters:

args pointer to plp_mat_mult_cmplx_stride_instance_q8 struct initialized by plp_mat_mult_trans_cmplx_stride_q8_parallel
args pointer to plp_mat_mat_mult_trans_cmplx_instance_q8 struct initialized by plp_mat_mult_trans_cmplx_stride_q8_parallel

Return:

none
none

Par:

Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

Fix-Point and Shifting The result will be shifted by the parameter shift to the right (which corresponds to a multiplication by 2^-shift). Assume that matrix A is represente as pSrcA * 2^-x and matrix B as pSrcB * 2^-y (which means that A has x, and B has y bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift). The output matrix is also stored with the same number of bits as the inputs. Set the shift parameter such that no overflow occurrs. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_add_stride_i32

void plp_mat_add_stride_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int32_t *__restrict__ pDst
)

Glue code for matrix addition of a 32-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Glue code for matrix addition of a 32-bit integer matrices.

function plp_mat_add_stride_i32s_rv32im

void plp_mat_add_stride_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int32_t *__restrict__ pDst
)

matrix addition of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

matrix addition of a 32-bit integer matrices for RV32IM extension.

function plp_mat_add_stride_i32s_xpulpv2

void plp_mat_add_stride_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int32_t *__restrict__ pDst
)

matrix addition of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

matrix addition of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_add_stride_i32_parallel

void plp_mat_add_stride_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for parallel matrix addition of a 32-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix addition of a 32-bit integer matrices.

function plp_mat_add_stride_i32p_xpulpv2

void plp_mat_add_stride_i32p_xpulpv2(
    void * args
)

Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_add_stride_instance_i32 struct initialized by plp_mat_add_stride_i32_parallel
args pointer to plp_mat_add_stride_instance_i32 struct initialized by plp_mat_add_stride_i32_parallel

Return:

none
none

Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_add_stride_i16

void plp_mat_add_stride_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int16_t *__restrict__ pDst
)

Glue code for matrix addition of a 16-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Glue code for matrix addition of a 16-bit integer matrices.

function plp_mat_add_stride_i16s_rv32im

void plp_mat_add_stride_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int16_t *__restrict__ pDst
)

matrix addition of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

matrix addition of a 16-bit integer matrices for RV32IM extension.

function plp_mat_add_stride_i16s_xpulpv2

void plp_mat_add_stride_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int16_t *__restrict__ pDst
)

matrix addition of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix addition of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_add_stride_i16_parallel

void plp_mat_add_stride_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for parallel matrix addition of a 16-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix addition of a 16-bit integer matrices.

function plp_mat_add_stride_i16p_xpulpv2

void plp_mat_add_stride_i16p_xpulpv2(
    void * args
)

Parallel matrix addition of 16-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_add_stride_instance_i16 struct initialized by plp_mat_add_stride_i16_parallel
args pointer to plp_mat_add_stride_instance_i16 struct initialized by plp_mat_add_stride_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

Parallel matrix addition of 16-bit integer matrices kernel for XPULPV2 extension.

function plp_mat_add_stride_i8

void plp_mat_add_stride_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int8_t *__restrict__ pDst
)

Glue code for matrix addition of a 8-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Glue code for matrix addition of a 8-bit integer matrices.

function plp_mat_add_stride_i8s_rv32im

void plp_mat_add_stride_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int8_t *__restrict__ pDst
)

matrix addition of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

matrix addition of a 8-bit integer matrices for RV32IM extension.

function plp_mat_add_stride_i8s_xpulpv2

void plp_mat_add_stride_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int8_t *__restrict__ pDst
)

matrix addition of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix addition of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_add_stride_i8_parallel

void plp_mat_add_stride_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for parallel matrix addition of a 8-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix addition of a 8-bit integer matrices.

function plp_mat_add_stride_i8p_xpulpv2

void plp_mat_add_stride_i8p_xpulpv2(
    void * args
)

Parallel matrix addition of 8-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_add_stride_instance_i8 struct initialized by plp_mat_add_stride_i8_parallel
args pointer to plp_mat_add_stride_instance_i8 struct initialized by plp_mat_add_stride_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

Parallel matrix addition of 8-bit integer matrices kernel for XPULPV2 extension.

function plp_mat_add_stride_f32

void plp_mat_add_stride_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    float *__restrict__ pDst
)

Glue code for matrix addition of a 32-bit floating-point matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Glue code for matrix addition of a 32-bit floating-point matrices.

function plp_mat_add_stride_f32s_xpulpv2

void plp_mat_add_stride_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    float *__restrict__ pDst
)

matrix addition of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

matrix addition of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_add_stride_f32_parallel

void plp_mat_add_stride_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code for parallel matrix addition of a 32-bit floating-point matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix addition of a 32-bit floating-point matrices.

function plp_mat_add_stride_f32p_xpulpv2

void plp_mat_add_stride_f32p_xpulpv2(
    void * args
)

Parallel matrix addition of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_add_stride_instance_f32 struct initialized by plp_mat_add_stride_f32_parallel
args pointer to plp_mat_add_stride_instance_f32 struct initialized by plp_mat_add_stride_f32_parallel

Return:

none
none

Parallel matrix addition of 32-bit floating-point matrices kernel for XPULPV2 extension.

function plp_mat_sub_stride_i32

void plp_mat_sub_stride_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int32_t *__restrict__ pDst
)

Glue code for matrix subtraction of a 32-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Glue code for matrix subtraction of a 32-bit integer matrices.

function plp_mat_sub_stride_i32s_rv32im

void plp_mat_sub_stride_i32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int32_t *__restrict__ pDst
)

matrix subtraction of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

matrix subtraction of a 32-bit integer matrices for RV32IM extension.

function plp_mat_sub_stride_i32s_xpulpv2

void plp_mat_sub_stride_i32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int32_t *__restrict__ pDst
)

matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_sub_stride_i32_parallel

void plp_mat_sub_stride_i32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for parallel matrix subtraction of a 32-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix subtraction of a 32-bit integer matrices.

function plp_mat_sub_stride_i32p_xpulpv2

void plp_mat_sub_stride_i32p_xpulpv2(
    void * args
)

Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_sub_stride_instance_i32 struct initialized by plp_mat_sub_stride_i32_parallel
args pointer to plp_mat_sub_stride_instance_i32 struct initialized by plp_mat_sub_stride_i32_parallel

Return:

none
none

Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_sub_stride_i16

void plp_mat_sub_stride_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int16_t *__restrict__ pDst
)

Glue code for matrix subtraction of a 16-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Glue code for matrix subtraction of a 16-bit integer matrices.

function plp_mat_sub_stride_i16s_rv32im

void plp_mat_sub_stride_i16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int16_t *__restrict__ pDst
)

matrix subtraction of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

matrix subtraction of a 16-bit integer matrices for RV32IM extension.

function plp_mat_sub_stride_i16s_xpulpv2

void plp_mat_sub_stride_i16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int16_t *__restrict__ pDst
)

matrix subtraction of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix subtraction of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_sub_stride_i16_parallel

void plp_mat_sub_stride_i16_parallel(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for parallel matrix subtraction of a 16-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix subtraction of a 16-bit integer matrices.

function plp_mat_sub_stride_i16p_xpulpv2

void plp_mat_sub_stride_i16p_xpulpv2(
    void * args
)

Parallel matrix subtraction of 16-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_sub_stride_instance_i16 struct initialized by plp_mat_sub_stride_i16_parallel
args pointer to plp_mat_sub_stride_instance_i16 struct initialized by plp_mat_sub_stride_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

Parallel matrix subtraction of 16-bit integer matrices kernel for XPULPV2 extension.

function plp_mat_sub_stride_i8

void plp_mat_sub_stride_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int8_t *__restrict__ pDst
)

Glue code for matrix subtraction of a 8-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Glue code for matrix subtraction of a 8-bit integer matrices.

function plp_mat_sub_stride_i8s_rv32im

void plp_mat_sub_stride_i8s_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int8_t *__restrict__ pDst
)

matrix subtraction of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

matrix subtraction of a 8-bit integer matrices for RV32IM extension.

function plp_mat_sub_stride_i8s_xpulpv2

void plp_mat_sub_stride_i8s_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    int8_t *__restrict__ pDst
)

matrix subtraction of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

matrix subtraction of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_sub_stride_i8_parallel

void plp_mat_sub_stride_i8_parallel(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for parallel matrix subtraction of a 8-bit integer matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix subtraction of a 8-bit integer matrices.

function plp_mat_sub_stride_i8p_xpulpv2

void plp_mat_sub_stride_i8p_xpulpv2(
    void * args
)

Parallel matrix subtraction of 8-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_sub_stride_instance_i8 struct initialized by plp_mat_sub_stride_i8_parallel
args pointer to plp_mat_sub_stride_instance_i8 struct initialized by plp_mat_sub_stride_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

Parallel matrix subtraction of 8-bit integer matrices kernel for XPULPV2 extension.

function plp_mat_sub_stride_f32

void plp_mat_sub_stride_f32(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    float *__restrict__ pDst
)

Glue code for matrix subtraction of a 32-bit floating-point matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Glue code for matrix subtraction of a 32-bit floating-point matrices.

function plp_mat_sub_stride_f32s_xpulpv2

void plp_mat_sub_stride_f32s_xpulpv2(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    float *__restrict__ pDst
)

matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_sub_stride_f32_parallel

void plp_mat_sub_stride_f32_parallel(
    const float *__restrict__ pSrcA,
    const float *__restrict__ pSrcB,
    uint32_t M,
    uint32_t N,
    uint32_t strideA,
    uint32_t strideB,
    uint32_t strideY,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code for parallel matrix subtraction of a 32-bit floating-point matrices.

Parameters:

pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of the matrices
N Width of the matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrix B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use
pDst Points to the output matrix
pSrcA Points to the first input matrix
pSrcB Points to the second input matrix
M Height of all matrices
N Width of all matrices
strideA Stride of matrix A (elements between each row)
strideB Stride of matrid B (elements between each row)
strideY Stride of output matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel matrix subtraction of a 32-bit floating-point matrices.

function plp_mat_sub_stride_f32p_xpulpv2

void plp_mat_sub_stride_f32p_xpulpv2(
    void * args
)

Parallel matrix subtraction of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_sub_stride_instance_f32 struct initialized by plp_mat_sub_stride_f32_parallel
args pointer to plp_mat_sub_stride_instance_f32 struct initialized by plp_mat_sub_stride_f32_parallel

Return:

none
none

Parallel matrix subtraction of 32-bit floating-point matrices kernel for XPULPV2 extension.

function plp_mat_scale_stride_i32

void plp_mat_scale_stride_i32(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int32_t scaleFactor,
    int32_t shift,
    int32_t *__restrict__ pDst
)

Glue code for strided matrix scale of a 32-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Glue code for strided matrix scale of a 32-bit integer matrices.

function plp_mat_scale_stride_i32s_rv32im

void plp_mat_scale_stride_i32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int32_t scaleFactor,
    int32_t shift,
    int32_t *__restrict__ pDst
)

strided matrix scale of a 32-bit integer matrices for RV32IM extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

strided matrix scale of a 32-bit integer matrices for RV32IM extension.

function plp_mat_scale_stride_i32s_xpulpv2

void plp_mat_scale_stride_i32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int32_t scaleFactor,
    int32_t shift,
    int32_t *__restrict__ pDst
)

strided matrix scale of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

strided matrix scale of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_scale_stride_i32_parallel

void plp_mat_scale_stride_i32_parallel(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int32_t scaleFactor,
    int32_t shift,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for parallel strided matrix scale of a 32-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
nPE Number of cores to use for computation
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
nPE Number of cores to use for computation
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Glue code for parallel strided matrix scale of a 32-bit integer matrices.

function plp_mat_scale_stride_i32p_xpulpv2

void plp_mat_scale_stride_i32p_xpulpv2(
    void * args
)

Parallel strided matrix scale of a 32-bit integer matrices for XPULPV2 extension.

Parameters:

args pointer to plp_mat_scale_stride_instance_i32 struct initialized by plp_mat_scale_stride_i32_parallel
args pointer to plp_mat_scale_stride_instance_i32 struct initialized by plp_mat_scale_stride_i32_parallel

Return:

none
none

Parallel strided matrix scale of a 32-bit integer matrices for XPULPV2 extension.

function plp_mat_scale_stride_i16

void plp_mat_scale_stride_i16(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int16_t scaleFactor,
    int32_t shift,
    int16_t *__restrict__ pDst
)

Glue code for strided matrix scale of a 16-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Glue code for strided matrix scale of a 16-bit integer matrices.

function plp_mat_scale_stride_i16s_rv32im

void plp_mat_scale_stride_i16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int16_t scaleFactor,
    int32_t shift,
    int16_t *__restrict__ pDst
)

strided matrix scale of a 16-bit integer matrices for RV32IM extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

strided matrix scale of a 16-bit integer matrices for RV32IM extension.

function plp_mat_scale_stride_i16s_xpulpv2

void plp_mat_scale_stride_i16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int16_t scaleFactor,
    int32_t shift,
    int16_t *__restrict__ pDst
)

strided matrix scale of a 16-bit integer matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

strided matrix scale of a 16-bit integer matrices for XPULPV2 extension.

function plp_mat_scale_stride_i16_parallel

void plp_mat_scale_stride_i16_parallel(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int16_t scaleFactor,
    int32_t shift,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for parallel strided matrix scale of a 16-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
nPE Number of cores to use for computation
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel strided matrix scale of a 16-bit integer matrices.

function plp_mat_scale_stride_i16p_xpulpv2

void plp_mat_scale_stride_i16p_xpulpv2(
    void * args
)

Parallel strided matrix scale of 16-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_scale_stride_instance_i16 struct initialized by plp_mat_scale_stride_i16_parallel
args pointer to plp_mat_scale_stride_instance_i16 struct initialized by plp_mat_scale_stride_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_scale_stride_i8

void plp_mat_scale_stride_i8(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int8_t scaleFactor,
    int32_t shift,
    int8_t *__restrict__ pDst
)

Glue code for strided matrix scale of a 8-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Glue code for strided matrix scale of a 8-bit integer matrices.

function plp_mat_scale_stride_i8s_rv32im

void plp_mat_scale_stride_i8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int8_t scaleFactor,
    int32_t shift,
    int8_t *__restrict__ pDst
)

strided matrix scale of a 8-bit integer matrices for RV32IM extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

strided matrix scale of a 8-bit integer matrices for RV32IM extension.

function plp_mat_scale_stride_i8s_xpulpv2

void plp_mat_scale_stride_i8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int8_t scaleFactor,
    int32_t shift,
    int8_t *__restrict__ pDst
)

strided matrix scale of a 8-bit integer matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

strided matrix scale of a 8-bit integer matrices for XPULPV2 extension.

function plp_mat_scale_stride_i8_parallel

void plp_mat_scale_stride_i8_parallel(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int8_t scaleFactor,
    int32_t shift,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for parallel strided matrix scale of a 8-bit integer matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
nPE Number of cores to use for computation
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements before shifting
shift Amount to shift each element
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel strided matrix scale of a 8-bit integer matrices.

function plp_mat_scale_stride_i8p_xpulpv2

void plp_mat_scale_stride_i8p_xpulpv2(
    void * args
)

Parallel strided matrix scale of 8-bit integer matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_scale_stride_instance_i8 struct initialized by plp_mat_scale_stride_i8_parallel
args pointer to plp_mat_scale_stride_instance_i8 struct initialized by plp_mat_scale_stride_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_scale_stride_f32

void plp_mat_scale_stride_f32(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    float scaleFactor,
    float *__restrict__ pDst
)

Glue code for strided matrix scale of a 32-bit floating-point matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements
pDst Points to the output matrix

Return:

none
none

Glue code for strided matrix scale of a 32-bit floating-point matrices.

function plp_mat_scale_stride_f32s_xpulpv2

void plp_mat_scale_stride_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    float scaleFactor,
    float *__restrict__ pDst
)

strided matrix scale of a 32-bit floating-point matrices for XPULPV2 extension.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements
pDst Points to the output matrix

Return:

none
none

strided matrix scale of a 32-bit floating-point matrices for XPULPV2 extension.

function plp_mat_scale_stride_f32_parallel

void plp_mat_scale_stride_f32_parallel(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    float scaleFactor,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code for parallel strided matrix scale of a 32-bit floating-point matrices.

Parameters:

pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride for input matrix (elements between each row)
strideDst Stride for output matrix (elements between each row)
scaleFactor Factor to mulitply all elements
nPE Number of cores to use for computation
pDst Points to the output matrix
pSrc Points to the input matrix
M Height of both matrices
N Width of both matrices
strideSrc Stride of input matrix (elements between each row)
strideDst Stride of output matrix (elements between each row)
scaleFactor Factor to mulitply all elements
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Glue code for parallel strided matrix scale of a 32-bit floating-point matrices.

function plp_mat_scale_stride_f32p_xpulpv2

void plp_mat_scale_stride_f32p_xpulpv2(
    void * args
)

Parallel strided matrix scale of 32-bit floating-point matrices kernel for XPULPV2 extension.

Parameters:

args pointer to plp_mat_scale_stride_instance_f32 struct initialized by plp_mat_scale_stride_f32_parallel
args pointer to plp_mat_scale_stride_instance_f32 struct initialized by plp_mat_scale_stride_f32_parallel

Return:

none
none

function plp_mat_fill_I_stride_i32

void plp_mat_fill_I_stride_i32(
    uint32_t N,
    uint32_t stride,
    int32_t *__restrict__ pDst
)

Glue code for creating a strided 32-bit integers identity matrix.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

function plp_mat_fill_I_stride_i32s_rv32im

void plp_mat_fill_I_stride_i32s_rv32im(
    uint32_t N,
    uint32_t stride,
    int32_t *__restrict__ pDst
)

Create a strided 32-bit integers identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Create a strided 32-bit integers identity matrix on RV32IM.

function plp_mat_fill_I_stride_i32s_xpulpv2

void plp_mat_fill_I_stride_i32s_xpulpv2(
    uint32_t N,
    uint32_t stride,
    int32_t *__restrict__ pDst
)

Create a strided 32-bit integers identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

function plp_mat_fill_I_stride_i32_parallel

void plp_mat_fill_I_stride_i32_parallel(
    uint32_t N,
    uint32_t stride,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for creating a strided 32-bit integers identity matrix in parallel.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

function plp_mat_fill_I_stride_i32p_xpulpv2

void plp_mat_fill_I_stride_i32p_xpulpv2(
    void * args
)

Create a strided 32-bit integers identity matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_I_stride_instance_i32 struct initialized by plp_mat_fill_I_stride_i32_parallel
args pointer to plp_mat_mat_fill_I_stride_instance_i32 struct initialized by plp_mat_fill_I_stride_i32_parallel

Return:

none
none

function plp_mat_fill_I_stride_i16

void plp_mat_fill_I_stride_i16(
    uint32_t N,
    uint32_t stride,
    int16_t *__restrict__ pDst
)

Glue code for creating a strided 16-bit integers identity matrix.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

function plp_mat_fill_I_stride_i16s_rv32im

void plp_mat_fill_I_stride_i16s_rv32im(
    uint32_t N,
    uint32_t stride,
    int16_t *__restrict__ pDst
)

Create a strided 16-bit integers identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Create a strided 16-bit integers identity matrix on RV32IM.

function plp_mat_fill_I_stride_i16s_xpulpv2

void plp_mat_fill_I_stride_i16s_xpulpv2(
    uint32_t N,
    uint32_t stride,
    int16_t *__restrict__ pDst
)

Create a strided 16-bit integers identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_stride_i16_parallel

void plp_mat_fill_I_stride_i16_parallel(
    uint32_t N,
    uint32_t stride,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for creating a strided 16-bit integers identity matrix in parallel.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

function plp_mat_fill_I_stride_i16p_xpulpv2

void plp_mat_fill_I_stride_i16p_xpulpv2(
    void * args
)

Create a strided 16-bit integers identity matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_I_stride_instance_i16 struct initialized by plp_mat_fill_I_stride_i16_parallel
args pointer to plp_mat_mat_fill_I_stride_instance_i16 struct initialized by plp_mat_fill_I_stride_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_stride_i8

void plp_mat_fill_I_stride_i8(
    uint32_t N,
    uint32_t stride,
    int8_t *__restrict__ pDst
)

Glue code for creating a strided 8-bit integers identity matrix.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

function plp_mat_fill_I_stride_i8s_rv32im

void plp_mat_fill_I_stride_i8s_rv32im(
    uint32_t N,
    uint32_t stride,
    int8_t *__restrict__ pDst
)

Create a strided 8-bit integers identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Create a strided 8-bit integers identity matrix on RV32IM.

function plp_mat_fill_I_stride_i8s_xpulpv2

void plp_mat_fill_I_stride_i8s_xpulpv2(
    uint32_t N,
    uint32_t stride,
    int8_t *__restrict__ pDst
)

Create a strided 8-bit integers identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_stride_i8_parallel

void plp_mat_fill_I_stride_i8_parallel(
    uint32_t N,
    uint32_t stride,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for creating a strided 8-bit integers identity matrix in parallel.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

function plp_mat_fill_I_stride_i8p_xpulpv2

void plp_mat_fill_I_stride_i8p_xpulpv2(
    void * args
)

Create a strided 8-bit integers identity matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_I_stride_instance_i8 struct initialized by plp_mat_fill_I_stride_i8_parallel
args pointer to plp_mat_mat_fill_I_stride_instance_i8 struct initialized by plp_mat_fill_I_stride_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_stride_f32

void plp_mat_fill_I_stride_f32(
    uint32_t N,
    uint32_t stride,
    float *__restrict__ pDst
)

Glue code for creating a strided 32-bit floats identity matrix.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

function plp_mat_fill_I_stride_f32s_xpulpv2

void plp_mat_fill_I_stride_f32s_xpulpv2(
    uint32_t N,
    uint32_t stride,
    float *__restrict__ pDst
)

Create a strided 32-bit floats identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
pDst Points to the output matrix

Return:

none
none

function plp_mat_fill_I_stride_f32_parallel

void plp_mat_fill_I_stride_f32_parallel(
    uint32_t N,
    uint32_t stride,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code for creating a strided 32-bit floats identity matrix in parallel.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

function plp_mat_fill_I_stride_f32p_xpulpv2

void plp_mat_fill_I_stride_f32p_xpulpv2(
    void * args
)

Create a strided 32-bit floats identity matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_I_stride_instance_f32 struct initialized by plp_mat_fill_I_stride_f32_parallel
args pointer to plp_mat_mat_fill_I_stride_instance_f32 struct initialized by plp_mat_fill_I_stride_f32_parallel

Return:

none
none

function plp_mat_fill_I_stride_q32

void plp_mat_fill_I_stride_q32(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    int32_t *__restrict__ pDst
)

Glue code for creating a strided 32-bit fix-point identity matrix.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

function plp_mat_fill_I_stride_q32s_rv32im

void plp_mat_fill_I_stride_q32s_rv32im(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    int32_t *__restrict__ pDst
)

Create a strided 32-bit fix-point identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

Create a strided 32-bit fix-point identity matrix on RV32IM.

function plp_mat_fill_I_stride_q32s_xpulpv2

void plp_mat_fill_I_stride_q32s_xpulpv2(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    int32_t *__restrict__ pDst
)

Create a strided 32-bit fix-point identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

function plp_mat_fill_I_stride_q32_parallel

void plp_mat_fill_I_stride_q32_parallel(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for creating a strided 32-bit fix-point identity matrix in parallel.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

function plp_mat_fill_I_stride_q32p_xpulpv2

void plp_mat_fill_I_stride_q32p_xpulpv2(
    void * args
)

Create a strided 32-bit fix-point identity matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_I_stride_instance_q32 struct initialized by plp_mat_fill_I_stride_q32_parallel
args pointer to plp_mat_mat_fill_I_stride_instance_q32 struct initialized by plp_mat_fill_I_stride_q32_parallel

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

function plp_mat_fill_I_stride_q16

void plp_mat_fill_I_stride_q16(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    int16_t *__restrict__ pDst
)

Glue code for creating a strided 16-bit fix-point identity matrix.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

function plp_mat_fill_I_stride_q16s_rv32im

void plp_mat_fill_I_stride_q16s_rv32im(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    int16_t *__restrict__ pDst
)

Create a strided 16-bit fix-point identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

Create a strided 16-bit fix-point identity matrix on RV32IM.

function plp_mat_fill_I_stride_q16s_xpulpv2

void plp_mat_fill_I_stride_q16s_xpulpv2(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    int16_t *__restrict__ pDst
)

Create a strided 16-bit fix-point identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

function plp_mat_fill_I_stride_q16_parallel

void plp_mat_fill_I_stride_q16_parallel(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for creating a strided 16-bit fix-point identity matrix in parallel.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

function plp_mat_fill_I_stride_q16p_xpulpv2

void plp_mat_fill_I_stride_q16p_xpulpv2(
    void * args
)

Create a strided 16-bit fix-point identity matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_I_stride_instance_q16 struct initialized by plp_mat_fill_I_stride_q16_parallel
args pointer to plp_mat_mat_fill_I_stride_instance_q16 struct initialized by plp_mat_fill_I_stride_q16_parallel

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_I_stride_q8

void plp_mat_fill_I_stride_q8(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    int8_t *__restrict__ pDst
)

Glue code for creating a strided 8-bit fix-point identity matrix.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

function plp_mat_fill_I_stride_q8s_rv32im

void plp_mat_fill_I_stride_q8s_rv32im(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    int8_t *__restrict__ pDst
)

Create a strided 8-bit fix-point identity matrix on RV32IM.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

Create a strided 8-bit fix-point identity matrix on RV32IM.

function plp_mat_fill_I_stride_q8s_xpulpv2

void plp_mat_fill_I_stride_q8s_xpulpv2(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    int8_t *__restrict__ pDst
)

Create a strided 8-bit fix-point identity matrix on XpulpV2.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

function plp_mat_fill_I_stride_q8_parallel

void plp_mat_fill_I_stride_q8_parallel(
    uint32_t N,
    uint32_t stride,
    int32_t fracBits,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for creating a strided 8-bit fix-point identity matrix in parallel.

Parameters:

N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix
N Width and height of the matrix
stride Stride of the matrix (elements between each row)
fracBits Decimal point for the appropriate scale
nPE Number of cores to use for computation
pDst Points to the output matrix

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits.

function plp_mat_fill_I_stride_q8p_xpulpv2

void plp_mat_fill_I_stride_q8p_xpulpv2(
    void * args
)

Create a strided 8-bit fix-point identity matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_I_stride_instance_q8 struct initialized by plp_mat_fill_I_stride_q8_parallel
args pointer to plp_mat_mat_fill_I_stride_instance_q8 struct initialized by plp_mat_fill_I_stride_q8_parallel

Return:

none
none

Par:

Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point

The diagonal elements will be filled with the value: 1 << fracBits. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_stride_i32

void plp_mat_fill_stride_i32(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int32_t value,
    int32_t *__restrict__ pDst
)

Glue code for filling an MxN strided 32-bit integers matrix.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_i32s_rv32im

void plp_mat_fill_stride_i32s_rv32im(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int32_t value,
    int32_t *__restrict__ pDst
)

Fill an MxN strided 32-bit integers matrix on RV32IM.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_i32s_xpulpv2

void plp_mat_fill_stride_i32s_xpulpv2(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int32_t value,
    int32_t *__restrict__ pDst
)

Fill an MxN strided 32-bit integers matrix on XpulpV2.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_i32_parallel

void plp_mat_fill_stride_i32_parallel(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int32_t value,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code for filling an MxN strided 32-bit integers matrix in parallel.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for processing
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for processing
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_i32p_xpulpv2

void plp_mat_fill_stride_i32p_xpulpv2(
    void * args
)

Fill an MxN strided 32-bit integers matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_stride_instance_i32 struct initialized by plp_mat_fill_stride_i32_parallel
args pointer to plp_mat_mat_fill_stride_instance_i32 struct initialized by plp_mat_fill_stride_i32_parallel

Return:

none
none

function plp_mat_fill_stride_i16

void plp_mat_fill_stride_i16(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int16_t value,
    int16_t *__restrict__ pDst
)

Glue code for filling an MxN strided 16-bit integers matrix.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_i16s_rv32im

void plp_mat_fill_stride_i16s_rv32im(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int16_t value,
    int16_t *__restrict__ pDst
)

Fill an MxN strided 16-bit integers matrix on RV32IM.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_i16s_xpulpv2

void plp_mat_fill_stride_i16s_xpulpv2(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int16_t value,
    int16_t *__restrict__ pDst
)

Fill an MxN strided 16-bit integers matrix on XpulpV2.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_stride_i16_parallel

void plp_mat_fill_stride_i16_parallel(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int16_t value,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code for filling an MxN strided 16-bit integers matrix in parallel.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for processing
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for processing
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_i16p_xpulpv2

void plp_mat_fill_stride_i16p_xpulpv2(
    void * args
)

Fill an MxN strided 16-bit integers matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_stride_instance_i16 struct initialized by plp_mat_fill_stride_i16_parallel
args pointer to plp_mat_mat_fill_stride_instance_i16 struct initialized by plp_mat_fill_stride_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_stride_i8

void plp_mat_fill_stride_i8(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int8_t value,
    int8_t *__restrict__ pDst
)

Glue code for filling an MxN strided 8-bit integers matrix.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_i8s_rv32im

void plp_mat_fill_stride_i8s_rv32im(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int8_t value,
    int8_t *__restrict__ pDst
)

Fill an MxN strided 8-bit integers matrix on RV32IM.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_i8s_xpulpv2

void plp_mat_fill_stride_i8s_xpulpv2(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int8_t value,
    int8_t *__restrict__ pDst
)

Fill an MxN strided 8-bit integers matrix on XpulpV2.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_stride_i8_parallel

void plp_mat_fill_stride_i8_parallel(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    int8_t value,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code for filling an MxN strided 8-bit integers matrix in parallel.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for processing
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for processing
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_i8p_xpulpv2

void plp_mat_fill_stride_i8p_xpulpv2(
    void * args
)

Fill an MxN strided 8-bit integers matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_stride_instance_i8 struct initialized by plp_mat_fill_stride_i8_parallel
args pointer to plp_mat_mat_fill_stride_instance_i8 struct initialized by plp_mat_fill_stride_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_fill_stride_f32

void plp_mat_fill_stride_f32(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    float value,
    float *__restrict__ pDst
)

Glue code for filling an MxN strided 32-bit floats matrix.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_f32s_xpulpv2

void plp_mat_fill_stride_f32s_xpulpv2(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    float value,
    float *__restrict__ pDst
)

Fill an MxN strided 32-bit floats matrix on XpulpV2.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_f32_parallel

void plp_mat_fill_stride_f32_parallel(
    uint32_t M,
    uint32_t N,
    uint32_t stride,
    float value,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code for filling an MxN strided 32-bit floats matrix in parallel.

Parameters:

M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for processing
pSrc Points to the output matrix
M Height of the matrix
N Width of the matrix
stride Stride of the matrix (elements between each row)
nPE Number of cores to use for processing
pSrc Points to the output matrix

Return:

none
none

function plp_mat_fill_stride_f32p_xpulpv2

void plp_mat_fill_stride_f32p_xpulpv2(
    void * args
)

Fill an MxN strided 32-bit floats matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_fill_stride_instance_f32 struct initialized by plp_mat_fill_stride_f32_parallel
args pointer to plp_mat_mat_fill_stride_instance_f32 struct initialized by plp_mat_fill_stride_f32_parallel

Return:

none
none

function plp_mat_copy_stride_i32

void plp_mat_copy_stride_i32(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int32_t *__restrict__ pDst
)

Glue code to copy an MxN strided 32-bit integers matrix.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_i32s_rv32im

void plp_mat_copy_stride_i32s_rv32im(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int32_t *__restrict__ pDst
)

Copy an MxN strided 32-bit integers matrix on RV32IM.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_i32s_xpulpv2

void plp_mat_copy_stride_i32s_xpulpv2(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int32_t *__restrict__ pDst
)

Copy an MxN strided 32-bit integers matrix on XpulpV2.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_i32_parallel

void plp_mat_copy_stride_i32_parallel(
    const int32_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    uint32_t nPE,
    int32_t *__restrict__ pDst
)

Glue code to copy an MxN strided 32-bit integers matrix in parallel.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
nPE Number of cores to use for processing
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
nPE Number of cores to use for processing
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_i32p_xpulpv2

void plp_mat_copy_stride_i32p_xpulpv2(
    void * args
)

Copy an MxN strided 32-bit integers matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_copy_stride_instance_i32 struct initialized by plp_mat_copy_stride_i32_parallel
args pointer to plp_mat_mat_copy_stride_instance_i32 struct initialized by plp_mat_copy_stride_i32_parallel

Return:

none
none

function plp_mat_copy_stride_i16

void plp_mat_copy_stride_i16(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int16_t *__restrict__ pDst
)

Glue code to copy an MxN strided 16-bit integers matrix.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_i16s_rv32im

void plp_mat_copy_stride_i16s_rv32im(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int16_t *__restrict__ pDst
)

Copy an MxN strided 16-bit integers matrix on RV32IM.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_i16s_xpulpv2

void plp_mat_copy_stride_i16s_xpulpv2(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int16_t *__restrict__ pDst
)

Copy an MxN strided 16-bit integers matrix on XpulpV2.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

Par: Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_copy_stride_i16_parallel

void plp_mat_copy_stride_i16_parallel(
    const int16_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    uint32_t nPE,
    int16_t *__restrict__ pDst
)

Glue code to copy an MxN strided 16-bit integers matrix in parallel.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
nPE Number of cores to use for processing
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
nPE Number of cores to use for processing
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_i16p_xpulpv2

void plp_mat_copy_stride_i16p_xpulpv2(
    void * args
)

Copy an MxN strided 16-bit integers matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_copy_stride_instance_i16 struct initialized by plp_mat_copy_stride_i16_parallel
args pointer to plp_mat_mat_copy_stride_instance_i16 struct initialized by plp_mat_copy_stride_i16_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_copy_stride_i8

void plp_mat_copy_stride_i8(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int8_t *__restrict__ pDst
)

Glue code to copy an MxN strided 8-bit integers matrix.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_i8s_rv32im

void plp_mat_copy_stride_i8s_rv32im(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int8_t *__restrict__ pDst
)

Copy an MxN strided 8-bit integers matrix on RV32IM.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_i8s_xpulpv2

void plp_mat_copy_stride_i8s_xpulpv2(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    int8_t *__restrict__ pDst
)

Copy an MxN strided 8-bit integers matrix on XpulpV2.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

Par: Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_copy_stride_i8_parallel

void plp_mat_copy_stride_i8_parallel(
    const int8_t *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    uint32_t nPE,
    int8_t *__restrict__ pDst
)

Glue code to copy an MxN strided 8-bit integers matrix in parallel.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
nPE Number of cores to use for processing
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
nPE Number of cores to use for processing
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_i8p_xpulpv2

void plp_mat_copy_stride_i8p_xpulpv2(
    void * args
)

Copy an MxN strided 8-bit integers matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_copy_stride_instance_i8 struct initialized by plp_mat_copy_stride_i8_parallel
args pointer to plp_mat_mat_copy_stride_instance_i8 struct initialized by plp_mat_copy_stride_i8_parallel

Return:

none
none

Par:

Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.

function plp_mat_copy_stride_f32

void plp_mat_copy_stride_f32(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    float *__restrict__ pDst
)

Glue code to copy an MxN strided 32-bit floats matrix.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_f32s_xpulpv2

void plp_mat_copy_stride_f32s_xpulpv2(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    float *__restrict__ pDst
)

Copy an MxN strided 32-bit floats matrix on XpulpV2.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_f32_parallel

void plp_mat_copy_stride_f32_parallel(
    const float *__restrict__ pSrc,
    uint32_t M,
    uint32_t N,
    uint32_t strideSrc,
    uint32_t strideDst,
    uint32_t nPE,
    float *__restrict__ pDst
)

Glue code to copy an MxN strided 32-bit floats matrix in parallel.

Parameters:

pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
nPE Number of cores to use for processing
pDst Points to the output matrix of shape MxN
pSrc Points to the input matrix of shape MxN
M Height of both matrices
N Width of both matrices
strideSrc Stride of the input matrix (elements between each row)
strideDst Stride of the output matrix (elements between each row)
nPE Number of cores to use for processing
pDst Points to the output matrix of shape MxN

Return:

none
none

function plp_mat_copy_stride_f32p_xpulpv2

void plp_mat_copy_stride_f32p_xpulpv2(
    void * args
)

Copy an MxN strided 32-bit floats matrix on XpulpV2 in parallel.

Parameters:

args pointer to plp_mat_copy_stride_instance_f32 struct initialized by plp_mat_copy_stride_f32_parallel
args pointer to plp_mat_mat_copy_stride_instance_f32 struct initialized by plp_mat_copy_stride_f32_parallel

Return:

none
none

function plp_cmplx_conj_f32

void plp_cmplx_conj_f32(
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex conjugate of 32-bit float vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_conj_f32_xpulpv2

void plp_cmplx_conj_f32_xpulpv2(
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst,
    uint32_t numSamples
)

Floating-point complex conjugate.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_conj_i32

void plp_cmplx_conj_i32(
    const int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex conjugate of 32-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_conj_i32_xpulpv2

void plp_cmplx_conj_i32_xpulpv2(
    const int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

32-bit integer complex conjugate.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_conj_i32_rv32im

void plp_cmplx_conj_i32_rv32im(
    const int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

32-bit integer complex conjugate.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_conj_i16

void plp_cmplx_conj_i16(
    const int16_t *__restrict__ pSrc,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex conjugate of 16-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_conj_i16_xpulpv2

void plp_cmplx_conj_i16_xpulpv2(
    const int16_t *__restrict__ pSrc,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

16-bit integer complex conjugate.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_conj_i16_rv32im

void plp_cmplx_conj_i16_rv32im(
    const int16_t *__restrict__ pSrc,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

16-bit integer complex conjugate.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_conj_i8

void plp_cmplx_conj_i8(
    const int8_t *__restrict__ pSrc,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex conjugate of 8-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_conj_i8_xpulpv2

void plp_cmplx_conj_i8_xpulpv2(
    const int8_t *__restrict__ pSrc,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

8-bit integer complex conjugate.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_conj_i8_rv32im

void plp_cmplx_conj_i8_rv32im(
    const int8_t *__restrict__ pSrc,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

8-bit integer complex conjugate.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_dot_prod_f32

void plp_cmplx_dot_prod_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    uint32_t numSamples,
    float32_t * realResult,
    float32_t * imagResult
)

Glue code for complex dot product of 32-bit float vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_dot_prod_f32_xpulpv2

void plp_cmplx_dot_prod_f32_xpulpv2(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    uint32_t numSamples,
    float32_t * realResult,
    float32_t * imagResult
)

Floating-point complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

function plp_cmplx_dot_prod_i32

void plp_cmplx_dot_prod_i32(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    uint32_t numSamples,
    int32_t * realResult,
    int32_t * imagResult
)

Glue code for complex dot product of 32-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_dot_prod_i32_xpulpv2

void plp_cmplx_dot_prod_i32_xpulpv2(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    uint32_t numSamples,
    int32_t * realResult,
    int32_t * imagResult
)

32-bit integer complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

function plp_cmplx_dot_prod_i32_rv32im

void plp_cmplx_dot_prod_i32_rv32im(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    uint32_t numSamples,
    int32_t * realResult,
    int32_t * imagResult
)

32-bit integer complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

function plp_cmplx_dot_prod_i16

void plp_cmplx_dot_prod_i16(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    uint32_t numSamples,
    int16_t * realResult,
    int16_t * imagResult
)

Glue code for complex dot product of 16-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_dot_prod_i16_xpulpv2

void plp_cmplx_dot_prod_i16_xpulpv2(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    uint32_t numSamples,
    int16_t * realResult,
    int16_t * imagResult
)

16-bit integer complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

function plp_cmplx_dot_prod_i16_rv32im

void plp_cmplx_dot_prod_i16_rv32im(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    uint32_t numSamples,
    int16_t * realResult,
    int16_t * imagResult
)

16-bit integer complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

function plp_cmplx_dot_prod_i8

void plp_cmplx_dot_prod_i8(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    uint32_t numSamples,
    int8_t * realResult,
    int8_t * imagResult
)

Glue code for complex dot product of 8-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_dot_prod_i8_xpulpv2

void plp_cmplx_dot_prod_i8_xpulpv2(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    uint32_t numSamples,
    int8_t * realResult,
    int8_t * imagResult
)

8-bit integer complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

function plp_cmplx_dot_prod_i8_rv32im

void plp_cmplx_dot_prod_i8_rv32im(
    const int8_t * pSrcA,
    const int8_t * pSrcB,
    uint32_t numSamples,
    int8_t * realResult,
    int8_t * imagResult
)

8-bit integer complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

function plp_cmplx_dot_prod_q32

void plp_cmplx_dot_prod_q32(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    uint32_t numSamples,
    uint32_t deciPoint,
    int32_t * realResult,
    int32_t * imagResult
)

Glue code for complex dot product of 32-bit fixed-point vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_dot_prod_q32_xpulpv2

void plp_cmplx_dot_prod_q32_xpulpv2(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    uint32_t numSamples,
    uint32_t deciPoint,
    int32_t * realResult,
    int32_t * imagResult
)

32-bit fixed-point complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
deciPoint decimal point for right shift
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

32-bit fixed-point complex dot product.

function plp_cmplx_dot_prod_q32_rv32im

void plp_cmplx_dot_prod_q32_rv32im(
    const int32_t * pSrcA,
    const int32_t * pSrcB,
    uint32_t numSamples,
    uint32_t deciPoint,
    int32_t * realResult,
    int32_t * imagResult
)

32-bit integer complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
deciPoint decimal point for right shift
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

function plp_cmplx_dot_prod_q16

void plp_cmplx_dot_prod_q16(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    uint32_t numSamples,
    uint32_t deciPoint,
    int16_t * realResult,
    int16_t * imagResult
)

Glue code for complex dot product of 16-bit fixed-point vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector

Return: none

function plp_cmplx_dot_prod_q16_xpulpv2

void plp_cmplx_dot_prod_q16_xpulpv2(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    uint32_t numSamples,
    uint32_t deciPoint,
    int16_t * realResult,
    int16_t * imagResult
)

16-bit fixed-point complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
deciPoint decimal point for right shift
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

16-bit fixed-point complex dot product.

function plp_cmplx_dot_prod_q16_rv32im

void plp_cmplx_dot_prod_q16_rv32im(
    const int16_t * pSrcA,
    const int16_t * pSrcB,
    uint32_t numSamples,
    uint32_t deciPoint,
    int16_t * realResult,
    int16_t * imagResult
)

16-bit fixed-point complex dot product.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to the first input vector
pSrcB points to the second input vector
numSamples number of samples in each vector
deciPoint decimal point for right shift
realResult real part of the result returned here
imagResult imaginary part of the result returned here

Return:

none
none

16-bit fixed-point complex dot product.

function plp_cmplx_mult_real_f32

void plp_cmplx_mult_real_f32(
    const float32_t *__restrict__ pSrcCmplx,
    const float32_t *__restrict__ pSrcReal,
    float32_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex multiplied with real of 32-bit float vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied with real of 32-bit float vectors.

function plp_cmplx_mult_real_f32_xpulpv2

void plp_cmplx_mult_real_f32_xpulpv2(
    const float32_t *__restrict__ pSrcCmplx,
    const float32_t *__restrict__ pSrcReal,
    float32_t *__restrict__ pDst,
    uint32_t numSamples
)

Floating-point complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

Floating-point complex multiplied with real.

function plp_cmplx_mult_real_i32

void plp_cmplx_mult_real_i32(
    const int32_t *__restrict__ pSrcCmplx,
    const int32_t *__restrict__ pSrcReal,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex multiplied with real of 32-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied with real of 32-bit integer vectors.

function plp_cmplx_mult_real_i32_xpulpv2

void plp_cmplx_mult_real_i32_xpulpv2(
    const int32_t *__restrict__ pSrcCmplx,
    const int32_t *__restrict__ pSrcReal,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

32-bit integer complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

32-bit integer complex multiplied with real.

function plp_cmplx_mult_real_i32_rv32im

void plp_cmplx_mult_real_i32_rv32im(
    const int32_t *__restrict__ pSrcCmplx,
    const int32_t *__restrict__ pSrcReal,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

32-bit integer complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

32-bit integer complex multiplied with real.

function plp_cmplx_mult_real_i16

void plp_cmplx_mult_real_i16(
    const int16_t *__restrict__ pSrcCmplx,
    const int16_t *__restrict__ pSrcReal,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex multiplied with real of 16-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied with real of 16-bit integer vectors.

function plp_cmplx_mult_real_i16_xpulpv2

void plp_cmplx_mult_real_i16_xpulpv2(
    const int16_t *__restrict__ pSrcCmplx,
    const int16_t *__restrict__ pSrcReal,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

16-bit integer complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

16-bit integer complex multiplied with real.

function plp_cmplx_mult_real_i16_rv32im

void plp_cmplx_mult_real_i16_rv32im(
    const int16_t *__restrict__ pSrcCmplx,
    const int16_t *__restrict__ pSrcReal,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

16-bit integer complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

16-bit integer complex multiplied with real.

function plp_cmplx_mult_real_i8

void plp_cmplx_mult_real_i8(
    const int8_t *__restrict__ pSrcCmplx,
    const int8_t *__restrict__ pSrcReal,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex multiplied with real of 8-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied with real of 8-bit integer vectors.

function plp_cmplx_mult_real_i8_xpulpv2

void plp_cmplx_mult_real_i8_xpulpv2(
    const int8_t *__restrict__ pSrcCmplx,
    const int8_t *__restrict__ pSrcReal,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

8-bit integer complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

8-bit integer complex multiplied with real.

function plp_cmplx_mult_real_i8_rv32im

void plp_cmplx_mult_real_i8_rv32im(
    const int8_t *__restrict__ pSrcCmplx,
    const int8_t *__restrict__ pSrcReal,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

8-bit integer complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

8-bit integer complex multiplied with real.

function plp_cmplx_mult_real_q32

void plp_cmplx_mult_real_q32(
    const int32_t *__restrict__ pSrcCmplx,
    const int32_t *__restrict__ pSrcReal,
    int32_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

Glue code for complex multiplied with real of 32-bit fixed-point vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied with real of 32-bit fixed-point vectors.

function plp_cmplx_mult_real_q32_xpulpv2

void plp_cmplx_mult_real_q32_xpulpv2(
    const int32_t *__restrict__ pSrcCmplx,
    const int32_t *__restrict__ pSrcReal,
    int32_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

32-bit fixed-point complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

32-bit fixed-point complex multiplied with real.

function plp_cmplx_mult_real_q32_rv32im

void plp_cmplx_mult_real_q32_rv32im(
    const int32_t *__restrict__ pSrcCmplx,
    const int32_t *__restrict__ pSrcReal,
    int32_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

32-bit fixed-point complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

32-bit fixed-point complex multiplied with real.

function plp_cmplx_mult_real_q16

void plp_cmplx_mult_real_q16(
    const int16_t *__restrict__ pSrcCmplx,
    const int16_t *__restrict__ pSrcReal,
    int16_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

Glue code for complex multiplied with real of 16-bit fixed-point vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied with real of 16-bit fixed-point vectors.

function plp_cmplx_mult_real_q16_xpulpv2

void plp_cmplx_mult_real_q16_xpulpv2(
    const int16_t *__restrict__ pSrcCmplx,
    const int16_t *__restrict__ pSrcReal,
    int16_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

16-bit fixed-point complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

16-bit fixed-point complex multiplied with real.

function plp_cmplx_mult_real_q16_rv32im

void plp_cmplx_mult_real_q16_rv32im(
    const int16_t *__restrict__ pSrcCmplx,
    const int16_t *__restrict__ pSrcReal,
    int16_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

16-bit fixed-point complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

16-bit fixed-point complex multiplied with real.

function plp_cmplx_mult_real_q8

void plp_cmplx_mult_real_q8(
    const int8_t *__restrict__ pSrcCmplx,
    const int8_t *__restrict__ pSrcReal,
    int8_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

Glue code for complex multiplied with real of 8-bit fixed-point vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied with real of 8-bit fixed-point vectors.

function plp_cmplx_mult_real_q8_xpulpv2

void plp_cmplx_mult_real_q8_xpulpv2(
    const int8_t *__restrict__ pSrcCmplx,
    const int8_t *__restrict__ pSrcReal,
    int8_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

8-bit fixed-point complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

8-bit fixed-point complex multiplied with real.

function plp_cmplx_mult_real_q8_rv32im

void plp_cmplx_mult_real_q8_rv32im(
    const int8_t *__restrict__ pSrcCmplx,
    const int8_t *__restrict__ pSrcReal,
    int8_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

8-bit fixed-point complex multiplied with real.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

8-bit fixed-point complex multiplied with real.

function plp_cmplx_mag_squared_f32

void plp_cmplx_mag_squared_f32(
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex squared magnitude of 32-bit float vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex squared magnitude of 32-bit float vectors.

function plp_cmplx_mag_squared_f32_xpulpv2

void plp_cmplx_mag_squared_f32_xpulpv2(
    const float32_t *__restrict__ pSrc,
    float32_t *__restrict__ pDst,
    uint32_t numSamples
)

Floating-point complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

Floating-point complex squared magnitude.

function plp_cmplx_mag_squared_i16

void plp_cmplx_mag_squared_i16(
    const int16_t *__restrict__ pSrc,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex squared magnitude of 16-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex squared magnitude of 16-bit integer vectors.

function plp_cmplx_mag_squared_i16_rv32im

void plp_cmplx_mag_squared_i16_rv32im(
    const int16_t *__restrict__ pSrc,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

16-bit integer complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

16-bit integer complex squared magnitude.

function plp_cmplx_mag_squared_i16_xpulpv2

void plp_cmplx_mag_squared_i16_xpulpv2(
    const int16_t *__restrict__ pSrc,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

16 bit Integer complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

16 bit Integer complex squared magnitude.

function plp_cmplx_mag_squared_i32

void plp_cmplx_mag_squared_i32(
    const int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex squared magnitude of 32-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex squared magnitude of 32-bit integer vectors.

function plp_cmplx_mag_squared_i32_rv32im

void plp_cmplx_mag_squared_i32_rv32im(
    const int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

32-bit integer complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

32-bit integer complex squared magnitude.

function plp_cmplx_mag_squared_i32_xpulpv2

void plp_cmplx_mag_squared_i32_xpulpv2(
    const int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

32-bit integer complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

32-bit integer complex squared magnitude.

function plp_cmplx_mag_squared_i8_xpulpv2

void plp_cmplx_mag_squared_i8_xpulpv2(
    const int8_t *__restrict__ pSrc,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

8 bit Integer complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

8 bit Integer complex squared magnitude.

function plp_cmplx_mag_squared_i8

void plp_cmplx_mag_squared_i8(
    const int8_t *__restrict__ pSrc,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex squared magnitude of 32-bit integer vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex squared magnitude of 32-bit integer vectors.

function plp_cmplx_mag_squared_i8_rv32im

void plp_cmplx_mag_squared_i8_rv32im(
    const int8_t *__restrict__ pSrc,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

8-bit integer complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

8-bit integer complex squared magnitude.

function plp_cmplx_mag_squared_q32

void plp_cmplx_mag_squared_q32(
    const int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

Glue code for complex squared magnitude of 32-bit fixed-point vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

Glue code for complex squared magnitude of 32-bit fixed-point vectors.

function plp_cmplx_mag_squared_q32_rv32im

void plp_cmplx_mag_squared_q32_rv32im(
    const int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

32-bit fixed-point complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

32-bit fixed-point complex squared magnitude.

function plp_cmplx_mag_squared_q32_xpulpv2

void plp_cmplx_mag_squared_q32_xpulpv2(
    const int32_t *__restrict__ pSrc,
    int32_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

32 bit fixed-point complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

32 bit fixed-point complex squared magnitude.

function plp_cmplx_mag_squared_q16

void plp_cmplx_mag_squared_q16(
    const int16_t *__restrict__ pSrc,
    int16_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

Glue code for complex squared magnitude of 16-bit fixed-point vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

Glue code for complex squared magnitude of 16-bit fixed-point vectors.

function plp_cmplx_mag_squared_q16_rv32im

void plp_cmplx_mag_squared_q16_rv32im(
    const int16_t *__restrict__ pSrc,
    int16_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

16-bit fixed-point complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

16-bit fixed-point complex squared magnitude.

function plp_cmplx_mag_squared_q16_xpulpv2

void plp_cmplx_mag_squared_q16_xpulpv2(
    const int16_t *__restrict__ pSrc,
    int16_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

16 bit fixed-point complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

16 bit fixed-point complex squared magnitude.

function plp_cmplx_mag_squared_q8

void plp_cmplx_mag_squared_q8(
    const int8_t *__restrict__ pSrc,
    int8_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

Glue code for complex squared magnitude of 8-bit fixed-point vectors.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

Glue code for complex squared magnitude of 8-bit fixed-point vectors.

function plp_cmplx_mag_squared_q8_rv32im

void plp_cmplx_mag_squared_q8_rv32im(
    const int8_t *__restrict__ pSrc,
    int8_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

8-bit fixed-point complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

8-bit fixed-point complex squared magnitude.

function plp_cmplx_mag_squared_q8_xpulpv2

void plp_cmplx_mag_squared_q8_xpulpv2(
    const int8_t *__restrict__ pSrc,
    int8_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

8 bit fixed-point complex squared magnitude.

Parameters:

pSrc points to the input vector
pDst points to the output vector
numSamples number of samples in each vector
pSrc points to input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

8 bit fixed-point complex squared magnitude.

function plp_cmplx_mult_cmplx_f32

void plp_cmplx_mult_cmplx_f32(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    float32_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex multiplied by complex of 32-bit float vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied by complex of 32-bit float vectors.

function plp_cmplx_mult_cmplx_f32_xpulpv2

void plp_cmplx_mult_cmplx_f32_xpulpv2(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    float32_t *__restrict__ pDst,
    uint32_t numSamples
)

Floating-point complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to complex input vector
pSrcB points to real input vector
pDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

Floating-point complex multiplied by complex.

function plp_cmplx_mult_cmplx_i32

void plp_cmplx_mult_cmplx_i32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex multiplied by complex of 32-bit integer vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcCmplx points to complex input vector
pSrcReal points to real input vector
pCmplxDst points to complex output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied by complex of 32-bit integer vectors.

function plp_cmplx_mult_cmplx_i32_xpulpv2

void plp_cmplx_mult_cmplx_i32_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

32-bit integer complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

32-bit integer complex multiplied by complex.

function plp_cmplx_mult_cmplx_i32_rv32im

void plp_cmplx_mult_cmplx_i32_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    int32_t *__restrict__ pDst,
    uint32_t numSamples
)

32-bit integer complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

32-bit integer complex multiplied by complex.

function plp_cmplx_mult_cmplx_i16

void plp_cmplx_mult_cmplx_i16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex multiplied by complex of 16-bit integer vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied by complex of 16-bit integer vectors.

function plp_cmplx_mult_cmplx_i16_xpulpv2

void plp_cmplx_mult_cmplx_i16_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

16-bit integer complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

16-bit integer complex multiplied by complex.

function plp_cmplx_mult_cmplx_i16_rv32im

void plp_cmplx_mult_cmplx_i16_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    int16_t *__restrict__ pDst,
    uint32_t numSamples
)

16-bit integer complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

16-bit integer complex multiplied by complex.

function plp_cmplx_mult_cmplx_i8

void plp_cmplx_mult_cmplx_i8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

Glue code for complex multiplied by complex of 8-bit integer vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied by complex of 8-bit integer vectors.

function plp_cmplx_mult_cmplx_i8_xpulpv2

void plp_cmplx_mult_cmplx_i8_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

8-bit integer complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

8-bit integer complex multiplied by complex.

function plp_cmplx_mult_cmplx_i8_rv32im

void plp_cmplx_mult_cmplx_i8_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    int8_t *__restrict__ pDst,
    uint32_t numSamples
)

8-bit integer complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
numSamples number of samples in each vector

Return:

none
none

8-bit integer complex multiplied by complex.

function plp_cmplx_mult_cmplx_q32

void plp_cmplx_mult_cmplx_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    int32_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

Glue code for complex multiplied by complex of 32-bit fixed-point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied by complex of 32-bit fixed-point vectors.

function plp_cmplx_mult_cmplx_q32_xpulpv2

void plp_cmplx_mult_cmplx_q32_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    int32_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

32-bit fixed-point complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

32-bit fixed-point complex multiplied by complex.

function plp_cmplx_mult_cmplx_q32_rv32im

void plp_cmplx_mult_cmplx_q32_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    int32_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

32-bit fixed-point complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

32-bit fixed-point complex multiplied by complex.

function plp_cmplx_mult_cmplx_q16

void plp_cmplx_mult_cmplx_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    int16_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

Glue code for complex multiplied by complex of 16-bit fixed-point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied by complex of 16-bit fixed-point vectors.

function plp_cmplx_mult_cmplx_q16_xpulpv2

void plp_cmplx_mult_cmplx_q16_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    int16_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

16-bit fixed-point complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

16-bit fixed-point complex multiplied by complex.

function plp_cmplx_mult_cmplx_q16_rv32im

void plp_cmplx_mult_cmplx_q16_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    int16_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

16-bit fixed-point complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

16-bit fixed-point complex multiplied by complex.

function plp_cmplx_mult_cmplx_q8

void plp_cmplx_mult_cmplx_q8(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    int8_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

Glue code for complex multiplied by complex of 8-bit fixed-point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

Glue code for complex multiplied by complex of 8-bit fixed-point vectors.

function plp_cmplx_mult_cmplx_q8_xpulpv2

void plp_cmplx_mult_cmplx_q8_xpulpv2(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    int8_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

8-bit fixed-point complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

8-bit fixed-point complex multiplied by complex.

function plp_cmplx_mult_cmplx_q8_rv32im

void plp_cmplx_mult_cmplx_q8_rv32im(
    const int8_t *__restrict__ pSrcA,
    const int8_t *__restrict__ pSrcB,
    int8_t *__restrict__ pDst,
    uint32_t deciPoint,
    uint32_t numSamples
)

8-bit fixed-point complex multiplied by complex.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second vector
pDst points to the output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector
pSrcA points to first input vector
pSrcB points to second input vector
pDst points to output vector
deciPoint decimal point for right shift
numSamples number of samples in each vector

Return:

none
none

8-bit fixed-point complex multiplied by complex.

function plp_euclidean_distance_q32_parallel

void plp_euclidean_distance_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    uint32_t nPE,
    uint32_t *__restrict__ pRes
)

Glue code for parallel Euclidean distance of 32-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
nPE number of parallel processing units
pRes output result returned here

Return: none

function plp_euclidean_distance_f32_parallel

void plp_euclidean_distance_f32_parallel(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t nPE,
    float32_t *__restrict__ pRes
)

Glue code for parallel Euclidean distance between 32-bit float vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pRes output result returned here

Return: none

function plp_euclidean_distance_q32p_xpulpv2

void plp_euclidean_distance_q32p_xpulpv2(
    void * S
)

Parallel euclidean distance with interleaved access 32-bit fixed point vectors. vectors kernel for XPULPV2 extension.

Parameters:

S points to the instance structure for integer parallel dot product
S points to the instance structure for integer parallel Euclidean distance

Return:

none
none

function plp_euclidean_distance_f32p_xpulpv2

void plp_euclidean_distance_f32p_xpulpv2(
    void * S
)

32-bit floating-point parallel Euclidean distance between two vectors

Parameters:

S points to the instance structure for float euclidean distance

Return: none

function plp_euclidean_distance_q32

void plp_euclidean_distance_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for euclidean distance of 32-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
pRes output result returned here

Return: none

function plp_euclidean_distance_q32s_xpulpv2

void plp_euclidean_distance_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Euclidean distance of 32-bit fixed point vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
pRes output result returned here

Return: none

function plp_euclidean_distance_q32s_rv32im

void plp_euclidean_distance_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Euclidean distance of 32-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
pRes output result returned here

Return: none

function plp_euclidean_distance_q16

void plp_euclidean_distance_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint16_t blockSize,
    uint16_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for euclidean distance of 16-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
pRes output result returned here

Return: none

function plp_euclidean_distance_q16s_xpulpv2

void plp_euclidean_distance_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t deciPoint,
    int32_t *__restrict__ pRes
)

Euclidean distance of 16-bit fixed point vectors kernel for XPULPV2.

Parameters:

pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
fracBits decimal point for right shift
pRes output result returned here [32 bit]

Return: none

Par: Exploiting SIMD instructions

The 16 bit values are packed two by two into 32 bit vectors and then the sums and prducts are performed simultaneously on 32 bit vectors, with 32 bit accumulator.

function plp_euclidean_distance_q16s_rv32im

void plp_euclidean_distance_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Euclidean distance of 16-bit fixed point vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
fracBits decimal point for right shift
pRes output result returned here [32 bit]

Return: none

Par: Exploiting SIMD instructions

When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).

function plp_euclidean_distance_f32

void plp_euclidean_distance_f32(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    float32_t *__restrict__ pRes
)

Glue code for Euclidean distance between 32-bit float vectors.

Parameters:

pSrcA First vector
pSrcB Second vector
blockSize vector length

Return: none

function plp_euclidean_distance_f32s_xpulpv2

void plp_euclidean_distance_f32s_xpulpv2(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    float32_t *__restrict__ pRes
)

32-bit floating point Euclidean distance between two vectors

Parameters:

pA First vector
pB Second vector
blockSize vector length
pRes output result returned here

Return: none

function plp_euclidean_distance_f32s_rv32im

void plp_euclidean_distance_f32s_rv32im(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    float32_t *__restrict__ pRes
)

32-bit floating point Euclidean distance between two vectors

Parameters:

pA First vector
pB Second vector
blockSize vector length
pRes output result returned here

Return: none

function plp_cosine_distance_q32_parallel

void plp_cosine_distance_q32_parallel(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    uint32_t nPE,
    int32_t *__restrict__ pRes
)

Glue code for parallel cosine distance between 32-bit fixed-precision vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pRes output result returned here

Return:

none
none

Glue code for parallel cosine distance between 32-bit fixed-precision vectors.

function plp_cosine_distance_f32_parallel

void plp_cosine_distance_f32_parallel(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t nPE,
    float32_t *__restrict__ pRes
)

Glue code for parallel cosine distance between 32-bit float vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pRes output result returned here

Return: none

function plp_cosine_distance_f32p_xpulpv2

void plp_cosine_distance_f32p_xpulpv2(
    void * S
)

32-bit floating-point parallel cosine distance between two vectors (computes power in parallel)

Parameters:

S points to the instance structure for float cosine distance

Return: none

function plp_cosine_distance_f32

void plp_cosine_distance_f32(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    float32_t *__restrict__ pRes
)

Glue code for cosine distance between 32-bit float vectors.

Parameters:

pSrcA First vector
pSrcB Second vector
blockSize vector length
pSrcA First vector
pSrcB Second vector
blockSize vector length

Return:

none
none

Glue code for cosine distance between 32-bit float vectors.

function plp_cosine_distance_f32s_rv32im

void plp_cosine_distance_f32s_rv32im(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    float32_t *__restrict__ pRes
)

32-bit floating point cosine distance between two vectors

Parameters:

pA First vector
pB Second vector
blockSize vector length
pRes output result returned here

Return: none

function plp_cosine_distance_f32s_xpulpv2

void plp_cosine_distance_f32s_xpulpv2(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    float32_t *__restrict__ pRes
)

32-bit floating point cosine distance between two vectors

Parameters:

pA First vector
pB Second vector
blockSize vector length
pRes output result returned here

Return: none

function plp_cosine_distance_q32

void plp_cosine_distance_q32(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for cosine distance of 32-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
pRes output result returned here

Return: none

function plp_cosine_distance_q32s_rv32im

void plp_cosine_distance_q32s_rv32im(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

cosine distance of 32-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed-point fractional bits
pRes output result returned here

Return:

none
none

function plp_cosine_distance_q32s_xpulpv2

void plp_cosine_distance_q32s_xpulpv2(
    const int32_t *__restrict__ pSrcA,
    const int32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

cosine distance of 32-bit fixed point vectors kernel for XPULPV2 extension.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
pRes output result returned here
pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
pRes output result returned here

Return:

none
none

cosine distance of 32-bit fixed point vectors kernel for XPULPV2 extension.

function plp_cosine_distance_q16

void plp_cosine_distance_q16(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint16_t blockSize,
    uint16_t fracBits,
    int32_t *__restrict__ pRes
)

Glue code for cosine distance of 16-bit fixed point vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
fracBits number of fixed point fractional bits
pRes output result returned here

Return: none

function plp_cosine_distance_q16s_rv32im

void plp_cosine_distance_q16s_rv32im(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

cosine distance of 16-bit fixed point vectors kernel for RV32IM extension.

Parameters:

pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
fracBits decimal point for right shift
pRes output result returned here [32 bit]

Return: none

Par: Exploiting SIMD instructions

When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).

function plp_cosine_distance_q16s_xpulpv2

void plp_cosine_distance_q16s_xpulpv2(
    const int16_t *__restrict__ pSrcA,
    const int16_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t fracBits,
    int32_t *__restrict__ pRes
)

cosine distance of 16-bit fixed point vectors kernel for XPULPV2.

Parameters:

pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
fracBits decimal point for right shift
pRes output result returned here [32 bit]
pSrcA points to the first input vector [16 bit]
pSrcB points to the second input vector [16 bit]
blockSize number of samples in each vector
fracBits decimal point for right shift
pRes output result returned here [32 bit]

Return:

none
none

Par:

Exploiting SIMD instructions

The 16 bit values are packed two by two into 32 bit vectors and then the sums and prducts are performed simultaneously on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions

The 16 bit values are packed two by two into 32 bit vectors and then the sums and prducts are performed simultaneously on 32 bit vectors, with 32 bit accumulator.

cosine distance of 16-bit fixed point vectors kernel for XPULPV2.

Macros Documentation

define PLP_MATH_IBEX

#define PLP_MATH_IBEX

define PLP_MATH_LOOPUNROLL

#define PLP_MATH_LOOPUNROLL

define PLP_DWT_DEC_LEN

#define PLP_DWT_DEC_LEN(
    SIG_LEN,
    WAVELET,
    LEVEL
)
plp_dwt_dec_len(SIG_LEN, WAVELET.length, LEVEL)

define PLP_DWT_DEC_TEMP_LEN

#define PLP_DWT_DEC_TEMP_LEN(
    SRC_LEN,
    WAVELET_LEN
)
(((SRC_LEN+WAVELET_LEN-1)/2 + ((SRC_LEN+WAVELET_LEN-1)/2 + WAVELET_LEN-1))/2)

define PLP_DWT_OUTPUT_LENGTH

#define PLP_DWT_OUTPUT_LENGTH(
    SIG_LEN,
    WAVELET_LEN
)
((SIG_LEN + WAVELET_LEN - 1) >> 1)

define FAST_MATH_TABLE_SIZE

#define FAST_MATH_TABLE_SIZE 512

Glue code for square root of a 32-bit floating point number.

Parameters:

pSrc points to the input vectoro
pRes Square root returned here

Return: none

Macros required for SINE and COSINE Fast math approximations

define FAST_MATH_Q32_SHIFT

#define FAST_MATH_Q32_SHIFT (32 - 10)

define FAST_MATH_Q16_SHIFT

#define FAST_MATH_Q16_SHIFT (16 - 10)

define CONTROLLER_Q32_SHIFT

#define CONTROLLER_Q32_SHIFT (32 - 9)

define TABLE_SPACING_Q32

#define TABLE_SPACING_Q32 0x400000

define TABLE_SPACING_Q16

#define TABLE_SPACING_Q16 0x80

Source code


/*
 * Copyright (C) 2019 ETH Zurich and University of Bologna. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef __PLP_MATH_H__
#define __PLP_MATH_H__

#include "math.h"
#include "rtos_hal.h"

typedef float float32_t;

#define PLP_MATH_IBEX // previously called zero-riscy
//#define PLP_MATH_RISCY
#define PLP_MATH_LOOPUNROLL

typedef struct {
    int32_t *pSrcA;     // pointer to the first vector
    int32_t *pSrcB;     // pointer to the second vector
    uint32_t blkSizePE; // number of samples in each vector
    uint32_t nPE;       // number of processing units
    int32_t *resBuffer; // pointer to result vector
} plp_dot_prod_instance_i32;

typedef struct {
    int32_t *pSrcA;     // pointer to the first vector
    int32_t *pSrcB;     // pointer to the second vector
    uint32_t blkSizePE; // number of samples in each vector
    uint32_t deciPoint; // decimal point for right shift
    uint32_t nPE;       // number of processing units
    int32_t *resBuffer; // pointer to result vector
} plp_dot_prod_instance_q32;

typedef struct {
    const float32_t *pSrcA; // pointer to the first vector
    const float32_t *pSrcB; // pointer to the second vector
    uint32_t blkSizePE;     // number of samples in each vector
    uint32_t nPE;           // number of processing units
    float32_t *resBuffer;   // pointer to result vector
} plp_dot_prod_instance_f32;

typedef struct {
    const float32_t *pSrcA; // pointer to the first vector
    const float32_t *pSrcB; // pointer to the second vector
    uint32_t blkSizePE;     // number of samples in each vector
    uint32_t nPE;           // number of processing units
    float32_t *pDst;        // pointer to result vector
} plp_mult_instance_f32;

typedef struct {
    const float32_t *pSrc; // pointer to the vector
    uint32_t blkSizePE;     // number of samples in each vector
    uint32_t nPE;           // number of processing units
    float32_t *pDst;        // pointer to result vector
} plp_log_instance_f32;

typedef struct {
    const int32_t *pSrcA; // pointer to the first vector
    uint32_t srcALen;
    const int32_t *pSrcB; // pointer to the second vector
    uint32_t srcBLen;     // number of samples in each vector
    uint8_t nPE;          // number of processing units
    int32_t *pRes;        // pointer to result vector
} plp_conv_instance_i32;

typedef struct {
    const int16_t *pSrcA; // pointer to the first vector
    uint32_t srcALen;
    const int16_t *pSrcB; // pointer to the second vector
    uint32_t srcBLen;     // number of samples in each vector
    uint8_t nPE;          // number of processing units
    int32_t *pRes;        // pointer to result vector
} plp_conv_instance_i16;

typedef struct {
    const int8_t *pSrcA; // pointer to the first vector
    uint32_t srcALen;
    const int8_t *pSrcB; // pointer to the second vector
    uint32_t srcBLen;    // number of samples in each vector
    uint8_t nPE;         // number of processing units
    int32_t *pRes;       // pointer to result vector
} plp_conv_instance_i8;

typedef struct {
    uint32_t addOffset;
    uint32_t addLengthfirst;
    uint32_t addLengthsecond;
    uint32_t numVectors;
    uint32_t blockOffset;
    int32_t *pRes;
    uint8_t coresPerVector;
} plp_conv_tree_add_instance;

typedef struct {
    uint16_t fftLen;             /*< length of the FFT. */
    const int16_t *pTwiddle;     /*< points to the Twiddle factor table. */
    const int16_t *pBitRevTable; /*< points to the bit reversal table. */
    uint16_t bitRevLength;       /*< bit reversal table length. */
} plp_cfft_instance_q16;

typedef struct {
    plp_cfft_instance_q16 *S;
    int16_t *p1;
    uint8_t ifftFlag;
    uint8_t bitReverseFlag;
    uint32_t deciPoint;
    uint32_t nPE;
} plp_cfft_instance_q16_parallel;

typedef struct {
    uint16_t fftLen;             /*< length of the FFT. */
    const int32_t *pTwiddle;     /*< points to the Twiddle factor table. */
    const int16_t *pBitRevTable; /*< points to the bit reversal table. */
    uint16_t bitRevLength;       /*< bit reversal table length. */
} plp_cfft_instance_q32;

typedef struct {
    plp_cfft_instance_q32 *S;
    int32_t *p1;
    uint8_t ifftFlag;
    uint8_t bitReverseFlag;
    uint32_t fracBits;
    uint32_t nPE;
} plp_cfft_instance_q32_parallel;


typedef struct {
    uint32_t fftLen;
    const float32_t *pTwiddle;
    const uint16_t *pBitRevTable;
    uint16_t bitRevLength;
} plp_cfft_instance_f32;

typedef struct {
    plp_cfft_instance_f32 *S;
    const float32_t *pSrc;
    uint8_t ifftFlag;
    uint8_t bitReverseFlag;
    const uint32_t nPE;
} plp_cfft_instance_f32_parallel;

typedef struct {
    uint32_t FFTLength;
    uint8_t bitReverseFlag;
    const float32_t *pTwiddleFactors;
    const uint16_t *pBitReverseLUT;
} plp_fft_instance_f32;

typedef struct {
    plp_cfft_instance_f32 *Sint;
    uint32_t FFTLengthRFFT;
    const float32_t *pTwiddleFactorsRFFT;
} plp_fft_fast_instance_f32;

typedef struct {
    plp_fft_fast_instance_f32* S;
    float32_t *__restrict__ pSrc;
    float32_t *__restrict__ pDst;
    const uint32_t nPE;
} plp_fft_fast_instance_f32_parallel;


typedef struct {
    plp_fft_instance_f32 *S;
    const float32_t *pSrc;
    const uint32_t nPE;
    float32_t *pDst;
} plp_fft_instance_f32_parallel;

typedef struct {
    const float32_t *V;
    const uint16_t *firstValue;
    const uint16_t *filterLength;
    const uint8_t nFilters;
} plp_triangular_filter_f32;

typedef struct {
    float32_t re;
    float32_t im;
} Complex_type_f32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_instance_i32;

typedef struct {
    const float *__restrict__ pSrcA;
    const float *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t nPE;
    float *__restrict__ pDstC;
} plp_mat_mult_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t shift;
    uint32_t nPE;
    int8_t *__restrict__ pDstC;
} plp_mat_mult_instance_q8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t shift;
    uint32_t nPE;
    int16_t *__restrict__ pDstC;
} plp_mat_mult_instance_q16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t shift;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_instance_q32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_i32;

typedef struct {
    const float *__restrict__ pSrcA;
    const float *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t nPE;
    float *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t shift;
    uint32_t nPE;
    int8_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_q8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t shift;
    uint32_t nPE;
    int16_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_q16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t shift;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_q32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_add_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_add_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_add_instance_i32;

typedef struct {
    const float *__restrict__ pSrcA;
    const float *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    float *__restrict__ pDst;
} plp_mat_add_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_sub_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_sub_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_sub_instance_i32;

typedef struct {
    const float *__restrict__ pSrcA;
    const float *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    float *__restrict__ pDst;
} plp_mat_sub_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    int8_t scaleFactor;
    int32_t shift;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_scale_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    int16_t scaleFactor;
    int32_t shift;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_scale_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    int32_t scaleFactor;
    int32_t shift;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_scale_instance_i32;

typedef struct {
    const float *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    float scaleFactor;
    uint32_t nPE;
    float *__restrict__ pDst;
} plp_mat_scale_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_trans_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_trans_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_trans_instance_i32;

typedef struct {
    uint32_t N;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_fill_I_instance_i8;

typedef struct {
    uint32_t N;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_fill_I_instance_i16;

typedef struct {
    uint32_t N;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_fill_I_instance_i32;

typedef struct {
    uint32_t N;
    uint32_t nPE;
    float *__restrict__ pDst;
} plp_mat_fill_I_instance_f32;

typedef struct {
    uint32_t N;
    int32_t fracBits;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_fill_I_instance_q8;

typedef struct {
    uint32_t N;
    uint32_t fracBits;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_fill_I_instance_q16;

typedef struct {
    uint32_t N;
    uint32_t fracBits;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_fill_I_instance_q32;

typedef struct {
    float *__restrict__ pSrc;
    float *__restrict__ pDst;
    uint32_t *__restrict__ flag;
    uint32_t N;
    uint32_t nPE;
} plp_mat_inv_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_i32;

typedef struct {
    const float *__restrict__ pSrcA;
    const float *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t nPE;
    float *__restrict__ pDstC;
} plp_mat_mult_stride_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t shift;
    uint32_t nPE;
    int8_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_q8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t shift;
    uint32_t nPE;
    int16_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_q16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t shift;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_q32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_i32;

typedef struct {
    const float *__restrict__ pSrcA;
    const float *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t nPE;
    float *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t shift;
    uint32_t nPE;
    int8_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_q8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t shift;
    uint32_t nPE;
    int16_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_q16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t O;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideC;
    uint32_t shift;
    uint32_t nPE;
    int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_q32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideY;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_add_stride_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideY;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_add_stride_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideY;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_add_stride_instance_i32;

typedef struct {
    const float *__restrict__ pSrcA;
    const float *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideY;
    uint32_t nPE;
    float *__restrict__ pDst;
} plp_mat_add_stride_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrcA;
    const int8_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideY;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_sub_stride_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrcA;
    const int16_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideY;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_sub_stride_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrcA;
    const int32_t *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideY;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_sub_stride_instance_i32;

typedef struct {
    const float *__restrict__ pSrcA;
    const float *__restrict__ pSrcB;
    uint32_t M;
    uint32_t N;
    uint32_t strideA;
    uint32_t strideB;
    uint32_t strideY;
    uint32_t nPE;
    float *__restrict__ pDst;
} plp_mat_sub_stride_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t strideSrc;
    uint32_t strideDst;
    int8_t scaleFactor;
    int32_t shift;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_scale_stride_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t strideSrc;
    uint32_t strideDst;
    int16_t scaleFactor;
    int32_t shift;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_scale_stride_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t strideSrc;
    uint32_t strideDst;
    int32_t scaleFactor;
    int32_t shift;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_scale_stride_instance_i32;

typedef struct {
    const float *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t strideSrc;
    uint32_t strideDst;
    float scaleFactor;
    uint32_t nPE;
    float *__restrict__ pDst;
} plp_mat_scale_stride_instance_f32;

typedef struct {
    uint32_t N;
    uint32_t stride;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_i8;

typedef struct {
    uint32_t N;
    uint32_t stride;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_i16;

typedef struct {
    uint32_t N;
    uint32_t stride;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_i32;

typedef struct {
    uint32_t N;
    uint32_t stride;
    uint32_t nPE;
    float *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_f32;

typedef struct {
    uint32_t N;
    uint32_t stride;
    uint32_t nPE;
    int32_t fracBits;
    int8_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_q8;

typedef struct {
    uint32_t N;
    uint32_t stride;
    uint32_t nPE;
    int32_t fracBits;
    int16_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_q16;

typedef struct {
    uint32_t N;
    uint32_t stride;
    uint32_t nPE;
    int32_t fracBits;
    int32_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_q32;

typedef struct {
    uint32_t M;
    uint32_t N;
    uint32_t stride;
    int8_t value;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_fill_stride_instance_i8;

typedef struct {
    uint32_t M;
    uint32_t N;
    uint32_t stride;
    int16_t value;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_fill_stride_instance_i16;

typedef struct {
    uint32_t M;
    uint32_t N;
    uint32_t stride;
    int32_t value;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_fill_stride_instance_i32;

typedef struct {
    uint32_t M;
    uint32_t N;
    uint32_t stride;
    float value;
    uint32_t nPE;
    float *__restrict__ pDst;
} plp_mat_fill_stride_instance_f32;

typedef struct {
    const int8_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t strideSrc;
    uint32_t strideDst;
    uint32_t nPE;
    int8_t *__restrict__ pDst;
} plp_mat_copy_stride_instance_i8;

typedef struct {
    const int16_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t strideSrc;
    uint32_t strideDst;
    uint32_t nPE;
    int16_t *__restrict__ pDst;
} plp_mat_copy_stride_instance_i16;

typedef struct {
    const int32_t *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t strideSrc;
    uint32_t strideDst;
    uint32_t nPE;
    int32_t *__restrict__ pDst;
} plp_mat_copy_stride_instance_i32;

typedef struct {
    const float *__restrict__ pSrc;
    uint32_t M;
    uint32_t N;
    uint32_t strideSrc;
    uint32_t strideDst;
    uint32_t nPE;
    float *__restrict__ pDst;
} plp_mat_copy_stride_instance_f32;

typedef struct {
    const float32_t *pSrcA; // pointer to the first vector
    const float32_t *pSrcB; // pointer to the second vector
    uint32_t blkSizePE;     // number of samples in each vector
    uint32_t nPE;           // number of processing units
    float32_t *resBuffer;   // pointer to result vector
} plp_euclidean_distance_instance_f32;

typedef struct {
    const int32_t *pSrcA;   // pointer to the first vector
    const int32_t *pSrcB;   // pointer to the second vector
    uint32_t blkSizePE;     // number of samples in each vector
    uint32_t nPE;           // number of processing units
    uint32_t fracBits;      // number of fixed point fractional bits
    int32_t *resBuffer;     // pointer to result vector
} plp_euclidean_distance_instance_q32;

typedef struct {
    const float32_t *pSrcA; // pointer to the first vector
    const float32_t *pSrcB; // pointer to the second vector
    uint32_t blkSizePE;     // number of samples in each vector
    uint32_t nPE;           // number of processing units
    float32_t *resBuffer_A;   // pointer to result vector
    float32_t *resBuffer_B;   // pointer to result vector
    float32_t *resBuffer_dot;   // pointer to result vector
} plp_cosine_distance_instance_f32;

typedef struct {
    int32_t *pSrc;     // pointer to the first vector
    uint32_t blkSizePE; // number of samples in each vector
    uint32_t fracBits; // fracBits for right shift
    uint32_t nPE;       // number of processing units
    int32_t *resBuffer; // pointer to result vector
} plp_power_instance_q32;

typedef struct {
    const float32_t *pSrc; // pointer to the first vector
    uint32_t blkSizePE;     // number of samples in each vector
    uint32_t nPE;           // number of processing units
    float32_t *resBuffer;   // pointer to result vector
} plp_power_instance_f32;



typedef enum {
    PLP_DWT_WAVELET_OTHER,
    PLP_DWT_WAVELET_HAAR,

    PLP_DWT_WAVELET_DB1,
    PLP_DWT_WAVELET_DB2,
    PLP_DWT_WAVELET_DB3,
    PLP_DWT_WAVELET_DB4,
    PLP_DWT_WAVELET_DB5,
    PLP_DWT_WAVELET_DB6,
    PLP_DWT_WAVELET_DB7,
    PLP_DWT_WAVELET_DB8,
    PLP_DWT_WAVELET_DB9,
    PLP_DWT_WAVELET_DB10,
    PLP_DWT_WAVELET_DB11,
    PLP_DWT_WAVELET_DB12,
    PLP_DWT_WAVELET_DB13,
    PLP_DWT_WAVELET_DB14,
    PLP_DWT_WAVELET_DB15,
    PLP_DWT_WAVELET_DB16,
    PLP_DWT_WAVELET_DB17,
    PLP_DWT_WAVELET_DB18,
    PLP_DWT_WAVELET_DB19,
    PLP_DWT_WAVELET_DB20,

    PLP_DWT_WAVELET_SYM2,
    PLP_DWT_WAVELET_SYM3,
    PLP_DWT_WAVELET_SYM4,
    PLP_DWT_WAVELET_SYM5,
    PLP_DWT_WAVELET_SYM6,
    PLP_DWT_WAVELET_SYM7,
    PLP_DWT_WAVELET_SYM8,
    PLP_DWT_WAVELET_SYM9,
    PLP_DWT_WAVELET_SYM10,
    PLP_DWT_WAVELET_SYM11,
    PLP_DWT_WAVELET_SYM12,
    PLP_DWT_WAVELET_SYM13,
    PLP_DWT_WAVELET_SYM14,
    PLP_DWT_WAVELET_SYM15,
    PLP_DWT_WAVELET_SYM16,
    PLP_DWT_WAVELET_SYM17,
    PLP_DWT_WAVELET_SYM18,
    PLP_DWT_WAVELET_SYM19,
    PLP_DWT_WAVELET_SYM20,

    PLP_DWT_WAVELET_COIF1,
    PLP_DWT_WAVELET_COIF2,
    PLP_DWT_WAVELET_COIF3,
    PLP_DWT_WAVELET_COIF4,
    PLP_DWT_WAVELET_COIF5,
    PLP_DWT_WAVELET_COIF6,
    PLP_DWT_WAVELET_COIF7,
    PLP_DWT_WAVELET_COIF8,
    PLP_DWT_WAVELET_COIF9,
    PLP_DWT_WAVELET_COIF10,
    PLP_DWT_WAVELET_COIF11,
    PLP_DWT_WAVELET_COIF12,
    PLP_DWT_WAVELET_COIF13,
    PLP_DWT_WAVELET_COIF14,
    PLP_DWT_WAVELET_COIF15,
    PLP_DWT_WAVELET_COIF16,
    PLP_DWT_WAVELET_COIF17
} plp_dwt_wavelet_type;


typedef struct {

   uint32_t length;
   plp_dwt_wavelet_type type;

   float32_t *dec_lo; /* decomposition lowpass */
   float32_t *dec_hi; /* decomposition highpass */

   float32_t *rec_lo; /* reconstruction lowpass */
   float32_t *rec_hi; /* reconstruction highpass */

} plp_dwt_wavelet_f32;

typedef struct {

   uint32_t length;
   plp_dwt_wavelet_type type;

   int32_t *dec_lo; /* decomposition lowpass */
   int32_t *dec_hi; /* decomposition highpass */
} plp_dwt_wavelet_q32;

typedef struct {

   uint32_t length;
   plp_dwt_wavelet_type type;

   int16_t *dec_lo; /* decomposition lowpass */
   int16_t *dec_hi; /* decomposition highpass */
} plp_dwt_wavelet_q16;

typedef struct {

   uint32_t length;
   plp_dwt_wavelet_type type;

   int8_t *dec_lo; /* decomposition lowpass */
   int8_t *dec_hi; /* decomposition highpass */
} plp_dwt_wavelet_q8;



typedef enum {
    PLP_DWT_MODE_ZERO,
    PLP_DWT_MODE_CONSTANT,
    PLP_DWT_MODE_SYMMETRIC,
    PLP_DWT_MODE_REFLECT,
    PLP_DWT_MODE_PERIODIC,
    PLP_DWT_MODE_ANTISYMMETRIC,
    PLP_DWT_MODE_ANTIREFLECT
} plp_dwt_extension_mode;


typedef struct {
    const float32_t *pSrc;             //  points to the input buffer
    uint32_t length;                   // length of input buffer
    plp_dwt_wavelet_f32 wavelet; // wavelet structure for calculating DWT
    plp_dwt_extension_mode mode;       // boundary extension mode
    uint32_t nPE;                      // number of processing units

    float32_t *pDstA;                  // output buffer with Approximate coefficients
    float32_t *pDstD;                  // ouput buffer with Detailed coefficients
} plp_dwt_instance_f32;

typedef struct {
    const int32_t *pSrc;  //  points to the input buffer
    uint32_t length;        // length of input buffer
    plp_dwt_wavelet_q32 wavelet; // wavelet structure for calculating DWT
    plp_dwt_extension_mode mode; // boundary extension mode
    uint32_t nPE;           // number of processing units

    int32_t *pDstA;   // output buffer with Approximate coefficients
    int32_t *pDstD;   // ouput buffer with Detailed coefficients
} plp_dwt_instance_q32;


typedef struct {
    const int16_t *pSrc;  //  points to the input buffer
    uint32_t length;        // length of input buffer
    plp_dwt_wavelet_q16 wavelet; // wavelet structure for calculating DWT
    plp_dwt_extension_mode mode; // boundary extension mode
    uint32_t nPE;           // number of processing units

    int16_t *pDstA;   // output buffer with Approximate coefficients
    int16_t *pDstD;   // ouput buffer with Detailed coefficients
} plp_dwt_instance_q16;

typedef struct {
    const int8_t *pSrc;  //  points to the input buffer
    uint32_t length;        // length of input buffer
    plp_dwt_wavelet_q8 wavelet; // wavelet structure for calculating DWT
    plp_dwt_extension_mode mode; // boundary extension mode
    uint32_t nPE;           // number of processing units

    int8_t *pDstA;   // output buffer with Approximate coefficients
    int8_t *pDstD;   // ouput buffer with Detailed coefficients
} plp_dwt_instance_q8;


#define PLP_DWT_DEC_LEN(SIG_LEN, WAVELET, LEVEL) plp_dwt_dec_len(SIG_LEN, WAVELET.length, LEVEL)
#define PLP_DWT_DEC_TEMP_LEN(SRC_LEN, WAVELET_LEN) (((SRC_LEN+WAVELET_LEN-1)/2 + ((SRC_LEN+WAVELET_LEN-1)/2 + WAVELET_LEN-1))/2)
#define PLP_DWT_OUTPUT_LENGTH(SIG_LEN, WAVELET_LEN) ((SIG_LEN + WAVELET_LEN - 1) >> 1)

uint32_t plp_dwt_max_level(uint32_t sig_len, uint32_t wavelet_len);

uint32_t plp_dwt_dec_len(uint32_t sig_len, uint32_t wavelet_len, uint32_t level);

void plp_dot_prod_i32_parallel(const int32_t *__restrict__ pSrcA,
                               const int32_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               uint32_t nPE,
                               int32_t *__restrict__ pRes);

void plp_dot_prod_q32_parallel(const int32_t *__restrict__ pSrcA,
                               const int32_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               uint32_t deciPoint,
                               uint32_t nPE,
                               int32_t *__restrict__ pRes);

void plp_dot_prod_f32_parallel(const float32_t *__restrict__ pSrcA,
                               const float32_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               uint32_t nPE,
                               float32_t *__restrict__ pRes);

void plp_dot_prod_i32p_xpulpv2(void *S);

void plp_dot_prod_q32p_xpulpv2(void *S);

void plp_dot_prod_f32p_xpulpv2(void *S);

void plp_dot_prod_i32(const int32_t *__restrict__ pSrcA,
                      const int32_t *__restrict__ pSrcB,
                      uint32_t blockSize,
                      int32_t *__restrict__ pRes);

void plp_dot_prod_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                              const int32_t *__restrict__ pSrcB,
                              uint32_t blockSize,
                              int32_t *__restrict__ pRes);

void plp_dot_prod_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                               const int32_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               int32_t *__restrict__ pRes);

void plp_dot_prod_q32(const int32_t *__restrict__ pSrcA,
                      const int32_t *__restrict__ pSrcB,
                      uint32_t blockSize,
                      uint32_t deciPoint,
                      int32_t *__restrict__ pRes);

void plp_dot_prod_q32s_rv32im(const int32_t *__restrict__ pSrcA,
                              const int32_t *__restrict__ pSrcB,
                              uint32_t blockSize,
                              uint32_t deciPoint,
                              int32_t *__restrict__ pRes);

void plp_dot_prod_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                               const int32_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               uint32_t deciPoint,
                               int32_t *__restrict__ pRes);

void plp_dot_prod_f32(const float32_t *__restrict__ pSrcA,
                      const float32_t *__restrict__ pSrcB,
                      uint32_t blockSize,
                      float32_t *__restrict__ pRes);

void plp_dot_prod_f32s_xpulpv2(const float32_t *__restrict__ pSrcA,
                               const float32_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               float32_t *__restrict__ pRes);

void plp_dot_prod_f32s_rv32im(const float32_t *__restrict__ pSrcA,
                               const float32_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               float32_t *__restrict__ pRes);


void plp_dot_prod_i16(const int16_t *pSrcA,
                      const int16_t *pSrcB,
                      uint32_t blockSize,
                      int32_t *__restrict__ pRes);

void plp_dot_prod_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              uint32_t blockSize,
                              int32_t *__restrict__ pRes);

void plp_dot_prod_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                               const int16_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               int32_t *__restrict__ pRes);

void plp_dot_prod_q16(const int16_t *__restrict__ pSrcA,
                      const int16_t *__restrict__ pSrcB,
                      uint32_t blockSize,
                      uint32_t deciPoint,
                      int32_t *__restrict__ pRes);

void plp_dot_prod_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              uint32_t blockSize,
                              uint32_t deciPoint,
                              int32_t *__restrict__ pRes);

void plp_dot_prod_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                               const int16_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               uint32_t deciPoint,
                               int32_t *__restrict__ pRes);

void plp_dot_prod_i8(const int8_t *__restrict__ pSrcA,
                     const int8_t *__restrict__ pSrcB,
                     uint32_t blockSize,
                     int32_t *__restrict__ pRes);

void plp_dot_prod_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                             const int8_t *__restrict__ pSrcB,
                             uint32_t blockSize,
                             int32_t *__restrict__ pRes);

void plp_dot_prod_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                              const int8_t *__restrict__ pSrcB,
                              uint32_t blockSize,
                              int32_t *__restrict__ pRes);

void plp_dot_prod_q8(const int8_t *__restrict__ pSrcA,
                     const int8_t *__restrict__ pSrcB,
                     uint32_t blockSize,
                     uint32_t deciPoint,
                     int32_t *__restrict__ pRes);

void plp_dot_prod_q8s_rv32im(const int8_t *__restrict__ pSrcA,
                             const int8_t *__restrict__ pSrcB,
                             uint32_t blockSize,
                             uint32_t deciPoint,
                             int32_t *__restrict__ pRes);

void plp_dot_prod_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                              const int8_t *__restrict__ pSrcB,
                              uint32_t blockSize,
                              uint32_t deciPoint,
                              int32_t *__restrict__ pRes);

void plp_abs_i32(const int32_t * pSrc,
                 int32_t * pDst,
                 uint32_t blockSize);

void plp_abs_i32s_rv32im(const int32_t * pSrc,
                         int32_t * pDst,
                         uint32_t blockSize);

void plp_abs_i32s_xpulpv2(const int32_t * pSrc,
                          int32_t * pDst,
                          uint32_t blockSize);

void plp_abs_i16(const int16_t * pSrc,
                 int16_t * pDst,
                 uint32_t blockSize);

void plp_abs_i16s_rv32im(const int16_t * pSrc,
                         int16_t * pDst,
                         uint32_t blockSize);

void plp_abs_i16s_xpulpv2(const int16_t * pSrc,
                          int16_t * pDst,
                          uint32_t blockSize);

void plp_abs_i8(const int8_t * pSrc,
                 int8_t * pDst,
                 uint32_t blockSize);

void plp_abs_i8s_rv32im(const int8_t * pSrc,
                         int8_t * pDst,
                         uint32_t blockSize);

void plp_abs_i8s_xpulpv2(const int8_t * pSrc,
                          int8_t * pDst,
                          uint32_t blockSize);

void plp_add_i32(const int32_t * pSrcA,
                 const int32_t * pSrcB,
                 int32_t * pDst,
                 uint32_t blockSize);

void plp_add_i32s_rv32im(const int32_t * pSrcA,
                         const int32_t * pSrcB,
                         int32_t * pDst,
                         uint32_t blockSize);

void plp_add_i32s_xpulpv2(const int32_t * pSrcA,
                          const int32_t * pSrcB,
                          int32_t * pDst,
                          uint32_t blockSize);

void plp_add_i16(const int16_t * pSrcA,
                 const int16_t * pSrcB,
                 int32_t * pDst,
                 uint32_t blockSize);

void plp_add_i16s_rv32im(const int16_t * pSrcA,
                         const int16_t * pSrcB,
                         int32_t * pDst,
                         uint32_t blockSize);

void plp_add_i16s_xpulpv2(const int16_t * pSrcA,
                          const int16_t * pSrcB,
                          int32_t * pDst,
                          uint32_t blockSize);

void plp_add_i8(const int8_t * pSrcA,
                 const int8_t * pSrcB,
                 int32_t * pDst,
                 uint32_t blockSize);

void plp_add_i8s_rv32im(const int8_t * pSrcA,
                         const int8_t * pSrcB,
                         int32_t * pDst,
                         uint32_t blockSize);

void plp_add_i8s_xpulpv2(const int8_t * pSrcA,
                          const int8_t * pSrcB,
                          int32_t * pDst,
                          uint32_t blockSize);

void plp_mult_i32(const int32_t * pSrcA,
                 const int32_t * pSrcB,
                 int32_t * pDst,
                 uint32_t blockSize);

void plp_mult_i32s_rv32im(const int32_t * pSrcA,
                         const int32_t * pSrcB,
                         int32_t * pDst,
                         uint32_t blockSize);

void plp_mult_i32s_xpulpv2(const int32_t * pSrcA,
                          const int32_t * pSrcB,
                          int32_t * pDst,
                          uint32_t blockSize);

void plp_mult_i16(const int16_t * pSrcA,
                 const int16_t * pSrcB,
                 int32_t * pDst,
                 uint32_t blockSize);

void plp_mult_i16s_rv32im(const int16_t * pSrcA,
                         const int16_t * pSrcB,
                         int32_t * pDst,
                         uint32_t blockSize);

void plp_mult_i16s_xpulpv2(const int16_t * pSrcA,
                          const int16_t * pSrcB,
                          int32_t * pDst,
                          uint32_t blockSize);

void plp_mult_i8(const int8_t * pSrcA,
                 const int8_t * pSrcB,
                 int32_t * pDst,
                 uint32_t blockSize);

void plp_mult_i8s_rv32im(const int8_t * pSrcA,
                         const int8_t * pSrcB,
                         int32_t * pDst,
                         uint32_t blockSize);

void plp_mult_i8s_xpulpv2(const int8_t * pSrcA,
                          const int8_t * pSrcB,
                          int32_t * pDst,
                          uint32_t blockSize);

void plp_mult_f32(const float32_t * pSrcA,
                 const float32_t * pSrcB,
                 float32_t * pDst,
                 uint32_t blockSize);

void plp_mult_f32s_xpulpv2(const float32_t * pSrcA,
                          const float32_t * pSrcB,
                          float32_t * pDst,
                          uint32_t blockSize);

void plp_mult_f32_parallel(const float32_t *__restrict__ pSrcA,
                               const float32_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               uint32_t nPE,
                               float32_t *__restrict__ pDst);
void plp_mult_f32p_xpulpv2(void *S);

void plp_log_f32_parallel(const float32_t *__restrict__ pSrc,
                               uint32_t blockSize,
                               uint32_t nPE,
                               float32_t *__restrict__ pDst);

void plp_log_f32p_xpulpv2(void *S);

void plp_negate_i32(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize);

void plp_negate_i32s_rv32im(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize);

void plp_negate_i32s_xpulpv2(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize);

void plp_negate_i16(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize);

void plp_negate_i16s_rv32im(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize);

void plp_negate_i16s_xpulpv2(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize);

void plp_negate_i8(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize);

void plp_negate_i8s_rv32im(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize);

void plp_negate_i8s_xpulpv2(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize);

void plp_negate_f32(const float32_t * pSrc, float32_t * pDst, uint32_t blockSize);

void plp_negate_f32s_xpulpv2(const float32_t * pSrc, float32_t * pDst, uint32_t blockSize);

void plp_offset_i32(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize);

void plp_offset_i32s_rv32im(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize);

void plp_offset_i32s_xpulpv2(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize);

void plp_offset_i16(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize);

void plp_offset_i16s_rv32im(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize);

void plp_offset_i16s_xpulpv2(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize);

void plp_offset_i8(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize);

void plp_offset_i8s_rv32im(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize);

void plp_offset_i8s_xpulpv2(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize);

void plp_offset_f32(const float32_t * pSrc, float32_t offset, float32_t * pDst, uint32_t blockSize);

void plp_offset_f32s_xpulpv2(const float32_t * pSrc, float32_t offset, float32_t * pDst, uint32_t blockSize);

void plp_sub_i32(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize);

void plp_sub_i32s_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize);

void plp_sub_i32s_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize);

void plp_sub_i16(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize);

void plp_sub_i16s_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize);

void plp_sub_i16s_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize);

void plp_sub_i8(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize);

void plp_sub_i8s_rv32im(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize);

void plp_sub_i8s_xpulpv2(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize);

void plp_sub_f32(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize);

void plp_sub_f32s_xpulpv2(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize);

void plp_scale_i32(const int32_t *__restrict__ pSrc, int32_t scaleFactor, int32_t shift, int32_t *__restrict__ pDst, uint32_t blockSize);

void plp_scale_i32s_rv32im(const int32_t *__restrict__ pSrc, int32_t scaleFactor, int32_t shift, int32_t *__restrict__ pDst, uint32_t blockSize);

void plp_scale_i32s_xpulpv2(const int32_t *__restrict__ pSrc, int32_t scaleFactor, int32_t shift, int32_t *__restrict__ pDst, uint32_t blockSize);

void plp_scale_i16(const int16_t *__restrict__ pSrc, int16_t scaleFactor, int32_t shift, int16_t *__restrict__ pDst, uint32_t blockSize);

void plp_scale_i16s_rv32im(const int16_t *__restrict__ pSrc, int16_t scaleFactor, int32_t shift, int16_t *__restrict__ pDst, uint32_t blockSize);

void plp_scale_i16s_xpulpv2(const int16_t *__restrict__ pSrc, int16_t scaleFactor, int32_t shift, int16_t *__restrict__ pDst, uint32_t blockSize);

void plp_scale_i8(const int8_t *__restrict__ pSrc, int8_t scaleFactor, int32_t shift, int8_t *__restrict__ pDst, uint32_t blockSize);

void plp_scale_i8s_rv32im(const int8_t *__restrict__ pSrc, int8_t scaleFactor, int32_t shift, int8_t *__restrict__ pDst, uint32_t blockSize);

void plp_scale_i8s_xpulpv2(const int8_t *__restrict__ pSrc, int8_t scaleFactor, int32_t shift, int8_t *__restrict__ pDst, uint32_t blockSize);

void plp_scale_f32(const float32_t *__restrict__ pSrc, float32_t scaleFactor, float32_t *__restrict__ pDst, uint32_t blockSize);

void plp_scale_f32s_xpulpv2(const float32_t *__restrict__ pSrc, float32_t scaleFactor, float32_t *__restrict__ pDst, uint32_t blockSize);

void plp_fill_i32(int32_t value, int32_t *__restrict__ pDst, uint32_t blockSize);

void plp_fill_i32s_rv32im(int32_t value, int32_t *__restrict__ pDst, uint32_t blockSize);

void plp_fill_i32s_xpulpv2(int32_t value, int32_t *__restrict__ pDst, uint32_t blockSize);

void plp_copy_i32(int32_t *__restrict__ pSrc, int32_t *__restrict__ pDst, uint32_t blockSize);

void plp_copy_i32s_rv32im(int32_t *__restrict__ pSrc,
                          int32_t *__restrict__ pDst,
                          uint32_t blockSize);

void plp_copy_i32s_xpulpv2(int32_t *__restrict__ pSrc,
                           int32_t *__restrict__ pDst,
                           uint32_t blockSize);

void plp_copy_f32(float32_t *__restrict__ pSrc, float32_t *__restrict__ pDst, uint32_t blockSize);

void plp_copy_f32s_xpulpv2(float32_t *__restrict__ pSrc,
                           float32_t *__restrict__ pDst,
                           uint32_t blockSize);

void plp_copy_f32s_rv32im(float32_t *__restrict__ pSrc,
                           float32_t *__restrict__ pDst,
                           uint32_t blockSize);

void plp_mean_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);

void plp_mean_f32s_xpulpv2(const float *__restrict__ pSrc,
                           uint32_t blockSize,
                           float *__restrict__ pRes);

void plp_mean_i32(const int32_t *__restrict__ pSrc, uint32_t blockSize, int32_t *__restrict__ pRes);

void plp_mean_i32s_rv32im(const int32_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          int32_t *__restrict__ pRes);

void plp_mean_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
                           uint32_t blockSize,
                           int32_t *__restrict__ pRes);

void plp_mean_i16(const int16_t *__restrict__ pSrc, uint32_t blockSize, int16_t *__restrict__ pRes);

void plp_mean_i16s_rv32im(const int16_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          int16_t *__restrict__ pRes);

void plp_mean_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
                           uint32_t blockSize,
                           int16_t *__restrict__ pRes);

void plp_mean_i8(const int8_t *__restrict__ pSrc, uint32_t blockSize, int8_t *__restrict__ pRes);

void plp_mean_i8s_rv32im(const int8_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         int8_t *__restrict__ pRes);

void plp_mean_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          int8_t *__restrict__ pRes);

void plp_max_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);

void plp_max_f32s_xpulpv2(const float *__restrict__ pSrc,
                          uint32_t blockSize,
                          float *__restrict__ pRes);

void plp_max_i32(const int32_t *__restrict__ pSrc, uint32_t blockSize, int32_t *__restrict__ pRes);

void plp_max_i32s_rv32im(const int32_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         int32_t *__restrict__ pRes);

void plp_max_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          int32_t *__restrict__ pRes);

void plp_max_i16(const int16_t *__restrict__ pSrc, uint32_t blockSize, int16_t *__restrict__ pRes);

void plp_max_i16s_rv32im(const int16_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         int16_t *__restrict__ pRes);

void plp_max_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          int16_t *__restrict__ pRes);

void plp_max_i8(const int8_t *__restrict__ pSrc, uint32_t blockSize, int8_t *__restrict__ pRes);

void plp_max_i8s_rv32im(const int8_t *__restrict__ pSrc,
                        uint32_t blockSize,
                        int8_t *__restrict__ pRes);

void plp_max_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         int8_t *__restrict__ pRes);

void plp_min_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);

void plp_min_f32s_xpulpv2(const float *__restrict__ pSrc,
                          uint32_t blockSize,
                          float *__restrict__ pRes);

void plp_min_i32(const int32_t *__restrict__ pSrc, uint32_t blockSize, int32_t *__restrict__ pRes);

void plp_min_i32s_rv32im(const int32_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         int32_t *__restrict__ pRes);

void plp_min_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          int32_t *__restrict__ pRes);

void plp_min_i16(const int16_t *__restrict__ pSrc, uint32_t blockSize, int16_t *__restrict__ pRes);

void plp_min_i16s_rv32im(const int16_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         int16_t *__restrict__ pRes);

void plp_min_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          int16_t *__restrict__ pRes);

void plp_min_i8(const int8_t *__restrict__ pSrc, uint32_t blockSize, int8_t *__restrict__ pRes);

void plp_min_i8s_rv32im(const int8_t *__restrict__ pSrc,
                        uint32_t blockSize,
                        int8_t *__restrict__ pRes);

void plp_min_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         int8_t *__restrict__ pRes);

void plp_power_f32_parallel(const float32_t *__restrict__ pSrc,
                            uint32_t blockSize,
                            uint32_t nPE,
                            float32_t *__restrict__ pRes);

void plp_power_f32p_xpulpv2(void* S);

void plp_power_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);

void plp_power_f32s_xpulpv2(const float *__restrict__ pSrc,
                           uint32_t blockSize,
                           float *__restrict__ pRes);

void plp_power_f32s_rv32im(const float *__restrict__ pSrc,
                            uint32_t blockSize,
                            float *__restrict__ pRes);

void plp_power_i32(const int32_t *__restrict__ pSrc,
                   uint32_t blockSize,
                   int32_t *__restrict__ pRes);

void plp_power_i32s_rv32im(const int32_t *__restrict__ pSrc,
                           uint32_t blockSize,
                           int32_t *__restrict__ pRes);

void plp_power_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
                            uint32_t blockSize,
                            int32_t *__restrict__ pRes);

void plp_power_i16(const int16_t *__restrict__ pSrc,
                   uint32_t blockSize,
                   int32_t *__restrict__ pRes);

void plp_power_i16s_rv32im(const int16_t *__restrict__ pSrc,
                           uint32_t blockSize,
                           int32_t *__restrict__ pRes);

void plp_power_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
                            uint32_t blockSize,
                            int32_t *__restrict__ pRes);

void plp_power_i8(const int8_t *__restrict__ pSrc, uint32_t blockSize, int32_t *__restrict__ pRes);

void plp_power_i8s_rv32im(const int8_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          int32_t *__restrict__ pRes);

void plp_power_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
                           uint32_t blockSize,
                           int32_t *__restrict__ pRes);

void plp_power_q32_parallel(const int32_t *__restrict__ pSrc,
                            uint32_t blockSize,
                            uint32_t fracBits,
                            uint32_t nPE,
                            int32_t *__restrict__ pRes);

void plp_power_q32p_xpulpv2(void *S);

void plp_power_q32(const int32_t *__restrict__ pSrc,
                   uint32_t blockSize,
                   uint32_t fracBits,
                   int32_t *__restrict__ pRes);

void plp_power_q32s_rv32im(const int32_t *__restrict__ pSrc,
                           uint32_t blockSize,
                           uint32_t fracBits,
                           int32_t *__restrict__ pRes);

void plp_power_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
                            uint32_t blockSize,
                            uint32_t fracBits,
                            int32_t *__restrict__ pRes);

void plp_power_q16(const int16_t *__restrict__ pSrc,
                   uint32_t blockSize,
                   uint32_t fracBits,
                   int32_t *__restrict__ pRes);

void plp_power_q16s_rv32im(const int16_t *__restrict__ pSrc,
                           uint32_t blockSize,
                           uint32_t fracBits,
                           int32_t *__restrict__ pRes);

void plp_power_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
                            uint32_t blockSize,
                            uint32_t fracBits,
                            int32_t *__restrict__ pRes);

void plp_power_q8(const int8_t *__restrict__ pSrc,
                  uint32_t blockSize,
                  uint32_t fracBits,
                  int32_t *__restrict__ pRes);

void plp_power_q8s_rv32im(const int8_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          uint32_t fracBits,
                          int32_t *__restrict__ pRes);

void plp_power_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
                           uint32_t blockSize,
                           uint32_t fracBits,
                           int32_t *__restrict__ pRes);

void plp_var_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);

void plp_var_f32s_xpulpv2(const float *__restrict__ pSrc,
                         uint32_t blockSize,
                         float *__restrict__ pRes);

void plp_var_q32(const int32_t *__restrict__ pSrc,
                 uint32_t blockSize,
                 uint32_t fracBits,
                 int32_t *__restrict__ pRes);

void plp_var_q32s_rv32im(const int32_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         uint32_t fracBits,
                         int32_t *__restrict__ pRes);

void plp_var_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          uint32_t fracBits,
                          int32_t *__restrict__ pRes);

void plp_var_q16(const int16_t *__restrict__ pSrc,
                 uint32_t blockSize,
                 uint32_t fracBits,
                 int16_t *__restrict__ pRes);

void plp_var_q16s_rv32im(const int16_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         uint32_t fracBits,
                         int16_t *__restrict__ pRes);

void plp_var_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          uint32_t fracBits,
                          int16_t *__restrict__ pRes);

void plp_var_q8(const int8_t *__restrict__ pSrc,
                uint32_t blockSize,
                uint32_t fracBits,
                int8_t *__restrict__ pRes);

void plp_var_q8s_rv32im(const int8_t *__restrict__ pSrc,
                        uint32_t blockSize,
                        uint32_t fracBits,
                        int8_t *__restrict__ pRes);

void plp_var_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         uint32_t fracBits,
                         int8_t *__restrict__ pRes);

void plp_std_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);

void plp_std_f32s_xpulpv2(const float *__restrict__ pSrc,
                         uint32_t blockSize,
                         float *__restrict__ pRes);

void plp_std_q32(const int32_t *__restrict__ pSrc,
                 uint32_t blockSize,
                 uint32_t fracBits,
                 int32_t *__restrict__ pRes);

void plp_std_q32s_rv32im(const int32_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         uint32_t fracBits,
                         int32_t *__restrict__ pRes);

void plp_std_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          uint32_t fracBits,
                          int32_t *__restrict__ pRes);

void plp_std_q16(const int16_t *__restrict__ pSrc,
                 uint32_t blockSize,
                 uint32_t fracBits,
                 int16_t *__restrict__ pRes);

void plp_std_q16s_rv32im(const int16_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         uint32_t fracBits,
                         int16_t *__restrict__ pRes);

void plp_std_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          uint32_t fracBits,
                          int16_t *__restrict__ pRes);

void plp_std_q8(const int8_t *__restrict__ pSrc,
                uint32_t blockSize,
                uint32_t fracBits,
                int8_t *__restrict__ pRes);

void plp_std_q8s_rv32im(const int8_t *__restrict__ pSrc,
                        uint32_t blockSize,
                        uint32_t fracBits,
                        int8_t *__restrict__ pRes);

void plp_std_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         uint32_t fracBits,
                         int8_t *__restrict__ pRes);
void plp_rms_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);

void plp_rms_f32s_xpulpv2(const float *__restrict__ pSrc,
                         uint32_t blockSize,
                         float *__restrict__ pRes);

void plp_rms_q32(const int32_t *__restrict__ pSrc,
                 uint32_t blockSize,
                 uint32_t fracBits,
                 int32_t *__restrict__ pRes);

void plp_rms_q32s_rv32im(const int32_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         uint32_t fracBits,
                         int32_t *__restrict__ pRes);

void plp_rms_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          uint32_t fracBits,
                          int32_t *__restrict__ pRes);

void plp_rms_q16(const int16_t *__restrict__ pSrc,
                 uint32_t blockSize,
                 uint32_t fracBits,
                 int16_t *__restrict__ pRes);

void plp_rms_q16s_rv32im(const int16_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         uint32_t fracBits,
                         int16_t *__restrict__ pRes);

void plp_rms_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
                          uint32_t blockSize,
                          uint32_t fracBits,
                          int16_t *__restrict__ pRes);

void plp_rms_q8(const int8_t *__restrict__ pSrc,
                uint32_t blockSize,
                uint32_t fracBits,
                int8_t *__restrict__ pRes);

void plp_rms_q8s_rv32im(const int8_t *__restrict__ pSrc,
                        uint32_t blockSize,
                        uint32_t fracBits,
                        int8_t *__restrict__ pRes);

void plp_rms_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
                         uint32_t blockSize,
                         uint32_t fracBits,
                         int8_t *__restrict__ pRes);

void plp_sqrt_q32(const int32_t *__restrict__ pSrc,
                  const uint32_t fracBits,
                  int32_t *__restrict__ pRes);

void plp_sqrt_q32s_rv32im(const int32_t *__restrict__ pSrc,
                          const uint32_t fracBits,
                          int32_t *__restrict__ pRes);

void plp_sqrt_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
                           const uint32_t fracBits,
                           int32_t *__restrict__ pRes);

void plp_sqrt_q16(const int16_t *__restrict__ pSrc,
                  const uint32_t fracBits,
                  int16_t *__restrict__ pRes);

void plp_sqrt_q16s_rv32im(const int16_t *__restrict__ pSrc,
                          const uint32_t fracBits,
                          int16_t *__restrict__ pRes);

void plp_sqrt_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
                           const uint32_t fracBits,
                           int16_t *__restrict__ pRes);


void plp_sqrt_f32(const float *__restrict__ pSrc, 
                  float *__restrict__ pRes);

void plp_sqrt_f32s_rv32im(const float *__restrict__ pSrc, float *__restrict__ pRes);

void plp_sqrt_f32s_xpulpv2(const float *__restrict__ pSrc,
                           float *__restrict__ pRes);


#define FAST_MATH_TABLE_SIZE 512
#define FAST_MATH_Q32_SHIFT (32 - 10)
#define FAST_MATH_Q16_SHIFT (16 - 10)
#define CONTROLLER_Q32_SHIFT (32 - 9)
#define TABLE_SPACING_Q32 0x400000
#define TABLE_SPACING_Q16 0x80

int32_t plp_cos_q32(int32_t x);

int32_t plp_cos_q32s_rv32im(int32_t x);

int32_t plp_cos_q32s_xpulpv2(int32_t x);

int16_t plp_cos_q16(int16_t x);

int16_t plp_cos_q16s_rv32im(int16_t x);

int16_t plp_cos_q16s_xpulpv2(int16_t x);

float32_t plp_cos_f32(float32_t x);

float32_t plp_cos_f32s_xpulpv2(float32_t x);

int32_t plp_sin_q32(int32_t x);

int32_t plp_sin_q32s_rv32im(int32_t x);

int32_t plp_sin_q32s_xpulpv2(int32_t x);

int16_t plp_sin_q16(int16_t x);

int16_t plp_sin_q16s_rv32im(int16_t x);

int16_t plp_sin_q16s_xpulpv2(int16_t x);

float32_t plp_sin_f32(float32_t x);

float32_t plp_sin_f32s_xpulpv2(float32_t x);

void plp_correlate_i32(const int32_t *pSrcA,
                       const uint32_t srcALen,
                       const int32_t *pSrcB,
                       const uint32_t srcBLen,
                       int32_t *pRes);

void plp_correlate_i32s_rv32im(const int32_t *pSrcA,
                               const uint32_t srcALen,
                               const int32_t *pSrcB,
                               const uint32_t srcBLen,
                               int32_t *pRes);

void plp_correlate_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                const uint32_t srcALen,
                                const int32_t *__restrict__ pSrcB,
                                const uint32_t srcBLen,
                                int32_t *__restrict__ pRes);

void plp_correlate_i16(const int16_t *pSrcA,
                       const uint32_t srcALen,
                       const int16_t *pSrcB,
                       const uint32_t srcBLen,
                       int32_t *pRes);

void plp_correlate_i16s_xpulpv2(const int16_t *pSrcA,
                                const uint32_t srcALen,
                                const int16_t *pSrcB,
                                const uint32_t srcBLen,
                                int32_t *pRes);

void plp_correlate_i16s_rv32im(const int16_t *pSrcA,
                               const uint32_t srcALen,
                               const int16_t *pSrcB,
                               const uint32_t srcBLen,
                               int32_t *pRes);

void plp_correlate_i8(const int8_t *pSrcA,
                      const uint32_t srcALen,
                      const int8_t *pSrcB,
                      const uint32_t srcBLen,
                      int32_t *pRes);

void plp_correlate_valid_i8(const int8_t *pSrcA,
                            const uint32_t srcALen,
                            const int8_t *pSrcB,
                            const uint32_t srcBLen,
                            int32_t *pRes);

void plp_correlate_i8s_xpulpv2(const int8_t *pSrcA,
                               const uint32_t srcALen,
                               const int8_t *pSrcB,
                               const uint32_t srcBLen,
                               int32_t *pRes);

void plp_correlate_i8s_rv32im(const int8_t *pSrcA,
                              const uint32_t srcALen,
                              const int8_t *pSrcB,
                              const uint32_t srcBLen,
                              int32_t *pRes);


void plp_correlate_q32(const int32_t *pSrcA,
                       const uint32_t srcALen,
                       const int32_t *pSrcB,
                       const uint32_t srcBLen,
               const uint32_t fracBits,
                       int32_t *pRes);

void plp_correlate_q32s_rv32im(const int32_t *pSrcA,
                               const uint32_t srcALen,
                               const int32_t *pSrcB,
                               const uint32_t srcBLen,
                   const uint32_t fracBits,
                               int32_t *pRes);

void plp_correlate_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                const uint32_t srcALen,
                                const int32_t *__restrict__ pSrcB,
                                const uint32_t srcBLen,
                const uint32_t fracBits,
                                int32_t *__restrict__ pRes);

void plp_correlate_q16(const int16_t *pSrcA,
                       const uint32_t srcALen,
                       const int16_t *pSrcB,
                       const uint32_t srcBLen,
               const uint32_t fracBits,
                       int32_t *pRes);

void plp_correlate_q16s_xpulpv2(const int16_t *pSrcA,
                                const uint32_t srcALen,
                                const int16_t *pSrcB,
                                const uint32_t srcBLen,
                const uint32_t fracBits,
                                int32_t *pRes);

void plp_correlate_q16s_rv32im(const int16_t *pSrcA,
                               const uint32_t srcALen,
                               const int16_t *pSrcB,
                               const uint32_t srcBLen,
                   const uint32_t fracBits,
                               int32_t *pRes);

void plp_correlate_q8(const int8_t *pSrcA,
                      const uint32_t srcALen,
                      const int8_t *pSrcB,
                      const uint32_t srcBLen,
              const uint32_t fracBits,
                      int32_t *pRes);

void plp_correlate_valid_q8(const int8_t *pSrcA,
                            const uint32_t srcALen,
                            const int8_t *pSrcB,
                            const uint32_t srcBLen,
                const uint32_t fracBits,
                            int32_t *pRes);

void plp_correlate_q8s_xpulpv2(const int8_t *pSrcA,
                               const uint32_t srcALen,
                               const int8_t *pSrcB,
                               const uint32_t srcBLen,
                   const uint32_t fracBits,
                               int32_t *pRes);

void plp_correlate_q8s_rv32im(const int8_t *pSrcA,
                              const uint32_t srcALen,
                              const int8_t *pSrcB,
                              const uint32_t srcBLen,
                  const uint32_t fracBits,
                              int32_t *pRes);


void plp_conv_i32(const int32_t *pSrcA,
                  const uint32_t srcALen,
                  const int32_t *pSrcB,
                  const uint32_t srcBLen,
                  int32_t *pRes);

void plp_conv_valid_i32(const int32_t *pSrcA,
                        const uint32_t srcALen,
                        const int32_t *pSrcB,
                        const uint32_t srcBLen,
                        int32_t *pRes);

void plp_conv_i32s_rv32im(const int32_t *pSrcA,
                          const uint32_t srcALen,
                          const int32_t *pSrcB,
                          const uint32_t srcBLen,
                          int32_t *pRes);

void plp_conv_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                           const uint32_t srcALen,
                           const int32_t *__restrict__ pSrcB,
                           const uint32_t srcBLen,
                           int32_t *__restrict__ pRes);

void plp_conv_valid_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                 const uint32_t srcALen,
                                 const int32_t *__restrict__ pSrcB,
                                 const uint32_t srcBLen,
                                 int32_t *__restrict__ pRes);

void plp_conv_i16(const int16_t *pSrcA,
                  const uint32_t srcALen,
                  const int16_t *pSrcB,
                  const uint32_t srcBLen,
                  int32_t *pRes);

void plp_conv_valid_i16(const int16_t *pSrcA,
                        const uint32_t srcALen,
                        const int16_t *pSrcB,
                        const uint32_t srcBLen,
                        int32_t *pRes);

void plp_conv_valid_rep_i16(const int16_t *pSrcA,
                            const uint32_t srcALen,
                            const int16_t *pSrcB,
                            const uint32_t srcBLen,
                            int32_t *pRes);

void plp_conv_i16s_xpulpv2(const int16_t *pSrcA,
                           const uint32_t srcALen,
                           const int16_t *pSrcB,
                           const uint32_t srcBLen,
                           int32_t *pRes);

void plp_conv_valid_i16s_xpulpv2(const int16_t *pSrcA,
                                 const uint32_t srcALen,
                                 const int16_t *pSrcB,
                                 const uint32_t srcBLen,
                                 int32_t *pRes);

void plp_conv_valid_rep_i16s_xpulpv2(const int16_t *pSrcA,
                                     const uint32_t srcALen,
                                     const uint32_t srcAMem,
                                     const int16_t *pSrcB,
                                     const uint32_t srcBLen,
                                     int32_t *pRes);

void plp_conv_i16s_rv32im(const int16_t *pSrcA,
                          const uint32_t srcALen,
                          const int16_t *pSrcB,
                          const uint32_t srcBLen,
                          int32_t *pRes);

void plp_conv_i8(const int8_t *pSrcA,
                 const uint32_t srcALen,
                 const int8_t *pSrcB,
                 const uint32_t srcBLen,
                 int32_t *pRes);

void plp_conv_valid_i8(const int8_t *pSrcA,
                       const uint32_t srcALen,
                       const int8_t *pSrcB,
                       const uint32_t srcBLen,
                       int32_t *pRes);

void plp_conv_valid_rep_i8(const int8_t *pSrcA,
                           const uint32_t srcALen,
                           const int8_t *pSrcB,
                           const uint32_t srcBLen,
                           int32_t *pRes);

void plp_conv_i8s_xpulpv2(const int8_t *pSrcA,
                          const uint32_t srcALen,
                          const int8_t *pSrcB,
                          const uint32_t srcBLen,
                          int32_t *pRes);

void plp_conv_valid_i8s_xpulpv2(const int8_t *pSrcA,
                                const uint32_t srcALen,
                                const int8_t *pSrcB,
                                const uint32_t srcBLen,
                                int32_t *pRes);

void plp_conv_valid_rep_i8s_xpulpv2(const int8_t *pSrcA,
                                    const uint32_t srcALen,
                                    const uint32_t srcAMem,
                                    const int8_t *pSrcB,
                                    const uint32_t srcBLen,
                                    int32_t *pRes);

void plp_conv_i8s_rv32im(const int8_t *pSrcA,
                         const uint32_t srcALen,
                         const int8_t *pSrcB,
                         const uint32_t srcBLen,
                         int32_t *pRes);

void plp_conv_i32_parallel(const int32_t *pSrcA,
                           const uint32_t srcALen,
                           const int32_t *pSrcB,
                           const uint32_t srcBLen,
                           const uint8_t nPE,
                           int32_t *pRes);

void plp_conv_i32p_xpulpv2(void *task_args);

void plp_conv_i16_parallel(const int16_t *pSrcA,
                           const uint32_t srcALen,
                           const int16_t *pSrcB,
                           const uint32_t srcBLen,
                           const uint8_t nPE,
                           int32_t *pRes);
void plp_conv_i16p_xpulpv2(void *task_args);

void plp_conv_i8_parallel(const int8_t *pSrcA,
                          const uint32_t srcALen,
                          const int8_t *pSrcB,
                          const uint32_t srcBLen,
                          const uint8_t nPE,
                          int32_t *pRes);
void plp_conv_i8p_xpulpv2(void *task_args);

void plp_conv_parallel_OLA(uint32_t nPE,
                           uint32_t srcALen,
                           uint32_t srcBLen,
                           int32_t *resultsBuffer);

void plp_conv_parallel_OLA_kernel(void *task_args);

void plp_mat_mult_i32(const int32_t *__restrict__ pSrcA,
                      const int32_t *__restrict__ pSrcB,
                      uint32_t M,
                      uint32_t N,
                      uint32_t O,
                      int32_t *__restrict__ pDstC);

void plp_mat_mult_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                              const int32_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t O,
                              int32_t *__restrict__ pDstC);

void plp_mat_mult_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                               const int32_t *__restrict__ pSrcB,
                               uint32_t M,
                               uint32_t N,
                               uint32_t O,
                               int32_t *__restrict__ pDstC);

void plp_mat_mult_i16(const int16_t *__restrict__ pSrcA,
                      const int16_t *__restrict__ pSrcB,
                      uint32_t M,
                      uint32_t N,
                      uint32_t O,
                      int32_t *__restrict__ pDstC);

void plp_mat_mult_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t O,
                              int32_t *__restrict__ pDstC);

void plp_mat_mult_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                               const int16_t *__restrict__ pSrcB,
                               uint32_t M,
                               uint32_t N,
                               uint32_t O,
                               int32_t *__restrict__ pDstC);

void plp_mat_mult_i8(const int8_t *__restrict__ pSrcA,
                     const int8_t *__restrict__ pSrcB,
                     uint32_t M,
                     uint32_t N,
                     uint32_t O,
                     int32_t *__restrict__ pDstC);

void plp_mat_mult_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                             const int8_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             uint32_t O,
                             int32_t *__restrict__ pDstC);

void plp_mat_mult_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                              const int8_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t O,
                              int32_t *__restrict__ pDstC);

void plp_mat_mult_i32_parallel(const int32_t *__restrict__ pSrcA,
                               const int32_t *__restrict__ pSrcB,
                               uint32_t M,
                               uint32_t N,
                               uint32_t O,
                               uint32_t nPE,
                               int32_t *__restrict__ pDstC);

void plp_mat_mult_i32p_xpulpv2(void *args);

void plp_mat_mult_i16_parallel(const int16_t *__restrict__ pSrcA,
                               const int16_t *__restrict__ pSrcB,
                               uint32_t M,
                               uint32_t N,
                               uint32_t O,
                               uint32_t nPE,
                               int32_t *__restrict__ pDstC);

void plp_mat_mult_i16p_xpulpv2(void *args);

void plp_mat_mult_i8_parallel(const int8_t *__restrict__ pSrcA,
                              const int8_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t O,
                              uint32_t nPE,
                              int32_t *__restrict__ pDstC);

void plp_mat_mult_f32(const float *__restrict__ pSrcA,
                      const float *__restrict__ pSrcB,
                      uint32_t M,
                      uint32_t N,
                      uint32_t O,
                      float *__restrict__ pDstC);

void plp_mat_mult_f32s_xpulpv2(const float *__restrict__ pSrcA,
                               const float *__restrict__ pSrcB,
                               uint32_t M,
                               uint32_t N,
                               uint32_t O,
                               float *__restrict__ pDstC);

void plp_mat_mult_f32_parallel(const float *__restrict__ pSrcA,
                               const float *__restrict__ pSrcB,
                               uint32_t M,
                               uint32_t N,
                               uint32_t O,
                               uint32_t nPE,
                               float *__restrict__ pDstC);

void plp_mat_mult_f32p_xpulpv2(void *args);

void plp_mat_mult_i8p_xpulpv2(void *args);

void plp_mat_mult_q32(const int32_t *__restrict__ pSrcA,
                      const int32_t *__restrict__ pSrcB,
                      uint32_t M,
                      uint32_t N,
                      uint32_t O,
                      uint32_t shift,
                      int32_t *__restrict__ pDstC);

void plp_mat_mult_q32_parallel(const int32_t *__restrict__ pSrcA,
                               const int32_t *__restrict__ pSrcB,
                               uint32_t M,
                               uint32_t N,
                               uint32_t O,
                               uint32_t shift,
                               uint32_t nPE,
                               int32_t *__restrict__ pDstC);

void plp_mat_mult_q32s_rv32im(const int32_t *__restrict__ pSrcA,
                              const int32_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t O,
                              uint32_t shift,
                              int32_t *__restrict__ pDstC);

void plp_mat_mult_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                               const int32_t *__restrict__ pSrcB,
                               uint32_t M,
                               uint32_t N,
                               uint32_t O,
                               uint32_t shift,
                               int32_t *__restrict__ pDstC);

void plp_mat_mult_q32p_xpulpv2(void *args);

void plp_mat_mult_q16(const int16_t *__restrict__ pSrcA,
                      const int16_t *__restrict__ pSrcB,
                      uint32_t M,
                      uint32_t N,
                      uint32_t O,
                      uint32_t shift,
                      int16_t *__restrict__ pDstC);

void plp_mat_mult_q16_parallel(const int16_t *__restrict__ pSrcA,
                               const int16_t *__restrict__ pSrcB,
                               uint32_t M,
                               uint32_t N,
                               uint32_t O,
                               uint32_t shift,
                               uint32_t nPE,
                               int16_t *__restrict__ pDstC);

void plp_mat_mult_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t O,
                              uint32_t shift,
                              int16_t *__restrict__ pDstC);

void plp_mat_mult_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                               const int16_t *__restrict__ pSrcB,
                               uint32_t M,
                               uint32_t N,
                               uint32_t O,
                               uint32_t shift,
                               int16_t *__restrict__ pDstC);

void plp_mat_mult_q16p_xpulpv2(void *args);

void plp_mat_mult_q8(const int8_t *__restrict__ pSrcA,
                     const int8_t *__restrict__ pSrcB,
                     uint32_t M,
                     uint32_t N,
                     uint32_t O,
                     uint32_t shift,
                     int8_t *__restrict__ pDstC);

void plp_mat_mult_q8_parallel(const int8_t *__restrict__ pSrcA,
                              const int8_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t O,
                              uint32_t shift,
                              uint32_t nPE,
                              int8_t *__restrict__ pDstC);

void plp_mat_mult_q8s_rv32im(const int8_t *__restrict__ pSrcA,
                             const int8_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             uint32_t O,
                             uint32_t shift,
                             int8_t *__restrict__ pDstC);

void plp_mat_mult_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                              const int8_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t O,
                              uint32_t shift,
                              int8_t *__restrict__ pDstC);

void plp_mat_mult_q8p_xpulpv2(void *args);

void plp_mat_mult_cmplx_i32(const int32_t *__restrict__ pSrcA,
                            const int32_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                                    const int32_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i32_parallel(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t nPE,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i32p_xpulpv2(void *args);

void plp_mat_mult_cmplx_i16(const int16_t *__restrict__ pSrcA,
                            const int16_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                                    const int16_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i16_parallel(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t nPE,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i16p_xpulpv2(void *args);

void plp_mat_mult_cmplx_i8(const int8_t *__restrict__ pSrcA,
                           const int8_t *__restrict__ pSrcB,
                           uint32_t M,
                           uint32_t N,
                           uint32_t O,
                           int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                                   const int8_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i8_parallel(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t nPE,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_i8p_xpulpv2(void *args);

void plp_mat_mult_cmplx_f32(const float *__restrict__ pSrcA,
                            const float *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            float *__restrict__ pDstC);

void plp_mat_mult_cmplx_f32s_xpulpv2(const float *__restrict__ pSrcA,
                                     const float *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     float *__restrict__ pDstC);

void plp_mat_mult_cmplx_f32_parallel(const float *__restrict__ pSrcA,
                                     const float *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t nPE,
                                     float *__restrict__ pDstC);

void plp_mat_mult_cmplx_f32p_xpulpv2(void *args);

void plp_mat_mult_cmplx_q32(const int32_t *__restrict__ pSrcA,
                            const int32_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            uint32_t shift,
                            int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q32s_rv32im(const int32_t *__restrict__ pSrcA,
                                    const int32_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t shift,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t shift,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q32_parallel(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t shift,
                                     uint32_t nPE,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q32p_xpulpv2(void *args);

void plp_mat_mult_cmplx_q16(const int16_t *__restrict__ pSrcA,
                            const int16_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            uint32_t shift,
                            int16_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                                    const int16_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t shift,
                                    int16_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t shift,
                                     int16_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q16_parallel(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t shift,
                                     uint32_t nPE,
                                     int16_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q16p_xpulpv2(void *args);

void plp_mat_mult_cmplx_q8(const int8_t *__restrict__ pSrcA,
                           const int8_t *__restrict__ pSrcB,
                           uint32_t M,
                           uint32_t N,
                           uint32_t O,
                           uint32_t shift,
                           int8_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q8s_rv32im(const int8_t *__restrict__ pSrcA,
                                   const int8_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t shift,
                                   int8_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t shift,
                                    int8_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q8_parallel(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t shift,
                                    uint32_t nPE,
                                    int8_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_q8p_xpulpv2(void *args);

void plp_mat_mult_trans_i32(const int32_t *__restrict__ pSrcA,
                            const int32_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                                    const int32_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i16(const int16_t *__restrict__ pSrcA,
                            const int16_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                                    const int16_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i8(const int8_t *__restrict__ pSrcA,
                           const int8_t *__restrict__ pSrcB,
                           uint32_t M,
                           uint32_t N,
                           uint32_t O,
                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                                   const int8_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i32_parallel(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t nPE,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i32p_xpulpv2(void *args);

void plp_mat_mult_trans_i16_parallel(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t nPE,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i16p_xpulpv2(void *args);

void plp_mat_mult_trans_i8_parallel(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t nPE,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_i8p_xpulpv2(void *args);

void plp_mat_mult_trans_q32(const int32_t *__restrict__ pSrcA,
                            const int32_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            uint32_t shift,
                            int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_q32_parallel(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t shift,
                                     uint32_t nPE,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_q32s_rv32im(const int32_t *__restrict__ pSrcA,
                                    const int32_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t shift,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t shift,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_q32p_xpulpv2(void *args);

void plp_mat_mult_trans_q16(const int16_t *__restrict__ pSrcA,
                            const int16_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            uint32_t shift,
                            int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_q16_parallel(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t shift,
                                     uint32_t nPE,
                                     int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                                    const int16_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t shift,
                                    int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t shift,
                                     int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_q16p_xpulpv2(void *args);

void plp_mat_mult_trans_q8(const int8_t *__restrict__ pSrcA,
                           const int8_t *__restrict__ pSrcB,
                           uint32_t M,
                           uint32_t N,
                           uint32_t O,
                           uint32_t shift,
                           int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_q8_parallel(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t shift,
                                    uint32_t nPE,
                                    int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_q8s_rv32im(const int8_t *__restrict__ pSrcA,
                                   const int8_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t shift,
                                   int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t shift,
                                    int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_q8p_xpulpv2(void *args);

void plp_mat_mult_trans_f32(const float *__restrict__ pSrcA,
                            const float *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            float *__restrict__ pDstC);

void plp_mat_mult_trans_f32s_xpulpv2(const float *__restrict__ pSrcA,
                                     const float *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     float *__restrict__ pDstC);

void plp_mat_mult_trans_f32_parallel(const float *__restrict__ pSrcA,
                                     const float *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t nPE,
                                     float *__restrict__ pDstC);

void plp_mat_mult_trans_f32p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_i32(const int32_t *__restrict__ pSrcA,
                                  const int32_t *__restrict__ pSrcB,
                                  uint32_t M,
                                  uint32_t N,
                                  uint32_t O,
                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                                          const int32_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                           const int32_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i32_parallel(const int32_t *__restrict__ pSrcA,
                                           const int32_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t nPE,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i32p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_i16(const int16_t *__restrict__ pSrcA,
                                  const int16_t *__restrict__ pSrcB,
                                  uint32_t M,
                                  uint32_t N,
                                  uint32_t O,
                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                                          const int16_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                           const int16_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i16_parallel(const int16_t *__restrict__ pSrcA,
                                           const int16_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t nPE,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i16p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_i8(const int8_t *__restrict__ pSrcA,
                                 const int8_t *__restrict__ pSrcB,
                                 uint32_t M,
                                 uint32_t N,
                                 uint32_t O,
                                 int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                                         const int8_t *__restrict__ pSrcB,
                                         uint32_t M,
                                         uint32_t N,
                                         uint32_t O,
                                         int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                          const int8_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i8_parallel(const int8_t *__restrict__ pSrcA,
                                          const int8_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          uint32_t nPE,
                                          int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_i8p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_f32(const float *__restrict__ pSrcA,
                                  const float *__restrict__ pSrcB,
                                  uint32_t M,
                                  uint32_t N,
                                  uint32_t O,
                                  float *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_f32s_xpulpv2(const float *__restrict__ pSrcA,
                                           const float *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           float *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_f32_parallel(const float *__restrict__ pSrcA,
                                           const float *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t nPE,
                                           float *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_f32p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_q32(const int32_t *__restrict__ pSrcA,
                                  const int32_t *__restrict__ pSrcB,
                                  uint32_t M,
                                  uint32_t N,
                                  uint32_t O,
                                  uint32_t shift,
                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q32s_rv32im(const int32_t *__restrict__ pSrcA,
                                          const int32_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          uint32_t shift,
                                          int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                           const int32_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t shift,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q32_parallel(const int32_t *__restrict__ pSrcA,
                                           const int32_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t shift,
                                           uint32_t nPE,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q32p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_q16(const int16_t *__restrict__ pSrcA,
                                  const int16_t *__restrict__ pSrcB,
                                  uint32_t M,
                                  uint32_t N,
                                  uint32_t O,
                                  uint32_t shift,
                                  int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                                          const int16_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          uint32_t shift,
                                          int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                           const int16_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t shift,
                                           int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q16_parallel(const int16_t *__restrict__ pSrcA,
                                           const int16_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t shift,
                                           uint32_t nPE,
                                           int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q16p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_q8(const int8_t *__restrict__ pSrcA,
                                 const int8_t *__restrict__ pSrcB,
                                 uint32_t M,
                                 uint32_t N,
                                 uint32_t O,
                                 uint32_t shift,
                                 int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q8s_rv32im(const int8_t *__restrict__ pSrcA,
                                         const int8_t *__restrict__ pSrcB,
                                         uint32_t M,
                                         uint32_t N,
                                         uint32_t O,
                                         uint32_t shift,
                                         int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                          const int8_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          uint32_t shift,
                                          int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q8_parallel(const int8_t *__restrict__ pSrcA,
                                          const int8_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          uint32_t shift,
                                          uint32_t nPE,
                                          int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_q8p_xpulpv2(void *args);

void plp_cmplx_mag_f32(const float32_t *pSrc,
                       float32_t *pRes,
                       uint32_t numSamples);

void plp_cmplx_mag_f32s_xpulpv2(const float32_t *pSrc,
                                float32_t *pRes,
                                uint32_t numSamples);

void plp_cmplx_mag_q32(const int32_t *pSrc,
                       const uint32_t fracBits,
                       int32_t *pRes,
                       uint32_t numSamples);

void plp_cmplx_mag_q32s_rv32im(const int32_t *pSrc,
                               const uint32_t fracBits,
                               int32_t *pRes,
                               uint32_t numSamples);

void plp_cmplx_mag_q32s_xpulpv2(const int32_t *pSrc,
                                const uint32_t fracBits,
                                int32_t *pRes,
                                uint32_t numSamples);

void plp_cmplx_mag_q8(const int8_t *pSrc,
                       const uint32_t fracBits,
                       int8_t *pRes,
                       uint32_t numSamples);

void plp_cmplx_mag_q8s_rv32im(const int8_t *pSrc,
                               const uint32_t fracBits,
                               int8_t *pRes,
                               uint32_t numSamples);

void plp_cmplx_mag_q8s_xpulpv2(const int8_t *pSrc,
                                const uint32_t fracBits,
                                int8_t *pRes,
                                uint32_t numSamples);

void plp_cmplx_mag_i16(const int16_t *pSrc,
                       int16_t *pRes,
                       uint32_t numSamples);

void plp_cmplx_mag_i16s_rv32im(const int16_t *pSrc,
                               int16_t *pRes,
                               uint32_t numSamples);

void plp_cmplx_mag_i16s_xpulpv2(const int16_t *pSrc,
                                int16_t *pRes,
                                uint32_t numSamples);

void plp_cmplx_mag_i32(const int32_t *pSrc,
                       int32_t *pRes,
                       uint32_t numSamples);

void plp_cmplx_mag_i32s_rv32im(const int32_t *pSrc,
                               int32_t *pRes,
                               uint32_t numSamples);

void plp_cmplx_mag_i32s_xpulpv2(const int32_t *pSrc,
                                int32_t *pRes,
                                uint32_t numSamples);

void plp_cmplx_mag_i8(const int8_t *pSrc,
                      int8_t *pRes,
                      uint32_t numSamples);

void plp_cmplx_mag_i8s_rv32im(const int8_t *pSrc,
                              int8_t *pRes,
                              uint32_t numSamples);

void plp_cmplx_mag_i8s_xpulpv2(const int8_t *pSrc,
                               int8_t *pRes,
                               uint32_t numSamples);

void plp_cmplx_mag_q16(const int16_t *pSrc,
                       const uint32_t fracBits,
                       int16_t *pRes,
                       uint32_t numSamples);

void plp_cmplx_mag_q16s_rv32im(const int16_t *pSrc,
                               const uint32_t fracBits,
                               int16_t *pRes,
                               uint32_t numSamples);

void plp_cmplx_mag_q16s_xpulpv2(const int16_t *pSrc,
                                const uint32_t fracBits,
                                int16_t *pRes,
                                uint32_t numSamples);

void plp_bitreversal_16s_rv32im(uint16_t *pSrc,
                                const uint16_t bitRevLen,
                                const uint16_t *pBitRevTab);

void plp_bitreversal_16s_xpulpv2(uint16_t *pSrc,
                                 const uint16_t bitRevLen,
                                 const uint16_t *pBitRevTab);

void plp_bitreversal_16p_xpulpv2(uint16_t *pSrc,
                                 const uint16_t bitRevLen,
                                 const uint16_t *pBitRevTab,
                                 uint32_t nPE);

void plp_cfft_q16(const plp_cfft_instance_q16 *S,
                  int16_t *p1,
                  uint8_t ifftFlag,
                  uint8_t bitReverseFlag,
                  uint32_t deciPoint);

void plp_cfft_q16_parallel(const plp_cfft_instance_q16 *S,
                           int16_t *p1,
                           uint8_t ifftFlag,
                           uint8_t bitReverseFlag,
                           uint32_t deciPoint,
                           uint32_t nPE);

void plp_cfft_q16s_rv32im(const plp_cfft_instance_q16 *S,
                          int16_t *p1,
                          uint8_t ifftFlag,
                          uint8_t bitReverseFlag,
                          uint32_t deciPoint);

void plp_cfft_q16s_xpulpv2(const plp_cfft_instance_q16 *S,
                           int16_t *p1,
                           uint8_t ifftFlag,
                           uint8_t bitReverseFlag,
                           uint32_t deciPoint);

void plp_cfft_q16p_xpulpv2(void *args);

void plp_bitreversal_32s_rv32im(uint32_t *pSrc, const uint16_t bitRevLen, const uint16_t *pBitRevTab);

void plp_bitreversal_32s_xpulpv2(uint32_t *pSrc, const uint16_t bitRevLen, const uint16_t *pBitRevTab);

void plp_bitreversal_32p_xpulpv2(uint32_t *pSrc, const uint16_t bitRevLen, const uint16_t *pBitRevTab, uint32_t nPE);

void plp_cfft_q32(const plp_cfft_instance_q32 *S,
                      int32_t *p1,
                      uint8_t ifftFlag,
                      uint8_t bitReverseFlag,
                      uint32_t fracBits);

void plp_cfft_q32_parallel( const plp_cfft_instance_q32 *S,
                            int32_t *p1,
                            uint8_t ifftFlag,
                            uint8_t bitReverseFlag,
                            uint32_t fracBits,
                            uint32_t nPE );

void plp_cfft_q32s_rv32im(const plp_cfft_instance_q32 *S,
                      int32_t *p1,
                      uint8_t ifftFlag,
                      uint8_t bitReverseFlag,
                      uint32_t fracBits);

void plp_cfft_q32s_xpulpv2(const plp_cfft_instance_q32 *S,
                      int32_t *p1,
                      uint8_t ifftFlag,
                      uint8_t bitReverseFlag,
                      uint32_t fracBits);

void plp_cfft_q32p_xpulpv2(void *args);

void plp_rfft_f32(const plp_fft_instance_f32 *S,
                  const float32_t *__restrict__ pSrc,
                  float32_t *__restrict__ pDst);

void plp_rfft_f32_parallel(const plp_fft_instance_f32 *S,
                           const float32_t *__restrict__ pSrc,
                           const uint32_t nPE,
                           float32_t *__restrict__ pDst);

void plp_rfft_f32s_xpulpv2(const plp_fft_instance_f32 *S,
                          const float32_t *__restrict__ pSrc,
                          float32_t *__restrict__ pDst);

void plp_rfft_f32p_xpulpv2(void *arg);





void plp_rfftfast_f32( const plp_fft_fast_instance_f32 *S,
                        const float32_t *__restrict__ pSrc,
                        float32_t *__restrict__ pDst);

void plp_rfftfast_f32_parallel( const plp_fft_fast_instance_f32 *S,
                                 float32_t *__restrict__ pSrc,
                                 float32_t *__restrict__ pDst,
                                 const uint32_t nPE);

void plp_rfftfast_f32s_xpulpv2( const plp_fft_fast_instance_f32 *S,
                                 float32_t *pSrc,
                                 float32_t *pDst);

void plp_rfftfast_f32p_xpulpv2( void *arg );

void plp_cfft_f32(  const plp_cfft_instance_f32 *S,
                    float32_t *pSrc,
                    uint8_t ifftFlag,
                    uint8_t bitReverseFlag);

void plp_cfft_f32_parallel( const plp_cfft_instance_f32 *S,
                            const float32_t *pSrc,
                            uint8_t ifftFlag,
                            uint8_t bitReverseFlag,
                            const uint32_t nPE);

void plp_cfft_f32s_xpulpv2( const plp_cfft_instance_f32 *S,
                            const float32_t *pSrc,
                            uint8_t ifftFlag,
                            uint8_t bitReverseFlag);

void plp_cfft_f32p_xpulpv2(void *arg);

void plp_dct2_f32(const plp_fft_instance_f32 *S,
                  const Complex_type_f32 *pShift,
                  const uint8_t orthoNorm,
                  const float32_t *__restrict__ pSrc,
                  float32_t *__restrict__ pBuf,
                  float32_t *__restrict__ pDst);

void plp_dct2_f32_parallel(const plp_fft_instance_f32 *S,
                       const Complex_type_f32 *pShift,
                       const uint8_t orthoNorm,
                       const float32_t *__restrict__ pSrc,
               const uint32_t nPE,
                       float32_t *__restrict__ pBuf,
                       float32_t *__restrict__ pDst);

void plp_mfcc_f32(const plp_fft_instance_f32 *SFFT,
                  const plp_fft_instance_f32 *SDCT,
                  const Complex_type_f32 *pShift,
                  const plp_triangular_filter_f32 *filterBank,
          const float32_t *window,
          const uint8_t *orthoNorm,
                  const float32_t *__restrict__ pSrc,
                  float32_t *__restrict__ pDst);

void plp_mfcc_f32_parallel(const plp_fft_instance_f32 *SFFT,
                       const plp_fft_instance_f32 *SDCT,
                       const Complex_type_f32 *pShift,
                       const plp_triangular_filter_f32 *filterBank,
               const float32_t *window,
               const uint8_t *orthoNorm,
                       const float32_t *__restrict__ pSrc,
               const uint32_t nPE,
                       float32_t *__restrict__ pDst);

void plp_dwt_f32(const float32_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_f32 wavelet,
                  plp_dwt_extension_mode mode,
                  float32_t *__restrict__ pDstA,
                  float32_t *__restrict__ pDstD);


void plp_dwt_q32(const int32_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_q32 wavelet,
                  plp_dwt_extension_mode mode,
                  int32_t *__restrict__ pDstA,
                  int32_t *__restrict__ pDstD);


void plp_dwt_q16(const int16_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_q16 wavelet,
                  plp_dwt_extension_mode mode,
                  int16_t *__restrict__ pDstA,
                  int16_t *__restrict__ pDstD);

void plp_dwt_q8(const int8_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_q8 wavelet,
                  plp_dwt_extension_mode mode,
                  int8_t *__restrict__ pDstA,
                  int8_t *__restrict__ pDstD);


void plp_dwt_dec_f32(const float32_t *__restrict__ pSrc,
                     uint32_t length,
                     const plp_dwt_wavelet_f32 wavelet,
                     plp_dwt_extension_mode mode,
                     uint32_t level,
                     float32_t *__restrict__ pTmp,
                     float32_t *__restrict__ pDst);

void plp_dwt_dec_f32_parallel(const float32_t *__restrict__ pSrc,
                     uint32_t length,
                     const plp_dwt_wavelet_f32 wavelet,
                     plp_dwt_extension_mode mode,
                     uint32_t level,
                     uint32_t nPE,
                     float32_t *__restrict__ pTemp,
                     float32_t *__restrict__ pDst);

void plp_dwt_f32s_xpulpv2(const float32_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_f32 wavelet,
                  plp_dwt_extension_mode mode,
                  float32_t *__restrict__ pDstA,
                  float32_t *__restrict__ pDstD);

void plp_dwt_haar_f32s_xpulpv2(const float32_t *__restrict__ pSrc,
                         uint32_t length,
                         plp_dwt_extension_mode mode,
                         float32_t *__restrict__ pDstA,
                         float32_t *__restrict__ pDstD);




void plp_dwt_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_q32 wavelet,
                  plp_dwt_extension_mode mode,
                  int32_t *__restrict__ pDstA,
                  int32_t *__restrict__ pDstD);

void plp_dwt_haar_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
                         uint32_t length,
                         plp_dwt_extension_mode mode,
                         int32_t *__restrict__ pDstA,
                         int32_t *__restrict__ pDstD);



void plp_dwt_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_q16 wavelet,
                  plp_dwt_extension_mode mode,
                  int16_t *__restrict__ pDstA,
                  int16_t *__restrict__ pDstD);

void plp_dwt_haar_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
                         uint32_t length,
                         plp_dwt_extension_mode mode,
                         int16_t *__restrict__ pDstA,
                         int16_t *__restrict__ pDstD);


void plp_dwt_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_q8 wavelet,
                  plp_dwt_extension_mode mode,
                  int8_t *__restrict__ pDstA,
                  int8_t *__restrict__ pDstD);

void plp_dwt_haar_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
                         uint32_t length,
                         plp_dwt_extension_mode mode,
                         int8_t *__restrict__ pDstA,
                         int8_t *__restrict__ pDstD);


void plp_dwt_f32_parallel(const float32_t *__restrict__ pSrc,
                 uint32_t length,
                 const plp_dwt_wavelet_f32 wavelet,
                 plp_dwt_extension_mode mode,
                 uint32_t nPE,
                 float32_t *__restrict__ pDstA,
                 float32_t *__restrict__ pDstD);


void plp_dwt_q8_parallel(const int8_t *__restrict__ pSrc,
                 uint32_t length,
                 const plp_dwt_wavelet_q8 wavelet,
                 plp_dwt_extension_mode mode,
                 uint32_t nPE,
                 int8_t *__restrict__ pDstA,
                 int8_t *__restrict__ pDstD);


void plp_dwt_q16_parallel(const int16_t *__restrict__ pSrc,
                 uint32_t length,
                 const plp_dwt_wavelet_q16 wavelet,
                 plp_dwt_extension_mode mode,
                 uint32_t nPE,
                 int16_t *__restrict__ pDstA,
                 int16_t *__restrict__ pDstD);

void plp_dwt_q32_parallel(const int32_t *__restrict__ pSrc,
                 uint32_t length,
                 const plp_dwt_wavelet_q32 wavelet,
                 plp_dwt_extension_mode mode,
                 uint32_t nPE,
                 int32_t *__restrict__ pDstA,
                 int32_t *__restrict__ pDstD);


void plp_dwt_f32p_xpulpv2(void *args);


void plp_dwt_haar_f32p_xpulpv2(void *args);


void plp_dwt_q8p_xpulpv2(void *args);



void plp_dwt_haar_q8p_xpulpv2(void *args);



void plp_dwt_q16p_xpulpv2(void *args);


void plp_dwt_haar_q16p_xpulpv2(void *args);

void plp_dwt_q32p_xpulpv2(void *arg);

void plp_dwt_haar_q32p_xpulpv2(void *args);





void plp_dwt_q32s_rv32im(const int32_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_q32 wavelet,
                  plp_dwt_extension_mode mode,
                  int32_t *__restrict__ pDstA,
                  int32_t *__restrict__ pDstD);

void plp_dwt_haar_q32s_rv32im(const int32_t *__restrict__ pSrc,
                         uint32_t length,
                         plp_dwt_extension_mode mode,
                         int32_t *__restrict__ pDstA,
                         int32_t *__restrict__ pDstD);



void plp_dwt_q16s_rv32im(const int16_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_q16 wavelet,
                  plp_dwt_extension_mode mode,
                  int16_t *__restrict__ pDstA,
                  int16_t *__restrict__ pDstD);

void plp_dwt_haar_q16s_rv32im(const int16_t *__restrict__ pSrc,
                         uint32_t length,
                         plp_dwt_extension_mode mode,
                         int16_t *__restrict__ pDstA,
                         int16_t *__restrict__ pDstD);


void plp_dwt_q8s_rv32im(const int8_t *__restrict__ pSrc,
                  uint32_t length,
                  const plp_dwt_wavelet_q8 wavelet,
                  plp_dwt_extension_mode mode,
                  int8_t *__restrict__ pDstA,
                  int8_t *__restrict__ pDstD);

void plp_dwt_haar_q8s_rv32im(const int8_t *__restrict__ pSrc,
                         uint32_t length,
                         plp_dwt_extension_mode mode,
                         int8_t *__restrict__ pDstA,
                         int8_t *__restrict__ pDstD);


void plp_mat_add_i32(const int32_t *__restrict__ pSrcA,
                     const int32_t *__restrict__ pSrcB,
                     uint32_t M,
                     uint32_t N,
                     int32_t *__restrict__ pDst);

void plp_mat_add_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                             const int32_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             int32_t *__restrict__ pDst);

void plp_mat_add_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                              const int32_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              int32_t *__restrict__ pDst);

void plp_mat_add_i32_parallel(const int32_t *__restrict__ pSrcA,
                              const int32_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t nPE,
                              int32_t *__restrict__ pDst);

void plp_mat_add_i32p_xpulpv2(void *args);

void plp_mat_add_i16(const int16_t *__restrict__ pSrcA,
                     const int16_t *__restrict__ pSrcB,
                     uint32_t M,
                     uint32_t N,
                     int16_t *__restrict__ pDst);

void plp_mat_add_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                             const int16_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             int16_t *__restrict__ pDst);

void plp_mat_add_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              int16_t *__restrict__ pDst);

void plp_mat_add_i16_parallel(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t nPE,
                              int16_t *__restrict__ pDst);

void plp_mat_add_i16p_xpulpv2(void *args);

void plp_mat_add_i8(const int8_t *__restrict__ pSrcA,
                    const int8_t *__restrict__ pSrcB,
                    uint32_t M,
                    uint32_t N,
                    int8_t *__restrict__ pDst);

void plp_mat_add_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                            const int8_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            int8_t *__restrict__ pDst);

void plp_mat_add_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                             const int8_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             int8_t *__restrict__ pDst);

void plp_mat_add_i8_parallel(const int8_t *__restrict__ pSrcA,
                             const int8_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             uint32_t nPE,
                             int8_t *__restrict__ pDst);

void plp_mat_add_i8p_xpulpv2(void *args);

void plp_mat_add_f32(const float *__restrict__ pSrcA,
                     const float *__restrict__ pSrcB,
                     uint32_t M,
                     uint32_t N,
                     float *__restrict__ pDst);

void plp_mat_add_f32s_xpulpv2(const float *__restrict__ pSrcA,
                              const float *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              float *__restrict__ pDst);

void plp_mat_add_f32_parallel(const float *__restrict__ pSrcA,
                              const float *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t nPE,
                              float *__restrict__ pDst);

void plp_mat_add_f32p_xpulpv2(void *args);

void plp_mat_sub_i32(const int32_t *__restrict__ pSrcA,
                     const int32_t *__restrict__ pSrcB,
                     uint32_t M,
                     uint32_t N,
                     int32_t *__restrict__ pDst);

void plp_mat_sub_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                             const int32_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             int32_t *__restrict__ pDst);

void plp_mat_sub_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                              const int32_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              int32_t *__restrict__ pDst);

void plp_mat_sub_i32_parallel(const int32_t *__restrict__ pSrcA,
                              const int32_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t nPE,
                              int32_t *__restrict__ pDst);

void plp_mat_sub_i32p_xpulpv2(void *args);

void plp_mat_sub_i16(const int16_t *__restrict__ pSrcA,
                     const int16_t *__restrict__ pSrcB,
                     uint32_t M,
                     uint32_t N,
                     int16_t *__restrict__ pDst);

void plp_mat_sub_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                             const int16_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             int16_t *__restrict__ pDst);

void plp_mat_sub_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              int16_t *__restrict__ pDst);

void plp_mat_sub_i16_parallel(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t nPE,
                              int16_t *__restrict__ pDst);

void plp_mat_sub_i16p_xpulpv2(void *args);

void plp_mat_sub_i8(const int8_t *__restrict__ pSrcA,
                    const int8_t *__restrict__ pSrcB,
                    uint32_t M,
                    uint32_t N,
                    int8_t *__restrict__ pDst);

void plp_mat_sub_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                            const int8_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            int8_t *__restrict__ pDst);

void plp_mat_sub_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                             const int8_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             int8_t *__restrict__ pDst);

void plp_mat_sub_i8_parallel(const int8_t *__restrict__ pSrcA,
                             const int8_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             uint32_t nPE,
                             int8_t *__restrict__ pDst);

void plp_mat_sub_i8p_xpulpv2(void *args);

void plp_mat_sub_f32(const float *__restrict__ pSrcA,
                     const float *__restrict__ pSrcB,
                     uint32_t M,
                     uint32_t N,
                     float *__restrict__ pDst);

void plp_mat_sub_f32s_xpulpv2(const float *__restrict__ pSrcA,
                              const float *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              float *__restrict__ pDst);

void plp_mat_sub_f32_parallel(const float *__restrict__ pSrcA,
                              const float *__restrict__ pSrcB,
                              uint32_t M,
                              uint32_t N,
                              uint32_t nPE,
                              float *__restrict__ pDst);

void plp_mat_sub_f32p_xpulpv2(void *args);

void plp_mat_scale_i32(const int32_t *__restrict__ pSrc,
                       uint32_t M,
                       uint32_t N,
                       int32_t scaleFactor,
                       int32_t shift,
                       int32_t *__restrict__ pDst);

void plp_mat_scale_i32s_rv32im(const int32_t *__restrict__ pSrc,
                               uint32_t M,
                               uint32_t N,
                               int32_t scaleFactor,
                               int32_t shift,
                               int32_t *__restrict__ pDst);

void plp_mat_scale_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
                                uint32_t M,
                                uint32_t N,
                                int32_t scaleFactor,
                                int32_t shift,
                                int32_t *__restrict__ pDst);

void plp_mat_scale_i32_parallel(const int32_t *__restrict__ pSrc,
                                uint32_t M,
                                uint32_t N,
                                int32_t scaleFactor,
                                int32_t shift,
                                uint32_t nPE,
                                int32_t *__restrict__ pDst);

void plp_mat_scale_i32p_xpulpv2(void *args);

void plp_mat_scale_i16(const int16_t *__restrict__ pSrc,
                       uint32_t M,
                       uint32_t N,
                       int16_t scaleFactor,
                       int32_t shift,
                       int16_t *__restrict__ pDst);

void plp_mat_scale_i16s_rv32im(const int16_t *__restrict__ pSrc,
                               uint32_t M,
                               uint32_t N,
                               int16_t scaleFactor,
                               int32_t shift,
                               int16_t *__restrict__ pDst);

void plp_mat_scale_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
                                uint32_t M,
                                uint32_t N,
                                int16_t scaleFactor,
                                int32_t shift,
                                int16_t *__restrict__ pDst);

void plp_mat_scale_i16_parallel(const int16_t *__restrict__ pSrc,
                                uint32_t M,
                                uint32_t N,
                                int16_t scaleFactor,
                                int32_t shift,
                                uint32_t nPE,
                                int16_t *__restrict__ pDst);

void plp_mat_scale_i16p_xpulpv2(void *args);

void plp_mat_scale_i8(const int8_t *__restrict__ pSrc,
                      uint32_t M,
                      uint32_t N,
                      int8_t scaleFactor,
                      int32_t shift,
                      int8_t *__restrict__ pDst);

void plp_mat_scale_i8s_rv32im(const int8_t *__restrict__ pSrc,
                              uint32_t M,
                              uint32_t N,
                              int8_t scaleFactor,
                              int32_t shift,
                              int8_t *__restrict__ pDst);

void plp_mat_scale_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
                               uint32_t M,
                               uint32_t N,
                               int8_t scaleFactor,
                               int32_t shift,
                               int8_t *__restrict__ pDst);

void plp_mat_scale_i8_parallel(const int8_t *__restrict__ pSrc,
                               uint32_t M,
                               uint32_t N,
                               int8_t scaleFactor,
                               int32_t shift,
                               uint32_t nPE,
                               int8_t *__restrict__ pDst);

void plp_mat_scale_i8p_xpulpv2(void *args);

void plp_mat_scale_f32(const float *__restrict__ pSrc,
                       uint32_t M,
                       uint32_t N,
                       float scaleFactor,
                       float *__restrict__ pDst);

void plp_mat_scale_f32s_xpulpv2(const float *__restrict__ pSrc,
                                uint32_t M,
                                uint32_t N,
                                float scaleFactor,
                                float *__restrict__ pDst);

void plp_mat_scale_f32_parallel(const float *__restrict__ pSrc,
                                uint32_t M,
                                uint32_t N,
                                float scaleFactor,
                                uint32_t nPE,
                                float *__restrict__ pDst);

void plp_mat_scale_f32p_xpulpv2(void *args);

void plp_mat_trans_i32(const int32_t *__restrict__ pSrc,
                       uint32_t M,
                       uint32_t N,
                       int32_t *__restrict__ pDst);

void plp_mat_trans_i32s_rv32im(const int32_t *__restrict__ pSrc,
                               uint32_t M,
                               uint32_t N,
                               int32_t *__restrict__ pDst);

void plp_mat_trans_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
                                uint32_t M,
                                uint32_t N,
                                int32_t *__restrict__ pDst);

void plp_mat_trans_i32_parallel(const int32_t *__restrict__ pSrc,
                                uint32_t M,
                                uint32_t N,
                                uint32_t nPE,
                                int32_t *__restrict__ pDst);

void plp_mat_trans_i32p_xpulpv2(void *args);

void plp_mat_trans_i16(const int16_t *__restrict__ pSrc,
                       uint32_t M,
                       uint32_t N,
                       int16_t *__restrict__ pDst);

void plp_mat_trans_i16s_rv32im(const int16_t *__restrict__ pSrc,
                               uint32_t M,
                               uint32_t N,
                               int16_t *__restrict__ pDst);

void plp_mat_trans_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
                                uint32_t M,
                                uint32_t N,
                                int16_t *__restrict__ pDst);

void plp_mat_trans_i16_parallel(const int16_t *__restrict__ pSrc,
                                uint32_t M,
                                uint32_t N,
                                uint32_t nPE,
                                int16_t *__restrict__ pDst);

void plp_mat_trans_i16p_xpulpv2(void *args);

void plp_mat_trans_i8(const int8_t *__restrict__ pSrc,
                      uint32_t M,
                      uint32_t N,
                      int8_t *__restrict__ pDst);

void plp_mat_trans_i8s_rv32im(const int8_t *__restrict__ pSrc,
                              uint32_t M,
                              uint32_t N,
                              int8_t *__restrict__ pDst);

void plp_mat_trans_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
                               uint32_t M,
                               uint32_t N,
                               int8_t *__restrict__ pDst);

void plp_mat_trans_i8_parallel(const int8_t *__restrict__ pSrc,
                               uint32_t M,
                               uint32_t N,
                               uint32_t nPE,
                               int8_t *__restrict__ pDst);

void plp_mat_trans_i8p_xpulpv2(void *args);

void plp_mat_trans_f32(const float *__restrict__ pSrc,
                       uint32_t M,
                       uint32_t N,
                       float *__restrict__ pDst);

void plp_mat_trans_f32_parallel(
    const float *__restrict__ pSrc, uint32_t M, uint32_t N, uint32_t nPE, float *__restrict__ pDst);

int plp_mat_inv_f32(float *__restrict__ pSrc, float *__restrict__ pDst, uint32_t N);

int plp_mat_inv_f32s_xpulpv2(float *__restrict__ pSrc, float *__restrict__ pDst, uint32_t N);

int plp_mat_inv_f32_parallel( float *__restrict__ pSrc,
                              float *__restrict__ pDst,
                              uint32_t N,
                              uint32_t nPE);

int plp_mat_inv_f32p_xpulpv2(void *args);

void plp_mat_fill_I_i32(uint32_t N, int32_t *__restrict__ pDst);

void plp_mat_fill_I_i32s_rv32im(uint32_t N, int32_t *__restrict__ pDst);

void plp_mat_fill_I_i32s_xpulpv2(uint32_t N, int32_t *__restrict__ pDst);

void plp_mat_fill_I_i32_parallel(uint32_t N, uint32_t nPE, int32_t *__restrict__ pDst);

void plp_mat_fill_I_i32p_xpulpv2(void *args);

void plp_mat_fill_I_i16(uint32_t N, int16_t *__restrict__ pDst);

void plp_mat_fill_I_i16s_rv32im(uint32_t N, int16_t *__restrict__ pDst);

void plp_mat_fill_I_i16s_xpulpv2(uint32_t N, int16_t *__restrict__ pDst);

void plp_mat_fill_I_i16_parallel(uint32_t N, uint32_t nPE, int16_t *__restrict__ pDst);

void plp_mat_fill_I_i16p_xpulpv2(void *args);

void plp_mat_fill_I_i8(uint32_t N, int8_t *__restrict__ pDst);

void plp_mat_fill_I_i8s_rv32im(uint32_t N, int8_t *__restrict__ pDst);

void plp_mat_fill_I_i8s_xpulpv2(uint32_t N, int8_t *__restrict__ pDst);

void plp_mat_fill_I_i8_parallel(uint32_t N, uint32_t nPE, int8_t *__restrict__ pDst);

void plp_mat_fill_I_i8p_xpulpv2(void *args);

void plp_mat_fill_I_f32(uint32_t N, float *__restrict__ pDst);

void plp_mat_fill_I_f32s_xpulpv2(uint32_t N, float *__restrict__ pDst);

void plp_mat_fill_I_f32_parallel(uint32_t N, uint32_t nPE, float *__restrict__ pDst);

void plp_mat_fill_I_f32p_xpulpv2(void *args);

void plp_mat_fill_I_q32(uint32_t N, int32_t fracBits, int32_t *__restrict__ pDst);

void plp_mat_fill_I_q32s_rv32im(uint32_t N, int32_t fracBits, int32_t *__restrict__ pDst);

void plp_mat_fill_I_q32s_xpulpv2(uint32_t N, int32_t fracBits, int32_t *__restrict__ pDst);

void plp_mat_fill_I_q32_parallel(uint32_t N,
                                 int32_t fracBits,
                                 uint32_t nPE,
                                 int32_t *__restrict__ pDst);

void plp_mat_fill_I_q32p_xpulpv2(void *args);

void plp_mat_fill_I_q16(uint32_t N, int32_t fracBits, int16_t *__restrict__ pDst);

void plp_mat_fill_I_q16s_rv32im(uint32_t N, int32_t fracBits, int16_t *__restrict__ pDst);

void plp_mat_fill_I_q16s_xpulpv2(uint32_t N, int32_t fracBits, int16_t *__restrict__ pDst);

void plp_mat_fill_I_q16_parallel(uint32_t N,
                                 int32_t fracBits,
                                 uint32_t nPE,
                                 int16_t *__restrict__ pDst);

void plp_mat_fill_I_q16p_xpulpv2(void *args);

void plp_mat_fill_I_q8(uint32_t N, int32_t fracBits, int8_t *__restrict__ pDst);

void plp_mat_fill_I_q8s_rv32im(uint32_t N, int32_t fracBits, int8_t *__restrict__ pDst);

void plp_mat_fill_I_q8s_xpulpv2(uint32_t N, int32_t fracBits, int8_t *__restrict__ pDst);

void plp_mat_fill_I_q8_parallel(uint32_t N,
                                int32_t fracBits,
                                uint32_t nPE,
                                int8_t *__restrict__ pDst);

void plp_mat_fill_I_q8p_xpulpv2(void *args);

void plp_mat_mult_stride_i32(const int32_t *__restrict__ pSrcA,
                             const int32_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             uint32_t O,
                             uint32_t strideA,
                             uint32_t strideB,
                             uint32_t strideC,
                             int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideC,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                      const int32_t *__restrict__ pSrcB,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t O,
                                      uint32_t strideA,
                                      uint32_t strideB,
                                      uint32_t strideC,
                                      int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i16(const int16_t *__restrict__ pSrcA,
                             const int16_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             uint32_t O,
                             uint32_t strideA,
                             uint32_t strideB,
                             uint32_t strideC,
                             int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideC,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                      const int16_t *__restrict__ pSrcB,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t O,
                                      uint32_t strideA,
                                      uint32_t strideB,
                                      uint32_t strideC,
                                      int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i8(const int8_t *__restrict__ pSrcA,
                            const int8_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            uint32_t strideA,
                            uint32_t strideB,
                            uint32_t strideC,
                            int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t strideA,
                                    uint32_t strideB,
                                    uint32_t strideC,
                                    int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                     const int8_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideC,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
                                      const int32_t *__restrict__ pSrcB,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t O,
                                      uint32_t strideA,
                                      uint32_t strideB,
                                      uint32_t strideC,
                                      uint32_t nPE,
                                      int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i32p_xpulpv2(void *args);

void plp_mat_mult_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
                                      const int16_t *__restrict__ pSrcB,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t O,
                                      uint32_t strideA,
                                      uint32_t strideB,
                                      uint32_t strideC,
                                      uint32_t nPE,
                                      int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_i16p_xpulpv2(void *args);

void plp_mat_mult_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
                                     const int8_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideC,
                                     uint32_t nPE,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_f32(const float *__restrict__ pSrcA,
                             const float *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             uint32_t O,
                             uint32_t strideA,
                             uint32_t strideB,
                             uint32_t strideC,
                             float *__restrict__ pDstC);

void plp_mat_mult_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
                                      const float *__restrict__ pSrcB,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t O,
                                      uint32_t strideA,
                                      uint32_t strideB,
                                      uint32_t strideC,
                                      float *__restrict__ pDstC);

void plp_mat_mult_stride_f32_parallel(const float *__restrict__ pSrcA,
                                      const float *__restrict__ pSrcB,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t O,
                                      uint32_t strideA,
                                      uint32_t strideB,
                                      uint32_t strideC,
                                      uint32_t nPE,
                                      float *__restrict__ pDstC);

void plp_mat_mult_stride_f32p_xpulpv2(void *args);

void plp_mat_mult_stride_i8p_xpulpv2(void *args);

void plp_mat_mult_stride_q32(const int32_t *__restrict__ pSrcA,
                             const int32_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             uint32_t O,
                             uint32_t strideA,
                             uint32_t strideB,
                             uint32_t strideC,
                             uint32_t shift,
                             int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_q32_parallel(const int32_t *__restrict__ pSrcA,
                                      const int32_t *__restrict__ pSrcB,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t O,
                                      uint32_t strideA,
                                      uint32_t strideB,
                                      uint32_t strideC,
                                      uint32_t shift,
                                      uint32_t nPE,
                                      int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_q32s_rv32im(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideC,
                                     uint32_t shift,
                                     int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                      const int32_t *__restrict__ pSrcB,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t O,
                                      uint32_t strideA,
                                      uint32_t strideB,
                                      uint32_t strideC,
                                      uint32_t shift,
                                      int32_t *__restrict__ pDstC);

void plp_mat_mult_stride_q32p_xpulpv2(void *args);

void plp_mat_mult_stride_q16(const int16_t *__restrict__ pSrcA,
                             const int16_t *__restrict__ pSrcB,
                             uint32_t M,
                             uint32_t N,
                             uint32_t O,
                             uint32_t strideA,
                             uint32_t strideB,
                             uint32_t strideC,
                             uint32_t shift,
                             int16_t *__restrict__ pDstC);

void plp_mat_mult_stride_q16_parallel(const int16_t *__restrict__ pSrcA,
                                      const int16_t *__restrict__ pSrcB,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t O,
                                      uint32_t strideA,
                                      uint32_t strideB,
                                      uint32_t strideC,
                                      uint32_t shift,
                                      uint32_t nPE,
                                      int16_t *__restrict__ pDstC);

void plp_mat_mult_stride_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideC,
                                     uint32_t shift,
                                     int16_t *__restrict__ pDstC);

void plp_mat_mult_stride_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                      const int16_t *__restrict__ pSrcB,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t O,
                                      uint32_t strideA,
                                      uint32_t strideB,
                                      uint32_t strideC,
                                      uint32_t shift,
                                      int16_t *__restrict__ pDstC);

void plp_mat_mult_stride_q16p_xpulpv2(void *args);

void plp_mat_mult_stride_q8(const int8_t *__restrict__ pSrcA,
                            const int8_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t O,
                            uint32_t strideA,
                            uint32_t strideB,
                            uint32_t strideC,
                            uint32_t shift,
                            int8_t *__restrict__ pDstC);

void plp_mat_mult_stride_q8_parallel(const int8_t *__restrict__ pSrcA,
                                     const int8_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideC,
                                     uint32_t shift,
                                     uint32_t nPE,
                                     int8_t *__restrict__ pDstC);

void plp_mat_mult_stride_q8s_rv32im(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t O,
                                    uint32_t strideA,
                                    uint32_t strideB,
                                    uint32_t strideC,
                                    uint32_t shift,
                                    int8_t *__restrict__ pDstC);

void plp_mat_mult_stride_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                     const int8_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t O,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideC,
                                     uint32_t shift,
                                     int8_t *__restrict__ pDstC);

void plp_mat_mult_stride_q8p_xpulpv2(void *args);

void plp_mat_mult_trans_stride_i32(const int32_t *__restrict__ pSrcA,
                                   const int32_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideC,
                                   int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                                           const int32_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i16(const int16_t *__restrict__ pSrcA,
                                   const int16_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideC,
                                   int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                                           const int16_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                            const int16_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i8(const int8_t *__restrict__ pSrcA,
                                  const int8_t *__restrict__ pSrcB,
                                  uint32_t M,
                                  uint32_t N,
                                  uint32_t O,
                                  uint32_t strideA,
                                  uint32_t strideB,
                                  uint32_t strideC,
                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                                          const int8_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          uint32_t strideA,
                                          uint32_t strideB,
                                          uint32_t strideC,
                                          int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                           const int8_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t nPE,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i32p_xpulpv2(void *args);

void plp_mat_mult_trans_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
                                            const int16_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t nPE,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i16p_xpulpv2(void *args);

void plp_mat_mult_trans_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
                                           const int8_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           uint32_t nPE,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_i8p_xpulpv2(void *args);

void plp_mat_mult_trans_stride_q32(const int32_t *__restrict__ pSrcA,
                                   const int32_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideC,
                                   uint32_t shift,
                                   int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q32_parallel(const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t shift,
                                            uint32_t nPE,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q32s_rv32im(const int32_t *__restrict__ pSrcA,
                                           const int32_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           uint32_t shift,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t shift,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q32p_xpulpv2(void *args);

void plp_mat_mult_trans_stride_q16(const int16_t *__restrict__ pSrcA,
                                   const int16_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideC,
                                   uint32_t shift,
                                   int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q16_parallel(const int16_t *__restrict__ pSrcA,
                                            const int16_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t shift,
                                            uint32_t nPE,
                                            int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                                           const int16_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           uint32_t shift,
                                           int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                            const int16_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t shift,
                                            int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q16p_xpulpv2(void *args);

void plp_mat_mult_trans_stride_q8(const int8_t *__restrict__ pSrcA,
                                  const int8_t *__restrict__ pSrcB,
                                  uint32_t M,
                                  uint32_t N,
                                  uint32_t O,
                                  uint32_t strideA,
                                  uint32_t strideB,
                                  uint32_t strideC,
                                  uint32_t shift,
                                  int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q8_parallel(const int8_t *__restrict__ pSrcA,
                                           const int8_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           uint32_t shift,
                                           uint32_t nPE,
                                           int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q8s_rv32im(const int8_t *__restrict__ pSrcA,
                                          const int8_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          uint32_t strideA,
                                          uint32_t strideB,
                                          uint32_t strideC,
                                          uint32_t shift,
                                          int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                           const int8_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           uint32_t shift,
                                           int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_stride_q8p_xpulpv2(void *args);

void plp_mat_mult_trans_stride_f32(const float *__restrict__ pSrcA,
                                   const float *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideC,
                                   float *__restrict__ pDstC);

void plp_mat_mult_trans_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
                                            const float *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            float *__restrict__ pDstC);

void plp_mat_mult_trans_stride_f32_parallel(const float *__restrict__ pSrcA,
                                            const float *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t nPE,
                                            float *__restrict__ pDstC);

void plp_mat_mult_trans_stride_f32p_xpulpv2(void *args);

void plp_mat_mult_cmplx_stride_i32(const int32_t *__restrict__ pSrcA,
                                   const int32_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideC,
                                   int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                                           const int32_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t nPE,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i32p_xpulpv2(void *args);

void plp_mat_mult_cmplx_stride_i16(const int16_t *__restrict__ pSrcA,
                                   const int16_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideC,
                                   int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                                           const int16_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                            const int16_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
                                            const int16_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t nPE,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i16p_xpulpv2(void *args);

void plp_mat_mult_cmplx_stride_i8(const int8_t *__restrict__ pSrcA,
                                  const int8_t *__restrict__ pSrcB,
                                  uint32_t M,
                                  uint32_t N,
                                  uint32_t O,
                                  uint32_t strideA,
                                  uint32_t strideB,
                                  uint32_t strideC,
                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                                          const int8_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          uint32_t strideA,
                                          uint32_t strideB,
                                          uint32_t strideC,
                                          int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                           const int8_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
                                           const int8_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           uint32_t nPE,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_i8p_xpulpv2(void *args);

void plp_mat_mult_cmplx_stride_f32(const float *__restrict__ pSrcA,
                                   const float *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideC,
                                   float *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
                                            const float *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            float *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_f32_parallel(const float *__restrict__ pSrcA,
                                            const float *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t nPE,
                                            float *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_f32p_xpulpv2(void *args);

void plp_mat_mult_cmplx_stride_q32(const int32_t *__restrict__ pSrcA,
                                   const int32_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideC,
                                   uint32_t shift,
                                   int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q32s_rv32im(const int32_t *__restrict__ pSrcA,
                                           const int32_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           uint32_t shift,
                                           int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t shift,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q32_parallel(const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t shift,
                                            uint32_t nPE,
                                            int32_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q32p_xpulpv2(void *args);

void plp_mat_mult_cmplx_stride_q16(const int16_t *__restrict__ pSrcA,
                                   const int16_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t O,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideC,
                                   uint32_t shift,
                                   int16_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                                           const int16_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           uint32_t shift,
                                           int16_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                            const int16_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t shift,
                                            int16_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q16_parallel(const int16_t *__restrict__ pSrcA,
                                            const int16_t *__restrict__ pSrcB,
                                            uint32_t M,
                                            uint32_t N,
                                            uint32_t O,
                                            uint32_t strideA,
                                            uint32_t strideB,
                                            uint32_t strideC,
                                            uint32_t shift,
                                            uint32_t nPE,
                                            int16_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q16p_xpulpv2(void *args);

void plp_mat_mult_cmplx_stride_q8(const int8_t *__restrict__ pSrcA,
                                  const int8_t *__restrict__ pSrcB,
                                  uint32_t M,
                                  uint32_t N,
                                  uint32_t O,
                                  uint32_t strideA,
                                  uint32_t strideB,
                                  uint32_t strideC,
                                  uint32_t shift,
                                  int8_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q8s_rv32im(const int8_t *__restrict__ pSrcA,
                                          const int8_t *__restrict__ pSrcB,
                                          uint32_t M,
                                          uint32_t N,
                                          uint32_t O,
                                          uint32_t strideA,
                                          uint32_t strideB,
                                          uint32_t strideC,
                                          uint32_t shift,
                                          int8_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                           const int8_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           uint32_t shift,
                                           int8_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q8_parallel(const int8_t *__restrict__ pSrcA,
                                           const int8_t *__restrict__ pSrcB,
                                           uint32_t M,
                                           uint32_t N,
                                           uint32_t O,
                                           uint32_t strideA,
                                           uint32_t strideB,
                                           uint32_t strideC,
                                           uint32_t shift,
                                           uint32_t nPE,
                                           int8_t *__restrict__ pDstC);

void plp_mat_mult_cmplx_stride_q8p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_stride_i32(const int32_t *__restrict__ pSrcA,
                                         const int32_t *__restrict__ pSrcB,
                                         uint32_t M,
                                         uint32_t N,
                                         uint32_t O,
                                         uint32_t strideA,
                                         uint32_t strideB,
                                         uint32_t strideC,
                                         int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                                                 const int32_t *__restrict__ pSrcB,
                                                 uint32_t M,
                                                 uint32_t N,
                                                 uint32_t O,
                                                 uint32_t strideA,
                                                 uint32_t strideB,
                                                 uint32_t strideC,
                                                 int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                                  const int32_t *__restrict__ pSrcB,
                                                  uint32_t M,
                                                  uint32_t N,
                                                  uint32_t O,
                                                  uint32_t strideA,
                                                  uint32_t strideB,
                                                  uint32_t strideC,
                                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
                                                  const int32_t *__restrict__ pSrcB,
                                                  uint32_t M,
                                                  uint32_t N,
                                                  uint32_t O,
                                                  uint32_t strideA,
                                                  uint32_t strideB,
                                                  uint32_t strideC,
                                                  uint32_t nPE,
                                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i32p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_stride_i16(const int16_t *__restrict__ pSrcA,
                                         const int16_t *__restrict__ pSrcB,
                                         uint32_t M,
                                         uint32_t N,
                                         uint32_t O,
                                         uint32_t strideA,
                                         uint32_t strideB,
                                         uint32_t strideC,
                                         int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                                                 const int16_t *__restrict__ pSrcB,
                                                 uint32_t M,
                                                 uint32_t N,
                                                 uint32_t O,
                                                 uint32_t strideA,
                                                 uint32_t strideB,
                                                 uint32_t strideC,
                                                 int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                                  const int16_t *__restrict__ pSrcB,
                                                  uint32_t M,
                                                  uint32_t N,
                                                  uint32_t O,
                                                  uint32_t strideA,
                                                  uint32_t strideB,
                                                  uint32_t strideC,
                                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
                                                  const int16_t *__restrict__ pSrcB,
                                                  uint32_t M,
                                                  uint32_t N,
                                                  uint32_t O,
                                                  uint32_t strideA,
                                                  uint32_t strideB,
                                                  uint32_t strideC,
                                                  uint32_t nPE,
                                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i16p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_stride_i8(const int8_t *__restrict__ pSrcA,
                                        const int8_t *__restrict__ pSrcB,
                                        uint32_t M,
                                        uint32_t N,
                                        uint32_t O,
                                        uint32_t strideA,
                                        uint32_t strideB,
                                        uint32_t strideC,
                                        int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                                                const int8_t *__restrict__ pSrcB,
                                                uint32_t M,
                                                uint32_t N,
                                                uint32_t O,
                                                uint32_t strideA,
                                                uint32_t strideB,
                                                uint32_t strideC,
                                                int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                                 const int8_t *__restrict__ pSrcB,
                                                 uint32_t M,
                                                 uint32_t N,
                                                 uint32_t O,
                                                 uint32_t strideA,
                                                 uint32_t strideB,
                                                 uint32_t strideC,
                                                 int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
                                                 const int8_t *__restrict__ pSrcB,
                                                 uint32_t M,
                                                 uint32_t N,
                                                 uint32_t O,
                                                 uint32_t strideA,
                                                 uint32_t strideB,
                                                 uint32_t strideC,
                                                 uint32_t nPE,
                                                 int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_i8p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_stride_f32(const float *__restrict__ pSrcA,
                                         const float *__restrict__ pSrcB,
                                         uint32_t M,
                                         uint32_t N,
                                         uint32_t O,
                                         uint32_t strideA,
                                         uint32_t strideB,
                                         uint32_t strideC,
                                         float *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
                                                  const float *__restrict__ pSrcB,
                                                  uint32_t M,
                                                  uint32_t N,
                                                  uint32_t O,
                                                  uint32_t strideA,
                                                  uint32_t strideB,
                                                  uint32_t strideC,
                                                  float *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_f32_parallel(const float *__restrict__ pSrcA,
                                                  const float *__restrict__ pSrcB,
                                                  uint32_t M,
                                                  uint32_t N,
                                                  uint32_t O,
                                                  uint32_t strideA,
                                                  uint32_t strideB,
                                                  uint32_t strideC,
                                                  uint32_t nPE,
                                                  float *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_f32p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_stride_q32(const int32_t *__restrict__ pSrcA,
                                         const int32_t *__restrict__ pSrcB,
                                         uint32_t M,
                                         uint32_t N,
                                         uint32_t O,
                                         uint32_t strideA,
                                         uint32_t strideB,
                                         uint32_t strideC,
                                         uint32_t shift,
                                         int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q32s_rv32im(const int32_t *__restrict__ pSrcA,
                                                 const int32_t *__restrict__ pSrcB,
                                                 uint32_t M,
                                                 uint32_t N,
                                                 uint32_t O,
                                                 uint32_t strideA,
                                                 uint32_t strideB,
                                                 uint32_t strideC,
                                                 uint32_t shift,
                                                 int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                                  const int32_t *__restrict__ pSrcB,
                                                  uint32_t M,
                                                  uint32_t N,
                                                  uint32_t O,
                                                  uint32_t strideA,
                                                  uint32_t strideB,
                                                  uint32_t strideC,
                                                  uint32_t shift,
                                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q32_parallel(const int32_t *__restrict__ pSrcA,
                                                  const int32_t *__restrict__ pSrcB,
                                                  uint32_t M,
                                                  uint32_t N,
                                                  uint32_t O,
                                                  uint32_t strideA,
                                                  uint32_t strideB,
                                                  uint32_t strideC,
                                                  uint32_t shift,
                                                  uint32_t nPE,
                                                  int32_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q32p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_stride_q16(const int16_t *__restrict__ pSrcA,
                                         const int16_t *__restrict__ pSrcB,
                                         uint32_t M,
                                         uint32_t N,
                                         uint32_t O,
                                         uint32_t strideA,
                                         uint32_t strideB,
                                         uint32_t strideC,
                                         uint32_t shift,
                                         int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                                                 const int16_t *__restrict__ pSrcB,
                                                 uint32_t M,
                                                 uint32_t N,
                                                 uint32_t O,
                                                 uint32_t strideA,
                                                 uint32_t strideB,
                                                 uint32_t strideC,
                                                 uint32_t shift,
                                                 int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                                  const int16_t *__restrict__ pSrcB,
                                                  uint32_t M,
                                                  uint32_t N,
                                                  uint32_t O,
                                                  uint32_t strideA,
                                                  uint32_t strideB,
                                                  uint32_t strideC,
                                                  uint32_t shift,
                                                  int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q16_parallel(const int16_t *__restrict__ pSrcA,
                                                  const int16_t *__restrict__ pSrcB,
                                                  uint32_t M,
                                                  uint32_t N,
                                                  uint32_t O,
                                                  uint32_t strideA,
                                                  uint32_t strideB,
                                                  uint32_t strideC,
                                                  uint32_t shift,
                                                  uint32_t nPE,
                                                  int16_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q16p_xpulpv2(void *args);

void plp_mat_mult_trans_cmplx_stride_q8(const int8_t *__restrict__ pSrcA,
                                        const int8_t *__restrict__ pSrcB,
                                        uint32_t M,
                                        uint32_t N,
                                        uint32_t O,
                                        uint32_t strideA,
                                        uint32_t strideB,
                                        uint32_t strideC,
                                        uint32_t shift,
                                        int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q8s_rv32im(const int8_t *__restrict__ pSrcA,
                                                const int8_t *__restrict__ pSrcB,
                                                uint32_t M,
                                                uint32_t N,
                                                uint32_t O,
                                                uint32_t strideA,
                                                uint32_t strideB,
                                                uint32_t strideC,
                                                uint32_t shift,
                                                int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                                 const int8_t *__restrict__ pSrcB,
                                                 uint32_t M,
                                                 uint32_t N,
                                                 uint32_t O,
                                                 uint32_t strideA,
                                                 uint32_t strideB,
                                                 uint32_t strideC,
                                                 uint32_t shift,
                                                 int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q8_parallel(const int8_t *__restrict__ pSrcA,
                                                 const int8_t *__restrict__ pSrcB,
                                                 uint32_t M,
                                                 uint32_t N,
                                                 uint32_t O,
                                                 uint32_t strideA,
                                                 uint32_t strideB,
                                                 uint32_t strideC,
                                                 uint32_t shift,
                                                 uint32_t nPE,
                                                 int8_t *__restrict__ pDstC);

void plp_mat_mult_trans_cmplx_stride_q8p_xpulpv2(void *args);

void plp_mat_add_stride_i32(const int32_t *__restrict__ pSrcA,
                            const int32_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t strideA,
                            uint32_t strideB,
                            uint32_t strideY,
                            int32_t *__restrict__ pDst);

void plp_mat_add_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                                    const int32_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t strideA,
                                    uint32_t strideB,
                                    uint32_t strideY,
                                    int32_t *__restrict__ pDst);

void plp_mat_add_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     int32_t *__restrict__ pDst);

void plp_mat_add_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     uint32_t nPE,
                                     int32_t *__restrict__ pDst);

void plp_mat_add_stride_i32p_xpulpv2(void *args);

void plp_mat_add_stride_i16(const int16_t *__restrict__ pSrcA,
                            const int16_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t strideA,
                            uint32_t strideB,
                            uint32_t strideY,
                            int16_t *__restrict__ pDst);

void plp_mat_add_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                                    const int16_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t strideA,
                                    uint32_t strideB,
                                    uint32_t strideY,
                                    int16_t *__restrict__ pDst);

void plp_mat_add_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     int16_t *__restrict__ pDst);

void plp_mat_add_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     uint32_t nPE,
                                     int16_t *__restrict__ pDst);

void plp_mat_add_stride_i16p_xpulpv2(void *args);

void plp_mat_add_stride_i8(const int8_t *__restrict__ pSrcA,
                           const int8_t *__restrict__ pSrcB,
                           uint32_t M,
                           uint32_t N,
                           uint32_t strideA,
                           uint32_t strideB,
                           uint32_t strideY,
                           int8_t *__restrict__ pDst);

void plp_mat_add_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                                   const int8_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideY,
                                   int8_t *__restrict__ pDst);

void plp_mat_add_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t strideA,
                                    uint32_t strideB,
                                    uint32_t strideY,
                                    int8_t *__restrict__ pDst);

void plp_mat_add_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t strideA,
                                    uint32_t strideB,
                                    uint32_t strideY,
                                    uint32_t nPE,
                                    int8_t *__restrict__ pDst);

void plp_mat_add_stride_i8p_xpulpv2(void *args);

void plp_mat_add_stride_f32(const float *__restrict__ pSrcA,
                            const float *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t strideA,
                            uint32_t strideB,
                            uint32_t strideY,
                            float *__restrict__ pDst);

void plp_mat_add_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
                                     const float *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     float *__restrict__ pDst);

void plp_mat_add_stride_f32_parallel(const float *__restrict__ pSrcA,
                                     const float *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     uint32_t nPE,
                                     float *__restrict__ pDst);

void plp_mat_add_stride_f32p_xpulpv2(void *args);

void plp_mat_sub_stride_i32(const int32_t *__restrict__ pSrcA,
                            const int32_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t strideA,
                            uint32_t strideB,
                            uint32_t strideY,
                            int32_t *__restrict__ pDst);

void plp_mat_sub_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
                                    const int32_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t strideA,
                                    uint32_t strideB,
                                    uint32_t strideY,
                                    int32_t *__restrict__ pDst);

void plp_mat_sub_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     int32_t *__restrict__ pDst);

void plp_mat_sub_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     uint32_t nPE,
                                     int32_t *__restrict__ pDst);

void plp_mat_sub_stride_i32p_xpulpv2(void *args);

void plp_mat_sub_stride_i16(const int16_t *__restrict__ pSrcA,
                            const int16_t *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t strideA,
                            uint32_t strideB,
                            uint32_t strideY,
                            int16_t *__restrict__ pDst);

void plp_mat_sub_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
                                    const int16_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t strideA,
                                    uint32_t strideB,
                                    uint32_t strideY,
                                    int16_t *__restrict__ pDst);

void plp_mat_sub_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     int16_t *__restrict__ pDst);

void plp_mat_sub_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     uint32_t nPE,
                                     int16_t *__restrict__ pDst);

void plp_mat_sub_stride_i16p_xpulpv2(void *args);

void plp_mat_sub_stride_i8(const int8_t *__restrict__ pSrcA,
                           const int8_t *__restrict__ pSrcB,
                           uint32_t M,
                           uint32_t N,
                           uint32_t strideA,
                           uint32_t strideB,
                           uint32_t strideY,
                           int8_t *__restrict__ pDst);

void plp_mat_sub_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
                                   const int8_t *__restrict__ pSrcB,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t strideA,
                                   uint32_t strideB,
                                   uint32_t strideY,
                                   int8_t *__restrict__ pDst);

void plp_mat_sub_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t strideA,
                                    uint32_t strideB,
                                    uint32_t strideY,
                                    int8_t *__restrict__ pDst);

void plp_mat_sub_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t strideA,
                                    uint32_t strideB,
                                    uint32_t strideY,
                                    uint32_t nPE,
                                    int8_t *__restrict__ pDst);

void plp_mat_sub_stride_i8p_xpulpv2(void *args);

void plp_mat_sub_stride_f32(const float *__restrict__ pSrcA,
                            const float *__restrict__ pSrcB,
                            uint32_t M,
                            uint32_t N,
                            uint32_t strideA,
                            uint32_t strideB,
                            uint32_t strideY,
                            float *__restrict__ pDst);

void plp_mat_sub_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
                                     const float *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     float *__restrict__ pDst);

void plp_mat_sub_stride_f32_parallel(const float *__restrict__ pSrcA,
                                     const float *__restrict__ pSrcB,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideA,
                                     uint32_t strideB,
                                     uint32_t strideY,
                                     uint32_t nPE,
                                     float *__restrict__ pDst);

void plp_mat_sub_stride_f32p_xpulpv2(void *args);

void plp_mat_scale_stride_i32(const int32_t *__restrict__ pSrc,
                              uint32_t M,
                              uint32_t N,
                              uint32_t strideSrc,
                              uint32_t strideDst,
                              int32_t scaleFactor,
                              int32_t shift,
                              int32_t *__restrict__ pDst);

void plp_mat_scale_stride_i32s_rv32im(const int32_t *__restrict__ pSrc,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t strideSrc,
                                      uint32_t strideDst,
                                      int32_t scaleFactor,
                                      int32_t shift,
                                      int32_t *__restrict__ pDst);

void plp_mat_scale_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
                                       uint32_t M,
                                       uint32_t N,
                                       uint32_t strideSrc,
                                       uint32_t strideDst,
                                       int32_t scaleFactor,
                                       int32_t shift,
                                       int32_t *__restrict__ pDst);

void plp_mat_scale_stride_i32_parallel(const int32_t *__restrict__ pSrc,
                                       uint32_t M,
                                       uint32_t N,
                                       uint32_t strideSrc,
                                       uint32_t strideDst,
                                       int32_t scaleFactor,
                                       int32_t shift,
                                       uint32_t nPE,
                                       int32_t *__restrict__ pDst);

void plp_mat_scale_stride_i32p_xpulpv2(void *args);

void plp_mat_scale_stride_i16(const int16_t *__restrict__ pSrc,
                              uint32_t M,
                              uint32_t N,
                              uint32_t strideSrc,
                              uint32_t strideDst,
                              int16_t scaleFactor,
                              int32_t shift,
                              int16_t *__restrict__ pDst);

void plp_mat_scale_stride_i16s_rv32im(const int16_t *__restrict__ pSrc,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t strideSrc,
                                      uint32_t strideDst,
                                      int16_t scaleFactor,
                                      int32_t shift,
                                      int16_t *__restrict__ pDst);

void plp_mat_scale_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
                                       uint32_t M,
                                       uint32_t N,
                                       uint32_t strideSrc,
                                       uint32_t strideDst,
                                       int16_t scaleFactor,
                                       int32_t shift,
                                       int16_t *__restrict__ pDst);

void plp_mat_scale_stride_i16_parallel(const int16_t *__restrict__ pSrc,
                                       uint32_t M,
                                       uint32_t N,
                                       uint32_t strideSrc,
                                       uint32_t strideDst,
                                       int16_t scaleFactor,
                                       int32_t shift,
                                       uint32_t nPE,
                                       int16_t *__restrict__ pDst);

void plp_mat_scale_stride_i16p_xpulpv2(void *args);

void plp_mat_scale_stride_i8(const int8_t *__restrict__ pSrc,
                             uint32_t M,
                             uint32_t N,
                             uint32_t strideSrc,
                             uint32_t strideDst,
                             int8_t scaleFactor,
                             int32_t shift,
                             int8_t *__restrict__ pDst);

void plp_mat_scale_stride_i8s_rv32im(const int8_t *__restrict__ pSrc,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideSrc,
                                     uint32_t strideDst,
                                     int8_t scaleFactor,
                                     int32_t shift,
                                     int8_t *__restrict__ pDst);

void plp_mat_scale_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t strideSrc,
                                      uint32_t strideDst,
                                      int8_t scaleFactor,
                                      int32_t shift,
                                      int8_t *__restrict__ pDst);

void plp_mat_scale_stride_i8_parallel(const int8_t *__restrict__ pSrc,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t strideSrc,
                                      uint32_t strideDst,
                                      int8_t scaleFactor,
                                      int32_t shift,
                                      uint32_t nPE,
                                      int8_t *__restrict__ pDst);

void plp_mat_scale_stride_i8p_xpulpv2(void *args);

void plp_mat_scale_stride_f32(const float *__restrict__ pSrc,
                              uint32_t M,
                              uint32_t N,
                              uint32_t strideSrc,
                              uint32_t strideDst,
                              float scaleFactor,
                              float *__restrict__ pDst);

void plp_mat_scale_stride_f32s_xpulpv2(const float *__restrict__ pSrc,
                                       uint32_t M,
                                       uint32_t N,
                                       uint32_t strideSrc,
                                       uint32_t strideDst,
                                       float scaleFactor,
                                       float *__restrict__ pDst);

void plp_mat_scale_stride_f32_parallel(const float *__restrict__ pSrc,
                                       uint32_t M,
                                       uint32_t N,
                                       uint32_t strideSrc,
                                       uint32_t strideDst,
                                       float scaleFactor,
                                       uint32_t nPE,
                                       float *__restrict__ pDst);

void plp_mat_scale_stride_f32p_xpulpv2(void *args);

void plp_mat_fill_I_stride_i32(uint32_t N, uint32_t stride, int32_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i32s_rv32im(uint32_t N, uint32_t stride, int32_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i32s_xpulpv2(uint32_t N, uint32_t stride, int32_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i32_parallel(uint32_t N,
                                        uint32_t stride,
                                        uint32_t nPE,
                                        int32_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i32p_xpulpv2(void *args);

void plp_mat_fill_I_stride_i16(uint32_t N, uint32_t stride, int16_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i16s_rv32im(uint32_t N, uint32_t stride, int16_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i16s_xpulpv2(uint32_t N, uint32_t stride, int16_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i16_parallel(uint32_t N,
                                        uint32_t stride,
                                        uint32_t nPE,
                                        int16_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i16p_xpulpv2(void *args);

void plp_mat_fill_I_stride_i8(uint32_t N, uint32_t stride, int8_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i8s_rv32im(uint32_t N, uint32_t stride, int8_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i8s_xpulpv2(uint32_t N, uint32_t stride, int8_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i8_parallel(uint32_t N,
                                       uint32_t stride,
                                       uint32_t nPE,
                                       int8_t *__restrict__ pDst);

void plp_mat_fill_I_stride_i8p_xpulpv2(void *args);

void plp_mat_fill_I_stride_f32(uint32_t N, uint32_t stride, float *__restrict__ pDst);

void plp_mat_fill_I_stride_f32s_xpulpv2(uint32_t N, uint32_t stride, float *__restrict__ pDst);

void plp_mat_fill_I_stride_f32_parallel(uint32_t N,
                                        uint32_t stride,
                                        uint32_t nPE,
                                        float *__restrict__ pDst);

void plp_mat_fill_I_stride_f32p_xpulpv2(void *args);

void plp_mat_fill_I_stride_q32(uint32_t N,
                               uint32_t stride,
                               int32_t fracBits,
                               int32_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q32s_rv32im(uint32_t N,
                                       uint32_t stride,
                                       int32_t fracBits,
                                       int32_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q32s_xpulpv2(uint32_t N,
                                        uint32_t stride,
                                        int32_t fracBits,
                                        int32_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q32_parallel(
    uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int32_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q32p_xpulpv2(void *args);

void plp_mat_fill_I_stride_q16(uint32_t N,
                               uint32_t stride,
                               int32_t fracBits,
                               int16_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q16s_rv32im(uint32_t N,
                                       uint32_t stride,
                                       int32_t fracBits,
                                       int16_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q16s_xpulpv2(uint32_t N,
                                        uint32_t stride,
                                        int32_t fracBits,
                                        int16_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q16_parallel(
    uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int16_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q16p_xpulpv2(void *args);

void plp_mat_fill_I_stride_q8(uint32_t N,
                              uint32_t stride,
                              int32_t fracBits,
                              int8_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q8s_rv32im(uint32_t N,
                                      uint32_t stride,
                                      int32_t fracBits,
                                      int8_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q8s_xpulpv2(uint32_t N,
                                       uint32_t stride,
                                       int32_t fracBits,
                                       int8_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q8_parallel(
    uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int8_t *__restrict__ pDst);

void plp_mat_fill_I_stride_q8p_xpulpv2(void *args);

void plp_mat_fill_stride_i32(
    uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *__restrict__ pDst);

void plp_mat_fill_stride_i32s_rv32im(
    uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *__restrict__ pDst);

void plp_mat_fill_stride_i32s_xpulpv2(
    uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *__restrict__ pDst);

void plp_mat_fill_stride_i32_parallel(uint32_t M,
                                      uint32_t N,
                                      uint32_t stride,
                                      int32_t value,
                                      uint32_t nPE,
                                      int32_t *__restrict__ pDst);

void plp_mat_fill_stride_i32p_xpulpv2(void *args);

void plp_mat_fill_stride_i16(
    uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *__restrict__ pDst);

void plp_mat_fill_stride_i16s_rv32im(
    uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *__restrict__ pDst);

void plp_mat_fill_stride_i16s_xpulpv2(
    uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *__restrict__ pDst);

void plp_mat_fill_stride_i16_parallel(uint32_t M,
                                      uint32_t N,
                                      uint32_t stride,
                                      int16_t value,
                                      uint32_t nPE,
                                      int16_t *__restrict__ pDst);

void plp_mat_fill_stride_i16p_xpulpv2(void *args);

void plp_mat_fill_stride_i8(
    uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *__restrict__ pDst);

void plp_mat_fill_stride_i8s_rv32im(
    uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *__restrict__ pDst);

void plp_mat_fill_stride_i8s_xpulpv2(
    uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *__restrict__ pDst);

void plp_mat_fill_stride_i8_parallel(
    uint32_t M, uint32_t N, uint32_t stride, int8_t value, uint32_t nPE, int8_t *__restrict__ pDst);

void plp_mat_fill_stride_i8p_xpulpv2(void *args);

void plp_mat_fill_stride_f32(
    uint32_t M, uint32_t N, uint32_t stride, float value, float *__restrict__ pDst);

void plp_mat_fill_stride_f32s_xpulpv2(
    uint32_t M, uint32_t N, uint32_t stride, float value, float *__restrict__ pDst);

void plp_mat_fill_stride_f32_parallel(
    uint32_t M, uint32_t N, uint32_t stride, float value, uint32_t nPE, float *__restrict__ pDst);

void plp_mat_fill_stride_f32p_xpulpv2(void *args);

void plp_mat_copy_stride_i32(const int32_t *__restrict__ pSrc,
                             uint32_t M,
                             uint32_t N,
                             uint32_t strideSrc,
                             uint32_t strideDst,
                             int32_t *__restrict__ pDst);

void plp_mat_copy_stride_i32s_rv32im(const int32_t *__restrict__ pSrc,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideSrc,
                                     uint32_t strideDst,
                                     int32_t *__restrict__ pDst);

void plp_mat_copy_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t strideSrc,
                                      uint32_t strideDst,
                                      int32_t *__restrict__ pDst);

void plp_mat_copy_stride_i32_parallel(const int32_t *__restrict__ pSrc,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t strideSrc,
                                      uint32_t strideDst,
                                      uint32_t nPE,
                                      int32_t *__restrict__ pDst);

void plp_mat_copy_stride_i32p_xpulpv2(void *args);

void plp_mat_copy_stride_i16(const int16_t *__restrict__ pSrc,
                             uint32_t M,
                             uint32_t N,
                             uint32_t strideSrc,
                             uint32_t strideDst,
                             int16_t *__restrict__ pDst);

void plp_mat_copy_stride_i16s_rv32im(const int16_t *__restrict__ pSrc,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideSrc,
                                     uint32_t strideDst,
                                     int16_t *__restrict__ pDst);

void plp_mat_copy_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t strideSrc,
                                      uint32_t strideDst,
                                      int16_t *__restrict__ pDst);

void plp_mat_copy_stride_i16_parallel(const int16_t *__restrict__ pSrc,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t strideSrc,
                                      uint32_t strideDst,
                                      uint32_t nPE,
                                      int16_t *__restrict__ pDst);

void plp_mat_copy_stride_i16p_xpulpv2(void *args);

void plp_mat_copy_stride_i8(const int8_t *__restrict__ pSrc,
                            uint32_t M,
                            uint32_t N,
                            uint32_t strideSrc,
                            uint32_t strideDst,
                            int8_t *__restrict__ pDst);

void plp_mat_copy_stride_i8s_rv32im(const int8_t *__restrict__ pSrc,
                                    uint32_t M,
                                    uint32_t N,
                                    uint32_t strideSrc,
                                    uint32_t strideDst,
                                    int8_t *__restrict__ pDst);

void plp_mat_copy_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideSrc,
                                     uint32_t strideDst,
                                     int8_t *__restrict__ pDst);

void plp_mat_copy_stride_i8_parallel(const int8_t *__restrict__ pSrc,
                                     uint32_t M,
                                     uint32_t N,
                                     uint32_t strideSrc,
                                     uint32_t strideDst,
                                     uint32_t nPE,
                                     int8_t *__restrict__ pDst);

void plp_mat_copy_stride_i8p_xpulpv2(void *args);

void plp_mat_copy_stride_f32(const float *__restrict__ pSrc,
                             uint32_t M,
                             uint32_t N,
                             uint32_t strideSrc,
                             uint32_t strideDst,
                             float *__restrict__ pDst);

void plp_mat_copy_stride_f32s_xpulpv2(const float *__restrict__ pSrc,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t strideSrc,
                                      uint32_t strideDst,
                                      float *__restrict__ pDst);

void plp_mat_copy_stride_f32_parallel(const float *__restrict__ pSrc,
                                      uint32_t M,
                                      uint32_t N,
                                      uint32_t strideSrc,
                                      uint32_t strideDst,
                                      uint32_t nPE,
                                      float *__restrict__ pDst);

void plp_mat_copy_stride_f32p_xpulpv2(void *args);

void plp_cmplx_conj_f32(const float32_t *__restrict__ pSrc,
                        float32_t *__restrict__ pDst,
                        uint32_t numSamples);

void plp_cmplx_conj_f32_xpulpv2(const float32_t *__restrict__ pSrc,
                                float32_t *__restrict__ pDst,
                                uint32_t numSamples);

void plp_cmplx_conj_i32(const int32_t *__restrict__ pSrc,
                        int32_t *__restrict__ pDst,
                        uint32_t numSamples);

void plp_cmplx_conj_i32_xpulpv2(const int32_t *__restrict__ pSrc,
                                int32_t *__restrict__ pDst,
                                uint32_t numSamples);

void plp_cmplx_conj_i32_rv32im(const int32_t *__restrict__ pSrc,
                               int32_t *__restrict__ pDst,
                               uint32_t numSamples);

void plp_cmplx_conj_i16(const int16_t *__restrict__ pSrc,
                        int16_t *__restrict__ pDst,
                        uint32_t numSamples);

void plp_cmplx_conj_i16_xpulpv2(const int16_t *__restrict__ pSrc,
                                int16_t *__restrict__ pDst,
                                uint32_t numSamples);

void plp_cmplx_conj_i16_rv32im(const int16_t *__restrict__ pSrc,
                               int16_t *__restrict__ pDst,
                               uint32_t numSamples);

void plp_cmplx_conj_i8(const int8_t *__restrict__ pSrc,
                       int8_t *__restrict__ pDst,
                       uint32_t numSamples);

void plp_cmplx_conj_i8_xpulpv2(const int8_t *__restrict__ pSrc,
                               int8_t *__restrict__ pDst,
                               uint32_t numSamples);

void plp_cmplx_conj_i8_rv32im(const int8_t *__restrict__ pSrc,
                              int8_t *__restrict__ pDst,
                              uint32_t numSamples);

void plp_cmplx_dot_prod_f32(const float32_t *pSrcA,
                            const float32_t *pSrcB,
                            uint32_t numSamples,
                            float32_t *realResult,
                            float32_t *imagResult);

void plp_cmplx_dot_prod_f32_xpulpv2(const float32_t *pSrcA,
                                    const float32_t *pSrcB,
                                    uint32_t numSamples,
                                    float32_t *realResult,
                                    float32_t *imagResult);

void plp_cmplx_dot_prod_i32(const int32_t *pSrcA,
                            const int32_t *pSrcB,
                            uint32_t numSamples,
                            int32_t *realResult,
                            int32_t *imagResult);

void plp_cmplx_dot_prod_i32_xpulpv2(const int32_t *pSrcA,
                                    const int32_t *pSrcB,
                                    uint32_t numSamples,
                                    int32_t *realResult,
                                    int32_t *imagResult);

void plp_cmplx_dot_prod_i32_rv32im(const int32_t *pSrcA,
                                   const int32_t *pSrcB,
                                   uint32_t numSamples,
                                   int32_t *realResult,
                                   int32_t *imagResult);

void plp_cmplx_dot_prod_i16(const int16_t *pSrcA,
                            const int16_t *pSrcB,
                            uint32_t numSamples,
                            int16_t *realResult,
                            int16_t *imagResult);

void plp_cmplx_dot_prod_i16_xpulpv2(const int16_t *pSrcA,
                                    const int16_t *pSrcB,
                                    uint32_t numSamples,
                                    int16_t *realResult,
                                    int16_t *imagResult);

void plp_cmplx_dot_prod_i16_rv32im(const int16_t *pSrcA,
                                   const int16_t *pSrcB,
                                   uint32_t numSamples,
                                   int16_t *realResult,
                                   int16_t *imagResult);

void plp_cmplx_dot_prod_i8(const int8_t *pSrcA,
                           const int8_t *pSrcB,
                           uint32_t numSamples,
                           int8_t *realResult,
                           int8_t *imagResult);

void plp_cmplx_dot_prod_i8_xpulpv2(const int8_t *pSrcA,
                                   const int8_t *pSrcB,
                                   uint32_t numSamples,
                                   int8_t *realResult,
                                   int8_t *imagResult);

void plp_cmplx_dot_prod_i8_rv32im(const int8_t *pSrcA,
                                  const int8_t *pSrcB,
                                  uint32_t numSamples,
                                  int8_t *realResult,
                                  int8_t *imagResult);

void plp_cmplx_dot_prod_q32(const int32_t *pSrcA,
                            const int32_t *pSrcB,
                            uint32_t numSamples,
                            uint32_t deciPoint,
                            int32_t *realResult,
                            int32_t *imagResult);

void plp_cmplx_dot_prod_q32_xpulpv2(const int32_t *pSrcA,
                                    const int32_t *pSrcB,
                                    uint32_t numSamples,
                                    uint32_t deciPoint,
                                    int32_t *realResult,
                                    int32_t *imagResult);

void plp_cmplx_dot_prod_q32_rv32im(const int32_t *pSrcA,
                                   const int32_t *pSrcB,
                                   uint32_t numSamples,
                                   uint32_t deciPoint,
                                   int32_t *realResult,
                                   int32_t *imagResult);

void plp_cmplx_dot_prod_q16(const int16_t *pSrcA,
                            const int16_t *pSrcB,
                            uint32_t numSamples,
                            uint32_t deciPoint,
                            int16_t *realResult,
                            int16_t *imagResult);

void plp_cmplx_dot_prod_q16_xpulpv2(const int16_t *pSrcA,
                                    const int16_t *pSrcB,
                                    uint32_t numSamples,
                                    uint32_t deciPoint,
                                    int16_t *realResult,
                                    int16_t *imagResult);

void plp_cmplx_dot_prod_q16_rv32im(const int16_t *pSrcA,
                                   const int16_t *pSrcB,
                                   uint32_t numSamples,
                                   uint32_t deciPoint,
                                   int16_t *realResult,
                                   int16_t *imagResult);

void plp_cmplx_mult_real_f32(const float32_t *__restrict__ pSrcCmplx,
                             const float32_t *__restrict__ pSrcReal,
                             float32_t *__restrict__ pDst,
                             uint32_t numSamples);

void plp_cmplx_mult_real_f32_xpulpv2(const float32_t *__restrict__ pSrcCmplx,
                                     const float32_t *__restrict__ pSrcReal,
                                     float32_t *__restrict__ pDst,
                                     uint32_t numSamples);

void plp_cmplx_mult_real_i32(const int32_t *__restrict__ pSrcCmplx,
                             const int32_t *__restrict__ pSrcReal,
                             int32_t *__restrict__ pDst,
                             uint32_t numSamples);

void plp_cmplx_mult_real_i32_xpulpv2(const int32_t *__restrict__ pSrcCmplx,
                                     const int32_t *__restrict__ pSrcReal,
                                     int32_t *__restrict__ pDst,
                                     uint32_t numSamples);

void plp_cmplx_mult_real_i32_rv32im(const int32_t *__restrict__ pSrcCmplx,
                                    const int32_t *__restrict__ pSrcReal,
                                    int32_t *__restrict__ pDst,
                                    uint32_t numSamples);

void plp_cmplx_mult_real_i16(const int16_t *__restrict__ pSrcCmplx,
                             const int16_t *__restrict__ pSrcReal,
                             int16_t *__restrict__ pDst,
                             uint32_t numSamples);

void plp_cmplx_mult_real_i16_xpulpv2(const int16_t *__restrict__ pSrcCmplx,
                                     const int16_t *__restrict__ pSrcReal,
                                     int16_t *__restrict__ pDst,
                                     uint32_t numSamples);

void plp_cmplx_mult_real_i16_rv32im(const int16_t *__restrict__ pSrcCmplx,
                                    const int16_t *__restrict__ pSrcReal,
                                    int16_t *__restrict__ pDst,
                                    uint32_t numSamples);

void plp_cmplx_mult_real_i8(const int8_t *__restrict__ pSrcCmplx,
                            const int8_t *__restrict__ pSrcReal,
                            int8_t *__restrict__ pDst,
                            uint32_t numSamples);

void plp_cmplx_mult_real_i8_xpulpv2(const int8_t *__restrict__ pSrcCmplx,
                                    const int8_t *__restrict__ pSrcReal,
                                    int8_t *__restrict__ pDst,
                                    uint32_t numSamples);

void plp_cmplx_mult_real_i8_rv32im(const int8_t *__restrict__ pSrcCmplx,
                                   const int8_t *__restrict__ pSrcReal,
                                   int8_t *__restrict__ pDst,
                                   uint32_t numSamples);

void plp_cmplx_mult_real_q32(const int32_t *__restrict__ pSrcCmplx,
                             const int32_t *__restrict__ pSrcReal,
                             int32_t *__restrict__ pDst,
                             uint32_t deciPoint,
                             uint32_t numSamples);

void plp_cmplx_mult_real_q32_xpulpv2(const int32_t *__restrict__ pSrcCmplx,
                                     const int32_t *__restrict__ pSrcReal,
                                     int32_t *__restrict__ pDst,
                                     uint32_t deciPoint,
                                     uint32_t numSamples);

void plp_cmplx_mult_real_q32_rv32im(const int32_t *__restrict__ pSrcCmplx,
                                    const int32_t *__restrict__ pSrcReal,
                                    int32_t *__restrict__ pDst,
                                    uint32_t deciPoint,
                                    uint32_t numSamples);

void plp_cmplx_mult_real_q16(const int16_t *__restrict__ pSrcCmplx,
                             const int16_t *__restrict__ pSrcReal,
                             int16_t *__restrict__ pDst,
                             uint32_t deciPoint,
                             uint32_t numSamples);

void plp_cmplx_mult_real_q16_xpulpv2(const int16_t *__restrict__ pSrcCmplx,
                                     const int16_t *__restrict__ pSrcReal,
                                     int16_t *__restrict__ pDst,
                                     uint32_t deciPoint,
                                     uint32_t numSamples);

void plp_cmplx_mult_real_q16_rv32im(const int16_t *__restrict__ pSrcCmplx,
                                    const int16_t *__restrict__ pSrcReal,
                                    int16_t *__restrict__ pDst,
                                    uint32_t deciPoint,
                                    uint32_t numSamples);

void plp_cmplx_mult_real_q8(const int8_t *__restrict__ pSrcCmplx,
                            const int8_t *__restrict__ pSrcReal,
                            int8_t *__restrict__ pDst,
                            uint32_t deciPoint,
                            uint32_t numSamples);

void plp_cmplx_mult_real_q8_xpulpv2(const int8_t *__restrict__ pSrcCmplx,
                                    const int8_t *__restrict__ pSrcReal,
                                    int8_t *__restrict__ pDst,
                                    uint32_t deciPoint,
                                    uint32_t numSamples);

void plp_cmplx_mult_real_q8_rv32im(const int8_t *__restrict__ pSrcCmplx,
                                   const int8_t *__restrict__ pSrcReal,
                                   int8_t *__restrict__ pDst,
                                   uint32_t deciPoint,
                                   uint32_t numSamples);

void plp_cmplx_mag_squared_f32(const float32_t *__restrict__ pSrc,
                               float32_t *__restrict__ pDst,
                               uint32_t numSamples);

void plp_cmplx_mag_squared_f32_xpulpv2(const float32_t *__restrict__ pSrc,
                                       float32_t *__restrict__ pDst,
                                       uint32_t numSamples);

void plp_cmplx_mag_squared_i16(const int16_t *__restrict__ pSrc,
                               int16_t *__restrict__ pDst,
                               uint32_t numSamples);

void plp_cmplx_mag_squared_i16_rv32im(const int16_t *__restrict__ pSrc,
                                      int16_t *__restrict__ pDst,
                                      uint32_t numSamples);

void plp_cmplx_mag_squared_i16_xpulpv2(const int16_t *__restrict__ pSrc,
                                       int16_t *__restrict__ pDst,
                                       uint32_t numSamples);

void plp_cmplx_mag_squared_i32(const int32_t *__restrict__ pSrc,
                               int32_t *__restrict__ pDst,
                               uint32_t numSamples);

void plp_cmplx_mag_squared_i32_rv32im(const int32_t *__restrict__ pSrc,
                                      int32_t *__restrict__ pDst,
                                      uint32_t numSamples);

void plp_cmplx_mag_squared_i32_xpulpv2(const int32_t *__restrict__ pSrc,
                                      int32_t *__restrict__ pDst,
                                      uint32_t numSamples);

void plp_cmplx_mag_squared_i8_xpulpv2(const int8_t *__restrict__ pSrc,
                                      int8_t *__restrict__ pDst,
                                      uint32_t numSamples);

void plp_cmplx_mag_squared_i8(const int8_t *__restrict__ pSrc,
                              int8_t *__restrict__ pDst,
                              uint32_t numSamples);

void plp_cmplx_mag_squared_i8_rv32im(const int8_t *__restrict__ pSrc,
                                     int8_t *__restrict__ pDst,
                                     uint32_t numSamples);

void plp_cmplx_mag_squared_i8_xpulpv2(const int8_t *__restrict__ pSrc,
                                      int8_t *__restrict__ pDst,
                                      uint32_t numSamples);

void plp_cmplx_mag_squared_q32(const int32_t *__restrict__ pSrc,
                               int32_t *__restrict__ pDst,
                               uint32_t deciPoint,
                               uint32_t numSamples);

void plp_cmplx_mag_squared_q32_rv32im(const int32_t *__restrict__ pSrc,
                                      int32_t *__restrict__ pDst,
                                      uint32_t deciPoint,
                                      uint32_t numSamples);

void plp_cmplx_mag_squared_q32_xpulpv2(const int32_t *__restrict__ pSrc,
                                       int32_t *__restrict__ pDst,
                                       uint32_t deciPoint,
                                       uint32_t numSamples);

void plp_cmplx_mag_squared_q16(const int16_t *__restrict__ pSrc,
                               int16_t *__restrict__ pDst,
                               uint32_t deciPoint,
                               uint32_t numSamples);

void plp_cmplx_mag_squared_q16_rv32im(const int16_t *__restrict__ pSrc,
                                      int16_t *__restrict__ pDst,
                                      uint32_t deciPoint,
                                      uint32_t numSamples);

void plp_cmplx_mag_squared_q16_xpulpv2(const int16_t *__restrict__ pSrc,
                                       int16_t *__restrict__ pDst,
                                       uint32_t deciPoint,
                                       uint32_t numSamples);

void plp_cmplx_mag_squared_q8(const int8_t *__restrict__ pSrc,
                              int8_t *__restrict__ pDst,
                              uint32_t deciPoint,
                              uint32_t numSamples);

void plp_cmplx_mag_squared_q8_rv32im(const int8_t *__restrict__ pSrc,
                                     int8_t *__restrict__ pDst,
                                     uint32_t deciPoint,
                                     uint32_t numSamples);

void plp_cmplx_mag_squared_q8_xpulpv2(const int8_t *__restrict__ pSrc,
                                      int8_t *__restrict__ pDst,
                                      uint32_t deciPoint,
                                      uint32_t numSamples);

void plp_cmplx_mult_cmplx_f32(const float32_t *__restrict__ pSrcA,
                              const float32_t *__restrict__ pSrcB,
                              float32_t *__restrict__ pDst,
                              uint32_t numSamples);

void plp_cmplx_mult_cmplx_f32_xpulpv2(const float32_t *__restrict__ pSrcA,
                                      const float32_t *__restrict__ pSrcB,
                                      float32_t *__restrict__ pDst,
                                      uint32_t numSamples);

void plp_cmplx_mult_cmplx_i32(const int32_t *__restrict__ pSrcA,
                              const int32_t *__restrict__ pSrcB,
                              int32_t *__restrict__ pDst,
                              uint32_t numSamples);

void plp_cmplx_mult_cmplx_i32_xpulpv2(const int32_t *__restrict__ pSrcA,
                                      const int32_t *__restrict__ pSrcB,
                                      int32_t *__restrict__ pDst,
                                      uint32_t numSamples);

void plp_cmplx_mult_cmplx_i32_rv32im(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     int32_t *__restrict__ pDst,
                                     uint32_t numSamples);

void plp_cmplx_mult_cmplx_i16(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              int16_t *__restrict__ pDst,
                              uint32_t numSamples);

void plp_cmplx_mult_cmplx_i16_xpulpv2(const int16_t *__restrict__ pSrcA,
                                      const int16_t *__restrict__ pSrcB,
                                      int16_t *__restrict__ pDst,
                                      uint32_t numSamples);

void plp_cmplx_mult_cmplx_i16_rv32im(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     int16_t *__restrict__ pDst,
                                     uint32_t numSamples);

void plp_cmplx_mult_cmplx_i8(const int8_t *__restrict__ pSrcA,
                             const int8_t *__restrict__ pSrcB,
                             int8_t *__restrict__ pDst,
                             uint32_t numSamples);

void plp_cmplx_mult_cmplx_i8_xpulpv2(const int8_t *__restrict__ pSrcA,
                                     const int8_t *__restrict__ pSrcB,
                                     int8_t *__restrict__ pDst,
                                     uint32_t numSamples);

void plp_cmplx_mult_cmplx_i8_rv32im(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    int8_t *__restrict__ pDst,
                                    uint32_t numSamples);

void plp_cmplx_mult_cmplx_q32(const int32_t *__restrict__ pSrcA,
                              const int32_t *__restrict__ pSrcB,
                              int32_t *__restrict__ pDst,
                              uint32_t deciPoint,
                              uint32_t numSamples);

void plp_cmplx_mult_cmplx_q32_xpulpv2(const int32_t *__restrict__ pSrcA,
                                      const int32_t *__restrict__ pSrcB,
                                      int32_t *__restrict__ pDst,
                                      uint32_t deciPoint,
                                      uint32_t numSamples);

void plp_cmplx_mult_cmplx_q32_rv32im(const int32_t *__restrict__ pSrcA,
                                     const int32_t *__restrict__ pSrcB,
                                     int32_t *__restrict__ pDst,
                                     uint32_t deciPoint,
                                     uint32_t numSamples);

void plp_cmplx_mult_cmplx_q16(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              int16_t *__restrict__ pDst,
                              uint32_t deciPoint,
                              uint32_t numSamples);

void plp_cmplx_mult_cmplx_q16_xpulpv2(const int16_t *__restrict__ pSrcA,
                                      const int16_t *__restrict__ pSrcB,
                                      int16_t *__restrict__ pDst,
                                      uint32_t deciPoint,
                                      uint32_t numSamples);

void plp_cmplx_mult_cmplx_q16_rv32im(const int16_t *__restrict__ pSrcA,
                                     const int16_t *__restrict__ pSrcB,
                                     int16_t *__restrict__ pDst,
                                     uint32_t deciPoint,
                                     uint32_t numSamples);

void plp_cmplx_mult_cmplx_q8(const int8_t *__restrict__ pSrcA,
                             const int8_t *__restrict__ pSrcB,
                             int8_t *__restrict__ pDst,
                             uint32_t deciPoint,
                             uint32_t numSamples);

void plp_cmplx_mult_cmplx_q8_xpulpv2(const int8_t *__restrict__ pSrcA,
                                     const int8_t *__restrict__ pSrcB,
                                     int8_t *__restrict__ pDst,
                                     uint32_t deciPoint,
                                     uint32_t numSamples);

void plp_cmplx_mult_cmplx_q8_rv32im(const int8_t *__restrict__ pSrcA,
                                    const int8_t *__restrict__ pSrcB,
                                    int8_t *__restrict__ pDst,
                                    uint32_t deciPoint,
                                    uint32_t numSamples);


void plp_euclidean_distance_q32_parallel(   const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t blockSize,
                                            uint32_t fracBits,
                                            uint32_t nPE,
                                            uint32_t *__restrict__ pRes);

void plp_euclidean_distance_f32_parallel( const float32_t *__restrict__ pSrcA,
                                          const float32_t *__restrict__ pSrcB,
                                          uint32_t blockSize,
                                          uint32_t nPE,
                                          float32_t *__restrict__ pRes);

void plp_euclidean_distance_q32p_xpulpv2(void *S);

void plp_euclidean_distance_f32p_xpulpv2(void *S);

void plp_euclidean_distance_q32(  const int32_t *__restrict__ pSrcA,
                                  const int32_t *__restrict__ pSrcB,
                                  uint32_t blockSize,
                                  uint32_t fracBits,
                                  int32_t *__restrict__ pRes);

void plp_euclidean_distance_q32s_xpulpv2(   const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t blockSize,
                                            uint32_t fracBits,
                                            int32_t *__restrict__ pRes);

void plp_euclidean_distance_q32s_rv32im(    const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t blockSize,
                                            uint32_t fracBits,
                                            int32_t *__restrict__ pRes);


void plp_euclidean_distance_q16(  const int16_t *__restrict__ pSrcA,
                                  const int16_t *__restrict__ pSrcB,
                                  uint16_t blockSize,
                                  uint16_t fracBits,
                                  int32_t *__restrict__ pRes);

void plp_euclidean_distance_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                               const int16_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               uint32_t deciPoint,
                               int32_t *__restrict__ pRes);

void plp_euclidean_distance_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              uint32_t blockSize,
                              uint32_t fracBits,
                              int32_t *__restrict__ pRes);

void plp_euclidean_distance_f32(  const float32_t *__restrict__ pSrcA,
                                  const float32_t *__restrict__ pSrcB,
                                  uint32_t blockSize,
                                  float32_t *__restrict__ pRes);

void plp_euclidean_distance_f32s_xpulpv2( const float32_t *__restrict__ pSrcA,
                                          const float32_t *__restrict__ pSrcB,
                                          uint32_t blockSize,
                                          float32_t *__restrict__ pRes);

void plp_euclidean_distance_f32s_rv32im(  const float32_t *__restrict__ pSrcA,
                                          const float32_t *__restrict__ pSrcB,
                                          uint32_t blockSize,
                                          float32_t *__restrict__ pRes);

void plp_cosine_distance_q32_parallel(   const int32_t *__restrict__ pSrcA,
                                         const int32_t *__restrict__ pSrcB,
                                         uint32_t blockSize,
                                         uint32_t fracBits,
                                         uint32_t nPE,
                                         int32_t *__restrict__ pRes);

void plp_cosine_distance_f32_parallel(  const float32_t *__restrict__ pSrcA,
                                        const float32_t *__restrict__ pSrcB,
                                        uint32_t blockSize,
                                        uint32_t nPE,
                                        float32_t *__restrict__ pRes);

void plp_cosine_distance_f32p_xpulpv2(void *S);


void plp_cosine_distance_f32(  const float32_t *__restrict__ pSrcA,
                                  const float32_t *__restrict__ pSrcB,
                                  uint32_t blockSize,
                                  float32_t *__restrict__ pRes);

void plp_cosine_distance_f32s_rv32im(  const float32_t *__restrict__ pSrcA,
                                          const float32_t *__restrict__ pSrcB,
                                          uint32_t blockSize,
                                          float32_t *__restrict__ pRes);

void plp_cosine_distance_f32s_xpulpv2(  const float32_t *__restrict__ pSrcA,
                                        const float32_t *__restrict__ pSrcB,
                                        uint32_t blockSize,
                                        float32_t *__restrict__ pRes);

void plp_cosine_distance_q32(   const int32_t *__restrict__ pSrcA,
                                const int32_t *__restrict__ pSrcB,
                                uint32_t blockSize,
                                uint32_t fracBits,
                                int32_t *__restrict__ pRes);

void plp_cosine_distance_q32s_rv32im(   const int32_t *__restrict__ pSrcA,
                                        const int32_t *__restrict__ pSrcB,
                                        uint32_t blockSize,
                                        uint32_t fracBits,
                                        int32_t *__restrict__ pRes);

void plp_cosine_distance_q32s_xpulpv2(   const int32_t *__restrict__ pSrcA,
                                            const int32_t *__restrict__ pSrcB,
                                            uint32_t blockSize,
                                            uint32_t fracBits,
                                            int32_t *__restrict__ pRes);

void plp_cosine_distance_q16(   const int16_t *__restrict__ pSrcA,
                                const int16_t *__restrict__ pSrcB,
                                uint16_t blockSize,
                                uint16_t fracBits,
                                int32_t *__restrict__ pRes);

void plp_cosine_distance_q16s_rv32im(const int16_t *__restrict__ pSrcA,
                              const int16_t *__restrict__ pSrcB,
                              uint32_t blockSize,
                              uint32_t fracBits,
                              int32_t *__restrict__ pRes);

void plp_cosine_distance_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
                               const int16_t *__restrict__ pSrcB,
                               uint32_t blockSize,
                               uint32_t fracBits,
                               int32_t *__restrict__ pRes);



#endif // __PLP_MATH_H__

Updated on 2023-03-01 at 16:16:34 +0000