/github/workspace/include/plp_math.h
Public header file for PULP DSP Library. More...
Classes
Name | |
---|---|
struct | plp_dot_prod_instance_i32 Instance structure for integer parallel dot product. |
struct | plp_dot_prod_instance_q32 Instance structure for fixed point parallel dot product. |
struct | plp_dot_prod_instance_f32 Instance structure for float parallel dot product. |
struct | plp_mult_instance_f32 Instance structure for float parallel multiplication. |
struct | plp_log_instance_f32 Instance structure for float parallel log. |
struct | plp_conv_instance_i32 Instance structure for basic integer convolution. |
struct | plp_conv_instance_i16 Instance structure for basic integer convolution. |
struct | plp_conv_instance_i8 Instance structure for basic integer convolution. |
struct | plp_conv_tree_add_instance Instance structure for basic integer convolution. |
struct | plp_cfft_instance_q16 Instance structure for the fixed-point CFFT/CIFFT function. |
struct | plp_cfft_instance_q16_parallel Instance structure for the parallel CFFT Q16. |
struct | plp_cfft_instance_q32 Instance structure for the fixed-point CFFT/CIFFT function. |
struct | plp_cfft_instance_q32_parallel Instance structure for the parallel CFFT Q16. |
struct | plp_cfft_instance_f32 Instance structure for the floating-point CFFT/CIFFT function. |
struct | plp_cfft_instance_f32_parallel Instance structure for floating-point FFT (parallel version) |
struct | plp_fft_instance_f32 Instance structure for floating-point FFT. |
struct | plp_fft_fast_instance_f32 Instance structure for floating-point FFT. |
struct | plp_fft_fast_instance_f32_parallel Instance structure for floating-point FFT. |
struct | plp_fft_instance_f32_parallel Instance structure for floating-point FFT (parallel version) |
struct | plp_triangular_filter_f32 structure containing non-zero values of triangular filterbanks |
struct | Complex_type_f32 Helper type to represent complex values with float32 components. |
struct | plp_mat_mult_instance_i8 Instance structure for integer parallel matrix multiplication. |
struct | plp_mat_mult_instance_i16 Instance structure for integer parallel matrix multiplication. |
struct | plp_mat_mult_instance_i32 Instance structure for integer parallel matrix multiplication. |
struct | plp_mat_mult_instance_f32 Instance structure for floating-point parallel matrix multiplication. |
struct | plp_mat_mult_instance_q8 Instance structure for 8-bit fix-point parallel matrix multiplication. |
struct | plp_mat_mult_instance_q16 Instance structure for 16-bit fix-point parallel matrix multiplication. |
struct | plp_mat_mult_instance_q32 Instance structure for 32-bit fix-point parallel matrix multiplication. |
struct | plp_mat_mult_cmplx_instance_i8 Instance structure for integer parallel complex matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_instance_i16 Instance structure for integer parallel complex matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_instance_i32 Instance structure for integer parallel complex matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_instance_f32 Instance structure for floating-point parallel complex matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_instance_q8 Instance structure for 8-bit fix-point parallel complex matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_instance_q16 Instance structure for 16-bit fix-point parallel complex matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_instance_q32 Instance structure for 32-bit fix-point parallel complex matrix matrix multiplication. |
struct | plp_mat_add_instance_i8 Instance structure for integer parallel matrix addition. |
struct | plp_mat_add_instance_i16 Instance structure for integer parallel matrix addition. |
struct | plp_mat_add_instance_i32 Instance structure for integer parallel matrix addition. |
struct | plp_mat_add_instance_f32 Instance structure for floating-point parallel matrix addition. |
struct | plp_mat_sub_instance_i8 Instance structure for integer parallel matrix subtraction. |
struct | plp_mat_sub_instance_i16 Instance structure for integer parallel matrix subtraction. |
struct | plp_mat_sub_instance_i32 Instance structure for integer parallel matrix subtraction. |
struct | plp_mat_sub_instance_f32 Instance structure for floating-point parallel matrix subtraction. |
struct | plp_mat_scale_instance_i8 Instance structure for integer parallel matrix scale. |
struct | plp_mat_scale_instance_i16 Instance structure for integer parallel matrix scale. |
struct | plp_mat_scale_instance_i32 Instance structure for integer parallel matrix scale. |
struct | plp_mat_scale_instance_f32 Instance structure for floating-point parallel matrix scale. |
struct | plp_mat_trans_instance_i8 Instance structure for integer parallel matrix transpose. |
struct | plp_mat_trans_instance_i16 Instance structure for integer parallel matrix transpose. |
struct | plp_mat_trans_instance_i32 Instance structure for integer parallel matrix transpose. |
struct | plp_mat_fill_I_instance_i8 Instance structure for integer parallel identity matrix creation. |
struct | plp_mat_fill_I_instance_i16 Instance structure for integer parallel identity matrix creation. |
struct | plp_mat_fill_I_instance_i32 Instance structure for integer parallel identity matrix creation. |
struct | plp_mat_fill_I_instance_f32 Instance structure for floating-point parallel identity matrix creation. |
struct | plp_mat_fill_I_instance_q8 Instance structure for fix-point parallel identity matrix creation. |
struct | plp_mat_fill_I_instance_q16 Instance structure for fix-point parallel identity matrix creation. |
struct | plp_mat_fill_I_instance_q32 Instance structure for fix-point parallel identity matrix creation. |
struct | plp_mat_inv_instance_f32 Instance structure for floating-point parallel matrix inversion. |
struct | plp_mat_mult_stride_instance_i8 Instance structure for strided integer parallel matrix multiplication. |
struct | plp_mat_mult_stride_instance_i16 Instance structure for strided integer parallel matrix multiplication. |
struct | plp_mat_mult_stride_instance_i32 Instance structure for strided integer parallel matrix multiplication. |
struct | plp_mat_mult_stride_instance_f32 Instance structure for strided floating-point parallel matrix multiplication. |
struct | plp_mat_mult_stride_instance_q8 Instance structure for strided 8-bit fix-point parallel matrix multiplication. |
struct | plp_mat_mult_stride_instance_q16 Instance structure for strided 16-bit fix-point parallel matrix multiplication. |
struct | plp_mat_mult_stride_instance_q32 Instance structure for strided 32-bit fix-point parallel matrix multiplication. |
struct | plp_mat_mult_cmplx_stride_instance_i8 Instance structure for integer parallel complex strided matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_stride_instance_i16 Instance structure for integer parallel complex strided matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_stride_instance_i32 Instance structure for integer parallel complex strided matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_stride_instance_f32 Instance structure for floating-point parallel complex strided matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_stride_instance_q8 Instance structure for 8-bit fix-point parallel complex strided matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_stride_instance_q16 Instance structure for 16-bit fix-point parallel complex strided matrix matrix multiplication. |
struct | plp_mat_mult_cmplx_stride_instance_q32 Instance structure for 32-bit fix-point parallel complex strided matrix matrix multiplication. |
struct | plp_mat_add_stride_instance_i8 Instance structure for strided integer parallel matrix addition. |
struct | plp_mat_add_stride_instance_i16 Instance structure for strided integer parallel matrix addition. |
struct | plp_mat_add_stride_instance_i32 Instance structure for strided integer parallel matrix addition. |
struct | plp_mat_add_stride_instance_f32 Instance structure for strided floating-point parallel matrix addition. |
struct | plp_mat_sub_stride_instance_i8 Instance structure for strided integer parallel matrix subtraction. |
struct | plp_mat_sub_stride_instance_i16 Instance structure for strided integer parallel matrix subtraction. |
struct | plp_mat_sub_stride_instance_i32 Instance structure for strided integer parallel matrix subtraction. |
struct | plp_mat_sub_stride_instance_f32 Instance structure for strided floating-point parallel matrix subtraction. |
struct | plp_mat_scale_stride_instance_i8 Instance structure for strided integer parallel matrix scale. |
struct | plp_mat_scale_stride_instance_i16 Instance structure for strided integer parallel matrix scale. |
struct | plp_mat_scale_stride_instance_i32 Instance structure for strided integer parallel matrix scale. |
struct | plp_mat_scale_stride_instance_f32 Instance structure for strided floating-point parallel matrix scale. |
struct | plp_mat_fill_I_stride_instance_i8 Instance structure for integer parallel strided identity matrix creation. |
struct | plp_mat_fill_I_stride_instance_i16 Instance structure for integer parallel strided identity matrix creation. |
struct | plp_mat_fill_I_stride_instance_i32 Instance structure for integer parallel strided identity matrix creation. |
struct | plp_mat_fill_I_stride_instance_f32 Instance structure for floating-point parallel strided identity matrix creation. |
struct | plp_mat_fill_I_stride_instance_q8 Instance structure for 8-bit fix-point parallel strided identity matrix creation. |
struct | plp_mat_fill_I_stride_instance_q16 Instance structure for 16-bit fix-point parallel strided identity matrix creation. |
struct | plp_mat_fill_I_stride_instance_q32 Instance structure for 32-bit fix-point parallel strided identity matrix creation. |
struct | plp_mat_fill_stride_instance_i8 Instance structure for filling an integer matrix in parallel. |
struct | plp_mat_fill_stride_instance_i16 Instance structure for filling an integer matrix in parallel. |
struct | plp_mat_fill_stride_instance_i32 Instance structure for filling an integer matrix in parallel. |
struct | plp_mat_fill_stride_instance_f32 Instance structure for filling a floating-point matrix in parallel. |
struct | plp_mat_copy_stride_instance_i8 Instance structure for integer parallel strided matrix copy. |
struct | plp_mat_copy_stride_instance_i16 Instance structure for integer parallel strided matrix copy. |
struct | plp_mat_copy_stride_instance_i32 Instance structure for integer parallel strided matrix copy. |
struct | plp_mat_copy_stride_instance_f32 Instance structure for floating-point parallel strided matrix copy. |
struct | plp_euclidean_distance_instance_f32 Instance structure for float parallel Euclidean distance. |
struct | plp_euclidean_distance_instance_q32 Instance structure for float parallel Euclidean distance. |
struct | plp_cosine_distance_instance_f32 Instance structure for float parallel cosine distance. |
struct | plp_power_instance_q32 Instance structure for fixed point parallel power. |
struct | plp_power_instance_f32 Instance structure for float parallel power. |
struct | plp_dwt_wavelet_f32 |
struct | plp_dwt_wavelet_q32 |
struct | plp_dwt_wavelet_q16 |
struct | plp_dwt_wavelet_q8 |
struct | plp_dwt_instance_f32 Instance structure for float parallel dwt. |
struct | plp_dwt_instance_q32 Instance structure for Q32 parallel dwt. |
struct | plp_dwt_instance_q16 Instance structure for Q16 parallel dwt. |
struct | plp_dwt_instance_q8 Instance structure for Q8 parallel dwt. |
Types
Name | |
---|---|
enum | plp_dwt_wavelet_type { PLP_DWT_WAVELET_OTHER, PLP_DWT_WAVELET_HAAR, PLP_DWT_WAVELET_DB1, PLP_DWT_WAVELET_DB2, PLP_DWT_WAVELET_DB3, PLP_DWT_WAVELET_DB4, PLP_DWT_WAVELET_DB5, PLP_DWT_WAVELET_DB6, PLP_DWT_WAVELET_DB7, PLP_DWT_WAVELET_DB8, PLP_DWT_WAVELET_DB9, PLP_DWT_WAVELET_DB10, PLP_DWT_WAVELET_DB11, PLP_DWT_WAVELET_DB12, PLP_DWT_WAVELET_DB13, PLP_DWT_WAVELET_DB14, PLP_DWT_WAVELET_DB15, PLP_DWT_WAVELET_DB16, PLP_DWT_WAVELET_DB17, PLP_DWT_WAVELET_DB18, PLP_DWT_WAVELET_DB19, PLP_DWT_WAVELET_DB20, PLP_DWT_WAVELET_SYM2, PLP_DWT_WAVELET_SYM3, PLP_DWT_WAVELET_SYM4, PLP_DWT_WAVELET_SYM5, PLP_DWT_WAVELET_SYM6, PLP_DWT_WAVELET_SYM7, PLP_DWT_WAVELET_SYM8, PLP_DWT_WAVELET_SYM9, PLP_DWT_WAVELET_SYM10, PLP_DWT_WAVELET_SYM11, PLP_DWT_WAVELET_SYM12, PLP_DWT_WAVELET_SYM13, PLP_DWT_WAVELET_SYM14, PLP_DWT_WAVELET_SYM15, PLP_DWT_WAVELET_SYM16, PLP_DWT_WAVELET_SYM17, PLP_DWT_WAVELET_SYM18, PLP_DWT_WAVELET_SYM19, PLP_DWT_WAVELET_SYM20, PLP_DWT_WAVELET_COIF1, PLP_DWT_WAVELET_COIF2, PLP_DWT_WAVELET_COIF3, PLP_DWT_WAVELET_COIF4, PLP_DWT_WAVELET_COIF5, PLP_DWT_WAVELET_COIF6, PLP_DWT_WAVELET_COIF7, PLP_DWT_WAVELET_COIF8, PLP_DWT_WAVELET_COIF9, PLP_DWT_WAVELET_COIF10, PLP_DWT_WAVELET_COIF11, PLP_DWT_WAVELET_COIF12, PLP_DWT_WAVELET_COIF13, PLP_DWT_WAVELET_COIF14, PLP_DWT_WAVELET_COIF15, PLP_DWT_WAVELET_COIF16, PLP_DWT_WAVELET_COIF17} |
enum | plp_dwt_extension_mode { PLP_DWT_MODE_ZERO, PLP_DWT_MODE_CONSTANT, PLP_DWT_MODE_SYMMETRIC, PLP_DWT_MODE_REFLECT, PLP_DWT_MODE_PERIODIC, PLP_DWT_MODE_ANTISYMMETRIC, PLP_DWT_MODE_ANTIREFLECT} |
typedef float | float32_t |
Functions
Name | |
---|---|
uint32_t | plp_dwt_max_level(uint32_t sig_len, uint32_t wavelet_len) Computes maximum available decomposition level for a signal length and wavelet length. |
uint32_t | plp_dwt_dec_len(uint32_t sig_len, uint32_t wavelet_len, uint32_t level) Calculates decomposition output length given a level. |
void | plp_dot_prod_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, int32_t *restrict pRes) Glue code for parallel dot product of 32-bit integer vectors. |
void | plp_dot_prod_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, uint32_t nPE, int32_t *restrict pRes) Glue code for parallel dot product of 32-bit fixed point vectors. |
void | plp_dot_prod_f32_parallel(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, float32_t *restrict pRes) Glue code for parallel dot product of 32-bit float vectors. |
void | plp_dot_prod_i32p_xpulpv2(void * S) Parallel dot product with interleaved access of 32-bit integer vectors kernel for XPULPV2 extension. |
void | plp_dot_prod_q32p_xpulpv2(void * S) Parallel dot product with interleaved access of 32-bit fixed point vectors kernel for XPULPV2 extension. |
void | plp_dot_prod_f32p_xpulpv2(void * S) Parallel dot product with interleaved access of 32-bit float vectors kernel for XPULPV2 extension. |
void | plp_dot_prod_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Glue code for dot product of 32-bit integer vectors. |
void | plp_dot_prod_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Scalar dot product of 32-bit integer vectors kernel for RV32IM extension. |
void | plp_dot_prod_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Scalar dot product of 32-bit integer vectors kernel for XPULPV2 extension. |
void | plp_dot_prod_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Glue code for dot product of 32-bit fixed point vectors. |
void | plp_dot_prod_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Scalar dot product of 32-bit fixed point vectors kernel for RV32IM extension. |
void | plp_dot_prod_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Scalar dot product of 32-bit fixed point vectors kernel for XPULPV2 extension. |
void | plp_dot_prod_f32(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) Glue code for dot product of 32-bit float vectors. |
void | plp_dot_prod_f32s_xpulpv2(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) Glue code for dot product of 32-bit float vectors. |
void | plp_dot_prod_f32s_rv32im(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) Glue code for dot product of 32-bit float vectors. |
void | plp_dot_prod_i16(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t blockSize, int32_t *restrict pRes) Glue code for dot product of 16-bit integer vectors. |
void | plp_dot_prod_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Vectorized dot product of 16-bit integer vectors kernel for RV32IM extension. |
void | plp_dot_prod_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Vectorized dot product of 16-bit integer vectors kernel singlecore for XPULPV2 extension. |
void | plp_dot_prod_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Glue code for dot product of 16-bit fixed point vectors. |
void | plp_dot_prod_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Scalar dot product of 16-bit fixed point vectors kernel for RV32IM extension. |
void | plp_dot_prod_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Vectorized dot product of 16-bit fixed point vectors singlecore kernel for XPULPV2 extension. |
void | plp_dot_prod_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Glue code for dot product of 8-bit integer vectors. |
void | plp_dot_prod_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Vectorized dot product of 8-bit integer vectors kernel for RV32IM extension. |
void | plp_dot_prod_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, int32_t *restrict pRes) Vectorized dot product of 8-bit integer vectors singlecore kernel for XPULPV2 extension. |
void | plp_dot_prod_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Glue code for dot product of 8-bit fixed point vectors. |
void | plp_dot_prod_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Scalar dot product of 8-bit fixed point vectors kernel for RV32IM extension. |
void | plp_dot_prod_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Scalar dot product of 8-bit fixed point vectors singlecore kernel for XPULPV2 extension. |
void | plp_abs_i32(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) Glue code for absolute value of 32-bit integer vectors. |
void | plp_abs_i32s_rv32im(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) Element-by-element absolute value of 32-bit integer vectors kernel for RV32IM extension. |
void | plp_abs_i32s_xpulpv2(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) Element-by-element absolute value of 32-bit integer vectors kernel for XPULPV2 extension. |
void | plp_abs_i16(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) Glue code for absolute value of 16-bit integer vectors. |
void | plp_abs_i16s_rv32im(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) Element-by-element absolute value of 16-bit integer vectors kernel for RV32IM extension. |
void | plp_abs_i16s_xpulpv2(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) Element-by-element absolute value of 16-bit integer vectors kernel for XPULPV2 extension. |
void | plp_abs_i8(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) Glue code for absolute value of 8-bit integer vectors. |
void | plp_abs_i8s_rv32im(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) Element-by-element absolute value of 8-bit integer vectors kernel for RV32IM extension. |
void | plp_abs_i8s_xpulpv2(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) Element-by-element absolute value of 8-bit integer vectors kernel for XPULPV2 extension. |
void | plp_add_i32(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element addition of 32-bit integer vectors. |
void | plp_add_i32s_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 32-bit integer vectors kernel for RV32IM extension. |
void | plp_add_i32s_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 32-bit integer vectors kernel for XPULPV2 extension. |
void | plp_add_i16(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element addition of 16-bit integer vectors. |
void | plp_add_i16s_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 16-bit integer vectors kernel for RV32IM extension. |
void | plp_add_i16s_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 16-bit integer vectors kernel for XPULPV2 extension. |
void | plp_add_i8(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element addition of 8-bit integer vectors. |
void | plp_add_i8s_rv32im(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 8-bit integer vectors kernel for RV32IM extension. |
void | plp_add_i8s_xpulpv2(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element addition of 8-bit integer vectors kernel for XPULPV2 extension. |
void | plp_mult_i32(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element multiplication of 32-bit integer vectors. |
void | plp_mult_i32s_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 32-bit integer vectors kernel for RV32IM extension. |
void | plp_mult_i32s_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 32-bit integer vectors kernel for XPULPV2 extension. |
void | plp_mult_i16(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element multiplication of 16-bit integer vectors. |
void | plp_mult_i16s_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 16-bit integer vectors kernel for RV32IM extension. |
void | plp_mult_i16s_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 16-bit integer vectors kernel for XPULPV2 extension. |
void | plp_mult_i8(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code for element-by-element multiplication of 8-bit integer vectors. |
void | plp_mult_i8s_rv32im(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 8-bit integer vectors kernel for RV32IM extension. |
void | plp_mult_i8s_xpulpv2(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 8-bit integer vectors kernel for XPULPV2 extension. |
void | plp_mult_f32(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize) Glue code for element-by-element multiplication of 32-bit float vectors. |
void | plp_mult_f32s_xpulpv2(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize) Element-by-element multiplication of 32-bit float vectors kernel for XPULPV2 extension. |
void | plp_mult_f32_parallel(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, float32_t *restrict pDst) Glue code for parallel dot product of 32-bit float vectors. |
void | plp_mult_f32p_xpulpv2(void * S) Parallel multiplication with interleaved access of 32-bit float vectors kernel for XPULPV2 extension. |
void | plp_log_f32_parallel(const float32_t restrict pSrc, uint32_t blockSize, uint32_t nPE, float32_t restrict pDst) Glue code for parallel log of 32-bit float vectors. |
void | plp_log_f32p_xpulpv2(void * S) Parallel log with interleaved access of 32-bit float vectors kernel for XPULPV2 extension. |
void | plp_negate_i32(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) Glue code of negate the elements of a vector for 32-bit integers. |
void | plp_negate_i32s_rv32im(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) negate the elements of a vector for 32-bit integers on RV32IM |
void | plp_negate_i32s_xpulpv2(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize) negate the elements of a vector for 32-bit integers on XpulpV2 |
void | plp_negate_i16(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) Glue code of negate the elements of a vector for 16-bit integers. |
void | plp_negate_i16s_rv32im(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) negate the elements of a vector for 16-bit integers on RV32IM |
void | plp_negate_i16s_xpulpv2(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize) negate the elements of a vector for 16-bit integers on XpulpV2 |
void | plp_negate_i8(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) Glue code of negate the elements of a vector for 8-bit integers. |
void | plp_negate_i8s_rv32im(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) negate the elements of a vector for 8-bit integers on RV32IM |
void | plp_negate_i8s_xpulpv2(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize) negate the elements of a vector for 8-bit integers on XpulpV2 |
void | plp_negate_f32(const float32_t * pSrc, float32_t * pDst, uint32_t blockSize) Glue code of negate the elements of a vector for 32-bit floats. |
void | plp_negate_f32s_xpulpv2(const float32_t * pSrc, float32_t * pDst, uint32_t blockSize) negate the elements of a vector for 32-bit floats on XpulpV2 |
void | plp_offset_i32(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize) Glue code of add a constant offset to a vector for 32-bit integers. |
void | plp_offset_i32s_rv32im(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize) add a constant offset to a vector for 32-bit integers on RV32IM |
void | plp_offset_i32s_xpulpv2(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize) add a constant offset to a vector for 32-bit integers on XpulpV2 |
void | plp_offset_i16(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize) Glue code of add a constant offset to a vector for 16-bit integers. |
void | plp_offset_i16s_rv32im(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize) add a constant offset to a vector for 16-bit integers on RV32IM |
void | plp_offset_i16s_xpulpv2(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize) add a constant offset to a vector for 16-bit integers on XpulpV2 |
void | plp_offset_i8(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize) Glue code of add a constant offset to a vector for 8-bit integers. |
void | plp_offset_i8s_rv32im(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize) add a constant offset to a vector for 8-bit integers on RV32IM |
void | plp_offset_i8s_xpulpv2(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize) add a constant offset to a vector for 8-bit integers on XpulpV2 |
void | plp_offset_f32(const float32_t * pSrc, float32_t offset, float32_t * pDst, uint32_t blockSize) Glue code of add a constant offset to a vector for 32-bit floats. |
void | plp_offset_f32s_xpulpv2(const float32_t * pSrc, float32_t offset, float32_t * pDst, uint32_t blockSize) add a constant offset to a vector for 32-bit floats on XpulpV2 |
void | plp_sub_i32(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code of vector substraction for 32-bit integers. |
void | plp_sub_i32s_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 32-bit integers on RV32IM |
void | plp_sub_i32s_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 32-bit integers on XpulpV2 |
void | plp_sub_i16(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code of vector substraction for 16-bit integers. |
void | plp_sub_i16s_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 16-bit integers on RV32IM |
void | plp_sub_i16s_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 16-bit integers on XpulpV2 |
void | plp_sub_i8(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) Glue code of vector substraction for 8-bit integers. |
void | plp_sub_i8s_rv32im(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 8-bit integers on RV32IM |
void | plp_sub_i8s_xpulpv2(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize) vector substraction for 8-bit integers on XpulpV2 |
void | plp_sub_f32(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize) Glue code of vector substraction for 32-bit floats. |
void | plp_sub_f32s_xpulpv2(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize) vector substraction for 32-bit floats on XpulpV2 |
void | plp_scale_i32(const int32_t restrict pSrc, int32_t scaleFactor, int32_t shift, int32_t restrict pDst, uint32_t blockSize) Glue code of multiply a vector by a scalar for 32-bit integers. |
void | plp_scale_i32s_rv32im(const int32_t restrict pSrc, int32_t scaleFactor, int32_t shift, int32_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 32-bit integers on RV32IM |
void | plp_scale_i32s_xpulpv2(const int32_t restrict pSrc, int32_t scaleFactor, int32_t shift, int32_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 32-bit integers on XpulpV2 |
void | plp_scale_i16(const int16_t restrict pSrc, int16_t scaleFactor, int32_t shift, int16_t restrict pDst, uint32_t blockSize) Glue code of multiply a vector by a scalar for 16-bit integers. |
void | plp_scale_i16s_rv32im(const int16_t restrict pSrc, int16_t scaleFactor, int32_t shift, int16_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 16-bit integers on RV32IM |
void | plp_scale_i16s_xpulpv2(const int16_t restrict pSrc, int16_t scaleFactor, int32_t shift, int16_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 16-bit integers on XpulpV2 |
void | plp_scale_i8(const int8_t restrict pSrc, int8_t scaleFactor, int32_t shift, int8_t restrict pDst, uint32_t blockSize) Glue code of multiply a vector by a scalar for 8-bit integers. |
void | plp_scale_i8s_rv32im(const int8_t restrict pSrc, int8_t scaleFactor, int32_t shift, int8_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 8-bit integers on RV32IM |
void | plp_scale_i8s_xpulpv2(const int8_t restrict pSrc, int8_t scaleFactor, int32_t shift, int8_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 8-bit integers on XpulpV2 |
void | plp_scale_f32(const float32_t restrict pSrc, float32_t scaleFactor, float32_t restrict pDst, uint32_t blockSize) Glue code of multiply a vector by a scalar for 32-bit floats. |
void | plp_scale_f32s_xpulpv2(const float32_t restrict pSrc, float32_t scaleFactor, float32_t restrict pDst, uint32_t blockSize) multiply a vector by a scalar for 32-bit floats on XpulpV2 |
void | plp_fill_i32(int32_t value, int32_t *restrict pDst, uint32_t blockSize) Glue code for filling a constant value into a 32-bit integer vector. |
void | plp_fill_i32s_rv32im(int32_t value, int32_t *restrict pDst, uint32_t blockSize) Fills a constant value into a 32-bit integer vector for RV32IM extension. |
void | plp_fill_i32s_xpulpv2(int32_t value, int32_t *restrict pDst, uint32_t blockSize) Fills a constant value into a 32-bit integer vector for XPULPV2 extension. |
void | plp_copy_i32(int32_t restrict pSrc, int32_t restrict pDst, uint32_t blockSize) Glue code for copying the elements of a 32-bit integer vector. |
void | plp_copy_i32s_rv32im(int32_t restrict pSrc, int32_t restrict pDst, uint32_t blockSize) Copies the elements of a 32-bit integer vector for RV32IM extension. |
void | plp_copy_i32s_xpulpv2(int32_t restrict pSrc, int32_t restrict pDst, uint32_t blockSize) Copies the elements of a 32-bit integer vector for XPULPV2 extension. |
void | plp_copy_f32(float32_t restrict pSrc, float32_t restrict pDst, uint32_t blockSize) Glue code for copying the elements of a 32-bit float vector. |
void | plp_copy_f32s_xpulpv2(float32_t restrict pSrc, float32_t restrict pDst, uint32_t blockSize) Copies the elements of a 32-bit integer vector for XPULPV2 extension. |
void | plp_copy_f32s_rv32im(float32_t restrict pSrc, float32_t restrict pDst, uint32_t blockSize) Copies the elements of a 32-bit integer vector for XPULPV2 extension. |
void | plp_mean_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for mean value of a 32-bit float vector. |
void | plp_mean_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for mean value of a 32-bit float vector. |
void | plp_mean_i32(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for mean value of a 32-bit integer vector. |
void | plp_mean_i32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Mean value of a 32-bit integer vector for RV32IM extension. |
void | plp_mean_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Mean value of a 32-bit integer vector for XPULPV2 extension. |
void | plp_mean_i16(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Glue code for mean value of a 16-bit integer vector. |
void | plp_mean_i16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Mean value of a 16-bit integer vector for RV32IM extension. |
void | plp_mean_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Mean value of a 16-bit integer vector for XPULPV2 extension. |
void | plp_mean_i8(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Glue code for mean value of a 8-bit integer vector. |
void | plp_mean_i8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Mean value of a 8-bit integer vector for RV32IM extension. |
void | plp_mean_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Mean value of a 8-bit integer vector for XPULPV2 extension. |
void | plp_max_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for max value of a 32-bit float vector. |
void | plp_max_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for max value of a 32-bit float vector. |
void | plp_max_i32(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for max value of a 32-bit integer vector. |
void | plp_max_i32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Max value of a 32-bit integer vector for RV32IM extension. |
void | plp_max_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Max value of a 32-bit integer vector for XPULPV2 extension. |
void | plp_max_i16(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Glue code for max value of a 16-bit integer vector. |
void | plp_max_i16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Max value of a 16-bit integer vector for RV32IM extension. |
void | plp_max_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Max value of a 16-bit integer vector for XPULPV2 extension. |
void | plp_max_i8(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Glue code for max value of a 8-bit integer vector. |
void | plp_max_i8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Max value of a 8-bit integer vector for RV32IM extension. |
void | plp_max_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Max value of a 8-bit integer vector for XPULPV2 extension. |
void | plp_min_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for min value of a 32-bit float vector. |
void | plp_min_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for min value of a 32-bit float vector. |
void | plp_min_i32(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for min value of a 32-bit integer vector. |
void | plp_min_i32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Min value of a 32-bit integer vector for RV32IM extension. |
void | plp_min_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Min value of a 32-bit integer vector for XPULPV2 extension. |
void | plp_min_i16(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Glue code for min value of a 16-bit integer vector. |
void | plp_min_i16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Min value of a 16-bit integer vector for RV32IM extension. |
void | plp_min_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, int16_t restrict pRes) Min value of a 16-bit integer vector for XPULPV2 extension. |
void | plp_min_i8(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Glue code for min value of a 8-bit integer vector. |
void | plp_min_i8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Min value of a 8-bit integer vector for RV32IM extension. |
void | plp_min_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, int8_t restrict pRes) Min value of a 8-bit integer vector for XPULPV2 extension. |
void | plp_power_f32_parallel(const float32_t restrict pSrc, uint32_t blockSize, uint32_t nPE, float32_t restrict pRes) Glue code for parallel power of 32-bit floating point vectors. |
void | plp_power_f32p_xpulpv2(void * S) Parallel sum of squares of a 32-bit float vector for XPULPV2 extension. |
void | plp_power_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for Sum of squares of a 32-bit float vector. |
void | plp_power_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for Sum of squares of a 32-bit float vector. |
void | plp_power_f32s_rv32im(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Sum of squares of a 32-bit float vector for RV32IM. |
void | plp_power_i32(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for Sum of squares of a 32-bit integer vector. |
void | plp_power_i32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 32-bit integer vector for RV32IM extension. |
void | plp_power_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 32-bit integer vector for XPULPV2 extension. |
void | plp_power_i16(const int16_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for Sum of squares of a 16-bit integer vector. |
void | plp_power_i16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 16-bit integer vector for RV32IM extension. |
void | plp_power_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 16-bit integer vector for XPULPV2 extension. |
void | plp_power_i8(const int8_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Glue code for Sum of squares of a 8-bit integer vector. |
void | plp_power_i8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 8-bit integer vector for RV32IM extension. |
void | plp_power_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, int32_t restrict pRes) Sum of squares of a 8-bit integer vector for XPULPV2 extension. |
void | plp_power_q32_parallel(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, uint32_t nPE, int32_t restrict pRes) Glue code for parallel power of 32-bit fixed point vectors. |
void | plp_power_q32p_xpulpv2(void * S) Parallel sum of squares of a 32-bit fixed-point vector for XPULPV2 extension. |
void | plp_power_q32(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Sum of squares of a 32-bit fixed point vector. |
void | plp_power_q32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 32-bit fixed point vector for RV32IM extension. |
void | plp_power_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 32-bit fixed point vector for XPULPV2 extension. |
void | plp_power_q16(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Sum of squares of a 16-bit fixed point vector. |
void | plp_power_q16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 16-bit fixed point vector for RV32IM extension. |
void | plp_power_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 16-bit fixed point vector for XPULPV2 extension. |
void | plp_power_q8(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Sum of squares of a 8-bit fixed point vector. |
void | plp_power_q8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 8-bit fixed point vector for RV32IM extension. |
void | plp_power_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Sum of squares of a 8-bit fixed point vector for XPULPV2 extension. |
void | plp_var_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for Statisical variance of a 32-bit float vector. |
void | plp_var_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for Statisical variance of a 32-bit float vector. |
void | plp_var_q32(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Statisical variance of a 32-bit fixed point vector. |
void | plp_var_q32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical variance of a 32-bit fixed point vector for RV32IM extension. |
void | plp_var_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical variance of a 32-bit fixed point vector for XPULPV2 extension. |
void | plp_var_q16(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Glue code for Statisical variance of a 16-bit fixed point vector. |
void | plp_var_q16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical variance of a 16-bit fixed point vector for RV32IM extension. |
void | plp_var_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical variance of a 16-bit fixed point vector for XPULPV2 extension. |
void | plp_var_q8(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Glue code for Statisical variance of a 8-bit fixed point vector. |
void | plp_var_q8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical variance of a 8-bit fixed point vector for RV32IM extension. |
void | plp_var_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical variance of a 8-bit fixed point vector for XPULPV2 extension. |
void | plp_std_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for Statisical standard deviation of a 32-bit floating point vector. |
void | plp_std_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for Statisical standard deviation of a 32-bit float vector. |
void | plp_std_q32(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Statisical standard deviation of a 32-bit fixed point vector. |
void | plp_std_q32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension. |
void | plp_std_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension. |
void | plp_std_q16(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Glue code for Statisical standard deviation of a 16-bit fixed point vector. |
void | plp_std_q16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension. |
void | plp_std_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension. |
void | plp_std_q8(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Glue code for Statisical standard deviation of a 8-bit fixed point vector. |
void | plp_std_q8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension. |
void | plp_std_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension. |
void | plp_rms_f32(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Glue code for Statisical standard deviation of a 32-bit floating point vector. |
void | plp_rms_f32s_xpulpv2(const float restrict pSrc, uint32_t blockSize, float restrict pRes) Kernel for Statisical standard deviation of a 32-bit float vector. |
void | plp_rms_q32(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Glue code for Statisical standard deviation of a 32-bit fixed point vector. |
void | plp_rms_q32s_rv32im(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension. |
void | plp_rms_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int32_t restrict pRes) Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension. |
void | plp_rms_q16(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Glue code for Statisical standard deviation of a 16-bit fixed point vector. |
void | plp_rms_q16s_rv32im(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension. |
void | plp_rms_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int16_t restrict pRes) Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension. |
void | plp_rms_q8(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Glue code for Statisical standard deviation of a 8-bit fixed point vector. |
void | plp_rms_q8s_rv32im(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension. |
void | plp_rms_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t blockSize, uint32_t fracBits, int8_t restrict pRes) Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension. |
void | plp_sqrt_q32(const int32_t restrict pSrc, const uint32_t fracBits, int32_t restrict pRes) Glue code for square root of a 32-bit fixed point number. |
void | plp_sqrt_q32s_rv32im(const int32_t restrict pSrc, const uint32_t fracBits, int32_t restrict pRes) Square root of a 32-bit fixed point number for XPULPV2 extension. |
void | plp_sqrt_q32s_xpulpv2(const int32_t restrict pSrc, const uint32_t fracBits, int32_t restrict pRes) Square root of a 32-bit fixed point number for XPULPV2 extension. |
void | plp_sqrt_q16(const int16_t restrict pSrc, const uint32_t fracBits, int16_t restrict pRes) Glue code for square root of a 16-bit fixed point number. |
void | plp_sqrt_q16s_rv32im(const int16_t restrict pSrc, const uint32_t fracBits, int16_t restrict pRes) Square root of a 16-bit fixed point number for XPULPV2 extension. |
void | plp_sqrt_q16s_xpulpv2(const int16_t restrict pSrc, const uint32_t fracBits, int16_t restrict pRes) Square root of a 16-bit fixed point number for XPULPV2 extension. |
void | plp_sqrt_f32(const float restrict pSrc, float restrict pRes) Glue code for square root of a 32-bit floating point number. |
void | plp_sqrt_f32s_rv32im(const float restrict pSrc, float restrict pRes) Square root of a 32-bit floating point number for RV32IM. |
void | plp_sqrt_f32s_xpulpv2(const float restrict pSrc, float restrict pRes) Kernel for square root of a 32-bit floating point number. |
int32_t | plp_cos_q32(int32_t x) Glue code for q32 cosine function. |
int32_t | plp_cos_q32s_rv32im(int32_t x) q32 cosine function for RV32IM |
int32_t | plp_cos_q32s_xpulpv2(int32_t x) q32 cosine function for XPULPV2 |
int16_t | plp_cos_q16(int16_t x) Glue code for q16 cosine function. |
int16_t | plp_cos_q16s_rv32im(int16_t x) q16 cosine function for RV32IM |
int16_t | plp_cos_q16s_xpulpv2(int16_t x) q16 cosine function for XPULPV2 |
float32_t | plp_cos_f32(float32_t x) Glue code for f32 cosine function. |
float32_t | plp_cos_f32s_xpulpv2(float32_t x) F32 cosine function for XPULPV2. |
int32_t | plp_sin_q32(int32_t x) Glue code for q32 sine function. |
int32_t | plp_sin_q32s_rv32im(int32_t x) q32 sine function for RV32IM |
int32_t | plp_sin_q32s_xpulpv2(int32_t x) q32 sine function for XPULPV2 |
int16_t | plp_sin_q16(int16_t x) Glue code for q16 sine function. |
int16_t | plp_sin_q16s_rv32im(int16_t x) q16 sine function for RV32IM |
int16_t | plp_sin_q16s_xpulpv2(int16_t x) q16 sine function for XPULPV2 |
float32_t | plp_sin_f32(float32_t x) Glue code for f32 sine function. |
float32_t | plp_sin_f32s_xpulpv2(float32_t x) F32 sine function for XPULPV2. |
void | plp_correlate_i32(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for correlation of 32-bit integer vectors. |
void | plp_correlate_i32s_rv32im(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Correlation of 32-bit integer vectors kernel for RV32IM extension. |
void | plp_correlate_i32s_xpulpv2(const int32_t restrict pSrcA, const uint32_t srcALen, const int32_t restrict pSrcB, const uint32_t srcBLen, int32_t *restrict pRes) Correlation of 32-bit integer vectors kernel for XPULPV2 extension. |
void | plp_correlate_i16(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for correlation of 16-bit integer vectors. |
void | plp_correlate_i16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Correlation of 16-bit integer vectors kernel for XPULPV2 extension. |
void | plp_correlate_i16s_rv32im(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Correlation of 16-bit integer vectors kernel for RV32IM extension. |
void | plp_correlate_i8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for correlation of 8-bit integer vectors. |
void | plp_correlate_valid_i8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for correlation (valid) of 8-bit integer vectors. |
void | plp_correlate_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Correlation of 8-bit integer vectors kernel for XPULPV2 extension. |
void | plp_correlate_i8s_rv32im(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Correlation of 8-bit integer vectors kernel for RV32IM extension. |
void | plp_correlate_q32(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Glue code for correlation of 32-bit integer vectors. |
void | plp_correlate_q32s_rv32im(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Correlation of 32-bit integer vectors kernel for RV32IM extension. |
void | plp_correlate_q32s_xpulpv2(const int32_t restrict pSrcA, const uint32_t srcALen, const int32_t restrict pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t *restrict pRes) Correlation of 32-bit integer vectors kernel for XPULPV2 extension. |
void | plp_correlate_q16(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Glue code for correlation of 16-bit integer vectors. |
void | plp_correlate_q16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Correlation of 16-bit integer vectors kernel for XPULPV2 extension. |
void | plp_correlate_q16s_rv32im(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Correlation of 16-bit integer vectors kernel for RV32IM extension. |
void | plp_correlate_q8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Glue code for correlation of 8-bit integer vectors. |
void | plp_correlate_valid_q8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Glue code for correlation (valid) of 8-bit integer vectors. |
void | plp_correlate_q8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Correlation of 8-bit integer vectors kernel for XPULPV2 extension. |
void | plp_correlate_q8s_rv32im(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, const uint32_t fracBits, int32_t * pRes) Correlation of 8-bit integer vectors kernel for RV32IM extension. |
void | plp_conv_i32(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution of 32-bit integer vectors. |
void | plp_conv_valid_i32(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution (valid) of 32-bit integer vectors. |
void | plp_conv_i32s_rv32im(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 32-bit integer vectors kernel for RV32IM extension. |
void | plp_conv_i32s_xpulpv2(const int32_t restrict pSrcA, const uint32_t srcALen, const int32_t restrict pSrcB, const uint32_t srcBLen, int32_t *restrict pRes) Convolution of 32-bit integer vectors kernel for XPULPV2 extension. |
void | plp_conv_valid_i32s_xpulpv2(const int32_t restrict pSrcA, const uint32_t srcALen, const int32_t restrict pSrcB, const uint32_t srcBLen, int32_t *restrict pRes) Convolution (valid) of 32-bit integer vectors kernel for XPULPV2 extension. |
void | plp_conv_i16(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution of 16-bit integer vectors. |
void | plp_conv_valid_i16(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution (valid) of 16-bit integer vectors. |
void | plp_conv_valid_rep_i16(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution (valid with replication) of 16-bit integer vectors. |
void | plp_conv_i16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 16-bit integer vectors kernel for XPULPV2 extension. |
void | plp_conv_valid_i16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution (valid) of 16-bit integer vectors kernel for XPULPV2 extension. |
void | plp_conv_valid_rep_i16s_xpulpv2(const int16_t * pSrcA, const uint32_t srcALen, const uint32_t srcAMem, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution (valid with data replication) of 16-bit integer vectors kernel for XPULPV2 extension. |
void | plp_conv_i16s_rv32im(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 16-bit integer vectors kernel for RV32IM extension. |
void | plp_conv_i8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution of 8-bit integer vectors. |
void | plp_conv_valid_i8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution (valid) of 8-bit integer vectors. |
void | plp_conv_valid_rep_i8(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Glue code for convolution (valid with data replication) of 8-bit integer vectors. |
void | plp_conv_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 8-bit integer vectors kernel for XPULPV2 extension. |
void | plp_conv_valid_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution (valid) of 8-bit integer vectors kernel for XPULPV2 extension. |
void | plp_conv_valid_rep_i8s_xpulpv2(const int8_t * pSrcA, const uint32_t srcALen, const uint32_t srcAMem, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution (valid with data replication) of 8-bit integer vectors kernel for XPULPV2 extension. |
void | plp_conv_i8s_rv32im(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, int32_t * pRes) Convolution of 8-bit integer vectors kernel for RV32IM extension. |
void | plp_conv_i32_parallel(const int32_t * pSrcA, const uint32_t srcALen, const int32_t * pSrcB, const uint32_t srcBLen, const uint8_t nPE, int32_t * pRes) Glue code for parallel convolution of 32-bit integer vectors. |
void | plp_conv_i32p_xpulpv2(void * task_args) Setup code for parallel convolution of 32-bit integer vectors. |
void | plp_conv_i16_parallel(const int16_t * pSrcA, const uint32_t srcALen, const int16_t * pSrcB, const uint32_t srcBLen, const uint8_t nPE, int32_t * pRes) Glue code for parallel convolution of 16-bit integer vectors. |
void | plp_conv_i16p_xpulpv2(void * task_args) Setup code for parallel convolution of 16-bit integer vectors. |
void | plp_conv_i8_parallel(const int8_t * pSrcA, const uint32_t srcALen, const int8_t * pSrcB, const uint32_t srcBLen, const uint8_t nPE, int32_t * pRes) Glue code for parallel convolution of 8-bit integer vectors. |
void | plp_conv_i8p_xpulpv2(void * task_args) Setup code for parallel convolution of 8-bit integer vectors. |
void | plp_conv_parallel_OLA(uint32_t nPE, uint32_t srcALen, uint32_t srcBLen, int32_t * resultsBuffer) Helper function for parallelized overlap-adding of partial convolution results. |
void | plp_conv_parallel_OLA_kernel(void * task_args) Helper function for parallelized overlap-adding of partial convolution results. |
void | plp_mat_mult_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix matrix multiplication of a 32-bit integer matrices. |
void | plp_mat_mult_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix matrix multiplication of a 16-bit integer matrices. |
void | plp_mat_mult_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix matrix multiplication of a 8-bit integer matrices. |
void | plp_mat_mult_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices. |
void | plp_mat_mult_i32p_xpulpv2(void * args) Parallel matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 16-bit integer matrices. |
void | plp_mat_mult_i16p_xpulpv2(void * args) Parallel matrix multiplication of 16-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 8-bit integer matrices. |
void | plp_mat_mult_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Glue code for matrix matrix multiplication of a 32-bit floating-point matrices. |
void | plp_mat_mult_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension. |
void | plp_mat_mult_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, float *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 32-bit floating-point matrices. |
void | plp_mat_mult_f32p_xpulpv2(void * args) Parallel matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_i8p_xpulpv2(void * args) Parallel matrix multiplication of 8-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Glue code for matrix matrix multiplication of a 32-bit fix-point matrices. |
void | plp_mat_mult_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 32-bit fix-point matrices. |
void | plp_mat_mult_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_q32p_xpulpv2(void * args) Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Glue code for matrix matrix multiplication of a 16-bit fix-point matrices. |
void | plp_mat_mult_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 16-bit fix-point matrices. |
void | plp_mat_mult_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_q16p_xpulpv2(void * args) Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Glue code for matrix matrix multiplication of a 8-bit fix-point matrices. |
void | plp_mat_mult_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 8-bit fix-point matrices. |
void | plp_mat_mult_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_q8p_xpulpv2(void * args) Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_cmplx_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 32-bit integers. |
void | plp_mat_mult_cmplx_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 32-bit integers on RV32IM. |
void | plp_mat_mult_cmplx_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 32-bit integers on XpulpV2. |
void | plp_mat_mult_cmplx_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 32-bit integers. |
void | plp_mat_mult_cmplx_i32p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 32-bit integers on XpulpV2 |
void | plp_mat_mult_cmplx_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 16-bit integers. |
void | plp_mat_mult_cmplx_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 16-bit integers on RV32IM. |
void | plp_mat_mult_cmplx_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 16-bit integers on XpulpV2. |
void | plp_mat_mult_cmplx_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 16-bit integers. |
void | plp_mat_mult_cmplx_i16p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 16-bit integers on XpulpV2 |
void | plp_mat_mult_cmplx_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 8-bit integers. |
void | plp_mat_mult_cmplx_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 8-bit integers on RV32IM. |
void | plp_mat_mult_cmplx_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix matrix multiplication for complex 8-bit integers on XpulpV2. |
void | plp_mat_mult_cmplx_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 8-bit integers. |
void | plp_mat_mult_cmplx_i8p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 8-bit integers on XpulpV2 |
void | plp_mat_mult_cmplx_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Glue code of matrix matrix multiplication for complex 32-bit floats. |
void | plp_mat_mult_cmplx_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Matrix matrix multiplication for complex 32-bit floats on XpulpV2. |
void | plp_mat_mult_cmplx_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, float *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 32-bit floats. |
void | plp_mat_mult_cmplx_f32p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 32-bit floats on XpulpV2 |
void | plp_mat_mult_cmplx_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 32-bit fix-point. |
void | plp_mat_mult_cmplx_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Matrix matrix multiplication for complex 32-bit fix-point on RV32IM. |
void | plp_mat_mult_cmplx_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Matrix matrix multiplication for complex 32-bit fix-point on XpulpV2. |
void | plp_mat_mult_cmplx_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 32-bit fix-point. |
void | plp_mat_mult_cmplx_q32p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 32-bit fix-point on XpulpV2 |
void | plp_mat_mult_cmplx_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 16-bit fix-point. |
void | plp_mat_mult_cmplx_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Matrix matrix multiplication for complex 16-bit fix-point on RV32IM. |
void | plp_mat_mult_cmplx_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Matrix matrix multiplication for complex 16-bit fix-point on XpulpV2. |
void | plp_mat_mult_cmplx_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 16-bit fix-point. |
void | plp_mat_mult_cmplx_q16p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 16-bit fix-point on XpulpV2 |
void | plp_mat_mult_cmplx_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Glue code of matrix matrix multiplication for complex 8-bit fix-point. |
void | plp_mat_mult_cmplx_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Matrix matrix multiplication for complex 8-bit fix-point on RV32IM. |
void | plp_mat_mult_cmplx_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Matrix matrix multiplication for complex 8-bit fix-point on XpulpV2. |
void | plp_mat_mult_cmplx_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code of parallel matrix matrix multiplication for complex 8-bit fix-point. |
void | plp_mat_mult_cmplx_q8p_xpulpv2(void * args) parallel matrix matrix multiplication for complex 8-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 32-bit integer matrices. |
void | plp_mat_mult_trans_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_trans_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 16-bit integer matrices. |
void | plp_mat_mult_trans_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_trans_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 8-bit integer matrices. |
void | plp_mat_mult_trans_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_trans_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices. |
void | plp_mat_mult_trans_i32p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_trans_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 16-bit integer matrices. |
void | plp_mat_mult_trans_i16p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 8-bit integer matrices. |
void | plp_mat_mult_trans_i8p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 32-bit fix-point matrices. |
void | plp_mat_mult_trans_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 32-bit fix-point matrices. |
void | plp_mat_mult_trans_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_trans_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_q32p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_trans_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 16-bit fix-point matrices. |
void | plp_mat_mult_trans_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 16-bit fix-point matrices. |
void | plp_mat_mult_trans_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_trans_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_q16p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_trans_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 8-bit fix-point matrices. |
void | plp_mat_mult_trans_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 8-bit fix-point matrices. |
void | plp_mat_mult_trans_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_trans_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_q8p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_trans_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Glue code for matrix transposed matrix multiplication of a 32-bit floating-point matrices. |
void | plp_mat_mult_trans_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, float *restrict pDstC) Glue code for parallel matrix transposed matrix multiplication of a 32-bit floating-point matrices. |
void | plp_mat_mult_trans_f32p_xpulpv2(void * args) Parallel matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_trans_cmplx_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 32-bit integers. |
void | plp_mat_mult_trans_cmplx_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 32-bit integers on RV32IM |
void | plp_mat_mult_trans_cmplx_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 32-bit integers. |
void | plp_mat_mult_trans_cmplx_i32p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 16-bit integers. |
void | plp_mat_mult_trans_cmplx_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 16-bit integers on RV32IM |
void | plp_mat_mult_trans_cmplx_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 16-bit integers. |
void | plp_mat_mult_trans_cmplx_i16p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 8-bit integers. |
void | plp_mat_mult_trans_cmplx_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 8-bit integers on RV32IM |
void | plp_mat_mult_trans_cmplx_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 8-bit integers. |
void | plp_mat_mult_trans_cmplx_i8p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 32-bit floats. |
void | plp_mat_mult_trans_cmplx_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, float *restrict pDstC) matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2 |
void | plp_mat_mult_trans_cmplx_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t nPE, float *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 32-bit floats. |
void | plp_mat_mult_trans_cmplx_f32p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2 |
void | plp_mat_mult_trans_cmplx_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 32-bit fix-point. |
void | plp_mat_mult_trans_cmplx_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM |
void | plp_mat_mult_trans_cmplx_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int32_t *restrict pDstC) matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 32-bit fix-point. |
void | plp_mat_mult_trans_cmplx_q32p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 16-bit fix-point. |
void | plp_mat_mult_trans_cmplx_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM |
void | plp_mat_mult_trans_cmplx_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int16_t *restrict pDstC) matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 16-bit fix-point. |
void | plp_mat_mult_trans_cmplx_q16p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) Glue code of matrix transpose matrix multiplication for complex 8-bit fix-point. |
void | plp_mat_mult_trans_cmplx_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM |
void | plp_mat_mult_trans_cmplx_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, int8_t *restrict pDstC) matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code of parallel matrix transpose matrix multiplication for complex 8-bit fix-point. |
void | plp_mat_mult_trans_cmplx_q8p_xpulpv2(void * args) parallel matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2 |
void | plp_cmplx_mag_f32(const float32_t * pSrc, float32_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation in float32. |
void | plp_cmplx_mag_f32s_xpulpv2(const float32_t * pSrc, float32_t * pRes, uint32_t numSamples) complex magnitude for float32 on XPULPV2 |
void | plp_cmplx_mag_q32(const int32_t * pSrc, const uint32_t fracBits, int32_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation for 32 bit fixpoint. |
void | plp_cmplx_mag_q32s_rv32im(const int32_t * pSrc, const uint32_t fracBits, int32_t * pRes, uint32_t numSamples) complex magnitude for q32 on RV32IM |
void | plp_cmplx_mag_q32s_xpulpv2(const int32_t * pSrc, const uint32_t fracBits, int32_t * pRes, uint32_t numSamples) complex magnitude for q32 on XPULPV2 |
void | plp_cmplx_mag_q8(const int8_t * pSrc, const uint32_t fracBits, int8_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation for 8 bit fixpoint. |
void | plp_cmplx_mag_q8s_rv32im(const int8_t * pSrc, const uint32_t fracBits, int8_t * pRes, uint32_t numSamples) complex magnitude for q8 on RV32IM |
void | plp_cmplx_mag_q8s_xpulpv2(const int8_t * pSrc, const uint32_t fracBits, int8_t * pRes, uint32_t numSamples) complex magnitude for q8 on XPULPV2 |
void | plp_cmplx_mag_i16(const int16_t * pSrc, int16_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation in 16-bit integer. |
void | plp_cmplx_mag_i16s_rv32im(const int16_t * pSrc, int16_t * pRes, uint32_t numSamples) complex magnitude for i16 on RV32IM |
void | plp_cmplx_mag_i16s_xpulpv2(const int16_t * pSrc, int16_t * pRes, uint32_t numSamples) complex magnitude for i16 on XPULPV2 |
void | plp_cmplx_mag_i32(const int32_t * pSrc, int32_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation in 32-bit integer. |
void | plp_cmplx_mag_i32s_rv32im(const int32_t * pSrc, int32_t * pRes, uint32_t numSamples) complex magnitude for i32 on RV32IM |
void | plp_cmplx_mag_i32s_xpulpv2(const int32_t * pSrc, int32_t * pRes, uint32_t numSamples) complex magnitude for i32 on XPULPV2 |
void | plp_cmplx_mag_i8(const int8_t * pSrc, int8_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation in 8-bit integer. |
void | plp_cmplx_mag_i8s_rv32im(const int8_t * pSrc, int8_t * pRes, uint32_t numSamples) complex magnitude for i8 on RV32IM |
void | plp_cmplx_mag_i8s_xpulpv2(const int8_t * pSrc, int8_t * pRes, uint32_t numSamples) complex magnitude for i8 on XPULPV2 |
void | plp_cmplx_mag_q16(const int16_t * pSrc, const uint32_t fracBits, int16_t * pRes, uint32_t numSamples) Glue code for complex magnitude calculation in 16-bit quantized integer. |
void | plp_cmplx_mag_q16s_rv32im(const int16_t * pSrc, const uint32_t fracBits, int16_t * pRes, uint32_t numSamples) complex magnitude for q16 on RV32IM |
void | plp_cmplx_mag_q16s_xpulpv2(const int16_t * pSrc, const uint32_t fracBits, int16_t * pRes, uint32_t numSamples) complex magnitude for q16 on XPULPV2 |
void | plp_bitreversal_16s_rv32im(uint16_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab) In-place 16 bit reversal function for RV32IM. |
void | plp_bitreversal_16s_xpulpv2(uint16_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab) In-place 16 bit reversal function for XPULPV2. |
void | plp_bitreversal_16p_xpulpv2(uint16_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab, uint32_t nPE) In-place 16 bit reversal function. |
void | plp_cfft_q16(const plp_cfft_instance_q16 * S, int16_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t deciPoint) Glue code for quantized 16 bit complex fast fourier transform. |
void | plp_cfft_q16_parallel(const plp_cfft_instance_q16 * S, int16_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t deciPoint, uint32_t nPE) Glue code for quantized 16 bit complex fast fourier transform. |
void | plp_cfft_q16s_rv32im(const plp_cfft_instance_q16 * S, int16_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t deciPoint) Quantized 16 bit complex fast fourier transform for RV32IM. |
void | plp_cfft_q16s_xpulpv2(const plp_cfft_instance_q16 * S, int16_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t deciPoint) Quantized 16 bit complex fast fourier transform for XPULPV2. |
void | plp_cfft_q16p_xpulpv2(void * args) Parallel quantized 16 bit complex fast fourier transform for XPULPV2. |
void | plp_bitreversal_32s_rv32im(uint32_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab) In-place 32 bit reversal function for RV32IM. |
void | plp_bitreversal_32s_xpulpv2(uint32_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab) In-place 32 bit reversal function for XPULPV2. |
void | plp_bitreversal_32p_xpulpv2(uint32_t * pSrc, const uint16_t bitRevLen, const uint16_t * pBitRevTab, uint32_t nPE) In-place 32 bit reversal function for XPULPV2. |
void | plp_cfft_q32(const plp_cfft_instance_q32 * S, int32_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t fracBits) Glue code for quantized 32-bit complex fast fourier transform. |
void | plp_cfft_q32_parallel(const plp_cfft_instance_q32 * S, int32_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t fracBits, uint32_t nPE) Quantized 32-bit complex fast fourier transform for XPULPV2. |
void | plp_cfft_q32s_rv32im(const plp_cfft_instance_q32 * S, int32_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t fracBits) Quantized 32-bit complex fast fourier transform for RV32IM. |
void | plp_cfft_q32s_xpulpv2(const plp_cfft_instance_q32 * S, int32_t * p1, uint8_t ifftFlag, uint8_t bitReverseFlag, uint32_t fracBits) Quantized 32-bit complex fast fourier transform for XPULPV2. |
void | plp_cfft_q32p_xpulpv2(void * args) Parallel quantized 32 bit complex fast fourier transform for XPULPV2. |
void | plp_rfft_f32(const plp_fft_instance_f32 * S, const float32_t restrict pSrc, float32_t restrict pDst) Floating-point FFT on real input data. |
void | plp_rfft_f32_parallel(const plp_fft_instance_f32 * S, const float32_t restrict pSrc, const uint32_t nPE, float32_t restrict pDst) Floating-point FFT on real input data (parallel version). |
void | plp_rfft_f32s_xpulpv2(const plp_fft_instance_f32 * S, const float32_t restrict pSrc, float32_t restrict pDst) Floating-point FFT on real input data for XPULPV2 extension. |
void | plp_rfft_f32p_xpulpv2(void * arg) Floating-point FFT on real input data for XPULPV2 extension (parallel version). |
void | plp_rfftfast_f32(const plp_fft_fast_instance_f32 * S, const float32_t restrict pSrc, float32_t restrict pDst) Floating-point FFT on real input data. |
void | plp_rfftfast_f32_parallel(const plp_fft_fast_instance_f32 * S, float32_t restrict pSrc, float32_t restrict pDst, const uint32_t nPE) Floating-point parallel FFT on real input data. |
void | plp_rfftfast_f32s_xpulpv2(const plp_fft_fast_instance_f32 * S, float32_t * pSrc, float32_t * pDst) Floating-point FFT on real input data for XPULPV2 extension. |
void | plp_rfftfast_f32p_xpulpv2(void * arg) Floating-point parallel FFT on real input data for XPULPV2 extension. |
void | plp_cfft_f32(const plp_cfft_instance_f32 * S, float32_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag) Floating-point FFT on complex input data. |
void | plp_cfft_f32_parallel(const plp_cfft_instance_f32 * S, const float32_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag, const uint32_t nPE) Floating-point FFT on complex input data (parallel version). |
void | plp_cfft_f32s_xpulpv2(const plp_cfft_instance_f32 * S, const float32_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag) Floating-point FFT on complex input data for XPULPV2 extension. |
void | plp_cfft_f32p_xpulpv2(void * arg) Floating-point FFT on complex input data for XPULPV2 extension (parallel version). |
void | plp_dct2_f32(const plp_fft_instance_f32 * S, const Complex_type_f32 * pShift, const uint8_t orthoNorm, const float32_t restrict pSrc, float32_t restrict pBuf, float32_t *restrict pDst) Floating-point DCT on real input data. Implementation of John Makhoul's "A Fast Cosine Transform in One and Two Dimensions" 1980 IEEE paper. |
void | plp_dct2_f32_parallel(const plp_fft_instance_f32 * S, const Complex_type_f32 * pShift, const uint8_t orthoNorm, const float32_t restrict pSrc, const uint32_t nPE, float32_t restrict pBuf, float32_t *restrict pDst) Floating-point DCT on real input data. Implementation of John Makhoul's "A Fast Cosine Transform in One and Two Dimensions" 1980 IEEE paper. |
void | plp_mfcc_f32(const plp_fft_instance_f32 * SFFT, const plp_fft_instance_f32 * SDCT, const Complex_type_f32 * pShift, const plp_triangular_filter_f32 * filterBank, const float32_t * window, const uint8_t * orthoNorm, const float32_t restrict pSrc, float32_t restrict pDst) MFCC on real input data. |
void | plp_mfcc_f32_parallel(const plp_fft_instance_f32 * SFFT, const plp_fft_instance_f32 * SDCT, const Complex_type_f32 * pShift, const plp_triangular_filter_f32 * filterBank, const float32_t * window, const uint8_t * orthoNorm, const float32_t restrict pSrc, const uint32_t nPE, float32_t restrict pDst) MFCC on real input data. |
void | plp_dwt_f32(const float32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_f32 wavelet, plp_dwt_extension_mode mode, float32_t restrict pDstA, float32_t *restrict pDstD) Glue code for matrix addition of a 32-bit integer matrices. |
void | plp_dwt_q32(const int32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q32 wavelet, plp_dwt_extension_mode mode, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Fixed-point DWT for XPULPV2 extension. |
void | plp_dwt_q16(const int16_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q16 wavelet, plp_dwt_extension_mode mode, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Fixed-point DWT for XPULPV2 extension. |
void | plp_dwt_q8(const int8_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q8 wavelet, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Fixed-point DWT for XPULPV2 extension. |
void | plp_dwt_dec_f32(const float32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_f32 wavelet, plp_dwt_extension_mode mode, uint32_t level, float32_t restrict pTmp, float32_t *restrict pDst) Floating-point n-level DWT for XPULPV2 extension. |
void | plp_dwt_dec_f32_parallel(const float32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_f32 wavelet, plp_dwt_extension_mode mode, uint32_t level, uint32_t nPE, float32_t restrict pTemp, float32_t *restrict pDst) Floating-point parallel n-level DWT for XPULPV2 extension. |
void | plp_dwt_f32s_xpulpv2(const float32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_f32 wavelet, plp_dwt_extension_mode mode, float32_t restrict pDstA, float32_t *restrict pDstD) Floating-point DWT on real input data for XPULPV2 extension. |
void | plp_dwt_haar_f32s_xpulpv2(const float32_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, float32_t restrict pDstA, float32_t *restrict pDstD) Floating-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension. |
void | plp_dwt_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q32 wavelet, plp_dwt_extension_mode mode, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Fixed-point DWT for XPULPV2 extension. |
void | plp_dwt_haar_q32s_xpulpv2(const int32_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension. |
void | plp_dwt_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q16 wavelet, plp_dwt_extension_mode mode, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Fixed-point DWT for XPULPV2 extension. |
void | plp_dwt_haar_q16s_xpulpv2(const int16_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension. |
void | plp_dwt_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q8 wavelet, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Fixed-point DWT for XPULPV2 extension. |
void | plp_dwt_haar_q8s_xpulpv2(const int8_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension. |
void | plp_dwt_f32_parallel(const float32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_f32 wavelet, plp_dwt_extension_mode mode, uint32_t nPE, float32_t restrict pDstA, float32_t *restrict pDstD) Parallel Floating-point DWT on real input data for XPULPV2 extension. |
void | plp_dwt_q8_parallel(const int8_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q8 wavelet, plp_dwt_extension_mode mode, uint32_t nPE, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Parallel Fixed-point DWT on real input data for XPULPV2 extension. |
void | plp_dwt_q16_parallel(const int16_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q16 wavelet, plp_dwt_extension_mode mode, uint32_t nPE, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Parallel Fixed-point DWT on real input data for XPULPV2 extension. |
void | plp_dwt_q32_parallel(const int32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q32 wavelet, plp_dwt_extension_mode mode, uint32_t nPE, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Parallel Fixed-point DWT on real input data for XPULPV2 extension. |
void | plp_dwt_f32p_xpulpv2(void * args) Floating-point DWT on real input data for XPULPV2 extension. |
void | plp_dwt_haar_f32p_xpulpv2(void * args) Floating-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension. |
void | plp_dwt_q8p_xpulpv2(void * args) Q7 fixed-point DWT for XPULPV2 extension. |
void | plp_dwt_haar_q8p_xpulpv2(void * args) q7 fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension. |
void | plp_dwt_q16p_xpulpv2(void * args) Q15 fixed-point DWT for XPULPV2 extension. |
void | plp_dwt_haar_q16p_xpulpv2(void * args) q15 fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension. |
void | plp_dwt_q32p_xpulpv2(void * arg) Q31 fixed-point DWT on real input data for XPULPV2 extension. |
void | plp_dwt_haar_q32p_xpulpv2(void * args) Q31 Fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension. |
void | plp_dwt_q32s_rv32im(const int32_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q32 wavelet, plp_dwt_extension_mode mode, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Fixed-point DWT. |
void | plp_dwt_haar_q32s_rv32im(const int32_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int32_t restrict pDstA, int32_t *restrict pDstD) 32bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data. |
void | plp_dwt_q16s_rv32im(const int16_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q16 wavelet, plp_dwt_extension_mode mode, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Fixed-point DWT. |
void | plp_dwt_haar_q16s_rv32im(const int16_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int16_t restrict pDstA, int16_t *restrict pDstD) 16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data. |
void | plp_dwt_q8s_rv32im(const int8_t restrict pSrc, uint32_t length, const plp_dwt_wavelet_q8 wavelet, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Fixed-point DWT. |
void | plp_dwt_haar_q8s_rv32im(const int8_t restrict pSrc, uint32_t length, plp_dwt_extension_mode mode, int8_t restrict pDstA, int8_t *restrict pDstD) 8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data. |
void | plp_mat_add_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) Glue code for matrix addition of 16-bit integer matrices. |
void | plp_mat_add_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) matrix addition of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_add_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) matrix addition of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_add_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int32_t *restrict pDst) Glue code for parallel matrix addition of a 32-bit integer matrices. |
void | plp_mat_add_i32p_xpulpv2(void * args) Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_add_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) Glue code for matrix addition of a 16-bit integer matrices. |
void | plp_mat_add_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) matrix addition of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_add_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) matrix addition of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_add_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int16_t *restrict pDst) Glue code for parallel matrix addition of a 16-bit integer matrices. |
void | plp_mat_add_i16p_xpulpv2(void * args) Parallel matrix addition of 16-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_add_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) Glue code for matrix addition of a 8-bit integer matrices. |
void | plp_mat_add_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) matrix addition of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_add_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) matrix addition of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_add_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int8_t *restrict pDst) Glue code for parallel matrix addition of a 8-bit integer matrices. |
void | plp_mat_add_i8p_xpulpv2(void * args) Parallel matrix addition of 8-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_add_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, float *restrict pDst) Glue code for matrix addition of a 32-bit floating-point matrices. |
void | plp_mat_add_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, float *restrict pDst) matrix addition of a 32-bit floating-point matrices for XPULPV2 extension. |
void | plp_mat_add_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, float *restrict pDst) Glue code for parallel matrix addition of a 32-bit floating-point matrices. |
void | plp_mat_add_f32p_xpulpv2(void * args) Parallel matrix addition of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_sub_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) Glue code for matrix subtraction of a 32-bit integer matrices. |
void | plp_mat_sub_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) matrix subtraction of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_sub_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, int32_t *restrict pDst) matrix subtraction of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_sub_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int32_t *restrict pDst) Glue code for parallel matrix subtraction of a 32-bit integer matrices. |
void | plp_mat_sub_i32p_xpulpv2(void * args) Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_sub_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) Glue code for matrix subtraction of a 16-bit integer matrices. |
void | plp_mat_sub_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) matrix subtraction of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_sub_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, int16_t *restrict pDst) matrix subtraction of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_sub_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int16_t *restrict pDst) Glue code for parallel matrix subtraction of a 16-bit integer matrices. |
void | plp_mat_sub_i16p_xpulpv2(void * args) Parallel matrix subtraction of 16-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_sub_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) Glue code for matrix subtraction of a 8-bit integer matrices. |
void | plp_mat_sub_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) matrix subtraction of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_sub_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, int8_t *restrict pDst) matrix subtraction of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_sub_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, int8_t *restrict pDst) Glue code for parallel matrix subtraction of a 8-bit integer matrices. |
void | plp_mat_sub_i8p_xpulpv2(void * args) Parallel matrix subtraction of 8-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_sub_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, float *restrict pDst) Glue code for matrix subtraction of a 32-bit floating-point matrices. |
void | plp_mat_sub_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, float *restrict pDst) matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension. |
void | plp_mat_sub_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t nPE, float *restrict pDst) Glue code for parallel matrix subtraction of a 32-bit floating-point matrices. |
void | plp_mat_sub_f32p_xpulpv2(void * args) Parallel matrix subtraction of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_scale_i32(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) Glue code for matrix scale of a 32-bit integer matrices. |
void | plp_mat_scale_i32s_rv32im(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) matrix scale of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_scale_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) matrix scale of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_scale_i32_parallel(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t scaleFactor, int32_t shift, uint32_t nPE, int32_t restrict pDst) Glue code for parallel matrix scale of a 32-bit integer matrices. |
void | plp_mat_scale_i32p_xpulpv2(void * args) Parallel matrix scale of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_scale_i16(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) Glue code for matrix scale of a 16-bit integer matrices. |
void | plp_mat_scale_i16s_rv32im(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) matrix scale of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_scale_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) matrix scale of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_scale_i16_parallel(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t scaleFactor, int32_t shift, uint32_t nPE, int16_t restrict pDst) Glue code for parallel matrix scale of a 16-bit integer matrices. |
void | plp_mat_scale_i16p_xpulpv2(void * args) Parallel matrix scale of 16-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_scale_i8(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) Glue code for matrix scale of a 8-bit integer matrices. |
void | plp_mat_scale_i8s_rv32im(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) matrix scale of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_scale_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) matrix scale of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_scale_i8_parallel(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t scaleFactor, int32_t shift, uint32_t nPE, int8_t restrict pDst) Glue code for parallel matrix scale of a 8-bit integer matrices. |
void | plp_mat_scale_i8p_xpulpv2(void * args) Parallel matrix scale of 8-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_scale_f32(const float restrict pSrc, uint32_t M, uint32_t N, float scaleFactor, float restrict pDst) Glue code for matrix scale of a 32-bit floating-point matrices. |
void | plp_mat_scale_f32s_xpulpv2(const float restrict pSrc, uint32_t M, uint32_t N, float scaleFactor, float restrict pDst) matrix scale of a 32-bit floating-point matrices for XPULPV2 extension. |
void | plp_mat_scale_f32_parallel(const float restrict pSrc, uint32_t M, uint32_t N, float scaleFactor, uint32_t nPE, float restrict pDst) Glue code for parallel matrix scale of a 32-bit floating-point matrices. |
void | plp_mat_scale_f32p_xpulpv2(void * args) Parallel matrix scale of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_trans_i32(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t restrict pDst) Glue code for matrix transpose of a 32-bit integer matrices. |
void | plp_mat_trans_i32s_rv32im(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t restrict pDst) matrix transpose of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_trans_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t M, uint32_t N, int32_t restrict pDst) matrix transpose of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_trans_i32_parallel(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t nPE, int32_t restrict pDst) Glue code for parallel matrix transpose of a 32-bit integer matrices. |
void | plp_mat_trans_i32p_xpulpv2(void * args) Parallel matrix transpose of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_trans_i16(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t restrict pDst) Glue code for matrix transpose of a 16-bit integer matrices. |
void | plp_mat_trans_i16s_rv32im(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t restrict pDst) matrix transpose of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_trans_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t M, uint32_t N, int16_t restrict pDst) matrix transpose of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_trans_i16_parallel(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t nPE, int16_t restrict pDst) Glue code for parallel matrix transpose of a 16-bit integer matrices. |
void | plp_mat_trans_i16p_xpulpv2(void * args) Parallel matrix transpose of 16-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_trans_i8(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t restrict pDst) Glue code for matrix transpose of a 8-bit integer matrices. |
void | plp_mat_trans_i8s_rv32im(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t restrict pDst) matrix transpose of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_trans_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t M, uint32_t N, int8_t restrict pDst) matrix transpose of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_trans_i8_parallel(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t nPE, int8_t restrict pDst) Glue code for parallel matrix transpose of a 8-bit integer matrices. |
void | plp_mat_trans_i8p_xpulpv2(void * args) Parallel matrix transpose of 8-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_trans_f32(const float restrict pSrc, uint32_t M, uint32_t N, float restrict pDst) Glue code for matrix transpose of a 32-bit float*ing-point matrices. |
void | plp_mat_trans_f32_parallel(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t nPE, float restrict pDst) Glue code for parallel matrix transpose of a 32-bit float*ing-point matrices. |
int | plp_mat_inv_f32(float restrict pSrc, float restrict pDst, uint32_t N) Glue code for matrix inverse of a 32-bit floating-point matrices. |
int | plp_mat_inv_f32s_xpulpv2(float restrict pSrc, float restrict pDst, uint32_t N) matrix inverse of a 32-bit floating-point matrices for XPULPV2 extension. |
int | plp_mat_inv_f32_parallel(float restrict pSrc, float restrict pDst, uint32_t N, uint32_t nPE) Glue code for parallel matrix inverse of a 32-bit floating-point matrices. |
int | plp_mat_inv_f32p_xpulpv2(void * args) Parallel matrix inverse of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_fill_I_i32(uint32_t N, int32_t *restrict pDst) Glue code for creating a 32-bit integer identity matrix. |
void | plp_mat_fill_I_i32s_rv32im(uint32_t N, int32_t *restrict pDst) Create a 32-bit integer identity matrix on RV32IM. |
void | plp_mat_fill_I_i32s_xpulpv2(uint32_t N, int32_t *restrict pDst) Create a 32-bit integer identity matrix on XpulpV2. |
void | plp_mat_fill_I_i32_parallel(uint32_t N, uint32_t nPE, int32_t *restrict pDst) Glue code for creating a 32-bit integer identity matrix in parallel. |
void | plp_mat_fill_I_i32p_xpulpv2(void * args) Create a 32-bit integer identity matrix in parallel on XpulpV2. |
void | plp_mat_fill_I_i16(uint32_t N, int16_t *restrict pDst) Glue code for creating a 16-bit integer identity matrix. |
void | plp_mat_fill_I_i16s_rv32im(uint32_t N, int16_t *restrict pDst) Create a 16-bit integer identity matrix on RV32IM. |
void | plp_mat_fill_I_i16s_xpulpv2(uint32_t N, int16_t *restrict pDst) Create a 16-bit integer identity matrix on XpulpV2. |
void | plp_mat_fill_I_i16_parallel(uint32_t N, uint32_t nPE, int16_t *restrict pDst) Glue code for creating a 16-bit integer identity matrix in parallel. |
void | plp_mat_fill_I_i16p_xpulpv2(void * args) Create a 16-bit integer identity matrix in parallel on XpulpV2. |
void | plp_mat_fill_I_i8(uint32_t N, int8_t *restrict pDst) Glue code for creating a 8-bit integer identity matrix. |
void | plp_mat_fill_I_i8s_rv32im(uint32_t N, int8_t *restrict pDst) Create a 8-bit integer identity matrix on RV32IM. |
void | plp_mat_fill_I_i8s_xpulpv2(uint32_t N, int8_t *restrict pDst) Create a 8-bit integer identity matrix on XpulpV2. |
void | plp_mat_fill_I_i8_parallel(uint32_t N, uint32_t nPE, int8_t *restrict pDst) Glue code for creating a 8-bit integer identity matrix in parallel. |
void | plp_mat_fill_I_i8p_xpulpv2(void * args) Create a 8-bit integer identity matrix in parallel on XpulpV2. |
void | plp_mat_fill_I_f32(uint32_t N, float *restrict pDst) Glue code for creating a 32-bit float identity matrix. |
void | plp_mat_fill_I_f32s_xpulpv2(uint32_t N, float *restrict pDst) Create a 32-bit float identity matrix on XpulpV2. |
void | plp_mat_fill_I_f32_parallel(uint32_t N, uint32_t nPE, float *restrict pDst) Glue code for creating a 32-bit float identity matrix in parallel. |
void | plp_mat_fill_I_f32p_xpulpv2(void * args) Create a 32-bit float identity matrix in parallel on XpulpV2. |
void | plp_mat_fill_I_q32(uint32_t N, int32_t fracBits, int32_t *restrict pDst) Glue code for creating a 32-bit fix-point identity matrix. |
void | plp_mat_fill_I_q32s_rv32im(uint32_t N, int32_t fracBits, int32_t *restrict pDst) Create a 32-bit fix-point identity matrix on RV32IM. |
void | plp_mat_fill_I_q32s_xpulpv2(uint32_t N, int32_t fracBits, int32_t *restrict pDst) Create a 32-bit fix-point identity matrix on XpulpV2. |
void | plp_mat_fill_I_q32_parallel(uint32_t N, int32_t fracBits, uint32_t nPE, int32_t *restrict pDst) Glue code for creating a 32-bit fix-point identity matrix in parallel. |
void | plp_mat_fill_I_q32p_xpulpv2(void * args) Create a 32-bit fix-point identity matrix in parallel on XpulpV2. |
void | plp_mat_fill_I_q16(uint32_t N, int32_t fracBits, int16_t *restrict pDst) Glue code for creating a 16-bit fix-point identity matrix. |
void | plp_mat_fill_I_q16s_rv32im(uint32_t N, int32_t fracBits, int16_t *restrict pDst) Create a 16-bit fix-point identity matrix on RV32IM. |
void | plp_mat_fill_I_q16s_xpulpv2(uint32_t N, int32_t fracBits, int16_t *restrict pDst) Create a 16-bit fix-point identity matrix on XpulpV2. |
void | plp_mat_fill_I_q16_parallel(uint32_t N, int32_t fracBits, uint32_t nPE, int16_t *restrict pDst) Glue code for creating a 16-bit fix-point identity matrix in parallel. |
void | plp_mat_fill_I_q16p_xpulpv2(void * args) Create a 16-bit fix-point identity matrix in parallel on XpulpV2. |
void | plp_mat_fill_I_q8(uint32_t N, int32_t fracBits, int8_t *restrict pDst) Glue code for creating a 8-bit fix-point identity matrix. |
void | plp_mat_fill_I_q8s_rv32im(uint32_t N, int32_t fracBits, int8_t *restrict pDst) Create a 8-bit fix-point identity matrix on RV32IM. |
void | plp_mat_fill_I_q8s_xpulpv2(uint32_t N, int32_t fracBits, int8_t *restrict pDst) Create a 8-bit fix-point identity matrix on XpulpV2. |
void | plp_mat_fill_I_q8_parallel(uint32_t N, int32_t fracBits, uint32_t nPE, int8_t *restrict pDst) Glue code for creating a 8-bit fix-point identity matrix in parallel. |
void | plp_mat_fill_I_q8p_xpulpv2(void * args) Create a 8-bit fix-point identity matrix in parallel on XpulpV2. |
void | plp_mat_mult_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 32-bit integer matrices. |
void | plp_mat_mult_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 16-bit integer matrices. |
void | plp_mat_mult_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 8-bit integer matrices. |
void | plp_mat_mult_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix matrix multiplication of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices. |
void | plp_mat_mult_stride_i32p_xpulpv2(void * args) Parallel strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 16-bit integer matrices. |
void | plp_mat_mult_stride_i16p_xpulpv2(void * args) Parallel matrix multiplication of 16-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 8-bit integer matrices. |
void | plp_mat_mult_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) Glue code for strided matrix matrix multiplication of a 32-bit floating-point matrices. |
void | plp_mat_mult_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) strided matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension. |
void | plp_mat_mult_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, float *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 32-bit floating-point matrices. |
void | plp_mat_mult_stride_f32p_xpulpv2(void * args) Parallel matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_stride_i8p_xpulpv2(void * args) Parallel matrix multiplication of 8-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_stride_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 32-bit fix-point matrices. |
void | plp_mat_mult_stride_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 32-bit fix-point matrices. |
void | plp_mat_mult_stride_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_stride_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_stride_q32p_xpulpv2(void * args) Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_stride_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 16-bit fix-point matrices. |
void | plp_mat_mult_stride_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 16-bit fix-point matrices. |
void | plp_mat_mult_stride_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_stride_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_stride_q16p_xpulpv2(void * args) Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_stride_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Glue code for strided matrix matrix multiplication of a 8-bit fix-point matrices. |
void | plp_mat_mult_stride_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 8-bit fix-point matrices. |
void | plp_mat_mult_stride_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_stride_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_stride_q8p_xpulpv2(void * args) Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 32-bit integer matrices. |
void | plp_mat_mult_trans_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_trans_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 16-bit integer matrices. |
void | plp_mat_mult_trans_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_trans_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 8-bit integer matrices. |
void | plp_mat_mult_trans_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_trans_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices. |
void | plp_mat_mult_trans_stride_i32p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_mult_trans_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices. |
void | plp_mat_mult_trans_stride_i16p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices. |
void | plp_mat_mult_trans_stride_i8p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 32-bit fix-point matrices. |
void | plp_mat_mult_trans_stride_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit fix-point matrices. |
void | plp_mat_mult_trans_stride_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_trans_stride_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_q32p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 16-bit fix-point matrices. |
void | plp_mat_mult_trans_stride_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit fix-point matrices. |
void | plp_mat_mult_trans_stride_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_trans_stride_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_q16p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 8-bit fix-point matrices. |
void | plp_mat_mult_trans_stride_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit fix-point matrices. |
void | plp_mat_mult_trans_stride_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension. |
void | plp_mat_mult_trans_stride_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_q8p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) Glue code for strided matrix transposed matrix multiplication of a 32-bit floating-point matrices. |
void | plp_mat_mult_trans_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) strided matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension. |
void | plp_mat_mult_trans_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, float *restrict pDstC) Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit floating-point matrices. |
void | plp_mat_mult_trans_stride_f32p_xpulpv2(void * args) Parallel strided matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_mult_cmplx_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 32-bit integers. |
void | plp_mat_mult_cmplx_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 32-bit integers on RV32IM. |
void | plp_mat_mult_cmplx_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 32-bit integers on XpulpV2. |
void | plp_mat_mult_cmplx_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 32-bit integers. |
void | plp_mat_mult_cmplx_stride_i32p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 32-bit integers on XpulpV2 |
void | plp_mat_mult_cmplx_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 16-bit integers. |
void | plp_mat_mult_cmplx_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 16-bit integers on RV32IM. |
void | plp_mat_mult_cmplx_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 16-bit integers on XpulpV2. |
void | plp_mat_mult_cmplx_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 16-bit integers. |
void | plp_mat_mult_cmplx_stride_i16p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 16-bit integers on XpulpV2 |
void | plp_mat_mult_cmplx_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 8-bit integers. |
void | plp_mat_mult_cmplx_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 8-bit integers on RV32IM. |
void | plp_mat_mult_cmplx_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 8-bit integers on XpulpV2. |
void | plp_mat_mult_cmplx_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 8-bit integers. |
void | plp_mat_mult_cmplx_stride_i8p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 8-bit integers on XpulpV2 |
void | plp_mat_mult_cmplx_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 32-bit floats. |
void | plp_mat_mult_cmplx_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) Strided strided matrix matrix multiplication for complex 32-bit floats on XpulpV2. |
void | plp_mat_mult_cmplx_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, float *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 32-bit floats. |
void | plp_mat_mult_cmplx_stride_f32p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 32-bit floats on XpulpV2 |
void | plp_mat_mult_cmplx_stride_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 32-bit fix-point. |
void | plp_mat_mult_cmplx_stride_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 32-bit fix-point on RV32IM. |
void | plp_mat_mult_cmplx_stride_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 32-bit fix-point on XpulpV2. |
void | plp_mat_mult_cmplx_stride_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 32-bit fix-point. |
void | plp_mat_mult_cmplx_stride_q32p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 32-bit fix-point on XpulpV2 |
void | plp_mat_mult_cmplx_stride_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 16-bit fix-point. |
void | plp_mat_mult_cmplx_stride_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 16-bit fix-point on RV32IM. |
void | plp_mat_mult_cmplx_stride_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 16-bit fix-point on XpulpV2. |
void | plp_mat_mult_cmplx_stride_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 16-bit fix-point. |
void | plp_mat_mult_cmplx_stride_q16p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 16-bit fix-point on XpulpV2 |
void | plp_mat_mult_cmplx_stride_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Glue code of strided matrix matrix multiplication for complex 8-bit fix-point. |
void | plp_mat_mult_cmplx_stride_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 8-bit fix-point on RV32IM. |
void | plp_mat_mult_cmplx_stride_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Strided strided matrix matrix multiplication for complex 8-bit fix-point on XpulpV2. |
void | plp_mat_mult_cmplx_stride_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code of parallel strided matrix matrix multiplication for complex 8-bit fix-point. |
void | plp_mat_mult_cmplx_stride_q8p_xpulpv2(void * args) parallel strided matrix matrix multiplication for complex 8-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 32-bit integers. |
void | plp_mat_mult_trans_cmplx_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 32-bit integers on RV32IM |
void | plp_mat_mult_trans_cmplx_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit integers. |
void | plp_mat_mult_trans_cmplx_stride_i32p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 16-bit integers. |
void | plp_mat_mult_trans_cmplx_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 16-bit integers on RV32IM |
void | plp_mat_mult_trans_cmplx_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 16-bit integers. |
void | plp_mat_mult_trans_cmplx_stride_i16p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 8-bit integers. |
void | plp_mat_mult_trans_cmplx_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 8-bit integers on RV32IM |
void | plp_mat_mult_trans_cmplx_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 8-bit integers. |
void | plp_mat_mult_trans_cmplx_stride_i8p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 32-bit floats. |
void | plp_mat_mult_trans_cmplx_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, float *restrict pDstC) strided matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t nPE, float *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit floats. |
void | plp_mat_mult_trans_cmplx_stride_f32p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 32-bit fix-point. |
void | plp_mat_mult_trans_cmplx_stride_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM |
void | plp_mat_mult_trans_cmplx_stride_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int32_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int32_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit fix-point. |
void | plp_mat_mult_trans_cmplx_stride_q32p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 16-bit fix-point. |
void | plp_mat_mult_trans_cmplx_stride_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM |
void | plp_mat_mult_trans_cmplx_stride_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int16_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_q16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int16_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 16-bit fix-point. |
void | plp_mat_mult_trans_cmplx_stride_q16p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) Glue code of strided matrix transpose matrix multiplication for complex 8-bit fix-point. |
void | plp_mat_mult_trans_cmplx_stride_q8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM |
void | plp_mat_mult_trans_cmplx_stride_q8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, int8_t *restrict pDstC) strided matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2 |
void | plp_mat_mult_trans_cmplx_stride_q8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t O, uint32_t strideA, uint32_t strideB, uint32_t strideC, uint32_t shift, uint32_t nPE, int8_t *restrict pDstC) Glue code of parallel strided matrix transpose matrix multiplication for complex 8-bit fix-point. |
void | plp_mat_mult_trans_cmplx_stride_q8p_xpulpv2(void * args) parallel strided matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2 |
void | plp_mat_add_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) Glue code for matrix addition of a 32-bit integer matrices. |
void | plp_mat_add_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) matrix addition of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_add_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) matrix addition of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_add_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int32_t *restrict pDst) Glue code for parallel matrix addition of a 32-bit integer matrices. |
void | plp_mat_add_stride_i32p_xpulpv2(void * args) Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_add_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) Glue code for matrix addition of a 16-bit integer matrices. |
void | plp_mat_add_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) matrix addition of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_add_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) matrix addition of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_add_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int16_t *restrict pDst) Glue code for parallel matrix addition of a 16-bit integer matrices. |
void | plp_mat_add_stride_i16p_xpulpv2(void * args) Parallel matrix addition of 16-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_add_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) Glue code for matrix addition of a 8-bit integer matrices. |
void | plp_mat_add_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) matrix addition of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_add_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) matrix addition of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_add_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int8_t *restrict pDst) Glue code for parallel matrix addition of a 8-bit integer matrices. |
void | plp_mat_add_stride_i8p_xpulpv2(void * args) Parallel matrix addition of 8-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_add_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, float *restrict pDst) Glue code for matrix addition of a 32-bit floating-point matrices. |
void | plp_mat_add_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, float *restrict pDst) matrix addition of a 32-bit floating-point matrices for XPULPV2 extension. |
void | plp_mat_add_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, float *restrict pDst) Glue code for parallel matrix addition of a 32-bit floating-point matrices. |
void | plp_mat_add_stride_f32p_xpulpv2(void * args) Parallel matrix addition of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_sub_stride_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) Glue code for matrix subtraction of a 32-bit integer matrices. |
void | plp_mat_sub_stride_i32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) matrix subtraction of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_sub_stride_i32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int32_t *restrict pDst) matrix subtraction of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_sub_stride_i32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int32_t *restrict pDst) Glue code for parallel matrix subtraction of a 32-bit integer matrices. |
void | plp_mat_sub_stride_i32p_xpulpv2(void * args) Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_sub_stride_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) Glue code for matrix subtraction of a 16-bit integer matrices. |
void | plp_mat_sub_stride_i16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) matrix subtraction of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_sub_stride_i16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int16_t *restrict pDst) matrix subtraction of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_sub_stride_i16_parallel(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int16_t *restrict pDst) Glue code for parallel matrix subtraction of a 16-bit integer matrices. |
void | plp_mat_sub_stride_i16p_xpulpv2(void * args) Parallel matrix subtraction of 16-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_sub_stride_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) Glue code for matrix subtraction of a 8-bit integer matrices. |
void | plp_mat_sub_stride_i8s_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) matrix subtraction of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_sub_stride_i8s_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, int8_t *restrict pDst) matrix subtraction of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_sub_stride_i8_parallel(const int8_t restrict pSrcA, const int8_t restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, int8_t *restrict pDst) Glue code for parallel matrix subtraction of a 8-bit integer matrices. |
void | plp_mat_sub_stride_i8p_xpulpv2(void * args) Parallel matrix subtraction of 8-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_sub_stride_f32(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, float *restrict pDst) Glue code for matrix subtraction of a 32-bit floating-point matrices. |
void | plp_mat_sub_stride_f32s_xpulpv2(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, float *restrict pDst) matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension. |
void | plp_mat_sub_stride_f32_parallel(const float restrict pSrcA, const float restrict pSrcB, uint32_t M, uint32_t N, uint32_t strideA, uint32_t strideB, uint32_t strideY, uint32_t nPE, float *restrict pDst) Glue code for parallel matrix subtraction of a 32-bit floating-point matrices. |
void | plp_mat_sub_stride_f32p_xpulpv2(void * args) Parallel matrix subtraction of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_scale_stride_i32(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) Glue code for strided matrix scale of a 32-bit integer matrices. |
void | plp_mat_scale_stride_i32s_rv32im(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) strided matrix scale of a 32-bit integer matrices for RV32IM extension. |
void | plp_mat_scale_stride_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t scaleFactor, int32_t shift, int32_t restrict pDst) strided matrix scale of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_scale_stride_i32_parallel(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t scaleFactor, int32_t shift, uint32_t nPE, int32_t restrict pDst) Glue code for parallel strided matrix scale of a 32-bit integer matrices. |
void | plp_mat_scale_stride_i32p_xpulpv2(void * args) Parallel strided matrix scale of a 32-bit integer matrices for XPULPV2 extension. |
void | plp_mat_scale_stride_i16(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) Glue code for strided matrix scale of a 16-bit integer matrices. |
void | plp_mat_scale_stride_i16s_rv32im(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) strided matrix scale of a 16-bit integer matrices for RV32IM extension. |
void | plp_mat_scale_stride_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t scaleFactor, int32_t shift, int16_t restrict pDst) strided matrix scale of a 16-bit integer matrices for XPULPV2 extension. |
void | plp_mat_scale_stride_i16_parallel(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t scaleFactor, int32_t shift, uint32_t nPE, int16_t restrict pDst) Glue code for parallel strided matrix scale of a 16-bit integer matrices. |
void | plp_mat_scale_stride_i16p_xpulpv2(void * args) Parallel strided matrix scale of 16-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_scale_stride_i8(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) Glue code for strided matrix scale of a 8-bit integer matrices. |
void | plp_mat_scale_stride_i8s_rv32im(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) strided matrix scale of a 8-bit integer matrices for RV32IM extension. |
void | plp_mat_scale_stride_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t scaleFactor, int32_t shift, int8_t restrict pDst) strided matrix scale of a 8-bit integer matrices for XPULPV2 extension. |
void | plp_mat_scale_stride_i8_parallel(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t scaleFactor, int32_t shift, uint32_t nPE, int8_t restrict pDst) Glue code for parallel strided matrix scale of a 8-bit integer matrices. |
void | plp_mat_scale_stride_i8p_xpulpv2(void * args) Parallel strided matrix scale of 8-bit integer matrices kernel for XPULPV2 extension. |
void | plp_mat_scale_stride_f32(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, float scaleFactor, float restrict pDst) Glue code for strided matrix scale of a 32-bit floating-point matrices. |
void | plp_mat_scale_stride_f32s_xpulpv2(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, float scaleFactor, float restrict pDst) strided matrix scale of a 32-bit floating-point matrices for XPULPV2 extension. |
void | plp_mat_scale_stride_f32_parallel(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, float scaleFactor, uint32_t nPE, float restrict pDst) Glue code for parallel strided matrix scale of a 32-bit floating-point matrices. |
void | plp_mat_scale_stride_f32p_xpulpv2(void * args) Parallel strided matrix scale of 32-bit floating-point matrices kernel for XPULPV2 extension. |
void | plp_mat_fill_I_stride_i32(uint32_t N, uint32_t stride, int32_t *restrict pDst) Glue code for creating a strided 32-bit integers identity matrix. |
void | plp_mat_fill_I_stride_i32s_rv32im(uint32_t N, uint32_t stride, int32_t *restrict pDst) Create a strided 32-bit integers identity matrix on RV32IM. |
void | plp_mat_fill_I_stride_i32s_xpulpv2(uint32_t N, uint32_t stride, int32_t *restrict pDst) Create a strided 32-bit integers identity matrix on XpulpV2. |
void | plp_mat_fill_I_stride_i32_parallel(uint32_t N, uint32_t stride, uint32_t nPE, int32_t *restrict pDst) Glue code for creating a strided 32-bit integers identity matrix in parallel. |
void | plp_mat_fill_I_stride_i32p_xpulpv2(void * args) Create a strided 32-bit integers identity matrix on XpulpV2 in parallel. |
void | plp_mat_fill_I_stride_i16(uint32_t N, uint32_t stride, int16_t *restrict pDst) Glue code for creating a strided 16-bit integers identity matrix. |
void | plp_mat_fill_I_stride_i16s_rv32im(uint32_t N, uint32_t stride, int16_t *restrict pDst) Create a strided 16-bit integers identity matrix on RV32IM. |
void | plp_mat_fill_I_stride_i16s_xpulpv2(uint32_t N, uint32_t stride, int16_t *restrict pDst) Create a strided 16-bit integers identity matrix on XpulpV2. |
void | plp_mat_fill_I_stride_i16_parallel(uint32_t N, uint32_t stride, uint32_t nPE, int16_t *restrict pDst) Glue code for creating a strided 16-bit integers identity matrix in parallel. |
void | plp_mat_fill_I_stride_i16p_xpulpv2(void * args) Create a strided 16-bit integers identity matrix on XpulpV2 in parallel. |
void | plp_mat_fill_I_stride_i8(uint32_t N, uint32_t stride, int8_t *restrict pDst) Glue code for creating a strided 8-bit integers identity matrix. |
void | plp_mat_fill_I_stride_i8s_rv32im(uint32_t N, uint32_t stride, int8_t *restrict pDst) Create a strided 8-bit integers identity matrix on RV32IM. |
void | plp_mat_fill_I_stride_i8s_xpulpv2(uint32_t N, uint32_t stride, int8_t *restrict pDst) Create a strided 8-bit integers identity matrix on XpulpV2. |
void | plp_mat_fill_I_stride_i8_parallel(uint32_t N, uint32_t stride, uint32_t nPE, int8_t *restrict pDst) Glue code for creating a strided 8-bit integers identity matrix in parallel. |
void | plp_mat_fill_I_stride_i8p_xpulpv2(void * args) Create a strided 8-bit integers identity matrix on XpulpV2 in parallel. |
void | plp_mat_fill_I_stride_f32(uint32_t N, uint32_t stride, float *restrict pDst) Glue code for creating a strided 32-bit floats identity matrix. |
void | plp_mat_fill_I_stride_f32s_xpulpv2(uint32_t N, uint32_t stride, float *restrict pDst) Create a strided 32-bit floats identity matrix on XpulpV2. |
void | plp_mat_fill_I_stride_f32_parallel(uint32_t N, uint32_t stride, uint32_t nPE, float *restrict pDst) Glue code for creating a strided 32-bit floats identity matrix in parallel. |
void | plp_mat_fill_I_stride_f32p_xpulpv2(void * args) Create a strided 32-bit floats identity matrix on XpulpV2 in parallel. |
void | plp_mat_fill_I_stride_q32(uint32_t N, uint32_t stride, int32_t fracBits, int32_t *restrict pDst) Glue code for creating a strided 32-bit fix-point identity matrix. |
void | plp_mat_fill_I_stride_q32s_rv32im(uint32_t N, uint32_t stride, int32_t fracBits, int32_t *restrict pDst) Create a strided 32-bit fix-point identity matrix on RV32IM. |
void | plp_mat_fill_I_stride_q32s_xpulpv2(uint32_t N, uint32_t stride, int32_t fracBits, int32_t *restrict pDst) Create a strided 32-bit fix-point identity matrix on XpulpV2. |
void | plp_mat_fill_I_stride_q32_parallel(uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int32_t *restrict pDst) Glue code for creating a strided 32-bit fix-point identity matrix in parallel. |
void | plp_mat_fill_I_stride_q32p_xpulpv2(void * args) Create a strided 32-bit fix-point identity matrix on XpulpV2 in parallel. |
void | plp_mat_fill_I_stride_q16(uint32_t N, uint32_t stride, int32_t fracBits, int16_t *restrict pDst) Glue code for creating a strided 16-bit fix-point identity matrix. |
void | plp_mat_fill_I_stride_q16s_rv32im(uint32_t N, uint32_t stride, int32_t fracBits, int16_t *restrict pDst) Create a strided 16-bit fix-point identity matrix on RV32IM. |
void | plp_mat_fill_I_stride_q16s_xpulpv2(uint32_t N, uint32_t stride, int32_t fracBits, int16_t *restrict pDst) Create a strided 16-bit fix-point identity matrix on XpulpV2. |
void | plp_mat_fill_I_stride_q16_parallel(uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int16_t *restrict pDst) Glue code for creating a strided 16-bit fix-point identity matrix in parallel. |
void | plp_mat_fill_I_stride_q16p_xpulpv2(void * args) Create a strided 16-bit fix-point identity matrix on XpulpV2 in parallel. |
void | plp_mat_fill_I_stride_q8(uint32_t N, uint32_t stride, int32_t fracBits, int8_t *restrict pDst) Glue code for creating a strided 8-bit fix-point identity matrix. |
void | plp_mat_fill_I_stride_q8s_rv32im(uint32_t N, uint32_t stride, int32_t fracBits, int8_t *restrict pDst) Create a strided 8-bit fix-point identity matrix on RV32IM. |
void | plp_mat_fill_I_stride_q8s_xpulpv2(uint32_t N, uint32_t stride, int32_t fracBits, int8_t *restrict pDst) Create a strided 8-bit fix-point identity matrix on XpulpV2. |
void | plp_mat_fill_I_stride_q8_parallel(uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int8_t *restrict pDst) Glue code for creating a strided 8-bit fix-point identity matrix in parallel. |
void | plp_mat_fill_I_stride_q8p_xpulpv2(void * args) Create a strided 8-bit fix-point identity matrix on XpulpV2 in parallel. |
void | plp_mat_fill_stride_i32(uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *restrict pDst) Glue code for filling an MxN strided 32-bit integers matrix. |
void | plp_mat_fill_stride_i32s_rv32im(uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *restrict pDst) Fill an MxN strided 32-bit integers matrix on RV32IM. |
void | plp_mat_fill_stride_i32s_xpulpv2(uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *restrict pDst) Fill an MxN strided 32-bit integers matrix on XpulpV2. |
void | plp_mat_fill_stride_i32_parallel(uint32_t M, uint32_t N, uint32_t stride, int32_t value, uint32_t nPE, int32_t *restrict pDst) Glue code for filling an MxN strided 32-bit integers matrix in parallel. |
void | plp_mat_fill_stride_i32p_xpulpv2(void * args) Fill an MxN strided 32-bit integers matrix on XpulpV2 in parallel. |
void | plp_mat_fill_stride_i16(uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *restrict pDst) Glue code for filling an MxN strided 16-bit integers matrix. |
void | plp_mat_fill_stride_i16s_rv32im(uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *restrict pDst) Fill an MxN strided 16-bit integers matrix on RV32IM. |
void | plp_mat_fill_stride_i16s_xpulpv2(uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *restrict pDst) Fill an MxN strided 16-bit integers matrix on XpulpV2. |
void | plp_mat_fill_stride_i16_parallel(uint32_t M, uint32_t N, uint32_t stride, int16_t value, uint32_t nPE, int16_t *restrict pDst) Glue code for filling an MxN strided 16-bit integers matrix in parallel. |
void | plp_mat_fill_stride_i16p_xpulpv2(void * args) Fill an MxN strided 16-bit integers matrix on XpulpV2 in parallel. |
void | plp_mat_fill_stride_i8(uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *restrict pDst) Glue code for filling an MxN strided 8-bit integers matrix. |
void | plp_mat_fill_stride_i8s_rv32im(uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *restrict pDst) Fill an MxN strided 8-bit integers matrix on RV32IM. |
void | plp_mat_fill_stride_i8s_xpulpv2(uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *restrict pDst) Fill an MxN strided 8-bit integers matrix on XpulpV2. |
void | plp_mat_fill_stride_i8_parallel(uint32_t M, uint32_t N, uint32_t stride, int8_t value, uint32_t nPE, int8_t *restrict pDst) Glue code for filling an MxN strided 8-bit integers matrix in parallel. |
void | plp_mat_fill_stride_i8p_xpulpv2(void * args) Fill an MxN strided 8-bit integers matrix on XpulpV2 in parallel. |
void | plp_mat_fill_stride_f32(uint32_t M, uint32_t N, uint32_t stride, float value, float *restrict pDst) Glue code for filling an MxN strided 32-bit floats matrix. |
void | plp_mat_fill_stride_f32s_xpulpv2(uint32_t M, uint32_t N, uint32_t stride, float value, float *restrict pDst) Fill an MxN strided 32-bit floats matrix on XpulpV2. |
void | plp_mat_fill_stride_f32_parallel(uint32_t M, uint32_t N, uint32_t stride, float value, uint32_t nPE, float *restrict pDst) Glue code for filling an MxN strided 32-bit floats matrix in parallel. |
void | plp_mat_fill_stride_f32p_xpulpv2(void * args) Fill an MxN strided 32-bit floats matrix on XpulpV2 in parallel. |
void | plp_mat_copy_stride_i32(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t restrict pDst) Glue code to copy an MxN strided 32-bit integers matrix. |
void | plp_mat_copy_stride_i32s_rv32im(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t restrict pDst) Copy an MxN strided 32-bit integers matrix on RV32IM. |
void | plp_mat_copy_stride_i32s_xpulpv2(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int32_t restrict pDst) Copy an MxN strided 32-bit integers matrix on XpulpV2. |
void | plp_mat_copy_stride_i32_parallel(const int32_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, uint32_t nPE, int32_t restrict pDst) Glue code to copy an MxN strided 32-bit integers matrix in parallel. |
void | plp_mat_copy_stride_i32p_xpulpv2(void * args) Copy an MxN strided 32-bit integers matrix on XpulpV2 in parallel. |
void | plp_mat_copy_stride_i16(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t restrict pDst) Glue code to copy an MxN strided 16-bit integers matrix. |
void | plp_mat_copy_stride_i16s_rv32im(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t restrict pDst) Copy an MxN strided 16-bit integers matrix on RV32IM. |
void | plp_mat_copy_stride_i16s_xpulpv2(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int16_t restrict pDst) Copy an MxN strided 16-bit integers matrix on XpulpV2. |
void | plp_mat_copy_stride_i16_parallel(const int16_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, uint32_t nPE, int16_t restrict pDst) Glue code to copy an MxN strided 16-bit integers matrix in parallel. |
void | plp_mat_copy_stride_i16p_xpulpv2(void * args) Copy an MxN strided 16-bit integers matrix on XpulpV2 in parallel. |
void | plp_mat_copy_stride_i8(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t restrict pDst) Glue code to copy an MxN strided 8-bit integers matrix. |
void | plp_mat_copy_stride_i8s_rv32im(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t restrict pDst) Copy an MxN strided 8-bit integers matrix on RV32IM. |
void | plp_mat_copy_stride_i8s_xpulpv2(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, int8_t restrict pDst) Copy an MxN strided 8-bit integers matrix on XpulpV2. |
void | plp_mat_copy_stride_i8_parallel(const int8_t restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, uint32_t nPE, int8_t restrict pDst) Glue code to copy an MxN strided 8-bit integers matrix in parallel. |
void | plp_mat_copy_stride_i8p_xpulpv2(void * args) Copy an MxN strided 8-bit integers matrix on XpulpV2 in parallel. |
void | plp_mat_copy_stride_f32(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, float restrict pDst) Glue code to copy an MxN strided 32-bit floats matrix. |
void | plp_mat_copy_stride_f32s_xpulpv2(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, float restrict pDst) Copy an MxN strided 32-bit floats matrix on XpulpV2. |
void | plp_mat_copy_stride_f32_parallel(const float restrict pSrc, uint32_t M, uint32_t N, uint32_t strideSrc, uint32_t strideDst, uint32_t nPE, float restrict pDst) Glue code to copy an MxN strided 32-bit floats matrix in parallel. |
void | plp_mat_copy_stride_f32p_xpulpv2(void * args) Copy an MxN strided 32-bit floats matrix on XpulpV2 in parallel. |
void | plp_cmplx_conj_f32(const float32_t restrict pSrc, float32_t restrict pDst, uint32_t numSamples) Glue code for complex conjugate of 32-bit float vectors. |
void | plp_cmplx_conj_f32_xpulpv2(const float32_t restrict pSrc, float32_t restrict pDst, uint32_t numSamples) Floating-point complex conjugate. |
void | plp_cmplx_conj_i32(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) Glue code for complex conjugate of 32-bit integer vectors. |
void | plp_cmplx_conj_i32_xpulpv2(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) 32-bit integer complex conjugate. |
void | plp_cmplx_conj_i32_rv32im(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) 32-bit integer complex conjugate. |
void | plp_cmplx_conj_i16(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) Glue code for complex conjugate of 16-bit integer vectors. |
void | plp_cmplx_conj_i16_xpulpv2(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) 16-bit integer complex conjugate. |
void | plp_cmplx_conj_i16_rv32im(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) 16-bit integer complex conjugate. |
void | plp_cmplx_conj_i8(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) Glue code for complex conjugate of 8-bit integer vectors. |
void | plp_cmplx_conj_i8_xpulpv2(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) 8-bit integer complex conjugate. |
void | plp_cmplx_conj_i8_rv32im(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) 8-bit integer complex conjugate. |
void | plp_cmplx_dot_prod_f32(const float32_t * pSrcA, const float32_t * pSrcB, uint32_t numSamples, float32_t * realResult, float32_t * imagResult) Glue code for complex dot product of 32-bit float vectors. |
void | plp_cmplx_dot_prod_f32_xpulpv2(const float32_t * pSrcA, const float32_t * pSrcB, uint32_t numSamples, float32_t * realResult, float32_t * imagResult) Floating-point complex dot product. |
void | plp_cmplx_dot_prod_i32(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, int32_t * realResult, int32_t * imagResult) Glue code for complex dot product of 32-bit integer vectors. |
void | plp_cmplx_dot_prod_i32_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, int32_t * realResult, int32_t * imagResult) 32-bit integer complex dot product. |
void | plp_cmplx_dot_prod_i32_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, int32_t * realResult, int32_t * imagResult) 32-bit integer complex dot product. |
void | plp_cmplx_dot_prod_i16(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, int16_t * realResult, int16_t * imagResult) Glue code for complex dot product of 16-bit integer vectors. |
void | plp_cmplx_dot_prod_i16_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, int16_t * realResult, int16_t * imagResult) 16-bit integer complex dot product. |
void | plp_cmplx_dot_prod_i16_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, int16_t * realResult, int16_t * imagResult) 16-bit integer complex dot product. |
void | plp_cmplx_dot_prod_i8(const int8_t * pSrcA, const int8_t * pSrcB, uint32_t numSamples, int8_t * realResult, int8_t * imagResult) Glue code for complex dot product of 8-bit integer vectors. |
void | plp_cmplx_dot_prod_i8_xpulpv2(const int8_t * pSrcA, const int8_t * pSrcB, uint32_t numSamples, int8_t * realResult, int8_t * imagResult) 8-bit integer complex dot product. |
void | plp_cmplx_dot_prod_i8_rv32im(const int8_t * pSrcA, const int8_t * pSrcB, uint32_t numSamples, int8_t * realResult, int8_t * imagResult) 8-bit integer complex dot product. |
void | plp_cmplx_dot_prod_q32(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int32_t * realResult, int32_t * imagResult) Glue code for complex dot product of 32-bit fixed-point vectors. |
void | plp_cmplx_dot_prod_q32_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int32_t * realResult, int32_t * imagResult) 32-bit fixed-point complex dot product. |
void | plp_cmplx_dot_prod_q32_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int32_t * realResult, int32_t * imagResult) 32-bit integer complex dot product. |
void | plp_cmplx_dot_prod_q16(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int16_t * realResult, int16_t * imagResult) Glue code for complex dot product of 16-bit fixed-point vectors. |
void | plp_cmplx_dot_prod_q16_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int16_t * realResult, int16_t * imagResult) 16-bit fixed-point complex dot product. |
void | plp_cmplx_dot_prod_q16_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, uint32_t numSamples, uint32_t deciPoint, int16_t * realResult, int16_t * imagResult) 16-bit fixed-point complex dot product. |
void | plp_cmplx_mult_real_f32(const float32_t restrict pSrcCmplx, const float32_t restrict pSrcReal, float32_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied with real of 32-bit float vectors. |
void | plp_cmplx_mult_real_f32_xpulpv2(const float32_t restrict pSrcCmplx, const float32_t restrict pSrcReal, float32_t *restrict pDst, uint32_t numSamples) Floating-point complex multiplied with real. |
void | plp_cmplx_mult_real_i32(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied with real of 32-bit integer vectors. |
void | plp_cmplx_mult_real_i32_xpulpv2(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t numSamples) 32-bit integer complex multiplied with real. |
void | plp_cmplx_mult_real_i32_rv32im(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t numSamples) 32-bit integer complex multiplied with real. |
void | plp_cmplx_mult_real_i16(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied with real of 16-bit integer vectors. |
void | plp_cmplx_mult_real_i16_xpulpv2(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t numSamples) 16-bit integer complex multiplied with real. |
void | plp_cmplx_mult_real_i16_rv32im(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t numSamples) 16-bit integer complex multiplied with real. |
void | plp_cmplx_mult_real_i8(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied with real of 8-bit integer vectors. |
void | plp_cmplx_mult_real_i8_xpulpv2(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t numSamples) 8-bit integer complex multiplied with real. |
void | plp_cmplx_mult_real_i8_rv32im(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t numSamples) 8-bit integer complex multiplied with real. |
void | plp_cmplx_mult_real_q32(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied with real of 32-bit fixed-point vectors. |
void | plp_cmplx_mult_real_q32_xpulpv2(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32-bit fixed-point complex multiplied with real. |
void | plp_cmplx_mult_real_q32_rv32im(const int32_t restrict pSrcCmplx, const int32_t restrict pSrcReal, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32-bit fixed-point complex multiplied with real. |
void | plp_cmplx_mult_real_q16(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied with real of 16-bit fixed-point vectors. |
void | plp_cmplx_mult_real_q16_xpulpv2(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16-bit fixed-point complex multiplied with real. |
void | plp_cmplx_mult_real_q16_rv32im(const int16_t restrict pSrcCmplx, const int16_t restrict pSrcReal, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16-bit fixed-point complex multiplied with real. |
void | plp_cmplx_mult_real_q8(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied with real of 8-bit fixed-point vectors. |
void | plp_cmplx_mult_real_q8_xpulpv2(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8-bit fixed-point complex multiplied with real. |
void | plp_cmplx_mult_real_q8_rv32im(const int8_t restrict pSrcCmplx, const int8_t restrict pSrcReal, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8-bit fixed-point complex multiplied with real. |
void | plp_cmplx_mag_squared_f32(const float32_t restrict pSrc, float32_t restrict pDst, uint32_t numSamples) Glue code for complex squared magnitude of 32-bit float vectors. |
void | plp_cmplx_mag_squared_f32_xpulpv2(const float32_t restrict pSrc, float32_t restrict pDst, uint32_t numSamples) Floating-point complex squared magnitude. |
void | plp_cmplx_mag_squared_i16(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) Glue code for complex squared magnitude of 16-bit integer vectors. |
void | plp_cmplx_mag_squared_i16_rv32im(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) 16-bit integer complex squared magnitude. |
void | plp_cmplx_mag_squared_i16_xpulpv2(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t numSamples) 16 bit Integer complex squared magnitude. |
void | plp_cmplx_mag_squared_i32(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) Glue code for complex squared magnitude of 32-bit integer vectors. |
void | plp_cmplx_mag_squared_i32_rv32im(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) 32-bit integer complex squared magnitude. |
void | plp_cmplx_mag_squared_i32_xpulpv2(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t numSamples) 32-bit integer complex squared magnitude. |
void | plp_cmplx_mag_squared_i8_xpulpv2(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) 8 bit Integer complex squared magnitude. |
void | plp_cmplx_mag_squared_i8(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) Glue code for complex squared magnitude of 32-bit integer vectors. |
void | plp_cmplx_mag_squared_i8_rv32im(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t numSamples) 8-bit integer complex squared magnitude. |
void | plp_cmplx_mag_squared_q32(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex squared magnitude of 32-bit fixed-point vectors. |
void | plp_cmplx_mag_squared_q32_rv32im(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32-bit fixed-point complex squared magnitude. |
void | plp_cmplx_mag_squared_q32_xpulpv2(const int32_t restrict pSrc, int32_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32 bit fixed-point complex squared magnitude. |
void | plp_cmplx_mag_squared_q16(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex squared magnitude of 16-bit fixed-point vectors. |
void | plp_cmplx_mag_squared_q16_rv32im(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16-bit fixed-point complex squared magnitude. |
void | plp_cmplx_mag_squared_q16_xpulpv2(const int16_t restrict pSrc, int16_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16 bit fixed-point complex squared magnitude. |
void | plp_cmplx_mag_squared_q8(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex squared magnitude of 8-bit fixed-point vectors. |
void | plp_cmplx_mag_squared_q8_rv32im(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8-bit fixed-point complex squared magnitude. |
void | plp_cmplx_mag_squared_q8_xpulpv2(const int8_t restrict pSrc, int8_t restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8 bit fixed-point complex squared magnitude. |
void | plp_cmplx_mult_cmplx_f32(const float32_t restrict pSrcA, const float32_t restrict pSrcB, float32_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied by complex of 32-bit float vectors. |
void | plp_cmplx_mult_cmplx_f32_xpulpv2(const float32_t restrict pSrcA, const float32_t restrict pSrcB, float32_t *restrict pDst, uint32_t numSamples) Floating-point complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_i32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied by complex of 32-bit integer vectors. |
void | plp_cmplx_mult_cmplx_i32_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t numSamples) 32-bit integer complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_i32_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t numSamples) 32-bit integer complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_i16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied by complex of 16-bit integer vectors. |
void | plp_cmplx_mult_cmplx_i16_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t numSamples) 16-bit integer complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_i16_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t numSamples) 16-bit integer complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_i8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t numSamples) Glue code for complex multiplied by complex of 8-bit integer vectors. |
void | plp_cmplx_mult_cmplx_i8_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t numSamples) 8-bit integer complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_i8_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t numSamples) 8-bit integer complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied by complex of 32-bit fixed-point vectors. |
void | plp_cmplx_mult_cmplx_q32_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32-bit fixed-point complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_q32_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, int32_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 32-bit fixed-point complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied by complex of 16-bit fixed-point vectors. |
void | plp_cmplx_mult_cmplx_q16_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16-bit fixed-point complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_q16_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, int16_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 16-bit fixed-point complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_q8(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) Glue code for complex multiplied by complex of 8-bit fixed-point vectors. |
void | plp_cmplx_mult_cmplx_q8_xpulpv2(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8-bit fixed-point complex multiplied by complex. |
void | plp_cmplx_mult_cmplx_q8_rv32im(const int8_t restrict pSrcA, const int8_t restrict pSrcB, int8_t *restrict pDst, uint32_t deciPoint, uint32_t numSamples) 8-bit fixed-point complex multiplied by complex. |
void | plp_euclidean_distance_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, uint32_t nPE, uint32_t *restrict pRes) Glue code for parallel Euclidean distance of 32-bit fixed point vectors. |
void | plp_euclidean_distance_f32_parallel(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, float32_t *restrict pRes) Glue code for parallel Euclidean distance between 32-bit float vectors. |
void | plp_euclidean_distance_q32p_xpulpv2(void * S) Parallel euclidean distance with interleaved access 32-bit fixed point vectors. vectors kernel for XPULPV2 extension. |
void | plp_euclidean_distance_f32p_xpulpv2(void * S) 32-bit floating-point parallel Euclidean distance between two vectors |
void | plp_euclidean_distance_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) Glue code for euclidean distance of 32-bit fixed point vectors. |
void | plp_euclidean_distance_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) Euclidean distance of 32-bit fixed point vectors kernel for XPULPV2 extension. |
void | plp_euclidean_distance_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) Euclidean distance of 32-bit fixed point vectors. |
void | plp_euclidean_distance_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint16_t blockSize, uint16_t fracBits, int32_t *restrict pRes) Glue code for euclidean distance of 16-bit fixed point vectors. |
void | plp_euclidean_distance_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t deciPoint, int32_t *restrict pRes) Euclidean distance of 16-bit fixed point vectors kernel for XPULPV2. |
void | plp_euclidean_distance_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) Euclidean distance of 16-bit fixed point vectors kernel for RV32IM extension. |
void | plp_euclidean_distance_f32(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) Glue code for Euclidean distance between 32-bit float vectors. |
void | plp_euclidean_distance_f32s_xpulpv2(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) 32-bit floating point Euclidean distance between two vectors |
void | plp_euclidean_distance_f32s_rv32im(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) 32-bit floating point Euclidean distance between two vectors |
void | plp_cosine_distance_q32_parallel(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, uint32_t nPE, int32_t *restrict pRes) Glue code for parallel cosine distance between 32-bit fixed-precision vectors. |
void | plp_cosine_distance_f32_parallel(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, float32_t *restrict pRes) Glue code for parallel cosine distance between 32-bit float vectors. |
void | plp_cosine_distance_f32p_xpulpv2(void * S) 32-bit floating-point parallel cosine distance between two vectors (computes power in parallel) |
void | plp_cosine_distance_f32(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) Glue code for cosine distance between 32-bit float vectors. |
void | plp_cosine_distance_f32s_rv32im(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) 32-bit floating point cosine distance between two vectors |
void | plp_cosine_distance_f32s_xpulpv2(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, float32_t *restrict pRes) 32-bit floating point cosine distance between two vectors |
void | plp_cosine_distance_q32(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) Glue code for cosine distance of 32-bit fixed point vectors. |
void | plp_cosine_distance_q32s_rv32im(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) cosine distance of 32-bit fixed point vectors. |
void | plp_cosine_distance_q32s_xpulpv2(const int32_t restrict pSrcA, const int32_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) cosine distance of 32-bit fixed point vectors kernel for XPULPV2 extension. |
void | plp_cosine_distance_q16(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint16_t blockSize, uint16_t fracBits, int32_t *restrict pRes) Glue code for cosine distance of 16-bit fixed point vectors. |
void | plp_cosine_distance_q16s_rv32im(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) cosine distance of 16-bit fixed point vectors kernel for RV32IM extension. |
void | plp_cosine_distance_q16s_xpulpv2(const int16_t restrict pSrcA, const int16_t restrict pSrcB, uint32_t blockSize, uint32_t fracBits, int32_t *restrict pRes) cosine distance of 16-bit fixed point vectors kernel for XPULPV2. |
Defines
Name | |
---|---|
PLP_MATH_IBEX | |
PLP_MATH_LOOPUNROLL | |
PLP_DWT_DEC_LEN(SIG_LEN, WAVELET, LEVEL) | |
PLP_DWT_DEC_TEMP_LEN(SRC_LEN, WAVELET_LEN) | |
PLP_DWT_OUTPUT_LENGTH(SIG_LEN, WAVELET_LEN) | |
FAST_MATH_TABLE_SIZE Glue code for square root of a 32-bit floating point number. |
|
FAST_MATH_Q32_SHIFT | |
FAST_MATH_Q16_SHIFT | |
CONTROLLER_Q32_SHIFT | |
TABLE_SPACING_Q32 | |
TABLE_SPACING_Q16 |
Detailed Description
Public header file for PULP DSP Library.
Version: V0
==========================================================================
@date 16. May 2019
Types Documentation
enum plp_dwt_wavelet_type
Enumerator | Value | Description |
---|---|---|
PLP_DWT_WAVELET_OTHER | ||
PLP_DWT_WAVELET_HAAR | ||
PLP_DWT_WAVELET_DB1 | ||
PLP_DWT_WAVELET_DB2 | ||
PLP_DWT_WAVELET_DB3 | ||
PLP_DWT_WAVELET_DB4 | ||
PLP_DWT_WAVELET_DB5 | ||
PLP_DWT_WAVELET_DB6 | ||
PLP_DWT_WAVELET_DB7 | ||
PLP_DWT_WAVELET_DB8 | ||
PLP_DWT_WAVELET_DB9 | ||
PLP_DWT_WAVELET_DB10 | ||
PLP_DWT_WAVELET_DB11 | ||
PLP_DWT_WAVELET_DB12 | ||
PLP_DWT_WAVELET_DB13 | ||
PLP_DWT_WAVELET_DB14 | ||
PLP_DWT_WAVELET_DB15 | ||
PLP_DWT_WAVELET_DB16 | ||
PLP_DWT_WAVELET_DB17 | ||
PLP_DWT_WAVELET_DB18 | ||
PLP_DWT_WAVELET_DB19 | ||
PLP_DWT_WAVELET_DB20 | ||
PLP_DWT_WAVELET_SYM2 | ||
PLP_DWT_WAVELET_SYM3 | ||
PLP_DWT_WAVELET_SYM4 | ||
PLP_DWT_WAVELET_SYM5 | ||
PLP_DWT_WAVELET_SYM6 | ||
PLP_DWT_WAVELET_SYM7 | ||
PLP_DWT_WAVELET_SYM8 | ||
PLP_DWT_WAVELET_SYM9 | ||
PLP_DWT_WAVELET_SYM10 | ||
PLP_DWT_WAVELET_SYM11 | ||
PLP_DWT_WAVELET_SYM12 | ||
PLP_DWT_WAVELET_SYM13 | ||
PLP_DWT_WAVELET_SYM14 | ||
PLP_DWT_WAVELET_SYM15 | ||
PLP_DWT_WAVELET_SYM16 | ||
PLP_DWT_WAVELET_SYM17 | ||
PLP_DWT_WAVELET_SYM18 | ||
PLP_DWT_WAVELET_SYM19 | ||
PLP_DWT_WAVELET_SYM20 | ||
PLP_DWT_WAVELET_COIF1 | ||
PLP_DWT_WAVELET_COIF2 | ||
PLP_DWT_WAVELET_COIF3 | ||
PLP_DWT_WAVELET_COIF4 | ||
PLP_DWT_WAVELET_COIF5 | ||
PLP_DWT_WAVELET_COIF6 | ||
PLP_DWT_WAVELET_COIF7 | ||
PLP_DWT_WAVELET_COIF8 | ||
PLP_DWT_WAVELET_COIF9 | ||
PLP_DWT_WAVELET_COIF10 | ||
PLP_DWT_WAVELET_COIF11 | ||
PLP_DWT_WAVELET_COIF12 | ||
PLP_DWT_WAVELET_COIF13 | ||
PLP_DWT_WAVELET_COIF14 | ||
PLP_DWT_WAVELET_COIF15 | ||
PLP_DWT_WAVELET_COIF16 | ||
PLP_DWT_WAVELET_COIF17 |
enum plp_dwt_extension_mode
Enumerator | Value | Description |
---|---|---|
PLP_DWT_MODE_ZERO | ||
PLP_DWT_MODE_CONSTANT | ||
PLP_DWT_MODE_SYMMETRIC | ||
PLP_DWT_MODE_REFLECT | ||
PLP_DWT_MODE_PERIODIC | ||
PLP_DWT_MODE_ANTISYMMETRIC | ||
PLP_DWT_MODE_ANTIREFLECT |
typedef float32_t
typedef float float32_t;
Functions Documentation
function plp_dwt_max_level
uint32_t plp_dwt_max_level(
uint32_t sig_len,
uint32_t wavelet_len
)
Computes maximum available decomposition level for a signal length and wavelet length.
Parameters:
- sig_len length of input signal
- wavelet_len wavelet length
Return: Maximal decomposition level
function plp_dwt_dec_len
uint32_t plp_dwt_dec_len(
uint32_t sig_len,
uint32_t wavelet_len,
uint32_t level
)
Calculates decomposition output length given a level.
Parameters:
- sig_len length of input signal
- wavelet_len wavelet length
- level decomposition level (0 for maximal decomposition)
Return: Length of decomposition output buffer
function plp_dot_prod_i32_parallel
void plp_dot_prod_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t nPE,
int32_t *__restrict__ pRes
)
Glue code for parallel dot product of 32-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- nPE number of parallel processing units
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- nPE number of parallel processing units
- pRes output result returned here
Return:
- none
- none
function plp_dot_prod_q32_parallel
void plp_dot_prod_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
uint32_t nPE,
int32_t *__restrict__ pRes
)
Glue code for parallel dot product of 32-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- nPE number of parallel processing units
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- nPE number of parallel processing units
- pRes output result returned here
Return:
- none
- none
function plp_dot_prod_f32_parallel
void plp_dot_prod_f32_parallel(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pRes
)
Glue code for parallel dot product of 32-bit float vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- nPE number of parallel processing units
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- nPE number of parallel processing units
- pRes output result returned here
Return:
- none
- none
function plp_dot_prod_i32p_xpulpv2
void plp_dot_prod_i32p_xpulpv2(
void * S
)
Parallel dot product with interleaved access of 32-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- S points to the instance structure for integer parallel dot product
- S points to the instance structure for integer parallel dot product
Return:
- none
- none
function plp_dot_prod_q32p_xpulpv2
void plp_dot_prod_q32p_xpulpv2(
void * S
)
Parallel dot product with interleaved access of 32-bit fixed point vectors kernel for XPULPV2 extension.
Parameters:
- S points to the instance structure for fixed point parallel dot product
- S points to the instance structure for fixed point parallel dot product
Return:
- none
- none
function plp_dot_prod_f32p_xpulpv2
void plp_dot_prod_f32p_xpulpv2(
void * S
)
Parallel dot product with interleaved access of 32-bit float vectors kernel for XPULPV2 extension.
Parameters:
- S points to the instance structure for float parallel dot product
- S points to the instance structure for float parallel dot product
Return:
- none
- none
function plp_dot_prod_i32
void plp_dot_prod_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Glue code for dot product of 32-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
Return: none
function plp_dot_prod_i32s_rv32im
void plp_dot_prod_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Scalar dot product of 32-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
Return: none
function plp_dot_prod_i32s_xpulpv2
void plp_dot_prod_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Scalar dot product of 32-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
Return: none
function plp_dot_prod_q32
void plp_dot_prod_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes
)
Glue code for dot product of 32-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here
Return:
- none
- none
function plp_dot_prod_q32s_rv32im
void plp_dot_prod_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes
)
Scalar dot product of 32-bit fixed point vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here
Return:
- none
- none
function plp_dot_prod_q32s_xpulpv2
void plp_dot_prod_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes
)
Scalar dot product of 32-bit fixed point vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here
Return:
- none
- none
function plp_dot_prod_f32
void plp_dot_prod_f32(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes
)
Glue code for dot product of 32-bit float vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
Return:
- none
- none
function plp_dot_prod_f32s_xpulpv2
void plp_dot_prod_f32s_xpulpv2(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes
)
Glue code for dot product of 32-bit float vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
Return:
- none
- none
function plp_dot_prod_f32s_rv32im
void plp_dot_prod_f32s_rv32im(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes
)
Glue code for dot product of 32-bit float vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- pRes output result returned here
Return:
- none
- none
function plp_dot_prod_i16
void plp_dot_prod_i16(
const int16_t * pSrcA,
const int16_t * pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Glue code for dot product of 16-bit integer vectors.
Parameters:
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
Par: Exploiting SIMD instructions
When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.
function plp_dot_prod_i16s_rv32im
void plp_dot_prod_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Vectorized dot product of 16-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2). * Exploiting SIMD instructions
When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).
Vectorized dot product of 16-bit integer vectors kernel for RV32IM extension.
function plp_dot_prod_i16s_xpulpv2
void plp_dot_prod_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Vectorized dot product of 16-bit integer vectors kernel singlecore for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors. * Exploiting SIMD instructions
The 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.
Vectorized dot product of 16-bit integer vectors kernel singlecore for XPULPV2 extension.
function plp_dot_prod_q16
void plp_dot_prod_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes
)
Glue code for dot product of 16-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.
function plp_dot_prod_q16s_rv32im
void plp_dot_prod_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes
)
Scalar dot product of 16-bit fixed point vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2). * Exploiting SIMD instructions
When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).
function plp_dot_prod_q16s_xpulpv2
void plp_dot_prod_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes
)
Vectorized dot product of 16-bit fixed point vectors singlecore kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.
function plp_dot_prod_i8
void plp_dot_prod_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Glue code for dot product of 8-bit integer vectors.
Parameters:
- pSrcA points to the first input vector [8 bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [8 bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.
function plp_dot_prod_i8s_rv32im
void plp_dot_prod_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Vectorized dot product of 8-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector [8] bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [8] bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2). * Exploiting SIMD instructions
When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).
Vectorized dot product of 8-bit integer vectors kernel for RV32IM extension.
function plp_dot_prod_i8s_xpulpv2
void plp_dot_prod_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Vectorized dot product of 8-bit integer vectors singlecore kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector [8 bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [8 bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_dot_prod_q8
void plp_dot_prod_q8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes
)
Glue code for dot product of 8-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector [8 bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [8 bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator.
function plp_dot_prod_q8s_rv32im
void plp_dot_prod_q8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes
)
Scalar dot product of 8-bit fixed point vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector [8 bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [8 bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2). * Exploiting SIMD instructions
When the ISA supports, the 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).
function plp_dot_prod_q8s_xpulpv2
void plp_dot_prod_q8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes
)
Scalar dot product of 8-bit fixed point vectors singlecore kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector [8 bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [8 bit]
- pSrcB points to the second input vector [8 bit]
- blockSize number of samples in each vector
- deciPoint decimal point for right shift
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four by four into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_abs_i32
void plp_abs_i32(
const int32_t * pSrc,
int32_t * pDst,
uint32_t blockSize
)
Glue code for absolute value of 32-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_abs_i32s_rv32im
void plp_abs_i32s_rv32im(
const int32_t * pSrc,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element absolute value of 32-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_abs_i32s_xpulpv2
void plp_abs_i32s_xpulpv2(
const int32_t * pSrc,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element absolute value of 32-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_abs_i16
void plp_abs_i16(
const int16_t * pSrc,
int16_t * pDst,
uint32_t blockSize
)
Glue code for absolute value of 16-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_abs_i16s_rv32im
void plp_abs_i16s_rv32im(
const int16_t * pSrc,
int16_t * pDst,
uint32_t blockSize
)
Element-by-element absolute value of 16-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_abs_i16s_xpulpv2
void plp_abs_i16s_xpulpv2(
const int16_t * pSrc,
int16_t * pDst,
uint32_t blockSize
)
Element-by-element absolute value of 16-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_abs_i8
void plp_abs_i8(
const int8_t * pSrc,
int8_t * pDst,
uint32_t blockSize
)
Glue code for absolute value of 8-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_abs_i8s_rv32im
void plp_abs_i8s_rv32im(
const int8_t * pSrc,
int8_t * pDst,
uint32_t blockSize
)
Element-by-element absolute value of 8-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_abs_i8s_xpulpv2
void plp_abs_i8s_xpulpv2(
const int8_t * pSrc,
int8_t * pDst,
uint32_t blockSize
)
Element-by-element absolute value of 8-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_add_i32
void plp_add_i32(
const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Glue code for element-by-element addition of 32-bit integer vectors.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_add_i32s_rv32im
void plp_add_i32s_rv32im(
const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element addition of 32-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_add_i32s_xpulpv2
void plp_add_i32s_xpulpv2(
const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element addition of 32-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_add_i16
void plp_add_i16(
const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Glue code for element-by-element addition of 16-bit integer vectors.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_add_i16s_rv32im
void plp_add_i16s_rv32im(
const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element addition of 16-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_add_i16s_xpulpv2
void plp_add_i16s_xpulpv2(
const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element addition of 16-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_add_i8
void plp_add_i8(
const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Glue code for element-by-element addition of 8-bit integer vectors.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_add_i8s_rv32im
void plp_add_i8s_rv32im(
const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element addition of 8-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_add_i8s_xpulpv2
void plp_add_i8s_xpulpv2(
const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element addition of 8-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_i32
void plp_mult_i32(
const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Glue code for element-by-element multiplication of 32-bit integer vectors.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_i32s_rv32im
void plp_mult_i32s_rv32im(
const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element multiplication of 32-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_i32s_xpulpv2
void plp_mult_i32s_xpulpv2(
const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element multiplication of 32-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_i16
void plp_mult_i16(
const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Glue code for element-by-element multiplication of 16-bit integer vectors.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_i16s_rv32im
void plp_mult_i16s_rv32im(
const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element multiplication of 16-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_i16s_xpulpv2
void plp_mult_i16s_xpulpv2(
const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element multiplication of 16-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_i8
void plp_mult_i8(
const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Glue code for element-by-element multiplication of 8-bit integer vectors.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_i8s_rv32im
void plp_mult_i8s_rv32im(
const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element multiplication of 8-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_i8s_xpulpv2
void plp_mult_i8s_xpulpv2(
const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Element-by-element multiplication of 8-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_f32
void plp_mult_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize
)
Glue code for element-by-element multiplication of 32-bit float vectors.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mult_f32s_xpulpv2
void plp_mult_f32s_xpulpv2(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize
)
Element-by-element multiplication of 32-bit float vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
Element-by-element multiplication of 32-bit float vectors kernel for XPULPV2 extension.
function plp_mult_f32_parallel
void plp_mult_f32_parallel(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pDst
)
Glue code for parallel dot product of 32-bit float vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- nPE number of parallel processing units
- pDst points to output vector
Return: none
function plp_mult_f32p_xpulpv2
void plp_mult_f32p_xpulpv2(
void * S
)
Parallel multiplication with interleaved access of 32-bit float vectors kernel for XPULPV2 extension.
Parameters:
- S points to the instance structure for float parallel multiplication
Return: none
function plp_log_f32_parallel
void plp_log_f32_parallel(
const float32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pDst
)
Glue code for parallel log of 32-bit float vectors.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in each vector
- nPE number of parallel processing units
- pDst points to output vector
Return: none
function plp_log_f32p_xpulpv2
void plp_log_f32p_xpulpv2(
void * S
)
Parallel log with interleaved access of 32-bit float vectors kernel for XPULPV2 extension.
Parameters:
- S points to the instance structure for float parallel log
Return: none
function plp_negate_i32
void plp_negate_i32(
const int32_t * pSrc,
int32_t * pDst,
uint32_t blockSize
)
Glue code of negate the elements of a vector for 32-bit integers.
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
function plp_negate_i32s_rv32im
void plp_negate_i32s_rv32im(
const int32_t * pSrc,
int32_t * pDst,
uint32_t blockSize
)
negate the elements of a vector for 32-bit integers on RV32IM
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
function plp_negate_i32s_xpulpv2
void plp_negate_i32s_xpulpv2(
const int32_t * pSrc,
int32_t * pDst,
uint32_t blockSize
)
negate the elements of a vector for 32-bit integers on XpulpV2
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
function plp_negate_i16
void plp_negate_i16(
const int16_t * pSrc,
int16_t * pDst,
uint32_t blockSize
)
Glue code of negate the elements of a vector for 16-bit integers.
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
function plp_negate_i16s_rv32im
void plp_negate_i16s_rv32im(
const int16_t * pSrc,
int16_t * pDst,
uint32_t blockSize
)
negate the elements of a vector for 16-bit integers on RV32IM
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
function plp_negate_i16s_xpulpv2
void plp_negate_i16s_xpulpv2(
const int16_t * pSrc,
int16_t * pDst,
uint32_t blockSize
)
negate the elements of a vector for 16-bit integers on XpulpV2
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_negate_i8
void plp_negate_i8(
const int8_t * pSrc,
int8_t * pDst,
uint32_t blockSize
)
Glue code of negate the elements of a vector for 8-bit integers.
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
function plp_negate_i8s_rv32im
void plp_negate_i8s_rv32im(
const int8_t * pSrc,
int8_t * pDst,
uint32_t blockSize
)
negate the elements of a vector for 8-bit integers on RV32IM
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
function plp_negate_i8s_xpulpv2
void plp_negate_i8s_xpulpv2(
const int8_t * pSrc,
int8_t * pDst,
uint32_t blockSize
)
negate the elements of a vector for 8-bit integers on XpulpV2
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_negate_f32
void plp_negate_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize
)
Glue code of negate the elements of a vector for 32-bit floats.
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
function plp_negate_f32s_xpulpv2
void plp_negate_f32s_xpulpv2(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize
)
negate the elements of a vector for 32-bit floats on XpulpV2
Parameters:
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
- pSrc points to input vector.
- pDst points to output vector.
- blockSize number of samples in each vector.
Return:
- none
- none
function plp_offset_i32
void plp_offset_i32(
const int32_t * pSrc,
int32_t offset,
int32_t * pDst,
uint32_t blockSize
)
Glue code of add a constant offset to a vector for 32-bit integers.
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_offset_i32s_rv32im
void plp_offset_i32s_rv32im(
const int32_t * pSrc,
int32_t offset,
int32_t * pDst,
uint32_t blockSize
)
add a constant offset to a vector for 32-bit integers on RV32IM
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_offset_i32s_xpulpv2
void plp_offset_i32s_xpulpv2(
const int32_t * pSrc,
int32_t offset,
int32_t * pDst,
uint32_t blockSize
)
add a constant offset to a vector for 32-bit integers on XpulpV2
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_offset_i16
void plp_offset_i16(
const int16_t * pSrc,
int16_t offset,
int16_t * pDst,
uint32_t blockSize
)
Glue code of add a constant offset to a vector for 16-bit integers.
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_offset_i16s_rv32im
void plp_offset_i16s_rv32im(
const int16_t * pSrc,
int16_t offset,
int16_t * pDst,
uint32_t blockSize
)
add a constant offset to a vector for 16-bit integers on RV32IM
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_offset_i16s_xpulpv2
void plp_offset_i16s_xpulpv2(
const int16_t * pSrc,
int16_t offset,
int16_t * pDst,
uint32_t blockSize
)
add a constant offset to a vector for 16-bit integers on XpulpV2
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_offset_i8
void plp_offset_i8(
const int8_t * pSrc,
int8_t offset,
int8_t * pDst,
uint32_t blockSize
)
Glue code of add a constant offset to a vector for 8-bit integers.
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_offset_i8s_rv32im
void plp_offset_i8s_rv32im(
const int8_t * pSrc,
int8_t offset,
int8_t * pDst,
uint32_t blockSize
)
add a constant offset to a vector for 8-bit integers on RV32IM
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_offset_i8s_xpulpv2
void plp_offset_i8s_xpulpv2(
const int8_t * pSrc,
int8_t offset,
int8_t * pDst,
uint32_t blockSize
)
add a constant offset to a vector for 8-bit integers on XpulpV2
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_offset_f32
void plp_offset_f32(
const float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize
)
Glue code of add a constant offset to a vector for 32-bit floats.
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_offset_f32s_xpulpv2
void plp_offset_f32s_xpulpv2(
const float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize
)
add a constant offset to a vector for 32-bit floats on XpulpV2
Parameters:
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- offset is the offset to be added
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_sub_i32
void plp_sub_i32(
const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Glue code of vector substraction for 32-bit integers.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_sub_i32s_rv32im
void plp_sub_i32s_rv32im(
const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
vector substraction for 32-bit integers on RV32IM
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_sub_i32s_xpulpv2
void plp_sub_i32s_xpulpv2(
const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
vector substraction for 32-bit integers on XpulpV2
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_sub_i16
void plp_sub_i16(
const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Glue code of vector substraction for 16-bit integers.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_sub_i16s_rv32im
void plp_sub_i16s_rv32im(
const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
vector substraction for 16-bit integers on RV32IM
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_sub_i16s_xpulpv2
void plp_sub_i16s_xpulpv2(
const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
vector substraction for 16-bit integers on XpulpV2
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_sub_i8
void plp_sub_i8(
const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
Glue code of vector substraction for 8-bit integers.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_sub_i8s_rv32im
void plp_sub_i8s_rv32im(
const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
vector substraction for 8-bit integers on RV32IM
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_sub_i8s_xpulpv2
void plp_sub_i8s_xpulpv2(
const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize
)
vector substraction for 8-bit integers on XpulpV2
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_sub_f32
void plp_sub_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize
)
Glue code of vector substraction for 32-bit floats.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_sub_f32s_xpulpv2
void plp_sub_f32s_xpulpv2(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize
)
vector substraction for 32-bit floats on XpulpV2
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_scale_i32
void plp_scale_i32(
const int32_t *__restrict__ pSrc,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst,
uint32_t blockSize
)
Glue code of multiply a vector by a scalar for 32-bit integers.
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_scale_i32s_rv32im
void plp_scale_i32s_rv32im(
const int32_t *__restrict__ pSrc,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst,
uint32_t blockSize
)
multiply a vector by a scalar for 32-bit integers on RV32IM
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_scale_i32s_xpulpv2
void plp_scale_i32s_xpulpv2(
const int32_t *__restrict__ pSrc,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst,
uint32_t blockSize
)
multiply a vector by a scalar for 32-bit integers on XpulpV2
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_scale_i16
void plp_scale_i16(
const int16_t *__restrict__ pSrc,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst,
uint32_t blockSize
)
Glue code of multiply a vector by a scalar for 16-bit integers.
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_scale_i16s_rv32im
void plp_scale_i16s_rv32im(
const int16_t *__restrict__ pSrc,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst,
uint32_t blockSize
)
multiply a vector by a scalar for 16-bit integers on RV32IM
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_scale_i16s_xpulpv2
void plp_scale_i16s_xpulpv2(
const int16_t *__restrict__ pSrc,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst,
uint32_t blockSize
)
multiply a vector by a scalar for 16-bit integers on XpulpV2
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_scale_i8
void plp_scale_i8(
const int8_t *__restrict__ pSrc,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst,
uint32_t blockSize
)
Glue code of multiply a vector by a scalar for 8-bit integers.
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_scale_i8s_rv32im
void plp_scale_i8s_rv32im(
const int8_t *__restrict__ pSrc,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst,
uint32_t blockSize
)
multiply a vector by a scalar for 8-bit integers on RV32IM
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_scale_i8s_xpulpv2
void plp_scale_i8s_xpulpv2(
const int8_t *__restrict__ pSrc,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst,
uint32_t blockSize
)
multiply a vector by a scalar for 8-bit integers on XpulpV2
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- shift number of bits to shift the result by
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_scale_f32
void plp_scale_f32(
const float32_t *__restrict__ pSrc,
float32_t scaleFactor,
float32_t *__restrict__ pDst,
uint32_t blockSize
)
Glue code of multiply a vector by a scalar for 32-bit floats.
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_scale_f32s_xpulpv2
void plp_scale_f32s_xpulpv2(
const float32_t *__restrict__ pSrc,
float32_t scaleFactor,
float32_t *__restrict__ pDst,
uint32_t blockSize
)
multiply a vector by a scalar for 32-bit floats on XpulpV2
Parameters:
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements before shifting
- pDst points to the output vector
- blockSize number of samples in each vector
- pSrc points to the input vector
- scaleFactor Factor to multiply all elements
- pDst points to the output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_fill_i32
void plp_fill_i32(
int32_t value,
int32_t *__restrict__ pDst,
uint32_t blockSize
)
Glue code for filling a constant value into a 32-bit integer vector.
Parameters:
- value input value to be filled
- pDst points to output vector
- blockSize number of samples in each vector
- value input value to be filled
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_fill_i32s_rv32im
void plp_fill_i32s_rv32im(
int32_t value,
int32_t *__restrict__ pDst,
uint32_t blockSize
)
Fills a constant value into a 32-bit integer vector for RV32IM extension.
Parameters:
- value input value to be filled
- pDst points to output vector
- blockSize number of samples in each vector
- value input value to be filled
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_fill_i32s_xpulpv2
void plp_fill_i32s_xpulpv2(
int32_t value,
int32_t *__restrict__ pDst,
uint32_t blockSize
)
Fills a constant value into a 32-bit integer vector for XPULPV2 extension.
Parameters:
- value input value to be filled
- pDst points to output vector
- blockSize number of samples in each vector
- value input value to be filled
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_copy_i32
void plp_copy_i32(
int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t blockSize
)
Glue code for copying the elements of a 32-bit integer vector.
Parameters:
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_copy_i32s_rv32im
void plp_copy_i32s_rv32im(
int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t blockSize
)
Copies the elements of a 32-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_copy_i32s_xpulpv2
void plp_copy_i32s_xpulpv2(
int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t blockSize
)
Copies the elements of a 32-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_copy_f32
void plp_copy_f32(
float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t blockSize
)
Glue code for copying the elements of a 32-bit float vector.
Parameters:
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
Glue code for copying the elements of a 32-bit float vector.
function plp_copy_f32s_xpulpv2
void plp_copy_f32s_xpulpv2(
float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t blockSize
)
Copies the elements of a 32-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_copy_f32s_rv32im
void plp_copy_f32s_rv32im(
float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t blockSize
)
Copies the elements of a 32-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- blockSize number of samples in each vector
Return:
- none
- none
function plp_mean_f32
void plp_mean_f32(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Glue code for mean value of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult mean value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
Return:
- none
- none
function plp_mean_f32s_xpulpv2
void plp_mean_f32s_xpulpv2(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Glue code for mean value of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult mean value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
Return:
- none
- none
Glue code for mean value of a 32-bit float vector.
function plp_mean_i32
void plp_mean_i32(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Glue code for mean value of a 32-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult mean value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
Return:
- none
- none
function plp_mean_i32s_rv32im
void plp_mean_i32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Mean value of a 32-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult mean value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
Return:
- none
- none
function plp_mean_i32s_xpulpv2
void plp_mean_i32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Mean value of a 32-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
Return:
- none
- none
function plp_mean_i16
void plp_mean_i16(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes
)
Glue code for mean value of a 16-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult mean value returned here
- pSrc points to the input vector
- pRes mean value returned here
Return:
- none
- none
function plp_mean_i16s_rv32im
void plp_mean_i16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes
)
Mean value of a 16-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult mean value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
Return:
- none
- none
function plp_mean_i16s_xpulpv2
void plp_mean_i16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes
)
Mean value of a 16-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
Return:
- none
- none
function plp_mean_i8
void plp_mean_i8(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes
)
Glue code for mean value of a 8-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult mean value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
Return:
- none
- none
function plp_mean_i8s_rv32im
void plp_mean_i8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes
)
Mean value of a 8-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult mean value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
Return:
- none
- none
function plp_mean_i8s_xpulpv2
void plp_mean_i8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes
)
Mean value of a 8-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes mean value returned here
Return:
- none
- none
function plp_max_f32
void plp_max_f32(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Glue code for max value of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
function plp_max_f32s_xpulpv2
void plp_max_f32s_xpulpv2(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Kernel for max value of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
Kernel for max value of a 32-bit float vector.
function plp_max_i32
void plp_max_i32(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Glue code for max value of a 32-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
function plp_max_i32s_rv32im
void plp_max_i32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Max value of a 32-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
function plp_max_i32s_xpulpv2
void plp_max_i32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Max value of a 32-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
function plp_max_i16
void plp_max_i16(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes
)
Glue code for max value of a 16-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
function plp_max_i16s_rv32im
void plp_max_i16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes
)
Max value of a 16-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
function plp_max_i16s_xpulpv2
void plp_max_i16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes
)
Max value of a 16-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
function plp_max_i8
void plp_max_i8(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes
)
Glue code for max value of a 8-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
function plp_max_i8s_rv32im
void plp_max_i8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes
)
Max value of a 8-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
function plp_max_i8s_xpulpv2
void plp_max_i8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes
)
Max value of a 8-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes max value returned here
Return:
- none
- none
function plp_min_f32
void plp_min_f32(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Glue code for min value of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
function plp_min_f32s_xpulpv2
void plp_min_f32s_xpulpv2(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Kernel for min value of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
Kernel for min value of a 32-bit float vector.
function plp_min_i32
void plp_min_i32(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Glue code for min value of a 32-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
function plp_min_i32s_rv32im
void plp_min_i32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Min value of a 32-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
function plp_min_i32s_xpulpv2
void plp_min_i32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Min value of a 32-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
function plp_min_i16
void plp_min_i16(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes
)
Glue code for min value of a 16-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
function plp_min_i16s_rv32im
void plp_min_i16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes
)
Min value of a 16-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
function plp_min_i16s_xpulpv2
void plp_min_i16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes
)
Min value of a 16-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
function plp_min_i8
void plp_min_i8(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes
)
Glue code for min value of a 8-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
function plp_min_i8s_rv32im
void plp_min_i8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes
)
Min value of a 8-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
function plp_min_i8s_xpulpv2
void plp_min_i8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes
)
Min value of a 8-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes min value returned here
Return:
- none
- none
function plp_power_f32_parallel
void plp_power_f32_parallel(
const float32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pRes
)
Glue code for parallel power of 32-bit floating point vectors.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- nPE number of parallel processing units
- pRes output result returned here
Return: none
function plp_power_f32p_xpulpv2
void plp_power_f32p_xpulpv2(
void * S
)
Parallel sum of squares of a 32-bit float vector for XPULPV2 extension.
Parameters:
- S points to the instance structure for floating-point parallel power
Return: none
function plp_power_f32
void plp_power_f32(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Glue code for Sum of squares of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
Glue code for Sum of squares of a 32-bit float vector.
function plp_power_f32s_xpulpv2
void plp_power_f32s_xpulpv2(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Kernel for Sum of squares of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
Kernel for Sum of squares of a 32-bit float vector.
function plp_power_f32s_rv32im
void plp_power_f32s_rv32im(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Sum of squares of a 32-bit float vector for RV32IM.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return: none
function plp_power_i32
void plp_power_i32(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Glue code for Sum of squares of a 32-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
Glue code for Sum of squares of a 32-bit integer vector.
function plp_power_i32s_rv32im
void plp_power_i32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Sum of squares of a 32-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_i32s_xpulpv2
void plp_power_i32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Sum of squares of a 32-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_i16
void plp_power_i16(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Glue code for Sum of squares of a 16-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
Glue code for Sum of squares of a 16-bit integer vector.
function plp_power_i16s_rv32im
void plp_power_i16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Sum of squares of a 16-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_i16s_xpulpv2
void plp_power_i16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Sum of squares of a 16-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_i8
void plp_power_i8(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Glue code for Sum of squares of a 8-bit integer vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
Glue code for Sum of squares of a 8-bit integer vector.
function plp_power_i8s_rv32im
void plp_power_i8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Sum of squares of a 8-bit integer vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_i8s_xpulpv2
void plp_power_i8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes
)
Sum of squares of a 8-bit integer vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Sum of squares value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_q32_parallel
void plp_power_q32_parallel(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
uint32_t nPE,
int32_t *__restrict__ pRes
)
Glue code for parallel power of 32-bit fixed point vectors.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- nPE number of parallel processing units
- pRes output result returned here
- pSrc points to the input vector
- blockSize number of samples in each vector
- deciPoint number of fixed point fractional bits
- nPE number of parallel processing units
- pRes output result returned here
Return:
- none
- none
Glue code for parallel power of 32-bit fixed point vectors.
function plp_power_q32p_xpulpv2
void plp_power_q32p_xpulpv2(
void * S
)
Parallel sum of squares of a 32-bit fixed-point vector for XPULPV2 extension.
Parameters:
- S points to the instance structure for fixed-point parallel power
- S points to the instance structure for floating-point parallel power
Return:
- none
- none
Parallel sum of squares of a 32-bit fixed-point vector for XPULPV2 extension.
function plp_power_q32
void plp_power_q32(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for Sum of squares of a 32-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
Glue code for Sum of squares of a 32-bit fixed point vector.
function plp_power_q32s_rv32im
void plp_power_q32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Sum of squares of a 32-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_q32s_xpulpv2
void plp_power_q32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Sum of squares of a 32-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_q16
void plp_power_q16(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for Sum of squares of a 16-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
Glue code for Sum of squares of a 16-bit fixed point vector.
function plp_power_q16s_rv32im
void plp_power_q16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Sum of squares of a 16-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_q16s_xpulpv2
void plp_power_q16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Sum of squares of a 16-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_q8
void plp_power_q8(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for Sum of squares of a 8-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
Glue code for Sum of squares of a 8-bit fixed point vector.
function plp_power_q8s_rv32im
void plp_power_q8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Sum of squares of a 8-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Sum of squares returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_power_q8s_xpulpv2
void plp_power_q8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Sum of squares of a 8-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Sum of squares value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes sum of squares returned here
Return:
- none
- none
function plp_var_f32
void plp_var_f32(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Glue code for Statisical variance of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical variance returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes variance value returned here
Return:
- none
- none
Glue code for Statisical variance of a 32-bit float vector.
function plp_var_f32s_xpulpv2
void plp_var_f32s_xpulpv2(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Kernel for Statisical variance of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical variance returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes variance value returned here
Return:
- none
- none
Kernel for Statisical variance of a 32-bit float vector.
function plp_var_q32
void plp_var_q32(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for Statisical variance of a 32-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical variance returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes variance value returned here
Return:
- none
- none
Glue code for Statisical variance of a 32-bit fixed point vector.
function plp_var_q32s_rv32im
void plp_var_q32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Statisical variance of a 32-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical variance value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Variance returned here
Return:
- none
- none
Statisical variance of a 32-bit fixed point vector for RV32IM extension.
function plp_var_q32s_xpulpv2
void plp_var_q32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Statisical variance of a 32-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Statisical variance returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes variance returned here
Return:
- none
- none
Statisical variance of a 32-bit fixed point vector for XPULPV2 extension.
function plp_var_q16
void plp_var_q16(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes
)
Glue code for Statisical variance of a 16-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical variance returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes variance value returned here
Return:
- none
- none
Glue code for Statisical variance of a 16-bit fixed point vector.
function plp_var_q16s_rv32im
void plp_var_q16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes
)
Statisical variance of a 16-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical variance returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes variance returned here
Return:
- none
- none
Statisical variance of a 16-bit fixed point vector for RV32IM extension.
function plp_var_q16s_xpulpv2
void plp_var_q16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes
)
Statisical variance of a 16-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Statisical variance returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes variance returned here
Return:
- none
- none
Statisical variance of a 16-bit fixed point vector for XPULPV2 extension.
function plp_var_q8
void plp_var_q8(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes
)
Glue code for Statisical variance of a 8-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical variance returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes variance value returned here
Return:
- none
- none
Glue code for Statisical variance of a 8-bit fixed point vector.
function plp_var_q8s_rv32im
void plp_var_q8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes
)
Statisical variance of a 8-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical variance returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes variance returned here
Return:
- none
- none
Statisical variance of a 8-bit fixed point vector for RV32IM extension.
function plp_var_q8s_xpulpv2
void plp_var_q8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes
)
Statisical variance of a 8-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Statisical variance value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes variance returned here
Return:
- none
- none
Statisical variance of a 8-bit fixed point vector for XPULPV2 extension.
function plp_std_f32
void plp_std_f32(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Glue code for Statisical standard deviation of a 32-bit floating point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard value returned here
Return:
- none
- none
Glue code for Statisical standard deviation of a 32-bit floating point vector.
function plp_std_f32s_xpulpv2
void plp_std_f32s_xpulpv2(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Kernel for Statisical standard deviation of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard deviation returned here
Return:
- none
- none
Kernel for Statisical standard deviation of a 32-bit float vector.
function plp_std_q32
void plp_std_q32(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for Statisical standard deviation of a 32-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard value returned here
Return:
- none
- none
Glue code for Statisical standard deviation of a 32-bit fixed point vector.
function plp_std_q32s_rv32im
void plp_std_q32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard deviation returned here
Return:
- none
- none
Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension.
function plp_std_q32s_xpulpv2
void plp_std_q32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard deviation returned here
Return:
- none
- none
Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension.
function plp_std_q16
void plp_std_q16(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes
)
Glue code for Statisical standard deviation of a 16-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard value returned here
Return:
- none
- none
Glue code for Statisical standard deviation of a 16-bit fixed point vector.
function plp_std_q16s_rv32im
void plp_std_q16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes
)
Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard deviation returned here
Return:
- none
- none
Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension.
function plp_std_q16s_xpulpv2
void plp_std_q16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes
)
Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard deviation returned here
Return:
- none
- none
Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension.
function plp_std_q8
void plp_std_q8(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes
)
Glue code for Statisical standard deviation of a 8-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard value returned here
Return:
- none
- none
Glue code for Statisical standard deviation of a 8-bit fixed point vector.
function plp_std_q8s_rv32im
void plp_std_q8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes
)
Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard deviation returned here
Return:
- none
- none
Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension.
function plp_std_q8s_xpulpv2
void plp_std_q8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes
)
Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Statisical standard deviation value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes standard deviation returned here
Return:
- none
- none
Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension.
function plp_rms_f32
void plp_rms_f32(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Glue code for Statisical standard deviation of a 32-bit floating point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Glue code for Statisical standard deviation of a 32-bit floating point vector.
function plp_rms_f32s_xpulpv2
void plp_rms_f32s_xpulpv2(
const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes
)
Kernel for Statisical standard deviation of a 32-bit float vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Kernel for Statisical standard deviation of a 32-bit float vector.
function plp_rms_q32
void plp_rms_q32(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for Statisical standard deviation of a 32-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Glue code for Statisical standard deviation of a 32-bit fixed point vector.
function plp_rms_q32s_rv32im
void plp_rms_q32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Statisical standard deviation of a 32-bit fixed point vector for RV32IM extension.
function plp_rms_q32s_xpulpv2
void plp_rms_q32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Statisical standard deviation of a 32-bit fixed point vector for XPULPV2 extension.
function plp_rms_q16
void plp_rms_q16(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes
)
Glue code for Statisical standard deviation of a 16-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Glue code for Statisical standard deviation of a 16-bit fixed point vector.
function plp_rms_q16s_rv32im
void plp_rms_q16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes
)
Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Statisical standard deviation of a 16-bit fixed point vector for RV32IM extension.
function plp_rms_q16s_xpulpv2
void plp_rms_q16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes
)
Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Statisical standard deviation of a 16-bit fixed point vector for XPULPV2 extension.
function plp_rms_q8
void plp_rms_q8(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes
)
Glue code for Statisical standard deviation of a 8-bit fixed point vector.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Glue code for Statisical standard deviation of a 8-bit fixed point vector.
function plp_rms_q8s_rv32im
void plp_rms_q8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes
)
Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pResult Statisical standard deviation returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Statisical standard deviation of a 8-bit fixed point vector for RV32IM extension.
function plp_rms_q8s_xpulpv2
void plp_rms_q8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes
)
Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension.
Parameters:
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes Statisical standard deviation value returned here
- pSrc points to the input vector
- blockSize number of samples in input vector
- pRes RMS value returned here
Return:
- none
- none
Statisical standard deviation of a 8-bit fixed point vector for XPULPV2 extension.
function plp_sqrt_q32
void plp_sqrt_q32(
const int32_t *__restrict__ pSrc,
const uint32_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for square root of a 32-bit fixed point number.
Parameters:
- in 32-Bit input integer
- out Square root of the input
- pSrc points to the input vector
- pRes Square root returned here
Return:
- none
- none
function plp_sqrt_q32s_rv32im
void plp_sqrt_q32s_rv32im(
const int32_t *__restrict__ pSrc,
const uint32_t fracBits,
int32_t *__restrict__ pRes
)
Square root of a 32-bit fixed point number for XPULPV2 extension.
Parameters:
- in 32-Bit input integer
- out Square root of the input
- pSrc points to the input vector
- pRes Square root returned here
Return:
- none
- none
Square root of a 32-bit fixed point number for XPULPV2 extension.
function plp_sqrt_q32s_xpulpv2
void plp_sqrt_q32s_xpulpv2(
const int32_t *__restrict__ pSrc,
const uint32_t fracBits,
int32_t *__restrict__ pRes
)
Square root of a 32-bit fixed point number for XPULPV2 extension.
Parameters:
- in 32-Bit input integer
- out Square root of the input
- pSrc points to the input vector
- pRes Square root returned here
Return:
- none
- none
function plp_sqrt_q16
void plp_sqrt_q16(
const int16_t *__restrict__ pSrc,
const uint32_t fracBits,
int16_t *__restrict__ pRes
)
Glue code for square root of a 16-bit fixed point number.
Parameters:
- in 16-Bit input integer
- out Square root of the input
- pSrc points to the input vector
- pRes Square root returned here
Return:
- none
- none
function plp_sqrt_q16s_rv32im
void plp_sqrt_q16s_rv32im(
const int16_t *__restrict__ pSrc,
const uint32_t fracBits,
int16_t *__restrict__ pRes
)
Square root of a 16-bit fixed point number for XPULPV2 extension.
Parameters:
- in 16-Bit input integer
- out Square root of the input
- pSrc points to the input vector
- pRes Square root returned here
Return:
- none
- none
Square root of a 16-bit fixed point number for XPULPV2 extension.
function plp_sqrt_q16s_xpulpv2
void plp_sqrt_q16s_xpulpv2(
const int16_t *__restrict__ pSrc,
const uint32_t fracBits,
int16_t *__restrict__ pRes
)
Square root of a 16-bit fixed point number for XPULPV2 extension.
Parameters:
- in 16-Bit input integer
- out Square root of the input
- pSrc points to the input vector
- pRes Square root returned here
Return:
- none
- none
function plp_sqrt_f32
void plp_sqrt_f32(
const float *__restrict__ pSrc,
float *__restrict__ pRes
)
Glue code for square root of a 32-bit floating point number.
Parameters:
- pSrc points to the input vector
- pRes Square root returned here
- pSrc points to the input vectoro
- pRes Square root returned here
Return:
- none
- none
function plp_sqrt_f32s_rv32im
void plp_sqrt_f32s_rv32im(
const float *__restrict__ pSrc,
float *__restrict__ pRes
)
Square root of a 32-bit floating point number for RV32IM.
Parameters:
- pSrc points to the input vector
- pRes Square root returned here
Return: none
function plp_sqrt_f32s_xpulpv2
void plp_sqrt_f32s_xpulpv2(
const float *__restrict__ pSrc,
float *__restrict__ pRes
)
Kernel for square root of a 32-bit floating point number.
Parameters:
- pSrc points to the input vector
- pRes Square root returned here
- pSrc points to the input vector
- pRes Square root returned here
Return:
- none
- none
Kernel for square root of a 32-bit floating point number.
function plp_cos_q32
int32_t plp_cos_q32(
int32_t x
)
Glue code for q32 cosine function.
Parameters:
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- cos(x)
- cos(x)
function plp_cos_q32s_rv32im
int32_t plp_cos_q32s_rv32im(
int32_t x
)
q32 cosine function for RV32IM
Parameters:
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- cos(x)
- cos(x)
function plp_cos_q32s_xpulpv2
int32_t plp_cos_q32s_xpulpv2(
int32_t x
)
q32 cosine function for XPULPV2
Parameters:
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- cos(x)
- cos(x)
function plp_cos_q16
int16_t plp_cos_q16(
int16_t x
)
Glue code for q16 cosine function.
Parameters:
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- cos(x)
- cos(x)
function plp_cos_q16s_rv32im
int16_t plp_cos_q16s_rv32im(
int16_t x
)
q16 cosine function for RV32IM
Parameters:
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- cos(x)
- cos(x)
function plp_cos_q16s_xpulpv2
int16_t plp_cos_q16s_xpulpv2(
int16_t x
)
q16 cosine function for XPULPV2
Parameters:
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- cos(x)
- cos(x)
function plp_cos_f32
float32_t plp_cos_f32(
float32_t x
)
Glue code for f32 cosine function.
Parameters:
- x input value in radians
- x input value in radians
Return:
- cos(x)
- cos(x)
function plp_cos_f32s_xpulpv2
float32_t plp_cos_f32s_xpulpv2(
float32_t x
)
F32 cosine function for XPULPV2.
Parameters:
- x input value in radians
- x input value in radians
Return:
- cos(x)
- cos(x)
F32 cosine function for XPULPV2.
function plp_sin_q32
int32_t plp_sin_q32(
int32_t x
)
Glue code for q32 sine function.
Parameters:
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- sin(x)
- sin(x)
function plp_sin_q32s_rv32im
int32_t plp_sin_q32s_rv32im(
int32_t x
)
q32 sine function for RV32IM
Parameters:
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- sin(x)
- sin(x)
function plp_sin_q32s_xpulpv2
int32_t plp_sin_q32s_xpulpv2(
int32_t x
)
q32 sine function for XPULPV2
Parameters:
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.31 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- sin(x)
- sin(x)
function plp_sin_q16
int16_t plp_sin_q16(
int16_t x
)
Glue code for q16 sine function.
Parameters:
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- sin(x)
- sin(x)
function plp_sin_q16s_rv32im
int16_t plp_sin_q16s_rv32im(
int16_t x
)
q16 sine function for RV32IM
Parameters:
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- sin(x)
- sin(x)
function plp_sin_q16s_xpulpv2
int16_t plp_sin_q16s_xpulpv2(
int16_t x
)
q16 sine function for XPULPV2
Parameters:
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
- x Scaled input value: Q1.15 value in range [0, +0.9999] and is mapped to [0, 2*PI)
Return:
- sin(x)
- sin(x)
function plp_sin_f32
float32_t plp_sin_f32(
float32_t x
)
Glue code for f32 sine function.
Parameters:
- x input value in radians
- x input value in radians
Return:
- sin(x)
- sin(x)
function plp_sin_f32s_xpulpv2
float32_t plp_sin_f32s_xpulpv2(
float32_t x
)
F32 sine function for XPULPV2.
Parameters:
- x input value in radians
- x input value in radians
Return:
- sin(x)
- sin(x)
function plp_correlate_i32
void plp_correlate_i32(
const int32_t * pSrcA,
const uint32_t srcALen,
const int32_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for correlation of 32-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
function plp_correlate_i32s_rv32im
void plp_correlate_i32s_rv32im(
const int32_t * pSrcA,
const uint32_t srcALen,
const int32_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Correlation of 32-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_correlate_i32s_xpulpv2
void plp_correlate_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const uint32_t srcALen,
const int32_t *__restrict__ pSrcB,
const uint32_t srcBLen,
int32_t *__restrict__ pRes
)
Correlation of 32-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
function plp_correlate_i16
void plp_correlate_i16(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for correlation of 16-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes result returned here
Return: none
function plp_correlate_i16s_xpulpv2
void plp_correlate_i16s_xpulpv2(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Correlation of 16-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_correlate_i16s_rv32im
void plp_correlate_i16s_rv32im(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Correlation of 16-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_correlate_i8
void plp_correlate_i8(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for correlation of 8-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
function plp_correlate_valid_i8
void plp_correlate_valid_i8(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for correlation (valid) of 8-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
function plp_correlate_i8s_xpulpv2
void plp_correlate_i8s_xpulpv2(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Correlation of 8-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_correlate_i8s_rv32im
void plp_correlate_i8s_rv32im(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Correlation of 8-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_correlate_q32
void plp_correlate_q32(
const int32_t * pSrcA,
const uint32_t srcALen,
const int32_t * pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t * pRes
)
Glue code for correlation of 32-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
function plp_correlate_q32s_rv32im
void plp_correlate_q32s_rv32im(
const int32_t * pSrcA,
const uint32_t srcALen,
const int32_t * pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t * pRes
)
Correlation of 32-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
Correlation of 32-bit integer vectors kernel for RV32IM extension.
function plp_correlate_q32s_xpulpv2
void plp_correlate_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const uint32_t srcALen,
const int32_t *__restrict__ pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *__restrict__ pRes
)
Correlation of 32-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
function plp_correlate_q16
void plp_correlate_q16(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t * pRes
)
Glue code for correlation of 16-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes result returned here
Return: none
function plp_correlate_q16s_xpulpv2
void plp_correlate_q16s_xpulpv2(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t * pRes
)
Correlation of 16-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
Correlation of 16-bit integer vectors kernel for XPULPV2 extension.
function plp_correlate_q16s_rv32im
void plp_correlate_q16s_rv32im(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t * pRes
)
Correlation of 16-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
Correlation of 16-bit integer vectors kernel for RV32IM extension.
function plp_correlate_q8
void plp_correlate_q8(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t * pRes
)
Glue code for correlation of 8-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
function plp_correlate_valid_q8
void plp_correlate_valid_q8(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t * pRes
)
Glue code for correlation (valid) of 8-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
function plp_correlate_q8s_xpulpv2
void plp_correlate_q8s_xpulpv2(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t * pRes
)
Correlation of 8-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
Correlation of 8-bit integer vectors kernel for XPULPV2 extension.
function plp_correlate_q8s_rv32im
void plp_correlate_q8s_rv32im(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t * pRes
)
Correlation of 8-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes ocdutput result returned here
Return:
- none
- none
Correlation of 8-bit integer vectors kernel for RV32IM extension.
function plp_conv_i32
void plp_conv_i32(
const int32_t * pSrcA,
const uint32_t srcALen,
const int32_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for convolution of 32-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes result returned here
Return: none
function plp_conv_valid_i32
void plp_conv_valid_i32(
const int32_t * pSrcA,
const uint32_t srcALen,
const int32_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for convolution (valid) of 32-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen ength of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here, of size |srcALen - srcBLen| + 1
Return:
- none
- none
Glue code for convolution (valid) of 32-bit integer vectors.
function plp_conv_i32s_rv32im
void plp_conv_i32s_rv32im(
const int32_t * pSrcA,
const uint32_t srcALen,
const int32_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution of 32-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_conv_i32s_xpulpv2
void plp_conv_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const uint32_t srcALen,
const int32_t *__restrict__ pSrcB,
const uint32_t srcBLen,
int32_t *__restrict__ pRes
)
Convolution of 32-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
function plp_conv_valid_i32s_xpulpv2
void plp_conv_valid_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const uint32_t srcALen,
const int32_t *__restrict__ pSrcB,
const uint32_t srcBLen,
int32_t *__restrict__ pRes
)
Convolution (valid) of 32-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return: none
function plp_conv_i16
void plp_conv_i16(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for convolution of 16-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_conv_valid_i16
void plp_conv_valid_i16(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for convolution (valid) of 16-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen ength of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here, of size |srcALen - srcBLen| + 1
Return:
- none
- none
Glue code for convolution (valid) of 16-bit integer vectors.
function plp_conv_valid_rep_i16
void plp_conv_valid_rep_i16(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for convolution (valid with replication) of 16-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector, must be on L2
- srcALen ength of the first input vector
- pSrcB points to the second input vector, must be on L2
- srcBLen Length of the second input vector
- pRes output result returned here, of size |srcALen - srcBLen| + 1, preferably in L1
Return:
- none
- none
Glue code for convolution (valid with replication) of 16-bit integer vectors.
function plp_conv_i16s_xpulpv2
void plp_conv_i16s_xpulpv2(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution of 16-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_conv_valid_i16s_xpulpv2
void plp_conv_valid_i16s_xpulpv2(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution (valid) of 16-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
Convolution (valid) of 16-bit integer vectors kernel for XPULPV2 extension.
function plp_conv_valid_rep_i16s_xpulpv2
void plp_conv_valid_rep_i16s_xpulpv2(
const int16_t * pSrcA,
const uint32_t srcALen,
const uint32_t srcAMem,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution (valid with data replication) of 16-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector of the replicated data
- srcALen Number of elements in (unreplicated) vector a
- srcAMem Number of elements between each replication
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector of the replicated data
- srcALen Number of elements in (unreplicated) vector a
- srcAMem Number of elements between each replication
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
Convolution (valid with data replication) of 16-bit integer vectors kernel for XPULPV2 extension.
function plp_conv_i16s_rv32im
void plp_conv_i16s_rv32im(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution of 16-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_conv_i8
void plp_conv_i8(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for convolution of 8-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_conv_valid_i8
void plp_conv_valid_i8(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for convolution (valid) of 8-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen ength of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here, of size |srcALen - srcBLen| + 1
Return:
- none
- none
Glue code for convolution (valid) of 8-bit integer vectors.
function plp_conv_valid_rep_i8
void plp_conv_valid_rep_i8(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Glue code for convolution (valid with data replication) of 8-bit integer vectors.
Parameters:
- pSrcA points to the first input vector (in L2)
- srcALen Length of the first input vector
- pSrcB points to the second input vector (in L2)
- srcBLen Length of the second input vector
- pRes output result returned here (preferably in L1)
- pSrcA points to the first input vector, must be on L2
- srcALen ength of the first input vector
- pSrcB points to the second input vector, must be on L2
- srcBLen Length of the second input vector
- pRes output result returned here, of size |srcALen - srcBLen| + 1, preferably in L1
Return:
- none
- none
Glue code for convolution (valid with data replication) of 8-bit integer vectors.
function plp_conv_i8s_xpulpv2
void plp_conv_i8s_xpulpv2(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution of 8-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_conv_valid_i8s_xpulpv2
void plp_conv_valid_i8s_xpulpv2(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution (valid) of 8-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
Convolution (valid) of 8-bit integer vectors kernel for XPULPV2 extension.
function plp_conv_valid_rep_i8s_xpulpv2
void plp_conv_valid_rep_i8s_xpulpv2(
const int8_t * pSrcA,
const uint32_t srcALen,
const uint32_t srcAMem,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution (valid with data replication) of 8-bit integer vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector of the replicated data
- srcALen Number of elements in (unreplicated) vector a
- srcAMem Number of elements between each replication
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector of the replicated data
- srcALen Number of elements in (unreplicated) vector a
- srcAMem Number of elements between each replication
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
Convolution (valid with data replication) of 8-bit integer vectors kernel for XPULPV2 extension.
function plp_conv_i8s_rv32im
void plp_conv_i8s_rv32im(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
int32_t * pRes
)
Convolution of 8-bit integer vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- pRes output result returned here
Return:
- none
- none
function plp_conv_i32_parallel
void plp_conv_i32_parallel(
const int32_t * pSrcA,
const uint32_t srcALen,
const int32_t * pSrcB,
const uint32_t srcBLen,
const uint8_t nPE,
int32_t * pRes
)
Glue code for parallel convolution of 32-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- nPE Number of cores to compute on
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- nPE Number of cores to compute on
- pRes output result returned here
Return:
- none
- none
function plp_conv_i32p_xpulpv2
void plp_conv_i32p_xpulpv2(
void * task_args
)
Setup code for parallel convolution of 32-bit integer vectors.
Parameters:
- task_args pointer to plp_conv_instance_i32 struct initialized by plp_conv_i32_parallel
- task_args pointer to plp_conv_instance_i32 struct initialized by plp_conv_i32_parallel
Return:
- none
- none
Setup code for parallel convolution of 32-bit integer vectors.
function plp_conv_i16_parallel
void plp_conv_i16_parallel(
const int16_t * pSrcA,
const uint32_t srcALen,
const int16_t * pSrcB,
const uint32_t srcBLen,
const uint8_t nPE,
int32_t * pRes
)
Glue code for parallel convolution of 16-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- nPE Number of cores to compute on
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- nPE Number of cores to compute on
- pRes output result returned here
Return:
- none
- none
function plp_conv_i16p_xpulpv2
void plp_conv_i16p_xpulpv2(
void * task_args
)
Setup code for parallel convolution of 16-bit integer vectors.
Parameters:
- task_args pointer to plp_conv_instance_i16 struct initialized by plp_conv_i16_parallel
- task_args pointer to plp_conv_instance_i16 struct initialized by plp_conv_i16_parallel
Return:
- none
- none
Setup code for parallel convolution of 16-bit integer vectors.
function plp_conv_i8_parallel
void plp_conv_i8_parallel(
const int8_t * pSrcA,
const uint32_t srcALen,
const int8_t * pSrcB,
const uint32_t srcBLen,
const uint8_t nPE,
int32_t * pRes
)
Glue code for parallel convolution of 8-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- nPE Number of cores to compute on
- pRes output result returned here
- pSrcA points to the first input vector
- srcALen Length of the first input vector
- pSrcB points to the second input vector
- srcBLen Length of the second input vector
- nPE Number of cores to compute on
- pRes output result returned here
Return:
- none
- none
function plp_conv_i8p_xpulpv2
void plp_conv_i8p_xpulpv2(
void * task_args
)
Setup code for parallel convolution of 8-bit integer vectors.
Parameters:
- task_args pointer to plp_conv_instance_i8 struct initialized by plp_conv_i8_parallel
- task_args pointer to plp_conv_instance_i8 struct initialized by plp_conv_i8_parallel
Return:
- none
- none
Setup code for parallel convolution of 8-bit integer vectors.
function plp_conv_parallel_OLA
void plp_conv_parallel_OLA(
uint32_t nPE,
uint32_t srcALen,
uint32_t srcBLen,
int32_t * resultsBuffer
)
Helper function for parallelized overlap-adding of partial convolution results.
Parameters:
- nPE Number of processing cores
- srcALen Length of the first original input vector
- srcBLen Length of the second original input vector
- resultsBuffer resultsBuffer array from plp_conv_i[XX]_parallel
- nPE Number of processing cores
- srcALen Length of the first original input vector
- srcBLen Length of the second original input vector
- resultsBuffer resultsBuffer array from plp_conv_i[XX]_parallel
Return:
- none
- none
function plp_conv_parallel_OLA_kernel
void plp_conv_parallel_OLA_kernel(
void * task_args
)
Helper function for parallelized overlap-adding of partial convolution results.
Parameters:
- task_args Holds the plp_conv_tree_add_instance that describes the vector parameters
- task_args Holds the plp_conv_tree_add_instance that describes the vector parameters
Return:
- none
- none
function plp_mat_mult_i32
void plp_mat_mult_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code for matrix matrix multiplication of a 32-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Glue code for matrix matrix multiplication of a 32-bit integer matrices.
function plp_mat_mult_i32s_rv32im
void plp_mat_mult_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension.
function plp_mat_mult_i32s_xpulpv2
void plp_mat_mult_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_i16
void plp_mat_mult_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code for matrix matrix multiplication of a 16-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Glue code for matrix matrix multiplication of a 16-bit integer matrices.
function plp_mat_mult_i16s_rv32im
void plp_mat_mult_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension.
function plp_mat_mult_i16s_xpulpv2
void plp_mat_mult_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
Matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_i8
void plp_mat_mult_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code for matrix matrix multiplication of a 8-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Glue code for matrix matrix multiplication of a 8-bit integer matrices.
function plp_mat_mult_i8s_rv32im
void plp_mat_mult_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension.
function plp_mat_mult_i8s_xpulpv2
void plp_mat_mult_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
Return: none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_i32_parallel
void plp_mat_mult_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices.
function plp_mat_mult_i32p_xpulpv2
void plp_mat_mult_i32p_xpulpv2(
void * args
)
Parallel matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_i32 struct initialized by plp_mat_mult_i32_parallel
- args pointer to plp_mat_mult_instance_i32 struct initialized by plp_mat_mult_i32_parallel
Return:
- none
- none
Parallel matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_i16_parallel
void plp_mat_mult_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel matrix matrix multiplication of a 16-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel matrix matrix multiplication of a 16-bit integer matrices.
function plp_mat_mult_i16p_xpulpv2
void plp_mat_mult_i16p_xpulpv2(
void * args
)
Parallel matrix multiplication of 16-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_i16 struct initialized by plp_mat_mult_i16_parallel
- args pointer to plp_mat_mult_instance_i16 struct initialized by plp_mat_mult_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_i8_parallel
void plp_mat_mult_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel matrix matrix multiplication of a 8-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel matrix matrix multiplication of a 8-bit integer matrices.
function plp_mat_mult_f32
void plp_mat_mult_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC
)
Glue code for matrix matrix multiplication of a 32-bit floating-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Glue code for matrix matrix multiplication of a 32-bit floating-point matrices.
function plp_mat_mult_f32s_xpulpv2
void plp_mat_mult_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC
)
Matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_mult_f32_parallel
void plp_mat_mult_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
float *__restrict__ pDstC
)
Glue code for parallel matrix matrix multiplication of a 32-bit floating-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel matrix matrix multiplication of a 32-bit floating-point matrices.
function plp_mat_mult_f32p_xpulpv2
void plp_mat_mult_f32p_xpulpv2(
void * args
)
Parallel matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_f32 struct initialized by plp_mat_mult_f32_parallel
- args pointer to plp_mat_mult_instance_f32 struct initialized by plp_mat_mult_f32_parallel
Return:
- none
- none
function plp_mat_mult_i8p_xpulpv2
void plp_mat_mult_i8p_xpulpv2(
void * args
)
Parallel matrix multiplication of 8-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_i8 struct initialized by plp_mat_mult_i8_parallel
Return: none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_q32
void plp_mat_mult_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Glue code for matrix matrix multiplication of a 32-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Glue code for matrix matrix multiplication of a 32-bit fix-point matrices.
function plp_mat_mult_q32_parallel
void plp_mat_mult_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel matrix matrix multiplication of a 32-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Glue code for parallel matrix matrix multiplication of a 32-bit fix-point matrices.
function plp_mat_mult_q32s_rv32im
void plp_mat_mult_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
function plp_mat_mult_q32s_xpulpv2
void plp_mat_mult_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
function plp_mat_mult_q32p_xpulpv2
void plp_mat_mult_q32p_xpulpv2(
void * args
)
Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_q32 struct initialized by plp_mat_mult_q32_parallel
- args pointer to plp_mat_mult_instance_q32 struct initialized by plp_mat_mult_q32_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
function plp_mat_mult_q16
void plp_mat_mult_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Glue code for matrix matrix multiplication of a 16-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for matrix matrix multiplication of a 16-bit fix-point matrices.
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_q16_parallel
void plp_mat_mult_q16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC
)
Glue code for parallel matrix matrix multiplication of a 16-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for parallel matrix matrix multiplication of a 16-bit fix-point matrices.
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_q16s_rv32im
void plp_mat_mult_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
Matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_q16s_xpulpv2
void plp_mat_mult_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
Matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_q16p_xpulpv2
void plp_mat_mult_q16p_xpulpv2(
void * args
)
Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_q16 struct initialized by plp_mat_mult_q16_parallel
- args pointer to plp_mat_mult_instance_q16 struct initialized by plp_mat_mult_q16_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_q8
void plp_mat_mult_q8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Glue code for matrix matrix multiplication of a 8-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for matrix matrix multiplication of a 8-bit fix-point matrices.
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_q8_parallel
void plp_mat_mult_q8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC
)
Glue code for parallel matrix matrix multiplication of a 8-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for parallel matrix matrix multiplication of a 8-bit fix-point matrices.
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_q8s_rv32im
void plp_mat_mult_q8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
Matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_q8s_xpulpv2
void plp_mat_mult_q8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
Matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_q8p_xpulpv2
void plp_mat_mult_q8p_xpulpv2(
void * args
)
Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_q8 struct initialized by plp_mat_mult_q8_parallel
- args pointer to plp_mat_mult_instance_q8 struct initialized by plp_mat_mult_q8_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_cmplx_i32
void plp_mat_mult_cmplx_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code of matrix matrix multiplication for complex 32-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_i32s_rv32im
void plp_mat_mult_cmplx_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 32-bit integers on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_i32s_xpulpv2
void plp_mat_mult_cmplx_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 32-bit integers on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_i32_parallel
void plp_mat_mult_cmplx_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel matrix matrix multiplication for complex 32-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_i32p_xpulpv2
void plp_mat_mult_cmplx_i32p_xpulpv2(
void * args
)
parallel matrix matrix multiplication for complex 32-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_i32 struct initialized by plp_mat_mult_cmplx_i32_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_i32 struct initialized by plp_mat_mult_cmplx_i32_parallel
Return:
- none
- none
function plp_mat_mult_cmplx_i16
void plp_mat_mult_cmplx_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code of matrix matrix multiplication for complex 16-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_i16s_rv32im
void plp_mat_mult_cmplx_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 16-bit integers on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_i16s_xpulpv2
void plp_mat_mult_cmplx_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 16-bit integers on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_cmplx_i16_parallel
void plp_mat_mult_cmplx_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel matrix matrix multiplication for complex 16-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_i16p_xpulpv2
void plp_mat_mult_cmplx_i16p_xpulpv2(
void * args
)
parallel matrix matrix multiplication for complex 16-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_i16 struct initialized by plp_mat_mult_cmplx_i16_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_i16 struct initialized by plp_mat_mult_cmplx_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_cmplx_i8
void plp_mat_mult_cmplx_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code of matrix matrix multiplication for complex 8-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_i8s_rv32im
void plp_mat_mult_cmplx_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 8-bit integers on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_i8s_xpulpv2
void plp_mat_mult_cmplx_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 8-bit integers on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_cmplx_i8_parallel
void plp_mat_mult_cmplx_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel matrix matrix multiplication for complex 8-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_i8p_xpulpv2
void plp_mat_mult_cmplx_i8p_xpulpv2(
void * args
)
parallel matrix matrix multiplication for complex 8-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_i8 struct initialized by plp_mat_mult_cmplx_i8_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_i8 struct initialized by plp_mat_mult_cmplx_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_cmplx_f32
void plp_mat_mult_cmplx_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC
)
Glue code of matrix matrix multiplication for complex 32-bit floats.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_f32s_xpulpv2
void plp_mat_mult_cmplx_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC
)
Matrix matrix multiplication for complex 32-bit floats on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_f32_parallel
void plp_mat_mult_cmplx_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
float *__restrict__ pDstC
)
Glue code of parallel matrix matrix multiplication for complex 32-bit floats.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_f32p_xpulpv2
void plp_mat_mult_cmplx_f32p_xpulpv2(
void * args
)
parallel matrix matrix multiplication for complex 32-bit floats on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_f32 struct initialized by plp_mat_mult_cmplx_f32_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_f32 struct initialized by plp_mat_mult_cmplx_f32_parallel
Return:
- none
- none
function plp_mat_mult_cmplx_q32
void plp_mat_mult_cmplx_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Glue code of matrix matrix multiplication for complex 32-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q32s_rv32im
void plp_mat_mult_cmplx_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 32-bit fix-point on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q32s_xpulpv2
void plp_mat_mult_cmplx_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 32-bit fix-point on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q32_parallel
void plp_mat_mult_cmplx_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel matrix matrix multiplication for complex 32-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q32p_xpulpv2
void plp_mat_mult_cmplx_q32p_xpulpv2(
void * args
)
parallel matrix matrix multiplication for complex 32-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_q32 struct initialized by plp_mat_mult_cmplx_q32_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_q32 struct initialized by plp_mat_mult_cmplx_q32_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q16
void plp_mat_mult_cmplx_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Glue code of matrix matrix multiplication for complex 16-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q16s_rv32im
void plp_mat_mult_cmplx_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 16-bit fix-point on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q16s_xpulpv2
void plp_mat_mult_cmplx_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 16-bit fix-point on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q16_parallel
void plp_mat_mult_cmplx_q16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC
)
Glue code of parallel matrix matrix multiplication for complex 16-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q16p_xpulpv2
void plp_mat_mult_cmplx_q16p_xpulpv2(
void * args
)
parallel matrix matrix multiplication for complex 16-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_q16 struct initialized by plp_mat_mult_cmplx_q16_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_q16 struct initialized by plp_mat_mult_cmplx_q16_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_cmplx_q8
void plp_mat_mult_cmplx_q8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Glue code of matrix matrix multiplication for complex 8-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q8s_rv32im
void plp_mat_mult_cmplx_q8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 8-bit fix-point on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q8s_xpulpv2
void plp_mat_mult_cmplx_q8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Matrix matrix multiplication for complex 8-bit fix-point on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q8_parallel
void plp_mat_mult_cmplx_q8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC
)
Glue code of parallel matrix matrix multiplication for complex 8-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_q8p_xpulpv2
void plp_mat_mult_cmplx_q8p_xpulpv2(
void * args
)
parallel matrix matrix multiplication for complex 8-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_q8 struct initialized by plp_mat_mult_cmplx_q8_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_q8 struct initialized by plp_mat_mult_cmplx_q8_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_trans_i32
void plp_mat_mult_trans_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code for matrix transposed matrix multiplication of a 32-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Glue code for matrix transposed matrix multiplication of a 32-bit integer matrices.
function plp_mat_mult_trans_i32s_rv32im
void plp_mat_mult_trans_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
function plp_mat_mult_trans_i32s_xpulpv2
void plp_mat_mult_trans_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_trans_i16
void plp_mat_mult_trans_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code for matrix transposed matrix multiplication of a 16-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Glue code for matrix transposed matrix multiplication of a 16-bit integer matrices.
function plp_mat_mult_trans_i16s_rv32im
void plp_mat_mult_trans_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension.
function plp_mat_mult_trans_i16s_xpulpv2
void plp_mat_mult_trans_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
Matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_trans_i8
void plp_mat_mult_trans_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code for matrix transposed matrix multiplication of a 8-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Glue code for matrix transposed matrix multiplication of a 8-bit integer matrices.
function plp_mat_mult_trans_i8s_rv32im
void plp_mat_mult_trans_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension.
function plp_mat_mult_trans_i8s_xpulpv2
void plp_mat_mult_trans_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
Matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_trans_i32_parallel
void plp_mat_mult_trans_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel matrix matrix multiplication of a 32-bit integer matrices.
function plp_mat_mult_trans_i32p_xpulpv2
void plp_mat_mult_trans_i32p_xpulpv2(
void * args
)
Parallel matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
Parameters:
- args pointer to plp_mat_mult_instance_i32 struct initialized by plp_mat_mult_i32_parallel
- args pointer to plp_mat_mult_instance_i32 struct initialized by plp_mat_mult_i32_parallel
Return:
- none
- none
function plp_mat_mult_trans_i16_parallel
void plp_mat_mult_trans_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel matrix transposed matrix multiplication of a 16-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel matrix transposed matrix multiplication of a 16-bit integer matrices.
function plp_mat_mult_trans_i16p_xpulpv2
void plp_mat_mult_trans_i16p_xpulpv2(
void * args
)
Parallel matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_i16 struct initialized by plp_mat_mult_i16_parallel
- args pointer to plp_mat_mult_instance_i16 struct initialized by plp_mat_mult_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_trans_i8_parallel
void plp_mat_mult_trans_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel matrix transposed matrix multiplication of a 8-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel matrix transposed matrix multiplication of a 8-bit integer matrices.
function plp_mat_mult_trans_i8p_xpulpv2
void plp_mat_mult_trans_i8p_xpulpv2(
void * args
)
Parallel matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_i8 struct initialized by plp_mat_mult_i8_parallel
- args pointer to plp_mat_mult_instance_i8 struct initialized by plp_mat_mult_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_trans_q32
void plp_mat_mult_trans_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Glue code for matrix transposed matrix multiplication of a 32-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Glue code for matrix transposed matrix multiplication of a 32-bit fix-point matrices.
function plp_mat_mult_trans_q32_parallel
void plp_mat_mult_trans_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel matrix transposed matrix multiplication of a 32-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Glue code for parallel matrix transposed matrix multiplication of a 32-bit fix-point matrices.
function plp_mat_mult_trans_q32s_rv32im
void plp_mat_mult_trans_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
function plp_mat_mult_trans_q32s_xpulpv2
void plp_mat_mult_trans_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
function plp_mat_mult_trans_q32p_xpulpv2
void plp_mat_mult_trans_q32p_xpulpv2(
void * args
)
Parallel matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_q32 struct initialized by plp_mat_mult_trans_q32_parallel
- args pointer to plp_mat_mult_instance_q32 struct initialized by plp_mat_mult_trans_q32_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
function plp_mat_mult_trans_q16
void plp_mat_mult_trans_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Glue code for matrix transposed matrix multiplication of a 16-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for matrix transposed matrix multiplication of a 16-bit fix-point matrices.
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_q16_parallel
void plp_mat_mult_trans_q16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC
)
Glue code for parallel matrix transposed matrix multiplication of a 16-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for parallel matrix transposed matrix multiplication of a 16-bit fix-point matrices.
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_q16s_rv32im
void plp_mat_mult_trans_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_q16s_xpulpv2
void plp_mat_mult_trans_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_q16p_xpulpv2
void plp_mat_mult_trans_q16p_xpulpv2(
void * args
)
Parallel matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_q16 struct initialized by plp_mat_mult_trans_q16_parallel
- args pointer to plp_mat_mult_instance_q16 struct initialized by plp_mat_mult_trans_q16_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_q8
void plp_mat_mult_trans_q8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Glue code for matrix transposed matrix multiplication of a 8-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for matrix transposed matrix multiplication of a 8-bit fix-point matrices.
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_q8_parallel
void plp_mat_mult_trans_q8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC
)
Glue code for parallel matrix transposed matrix multiplication of a 8-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for parallel matrix transposed matrix multiplication of a 8-bit fix-point matrices.
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_q8s_rv32im
void plp_mat_mult_trans_q8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_q8s_xpulpv2
void plp_mat_mult_trans_q8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_q8p_xpulpv2
void plp_mat_mult_trans_q8p_xpulpv2(
void * args
)
Parallel matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_q8 struct initialized by plp_mat_mult_trans_q8_parallel
- args pointer to plp_mat_mult_instance_q8 struct initialized by plp_mat_mult_trans_q8_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_f32
void plp_mat_mult_trans_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC
)
Glue code for matrix transposed matrix multiplication of a 32-bit floating-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
Glue code for matrix transposed matrix multiplication of a 32-bit floating-point matrices.
function plp_mat_mult_trans_f32s_xpulpv2
void plp_mat_mult_trans_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC
)
matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- pDstC points to the output matrix
Return:
- none
- none
matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_mult_trans_f32_parallel
void plp_mat_mult_trans_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
float *__restrict__ pDstC
)
Glue code for parallel matrix transposed matrix multiplication of a 32-bit floating-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix, stored transposed in memory
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel matrix transposed matrix multiplication of a 32-bit floating-point matrices.
function plp_mat_mult_trans_f32p_xpulpv2
void plp_mat_mult_trans_f32p_xpulpv2(
void * args
)
Parallel matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_instance_f32 struct initialized by plp_mat_mult_trans_f32_parallel
- args pointer to plp_mat_mult_instance_f32 struct initialized by plp_mat_mult_trans_f32_parallel
Return:
- none
- none
Parallel matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
function plp_mat_mult_trans_cmplx_i32
void plp_mat_mult_trans_cmplx_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code of matrix transpose matrix multiplication for complex 32-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_i32s_rv32im
void plp_mat_mult_trans_cmplx_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 32-bit integers on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
matrix transpose matrix multiplication for complex 32-bit integers on RV32IM
function plp_mat_mult_trans_cmplx_i32s_xpulpv2
void plp_mat_mult_trans_cmplx_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2
function plp_mat_mult_trans_cmplx_i32_parallel
void plp_mat_mult_trans_cmplx_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel matrix transpose matrix multiplication for complex 32-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_i32p_xpulpv2
void plp_mat_mult_trans_cmplx_i32p_xpulpv2(
void * args
)
parallel matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_i32 struct initialized by plp_mat_mult_trans_cmplx_i32_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_i32 struct initialized by plp_mat_mult_trans_cmplx_i32_parallel
Return:
- none
- none
function plp_mat_mult_trans_cmplx_i16
void plp_mat_mult_trans_cmplx_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code of matrix transpose matrix multiplication for complex 16-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_i16s_rv32im
void plp_mat_mult_trans_cmplx_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 16-bit integers on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
matrix transpose matrix multiplication for complex 16-bit integers on RV32IM
function plp_mat_mult_trans_cmplx_i16s_xpulpv2
void plp_mat_mult_trans_cmplx_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2
function plp_mat_mult_trans_cmplx_i16_parallel
void plp_mat_mult_trans_cmplx_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel matrix transpose matrix multiplication for complex 16-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_i16p_xpulpv2
void plp_mat_mult_trans_cmplx_i16p_xpulpv2(
void * args
)
parallel matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_i16 struct initialized by plp_mat_mult_trans_cmplx_i16_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_i16 struct initialized by plp_mat_mult_trans_cmplx_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_trans_cmplx_i8
void plp_mat_mult_trans_cmplx_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
Glue code of matrix transpose matrix multiplication for complex 8-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_i8s_rv32im
void plp_mat_mult_trans_cmplx_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 8-bit integers on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
matrix transpose matrix multiplication for complex 8-bit integers on RV32IM
function plp_mat_mult_trans_cmplx_i8s_xpulpv2
void plp_mat_mult_trans_cmplx_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2
function plp_mat_mult_trans_cmplx_i8_parallel
void plp_mat_mult_trans_cmplx_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel matrix transpose matrix multiplication for complex 8-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_i8p_xpulpv2
void plp_mat_mult_trans_cmplx_i8p_xpulpv2(
void * args
)
parallel matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_i8 struct initialized by plp_mat_mult_trans_cmplx_i8_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_i8 struct initialized by plp_mat_mult_trans_cmplx_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_trans_cmplx_f32
void plp_mat_mult_trans_cmplx_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC
)
Glue code of matrix transpose matrix multiplication for complex 32-bit floats.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_f32s_xpulpv2
void plp_mat_mult_trans_cmplx_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2
function plp_mat_mult_trans_cmplx_f32_parallel
void plp_mat_mult_trans_cmplx_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
float *__restrict__ pDstC
)
Glue code of parallel matrix transpose matrix multiplication for complex 32-bit floats.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_f32p_xpulpv2
void plp_mat_mult_trans_cmplx_f32p_xpulpv2(
void * args
)
parallel matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_f32 struct initialized by plp_mat_mult_trans_cmplx_f32_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_f32 struct initialized by plp_mat_mult_trans_cmplx_f32_parallel
Return:
- none
- none
function plp_mat_mult_trans_cmplx_q32
void plp_mat_mult_trans_cmplx_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Glue code of matrix transpose matrix multiplication for complex 32-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_q32s_rv32im
void plp_mat_mult_trans_cmplx_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM
function plp_mat_mult_trans_cmplx_q32s_xpulpv2
void plp_mat_mult_trans_cmplx_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2
function plp_mat_mult_trans_cmplx_q32_parallel
void plp_mat_mult_trans_cmplx_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel matrix transpose matrix multiplication for complex 32-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_q32p_xpulpv2
void plp_mat_mult_trans_cmplx_q32p_xpulpv2(
void * args
)
parallel matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_q32 struct initialized by plp_mat_mult_trans_cmplx_q32_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_q32 struct initialized by plp_mat_mult_trans_cmplx_q32_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_q16
void plp_mat_mult_trans_cmplx_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Glue code of matrix transpose matrix multiplication for complex 16-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_q16s_rv32im
void plp_mat_mult_trans_cmplx_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM
function plp_mat_mult_trans_cmplx_q16s_xpulpv2
void plp_mat_mult_trans_cmplx_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2
function plp_mat_mult_trans_cmplx_q16_parallel
void plp_mat_mult_trans_cmplx_q16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC
)
Glue code of parallel matrix transpose matrix multiplication for complex 16-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_q16p_xpulpv2
void plp_mat_mult_trans_cmplx_q16p_xpulpv2(
void * args
)
parallel matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_q16 struct initialized by plp_mat_mult_trans_cmplx_q16_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_q16 struct initialized by plp_mat_mult_trans_cmplx_q16_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_trans_cmplx_q8
void plp_mat_mult_trans_cmplx_q8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Glue code of matrix transpose matrix multiplication for complex 8-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_q8s_rv32im
void plp_mat_mult_trans_cmplx_q8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM
function plp_mat_mult_trans_cmplx_q8s_xpulpv2
void plp_mat_mult_trans_cmplx_q8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC
)
matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2
function plp_mat_mult_trans_cmplx_q8_parallel
void plp_mat_mult_trans_cmplx_q8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC
)
Glue code of parallel matrix transpose matrix multiplication for complex 8-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_q8p_xpulpv2
void plp_mat_mult_trans_cmplx_q8p_xpulpv2(
void * args
)
parallel matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_instance_q8 struct initialized by plp_mat_mult_trans_cmplx_q8_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_q8 struct initialized by plp_mat_mult_trans_cmplx_q8_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_cmplx_mag_f32
void plp_cmplx_mag_f32(
const float32_t * pSrc,
float32_t * pRes,
uint32_t numSamples
)
Glue code for complex magnitude calculation in float32.
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_f32s_xpulpv2
void plp_cmplx_mag_f32s_xpulpv2(
const float32_t * pSrc,
float32_t * pRes,
uint32_t numSamples
)
complex magnitude for float32 on XPULPV2
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_q32
void plp_cmplx_mag_q32(
const int32_t * pSrc,
const uint32_t fracBits,
int32_t * pRes,
uint32_t numSamples
)
Glue code for complex magnitude calculation for 32 bit fixpoint.
Parameters:
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_q32s_rv32im
void plp_cmplx_mag_q32s_rv32im(
const int32_t * pSrc,
const uint32_t fracBits,
int32_t * pRes,
uint32_t numSamples
)
complex magnitude for q32 on RV32IM
Parameters:
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_q32s_xpulpv2
void plp_cmplx_mag_q32s_xpulpv2(
const int32_t * pSrc,
const uint32_t fracBits,
int32_t * pRes,
uint32_t numSamples
)
complex magnitude for q32 on XPULPV2
Parameters:
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_q8
void plp_cmplx_mag_q8(
const int8_t * pSrc,
const uint32_t fracBits,
int8_t * pRes,
uint32_t numSamples
)
Glue code for complex magnitude calculation for 8 bit fixpoint.
Parameters:
- pSrc pointer to source
- fracBits fractional bits -> Q(8-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- fracBits fractional bits -> Q(8-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_q8s_rv32im
void plp_cmplx_mag_q8s_rv32im(
const int8_t * pSrc,
const uint32_t fracBits,
int8_t * pRes,
uint32_t numSamples
)
complex magnitude for q8 on RV32IM
Parameters:
- pSrc pointer to source
- fracBits fractional bits -> Q(8-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- fracBits fractional bits -> Q(8-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_q8s_xpulpv2
void plp_cmplx_mag_q8s_xpulpv2(
const int8_t * pSrc,
const uint32_t fracBits,
int8_t * pRes,
uint32_t numSamples
)
complex magnitude for q8 on XPULPV2
Parameters:
- pSrc pointer to source
- fracBits fractional bits -> Q(8-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- fracBits fractional bits -> Q(8-fracBits).fracBits
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_i16
void plp_cmplx_mag_i16(
const int16_t * pSrc,
int16_t * pRes,
uint32_t numSamples
)
Glue code for complex magnitude calculation in 16-bit integer.
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_i16s_rv32im
void plp_cmplx_mag_i16s_rv32im(
const int16_t * pSrc,
int16_t * pRes,
uint32_t numSamples
)
complex magnitude for i16 on RV32IM
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_i16s_xpulpv2
void plp_cmplx_mag_i16s_xpulpv2(
const int16_t * pSrc,
int16_t * pRes,
uint32_t numSamples
)
complex magnitude for i16 on XPULPV2
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_i32
void plp_cmplx_mag_i32(
const int32_t * pSrc,
int32_t * pRes,
uint32_t numSamples
)
Glue code for complex magnitude calculation in 32-bit integer.
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_i32s_rv32im
void plp_cmplx_mag_i32s_rv32im(
const int32_t * pSrc,
int32_t * pRes,
uint32_t numSamples
)
complex magnitude for i32 on RV32IM
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_i32s_xpulpv2
void plp_cmplx_mag_i32s_xpulpv2(
const int32_t * pSrc,
int32_t * pRes,
uint32_t numSamples
)
complex magnitude for i32 on XPULPV2
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_i8
void plp_cmplx_mag_i8(
const int8_t * pSrc,
int8_t * pRes,
uint32_t numSamples
)
Glue code for complex magnitude calculation in 8-bit integer.
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_i8s_rv32im
void plp_cmplx_mag_i8s_rv32im(
const int8_t * pSrc,
int8_t * pRes,
uint32_t numSamples
)
complex magnitude for i8 on RV32IM
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_i8s_xpulpv2
void plp_cmplx_mag_i8s_xpulpv2(
const int8_t * pSrc,
int8_t * pRes,
uint32_t numSamples
)
complex magnitude for i8 on XPULPV2
Parameters:
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
- pSrc pointer to source
- pRes pointer to result
- numSamples The number samples
function plp_cmplx_mag_q16
void plp_cmplx_mag_q16(
const int16_t * pSrc,
const uint32_t fracBits,
int16_t * pRes,
uint32_t numSamples
)
Glue code for complex magnitude calculation in 16-bit quantized integer.
Parameters:
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number of samples
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number of samples
function plp_cmplx_mag_q16s_rv32im
void plp_cmplx_mag_q16s_rv32im(
const int16_t * pSrc,
const uint32_t fracBits,
int16_t * pRes,
uint32_t numSamples
)
complex magnitude for q16 on RV32IM
Parameters:
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number of samples
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number of samples
function plp_cmplx_mag_q16s_xpulpv2
void plp_cmplx_mag_q16s_xpulpv2(
const int16_t * pSrc,
const uint32_t fracBits,
int16_t * pRes,
uint32_t numSamples
)
complex magnitude for q16 on XPULPV2
Parameters:
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number of samples
- pSrc pointer to source
- fracBits fractional bits -> Q(32-fracBits).fracBits
- pRes pointer to result
- numSamples The number of samples
function plp_bitreversal_16s_rv32im
void plp_bitreversal_16s_rv32im(
uint16_t * pSrc,
const uint16_t bitRevLen,
const uint16_t * pBitRevTab
)
In-place 16 bit reversal function for RV32IM.
Parameters:
- pSrc points to in-place buffer of unknown 16-bit data type
- bitRevLen bit reversal table length
- pBitRevTab points to bit reversal table
- pSrc points to in-place buffer of unknown 16-bit data type
- bitRevLen bit reversal table length
- pBitRevTab points to bit reversal table
Return:
- none
- none
In-place 16 bit reversal function for RV32IM.
function plp_bitreversal_16s_xpulpv2
void plp_bitreversal_16s_xpulpv2(
uint16_t * pSrc,
const uint16_t bitRevLen,
const uint16_t * pBitRevTab
)
In-place 16 bit reversal function for XPULPV2.
Parameters:
- pSrc points to in-place buffer of unknown 16-bit data type
- bitRevLen bit reversal table length
- pBitRevTab points to bit reversal table
- pSrc points to in-place buffer of unknown 16-bit data type
- bitRevLen bit reversal table length
- pBitRevTab points to bit reversal table
Return:
- none
- none
In-place 16 bit reversal function for XPULPV2.
function plp_bitreversal_16p_xpulpv2
void plp_bitreversal_16p_xpulpv2(
uint16_t * pSrc,
const uint16_t bitRevLen,
const uint16_t * pBitRevTab,
uint32_t nPE
)
In-place 16 bit reversal function.
Parameters:
- pSrc points to in-place buffer of unknown 16-bit data type
- bitRevLen bit reversal table length
- pBitRevTab points to bit reversal table
- nPE number of cores
Return: none
function plp_cfft_q16
void plp_cfft_q16(
const plp_cfft_instance_q16 * S,
int16_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t deciPoint
)
Glue code for quantized 16 bit complex fast fourier transform.
Parameters:
- S points to an instance of the 16bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- deciPoint decimal point for right shift
- S points to an instance of the 16bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- deciPoint decimal point for right shift
Fixed point units input -> output dependent on length: len=16: Q1.15 -> Q5.11 len=32: Q1.15 -> Q6.10 len=64: Q1.15 -> Q7.9 len=128: Q1.15 -> Q8.8 len=256: Q1.15 -> Q9.7 len=512: Q1.15 -> Q10.6 len=1024: Q1.15 -> Q11.5 len=2048: Q1.15 -> Q12.4 len=4096: Q1.15 -> Q13.3
function plp_cfft_q16_parallel
void plp_cfft_q16_parallel(
const plp_cfft_instance_q16 * S,
int16_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t deciPoint,
uint32_t nPE
)
Glue code for quantized 16 bit complex fast fourier transform.
Parameters:
- S points to an instance of the 16bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- deciPoint decimal point for right shift
- nPE Number of cores to use
- S points to an instance of the 16bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- deciPoint decimal point for right shift
- nPE Number of cores to use
Fixed point units input -> output dependent on length: len=16: Q1.15 -> Q5.11 len=32: Q1.15 -> Q6.10 len=64: Q1.15 -> Q7.9 len=128: Q1.15 -> Q8.8 len=256: Q1.15 -> Q9.7 len=512: Q1.15 -> Q10.6 len=1024: Q1.15 -> Q11.5 len=2048: Q1.15 -> Q12.4 len=4096: Q1.15 -> Q13.3
Fixed point units input -> output dependent on length: len=16: Q1.15 -> Q5.11 len=32: Q1.15 -> Q6.10 len=64: Q1.15 -> Q7.9 len=128: Q1.15 -> Q8.8 len=256: Q1.15 -> Q9.7 len=512: Q1.15 -> Q10.6 len=1024: Q1.15 -> Q11.5 len=2048: Q1.15 -> Q12.4 len=4096: Q1.15 -> Q13.3
function plp_cfft_q16s_rv32im
void plp_cfft_q16s_rv32im(
const plp_cfft_instance_q16 * S,
int16_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t deciPoint
)
Quantized 16 bit complex fast fourier transform for RV32IM.
Parameters:
- S points to an instance of the 16bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- deciPoint decimal point for right shift
function plp_cfft_q16s_xpulpv2
void plp_cfft_q16s_xpulpv2(
const plp_cfft_instance_q16 * S,
int16_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t deciPoint
)
Quantized 16 bit complex fast fourier transform for XPULPV2.
Parameters:
- S points to an instance of the 16bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- deciPoint decimal point for right shift
function plp_cfft_q16p_xpulpv2
void plp_cfft_q16p_xpulpv2(
void * args
)
Parallel quantized 16 bit complex fast fourier transform for XPULPV2.
Parameters:
- args points to the plp_cfft_instance_q16_parallel
function plp_bitreversal_32s_rv32im
void plp_bitreversal_32s_rv32im(
uint32_t * pSrc,
const uint16_t bitRevLen,
const uint16_t * pBitRevTab
)
In-place 32 bit reversal function for RV32IM.
Parameters:
- pSrc points to in-place buffer of unknown 32-bit data type
- bitRevLen bit reversal table length
- pBitRevTab points to bit reversal table
- pSrc points to in-place buffer of unknown 32-bit data type
- bitRevLen bit reversal table length
- pBitRevTab points to bit reversal table
Return:
- none
- none
In-place 32 bit reversal function for RV32IM.
function plp_bitreversal_32s_xpulpv2
void plp_bitreversal_32s_xpulpv2(
uint32_t * pSrc,
const uint16_t bitRevLen,
const uint16_t * pBitRevTab
)
In-place 32 bit reversal function for XPULPV2.
Parameters:
- pSrc points to in-place buffer of unknown 32-bit data type
- bitRevLen bit reversal table length
- pBitRevTab points to bit reversal table
Return: none
function plp_bitreversal_32p_xpulpv2
void plp_bitreversal_32p_xpulpv2(
uint32_t * pSrc,
const uint16_t bitRevLen,
const uint16_t * pBitRevTab,
uint32_t nPE
)
In-place 32 bit reversal function for XPULPV2.
Parameters:
- pSrc points to in-place buffer of unknown 32-bit data type
- bitRevLen bit reversal table length
- pBitRevTab points to bit reversal table
- nPE number of cores
Return: none
function plp_cfft_q32
void plp_cfft_q32(
const plp_cfft_instance_q32 * S,
int32_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t fracBits
)
Glue code for quantized 32-bit complex fast fourier transform.
Parameters:
- S points to an instance of the 32bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
- S points to an instance of the 32bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
Fixed point units input -> output dependent on length: len=16: Q1.31 -> Q5.27 len=32: Q1.31 -> Q6.26 len=64: Q1.31 -> Q7.25 len=128: Q1.31 -> Q8.24 len=256: Q1.31 -> Q9.23 len=512: Q1.31 -> Q10.22 len=1024: Q1.31 -> Q11.21 len=2048: Q1.31 -> Q12.20 len=4096: Q1.31 -> Q13.19
function plp_cfft_q32_parallel
void plp_cfft_q32_parallel(
const plp_cfft_instance_q32 * S,
int32_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t fracBits,
uint32_t nPE
)
Quantized 32-bit complex fast fourier transform for XPULPV2.
Parameters:
- S points to an instance of the 32bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
- nPE Number of cores to use
- S points to an instance of the 32bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
- nPE Number of cores to use
function plp_cfft_q32s_rv32im
void plp_cfft_q32s_rv32im(
const plp_cfft_instance_q32 * S,
int32_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t fracBits
)
Quantized 32-bit complex fast fourier transform for RV32IM.
Parameters:
- S points to an instance of the 32bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
- S points to an instance of the 32bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
function plp_cfft_q32s_xpulpv2
void plp_cfft_q32s_xpulpv2(
const plp_cfft_instance_q32 * S,
int32_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t fracBits
)
Quantized 32-bit complex fast fourier transform for XPULPV2.
Parameters:
- S points to an instance of the 32bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
- S points to an instance of the 32bit quantized CFFT structure
- p1 points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- fracBits decimal point for right shift (input format Q(32-fracBits).fracBits)
function plp_cfft_q32p_xpulpv2
void plp_cfft_q32p_xpulpv2(
void * args
)
Parallel quantized 32 bit complex fast fourier transform for XPULPV2.
Parameters:
- args points to the plp_cfft_instance_q32_parallel
function plp_rfft_f32
void plp_rfft_f32(
const plp_fft_instance_f32 * S,
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst
)
Floating-point FFT on real input data.
Parameters:
- S points to an instance of the floating-point FFT structure
- pSrc points to the input buffer (real data)
- pDst points to the output buffer (complex data)
Return: none
function plp_rfft_f32_parallel
void plp_rfft_f32_parallel(
const plp_fft_instance_f32 * S,
const float32_t *__restrict__ pSrc,
const uint32_t nPE,
float32_t *__restrict__ pDst
)
Floating-point FFT on real input data (parallel version).
Parameters:
- S points to an instance of the floating-point FFT structure
- pSrc points to the input buffer (real data)
- nPE number of parallel processing units
- pDst points to the output buffer (complex data)
Return: none
function plp_rfft_f32s_xpulpv2
void plp_rfft_f32s_xpulpv2(
const plp_fft_instance_f32 * S,
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst
)
Floating-point FFT on real input data for XPULPV2 extension.
Parameters:
- S points to an instance of the floating-point FFT structure
- pSrcA points to the input buffer (real data)
- pDst points to the output buffer (complex data)
- S points to an instance of the floating-point FFT structure
- pSrc points to the input buffer (real data)
- pDst points to the output buffer (complex data)
Return:
- none
- none
function plp_rfft_f32p_xpulpv2
void plp_rfft_f32p_xpulpv2(
void * arg
)
Floating-point FFT on real input data for XPULPV2 extension (parallel version).
Parameters:
- arg points to an instance of the floating-point FFT structure
- arg points to an instance of the floating-point FFT structure
Return:
- none
- none
Floating-point FFT on real input data for XPULPV2 extension (parallel version).
function plp_rfftfast_f32
void plp_rfftfast_f32(
const plp_fft_fast_instance_f32 * S,
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst
)
Floating-point FFT on real input data.
Parameters:
- S points to an instance of the floating-point FFT structure
- pSrc points to the input buffer (real data)
- pDst points to the output buffer (complex data)
Return: none
function plp_rfftfast_f32_parallel
void plp_rfftfast_f32_parallel(
const plp_fft_fast_instance_f32 * S,
float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
const uint32_t nPE
)
Floating-point parallel FFT on real input data.
Parameters:
- S points to an instance of the floating-point FFT structure
- pSrc points to the input buffer (real data)
- pDst points to the output buffer (complex data)
- S points to an instance of the floating-point FFT structure
- pSrc points to the input buffer (real data)
- pDst points to the output buffer (complex data)
Return:
- none
- none
Floating-point parallel FFT on real input data.
function plp_rfftfast_f32s_xpulpv2
void plp_rfftfast_f32s_xpulpv2(
const plp_fft_fast_instance_f32 * S,
float32_t * pSrc,
float32_t * pDst
)
Floating-point FFT on real input data for XPULPV2 extension.
Parameters:
- S points to an instance of the floating-point FFT structure
- pSrcA points to the input buffer (real data)
- pDst points to the output buffer (complex data)
Return: none
function plp_rfftfast_f32p_xpulpv2
void plp_rfftfast_f32p_xpulpv2(
void * arg
)
Floating-point parallel FFT on real input data for XPULPV2 extension.
Parameters:
- arg points to an instance of the floating-point FFT structure
Return: none
function plp_cfft_f32
void plp_cfft_f32(
const plp_cfft_instance_f32 * S,
float32_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag
)
Floating-point FFT on complex input data.
Parameters:
- S points to an instance of the floating-point FFT structure
- pSrc points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
Return: none
function plp_cfft_f32_parallel
void plp_cfft_f32_parallel(
const plp_cfft_instance_f32 * S,
const float32_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
const uint32_t nPE
)
Floating-point FFT on complex input data (parallel version).
Parameters:
- S points to an instance of the floating-point FFT structure
- pSrc points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
- nPE number of parallel processing units
Return: none
function plp_cfft_f32s_xpulpv2
void plp_cfft_f32s_xpulpv2(
const plp_cfft_instance_f32 * S,
const float32_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag
)
Floating-point FFT on complex input data for XPULPV2 extension.
Parameters:
- S points to an instance of the floating-point FFT structure
- pSrc points to the complex data buffer of size
2*fftLen
. Processing occurs in-place. - ifftFlag flag that selects forwart (ifftFlag=0) or inverse (ifftFlag=1)
- bitReverseFlag flag that enables (bitReverseFlag=1) of disables (bitReverseFlag=0) bit reversal of output.
Return: none
function plp_cfft_f32p_xpulpv2
void plp_cfft_f32p_xpulpv2(
void * arg
)
Floating-point FFT on complex input data for XPULPV2 extension (parallel version).
Parameters:
- arg points to an instance of the floating-point FFT structure
- arg points to an instance of the floating-point FFT structure
Return:
- none
- none
Floating-point FFT on complex input data for XPULPV2 extension (parallel version).
function plp_dct2_f32
void plp_dct2_f32(
const plp_fft_instance_f32 * S,
const Complex_type_f32 * pShift,
const uint8_t orthoNorm,
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pBuf,
float32_t *__restrict__ pDst
)
Floating-point DCT on real input data. Implementation of John Makhoul's "A Fast Cosine Transform in One and Two Dimensions" 1980 IEEE paper.
Parameters:
- S points to an instance of the floating-point FFT structure with FFTLength = DCTLength
- pShift points to twiddle coefficient table of 4*FFTLength, of which only the first quarter is necessary.
- pSrc points to the input buffer (real data) of size FFTLength
- pBuf points to buffer of size 2*FFTLength, used for computation.
- pDst points to output buffer (real data) of size FFTLength, may be the same as pSrc.
- S points to an instance of the floating-point FFT structure with FFTLength = DCTLength
- pShift points to twiddle coefficient table of 4*FFTLength, of which only the first quadrant of the complex unit circle is used. For example, if S contains twiddleCoef_rfft_32, pShift can be set to twiddleCoef_rfft_128.
- pSrc points to the input buffer (real data) of size FFTLength.
- pBuf points to buffer of size 2*FFTLength, used for computation.
- pDst points to output buffer (real data) of size FFTLength, may be the same as pSrc.
Return:
- none
- none
function plp_dct2_f32_parallel
void plp_dct2_f32_parallel(
const plp_fft_instance_f32 * S,
const Complex_type_f32 * pShift,
const uint8_t orthoNorm,
const float32_t *__restrict__ pSrc,
const uint32_t nPE,
float32_t *__restrict__ pBuf,
float32_t *__restrict__ pDst
)
Floating-point DCT on real input data. Implementation of John Makhoul's "A Fast Cosine Transform in One and Two Dimensions" 1980 IEEE paper.
Parameters:
- S points to an instance of the floating-point FFT structure with FFTLength = DCTLength
- pShift points to twiddle coefficient table of 4*FFTLength, of which only the first quarter is necessary.
- pSrc points to the input buffer (real data) of size FFTLength
- nPE number of parallel processing units
- pBuf points to buffer of size 2*FFTLength, used for computation.
- pDst points to output buffer (real data) of size FFTLength, may be the same as pSrc.
- S points to an instance of the floating-point FFT structure with FFTLength = DCTLength
- pShift points to twiddle coefficient table of 4*FFTLength, of which only the first quadrant of the complex unit circle is used. For example, if S contains twiddleCoef_rfft_32, pShift can be set to twiddleCoef_rfft_128.
- pSrc points to the input buffer (real data) of size FFTLength.
- pBuf points to buffer of size 2*FFTLength, used for computation.
- pDst points to output buffer (real data) of size FFTLength, may be the same as pSrc.
Return:
- none
- none
Floating-point DCT on real input data. Implementation of John Makhoul's "A Fast Cosine Transform in One and Two Dimensions" 1980 IEEE paper.
function plp_mfcc_f32
void plp_mfcc_f32(
const plp_fft_instance_f32 * SFFT,
const plp_fft_instance_f32 * SDCT,
const Complex_type_f32 * pShift,
const plp_triangular_filter_f32 * filterBank,
const float32_t * window,
const uint8_t * orthoNorm,
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst
)
MFCC on real input data.
Parameters:
- SFFT points to an instance of the floating-point FFT structure for the initial FFT (with FFTLength = n_fft). bitReverseFlag should be on.
- SDCT points to an instance of the floating-point FFT structure for the DCT (with FFTLength = n_mels). bitReverseFlag should be on.
- pShift points to twiddle coefficient table with FFTLength = 4*n_mels. Only first quarter necessary.
- filterBank points to plp_triangular_filter_f32 instance with nFilters = n_mels.
- window vector to use for windowing
- orthoNorm whether to use dct orthonormalisation or not
- pSrc points to the input buffer (real data, size n_fft)
- pDst points to the output buffer of length at least 3*n_fft. pSrc and pDst must not overlap, the calculation can not be done in place. MFCCs are returned in the first n_mels spots.
Return: none
function plp_mfcc_f32_parallel
void plp_mfcc_f32_parallel(
const plp_fft_instance_f32 * SFFT,
const plp_fft_instance_f32 * SDCT,
const Complex_type_f32 * pShift,
const plp_triangular_filter_f32 * filterBank,
const float32_t * window,
const uint8_t * orthoNorm,
const float32_t *__restrict__ pSrc,
const uint32_t nPE,
float32_t *__restrict__ pDst
)
MFCC on real input data.
Parameters:
- SFFT points to an instance of the floating-point FFT structure for the initial FFT (with FFTLength = n_fft). bitReverseFlag should be on.
- SDCT points to an instance of the floating-point FFT structure for the DCT (with FFTLength = n_mels). bitReverseFlag should be on.
- pShift points to twiddle coefficient table with FFTLength = 4*n_mels. Only first quarter necessary.
- filterBank points to plp_triangular_filter_f32 instance with nFilters = n_mels.
- window vector to use for windowing
- orthoNorm whether to use dct orthonormalisation or not
- pSrc points to the input buffer (real data, size n_fft)
- nPE number of parallel processing units
- pDst points to the output buffer of length at least 3*n_fft. pSrc and pDst must not overlap, the calculation can not be done in place. MFCCs are returned in the first n_mels spots.
Return: none
function plp_dwt_f32
void plp_dwt_f32(
const float32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_f32 wavelet,
plp_dwt_extension_mode mode,
float32_t *__restrict__ pDstA,
float32_t *__restrict__ pDstD
)
Glue code for matrix addition of a 32-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
- none
Floating-point DWT on real input data for XPULPV2 extension. Glue code for matrix addition of a 32-bit integer matrices.
function plp_dwt_q32
void plp_dwt_q32(
const int32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q32 wavelet,
plp_dwt_extension_mode mode,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD
)
32bit Fixed-point DWT for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (q32)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
32bit Fixed-point DWT for XPULPV2 extension.
function plp_dwt_q16
void plp_dwt_q16(
const int16_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q16 wavelet,
plp_dwt_extension_mode mode,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD
)
16bit Fixed-point DWT for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (q16)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
16bit Fixed-point DWT for XPULPV2 extension.
function plp_dwt_q8
void plp_dwt_q8(
const int8_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q8 wavelet,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD
)
8bit Fixed-point DWT for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (q8)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
8bit Fixed-point DWT for XPULPV2 extension.
function plp_dwt_dec_f32
void plp_dwt_dec_f32(
const float32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_f32 wavelet,
plp_dwt_extension_mode mode,
uint32_t level,
float32_t *__restrict__ pTmp,
float32_t *__restrict__ pDst
)
Floating-point n-level DWT for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- level Levels of Wavelet decomposition
- pDst points to ouput buffer with Detailed coefficients and final approximate
Return: none
function plp_dwt_dec_f32_parallel
void plp_dwt_dec_f32_parallel(
const float32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_f32 wavelet,
plp_dwt_extension_mode mode,
uint32_t level,
uint32_t nPE,
float32_t *__restrict__ pTemp,
float32_t *__restrict__ pDst
)
Floating-point parallel n-level DWT for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- level Levels of Wavelet decomposition
- pDst points to ouput buffer with Detailed coefficients and final approximate
Return: none
function plp_dwt_f32s_xpulpv2
void plp_dwt_f32s_xpulpv2(
const float32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_f32 wavelet,
plp_dwt_extension_mode mode,
float32_t *__restrict__ pDstA,
float32_t *__restrict__ pDstD
)
Floating-point DWT on real input data for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return: none
function plp_dwt_haar_f32s_xpulpv2
void plp_dwt_haar_f32s_xpulpv2(
const float32_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
float32_t *__restrict__ pDstA,
float32_t *__restrict__ pDstD
)
Floating-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return: none
function plp_dwt_q32s_xpulpv2
void plp_dwt_q32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q32 wavelet,
plp_dwt_extension_mode mode,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD
)
32bit Fixed-point DWT for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
32bit Fixed-point DWT for XPULPV2 extension.
function plp_dwt_haar_q32s_xpulpv2
void plp_dwt_haar_q32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD
)
32bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return: none
function plp_dwt_q16s_xpulpv2
void plp_dwt_q16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q16 wavelet,
plp_dwt_extension_mode mode,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD
)
16bit Fixed-point DWT for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (q15)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
16bit Fixed-point DWT for XPULPV2 extension.
function plp_dwt_haar_q16s_xpulpv2
void plp_dwt_haar_q16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD
)
16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (q15)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
function plp_dwt_q8s_xpulpv2
void plp_dwt_q8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q8 wavelet,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD
)
8bit Fixed-point DWT for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (q7)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
8bit Fixed-point DWT for XPULPV2 extension.
function plp_dwt_haar_q8s_xpulpv2
void plp_dwt_haar_q8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD
)
8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (q7)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
function plp_dwt_f32_parallel
void plp_dwt_f32_parallel(
const float32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_f32 wavelet,
plp_dwt_extension_mode mode,
uint32_t nPE,
float32_t *__restrict__ pDstA,
float32_t *__restrict__ pDstD
)
Parallel Floating-point DWT on real input data for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- nPE Number of cores to use
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return: none
function plp_dwt_q8_parallel
void plp_dwt_q8_parallel(
const int8_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q8 wavelet,
plp_dwt_extension_mode mode,
uint32_t nPE,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD
)
8bit Parallel Fixed-point DWT on real input data for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (q8)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- nPE Number of cores to use
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return: none
function plp_dwt_q16_parallel
void plp_dwt_q16_parallel(
const int16_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q16 wavelet,
plp_dwt_extension_mode mode,
uint32_t nPE,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD
)
16bit Parallel Fixed-point DWT on real input data for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (q16)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- nPE Number of cores to use
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return: none
function plp_dwt_q32_parallel
void plp_dwt_q32_parallel(
const int32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q32 wavelet,
plp_dwt_extension_mode mode,
uint32_t nPE,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD
)
32bit Parallel Fixed-point DWT on real input data for XPULPV2 extension.
Parameters:
- pSrc points to the input buffer (q32)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- nPE Number of cores to use
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return: none
function plp_dwt_f32p_xpulpv2
void plp_dwt_f32p_xpulpv2(
void * args
)
Floating-point DWT on real input data for XPULPV2 extension.
Parameters:
- args points to the plp_dwt_instance_f32
Return: none
function plp_dwt_haar_f32p_xpulpv2
void plp_dwt_haar_f32p_xpulpv2(
void * args
)
Floating-point DWT kernel optimized for Haar Wavelet on real input data for XPULPV2 extension.
Parameters:
- args points to the plp_dwt_instance_f32
Return: none
function plp_dwt_q8p_xpulpv2
void plp_dwt_q8p_xpulpv2(
void * args
)
Q7 fixed-point DWT for XPULPV2 extension.
Parameters:
- args points to the plp_dwt_instance_q8
Return: none
function plp_dwt_haar_q8p_xpulpv2
void plp_dwt_haar_q8p_xpulpv2(
void * args
)
q7 fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension.
Parameters:
- args points to the plp_dwt_instance_q8
Return: none
function plp_dwt_q16p_xpulpv2
void plp_dwt_q16p_xpulpv2(
void * args
)
Q15 fixed-point DWT for XPULPV2 extension.
Parameters:
- args points to the plp_dwt_instance_q16
Return: none
function plp_dwt_haar_q16p_xpulpv2
void plp_dwt_haar_q16p_xpulpv2(
void * args
)
q15 fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension.
Parameters:
- args points to the plp_dwt_instance_q16
Return: none
function plp_dwt_q32p_xpulpv2
void plp_dwt_q32p_xpulpv2(
void * arg
)
Q31 fixed-point DWT on real input data for XPULPV2 extension.
Parameters:
- args points to the plp_dwt_instance_q32
Return: none
function plp_dwt_haar_q32p_xpulpv2
void plp_dwt_haar_q32p_xpulpv2(
void * args
)
Q31 Fixed-point DWT kernel optimized for Haar Wavelet for XPULPV2 extension.
Parameters:
- args points to the plp_dwt_instance_q32
Return: none
function plp_dwt_q32s_rv32im
void plp_dwt_q32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q32 wavelet,
plp_dwt_extension_mode mode,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD
)
32bit Fixed-point DWT.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
32bit Fixed-point DWT.
function plp_dwt_haar_q32s_rv32im
void plp_dwt_haar_q32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD
)
32bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return: none
function plp_dwt_q16s_rv32im
void plp_dwt_q16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q16 wavelet,
plp_dwt_extension_mode mode,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD
)
16bit Fixed-point DWT.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
16bit Fixed-point DWT.
function plp_dwt_haar_q16s_rv32im
void plp_dwt_haar_q16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD
)
16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (real data)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
16bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.
function plp_dwt_q8s_rv32im
void plp_dwt_q8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q8 wavelet,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD
)
8bit Fixed-point DWT.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (real data)
- length length of input buffer
- wavelet wavelet structure for calculating DWT
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
8bit Fixed-point DWT.
function plp_dwt_haar_q8s_rv32im
void plp_dwt_haar_q8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD
)
8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.
Parameters:
- pSrc points to the input buffer (real data)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
- pSrc points to the input buffer (real data)
- length length of input buffer
- mode boundary extension mode
- pDstA points to ouput buffer with Approximate coefficients
- pDstD points to ouput buffer with Detailed coefficients
Return:
- none
- none
8bit Fixed-point DWT kernel optimized for Haar Wavelet on real input data.
function plp_mat_add_i32
void plp_mat_add_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst
)
Glue code for matrix addition of 16-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return: none
function plp_mat_add_i32s_rv32im
void plp_mat_add_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst
)
matrix addition of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
matrix addition of a 32-bit integer matrices for RV32IM extension.
function plp_mat_add_i32s_xpulpv2
void plp_mat_add_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst
)
matrix addition of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
matrix addition of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_add_i32_parallel
void plp_mat_add_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for parallel matrix addition of a 32-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix addition of a 32-bit integer matrices.
function plp_mat_add_i32p_xpulpv2
void plp_mat_add_i32p_xpulpv2(
void * args
)
Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_add_instance_i32 struct initialized by plp_mat_add_i32_parallel
- args pointer to plp_mat_add_instance_i32 struct initialized by plp_mat_add_i32_parallel
Return:
- none
- none
Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_add_i16
void plp_mat_add_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst
)
Glue code for matrix addition of a 16-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix addition of a 16-bit integer matrices.
function plp_mat_add_i16s_rv32im
void plp_mat_add_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst
)
matrix addition of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
matrix addition of a 16-bit integer matrices for RV32IM extension.
function plp_mat_add_i16s_xpulpv2
void plp_mat_add_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst
)
matrix addition of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix addition of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_add_i16_parallel
void plp_mat_add_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for parallel matrix addition of a 16-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix addition of a 16-bit integer matrices.
function plp_mat_add_i16p_xpulpv2
void plp_mat_add_i16p_xpulpv2(
void * args
)
Parallel matrix addition of 16-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_add_instance_i16 struct initialized by plp_mat_add_i16_parallel
- args pointer to plp_mat_add_instance_i16 struct initialized by plp_mat_add_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_add_i8
void plp_mat_add_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst
)
Glue code for matrix addition of a 8-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix addition of a 8-bit integer matrices.
function plp_mat_add_i8s_rv32im
void plp_mat_add_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst
)
matrix addition of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
matrix addition of a 8-bit integer matrices for RV32IM extension.
function plp_mat_add_i8s_xpulpv2
void plp_mat_add_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst
)
matrix addition of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix addition of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_add_i8_parallel
void plp_mat_add_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for parallel matrix addition of a 8-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix addition of a 8-bit integer matrices.
function plp_mat_add_i8p_xpulpv2
void plp_mat_add_i8p_xpulpv2(
void * args
)
Parallel matrix addition of 8-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_add_instance_i8 struct initialized by plp_mat_add_i8_parallel
- args pointer to plp_mat_add_instance_i8 struct initialized by plp_mat_add_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_add_f32
void plp_mat_add_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
float *__restrict__ pDst
)
Glue code for matrix addition of a 32-bit floating-point matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix addition of a 32-bit floating-point matrices.
function plp_mat_add_f32s_xpulpv2
void plp_mat_add_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
float *__restrict__ pDst
)
matrix addition of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
matrix addition of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_add_f32_parallel
void plp_mat_add_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code for parallel matrix addition of a 32-bit floating-point matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix addition of a 32-bit floating-point matrices.
function plp_mat_add_f32p_xpulpv2
void plp_mat_add_f32p_xpulpv2(
void * args
)
Parallel matrix addition of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_add_instance_f32 struct initialized by plp_mat_add_f32_parallel
- args pointer to plp_mat_add_instance_f32 struct initialized by plp_mat_add_f32_parallel
Return:
- none
- none
function plp_mat_sub_i32
void plp_mat_sub_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst
)
Glue code for matrix subtraction of a 32-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix subtraction of a 32-bit integer matrices.
function plp_mat_sub_i32s_rv32im
void plp_mat_sub_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst
)
matrix subtraction of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
matrix subtraction of a 32-bit integer matrices for RV32IM extension.
function plp_mat_sub_i32s_xpulpv2
void plp_mat_sub_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst
)
matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_sub_i32_parallel
void plp_mat_sub_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for parallel matrix subtraction of a 32-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix subtraction of a 32-bit integer matrices.
function plp_mat_sub_i32p_xpulpv2
void plp_mat_sub_i32p_xpulpv2(
void * args
)
Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_sub_instance_i32 struct initialized by plp_mat_sub_i32_parallel
- args pointer to plp_mat_sub_instance_i32 struct initialized by plp_mat_sub_i32_parallel
Return:
- none
- none
Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_sub_i16
void plp_mat_sub_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst
)
Glue code for matrix subtraction of a 16-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix subtraction of a 16-bit integer matrices.
function plp_mat_sub_i16s_rv32im
void plp_mat_sub_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst
)
matrix subtraction of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
matrix subtraction of a 16-bit integer matrices for RV32IM extension.
function plp_mat_sub_i16s_xpulpv2
void plp_mat_sub_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst
)
matrix subtraction of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix subtraction of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_sub_i16_parallel
void plp_mat_sub_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for parallel matrix subtraction of a 16-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix subtraction of a 16-bit integer matrices.
function plp_mat_sub_i16p_xpulpv2
void plp_mat_sub_i16p_xpulpv2(
void * args
)
Parallel matrix subtraction of 16-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_sub_instance_i16 struct initialized by plp_mat_sub_i16_parallel
- args pointer to plp_mat_sub_instance_i16 struct initialized by plp_mat_sub_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_sub_i8
void plp_mat_sub_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst
)
Glue code for matrix subtraction of a 8-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix subtraction of a 8-bit integer matrices.
function plp_mat_sub_i8s_rv32im
void plp_mat_sub_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst
)
matrix subtraction of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
matrix subtraction of a 8-bit integer matrices for RV32IM extension.
function plp_mat_sub_i8s_xpulpv2
void plp_mat_sub_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst
)
matrix subtraction of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix subtraction of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_sub_i8_parallel
void plp_mat_sub_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for parallel matrix subtraction of a 8-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix subtraction of a 8-bit integer matrices.
function plp_mat_sub_i8p_xpulpv2
void plp_mat_sub_i8p_xpulpv2(
void * args
)
Parallel matrix subtraction of 8-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_sub_instance_i8 struct initialized by plp_mat_sub_i8_parallel
- args pointer to plp_mat_sub_instance_i8 struct initialized by plp_mat_sub_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_sub_f32
void plp_mat_sub_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
float *__restrict__ pDst
)
Glue code for matrix subtraction of a 32-bit floating-point matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix subtraction of a 32-bit floating-point matrices.
function plp_mat_sub_f32s_xpulpv2
void plp_mat_sub_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
float *__restrict__ pDst
)
matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- pDst Points to the output matrix
Return:
- none
- none
matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_sub_f32_parallel
void plp_mat_sub_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code for parallel matrix subtraction of a 32-bit floating-point matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix subtraction of a 32-bit floating-point matrices.
function plp_mat_sub_f32p_xpulpv2
void plp_mat_sub_f32p_xpulpv2(
void * args
)
Parallel matrix subtraction of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_sub_instance_f32 struct initialized by plp_mat_sub_f32_parallel
- args pointer to plp_mat_sub_instance_f32 struct initialized by plp_mat_sub_f32_parallel
Return:
- none
- none
function plp_mat_scale_i32
void plp_mat_scale_i32(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst
)
Glue code for matrix scale of a 32-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix scale of a 32-bit integer matrices.
function plp_mat_scale_i32s_rv32im
void plp_mat_scale_i32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst
)
matrix scale of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
matrix scale of a 32-bit integer matrices for RV32IM extension.
function plp_mat_scale_i32s_xpulpv2
void plp_mat_scale_i32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst
)
matrix scale of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
matrix scale of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_scale_i32_parallel
void plp_mat_scale_i32_parallel(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t scaleFactor,
int32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for parallel matrix scale of a 32-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- nPE Number of cores to use for computation
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix scale of a 32-bit integer matrices.
function plp_mat_scale_i32p_xpulpv2
void plp_mat_scale_i32p_xpulpv2(
void * args
)
Parallel matrix scale of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_scale_instance_i32 struct initialized by plp_mat_scale_i32_parallel
- args pointer to plp_mat_scale_instance_i32 struct initialized by plp_mat_scale_i32_parallel
Return:
- none
- none
Parallel matrix scale of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_scale_i16
void plp_mat_scale_i16(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst
)
Glue code for matrix scale of a 16-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix scale of a 16-bit integer matrices.
function plp_mat_scale_i16s_rv32im
void plp_mat_scale_i16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst
)
matrix scale of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
matrix scale of a 16-bit integer matrices for RV32IM extension.
function plp_mat_scale_i16s_xpulpv2
void plp_mat_scale_i16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst
)
matrix scale of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix scale of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_scale_i16_parallel
void plp_mat_scale_i16_parallel(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t scaleFactor,
int32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for parallel matrix scale of a 16-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix scale of a 16-bit integer matrices.
function plp_mat_scale_i16p_xpulpv2
void plp_mat_scale_i16p_xpulpv2(
void * args
)
Parallel matrix scale of 16-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_scale_instance_i16 struct initialized by plp_mat_scale_i16_parallel
- args pointer to plp_mat_scale_instance_i16 struct initialized by plp_mat_scale_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_scale_i8
void plp_mat_scale_i8(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst
)
Glue code for matrix scale of a 8-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix scale of a 8-bit integer matrices.
function plp_mat_scale_i8s_rv32im
void plp_mat_scale_i8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst
)
matrix scale of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
matrix scale of a 8-bit integer matrices for RV32IM extension.
function plp_mat_scale_i8s_xpulpv2
void plp_mat_scale_i8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst
)
matrix scale of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix scale of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_scale_i8_parallel
void plp_mat_scale_i8_parallel(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t scaleFactor,
int32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for parallel matrix scale of a 8-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix scale of a 8-bit integer matrices.
function plp_mat_scale_i8p_xpulpv2
void plp_mat_scale_i8p_xpulpv2(
void * args
)
Parallel matrix scale of 8-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_scale_instance_i8 struct initialized by plp_mat_scale_i8_parallel
- args pointer to plp_mat_scale_instance_i8 struct initialized by plp_mat_scale_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_scale_f32
void plp_mat_scale_f32(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
float scaleFactor,
float *__restrict__ pDst
)
Glue code for matrix scale of a 32-bit floating-point matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix scale of a 32-bit floating-point matrices.
function plp_mat_scale_f32s_xpulpv2
void plp_mat_scale_f32s_xpulpv2(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
float scaleFactor,
float *__restrict__ pDst
)
matrix scale of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements
- pDst Points to the output matrix
Return:
- none
- none
matrix scale of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_scale_f32_parallel
void plp_mat_scale_f32_parallel(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
float scaleFactor,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code for parallel matrix scale of a 32-bit floating-point matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- scaleFactor Factor to mulitply all elements
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix scale of a 32-bit floating-point matrices.
function plp_mat_scale_f32p_xpulpv2
void plp_mat_scale_f32p_xpulpv2(
void * args
)
Parallel matrix scale of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_scale_instance_f32 struct initialized by plp_mat_scale_f32_parallel
- args pointer to plp_mat_scale_instance_f32 struct initialized by plp_mat_scale_f32_parallel
Return:
- none
- none
function plp_mat_trans_i32
void plp_mat_trans_i32(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst
)
Glue code for matrix transpose of a 32-bit integer matrices.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
Glue code for matrix transpose of a 32-bit integer matrices.
function plp_mat_trans_i32s_rv32im
void plp_mat_trans_i32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst
)
matrix transpose of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
matrix transpose of a 32-bit integer matrices for RV32IM extension.
function plp_mat_trans_i32s_xpulpv2
void plp_mat_trans_i32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst
)
matrix transpose of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
matrix transpose of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_trans_i32_parallel
void plp_mat_trans_i32_parallel(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for parallel matrix transpose of a 32-bit integer matrices.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxM
Return: none
Glue code for parallel matrix transpose of a 32-bit integer matrices.
function plp_mat_trans_i32p_xpulpv2
void plp_mat_trans_i32p_xpulpv2(
void * args
)
Parallel matrix transpose of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_trans_instance_i32 struct initialized by plp_mat_trans_i32_parallel
- args pointer to plp_mat_trans_instance_i32 struct initialized by plp_mat_trans_i32_parallel
Return:
- none
- none
Parallel matrix transpose of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_trans_i16
void plp_mat_trans_i16(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst
)
Glue code for matrix transpose of a 16-bit integer matrices.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
Glue code for matrix transpose of a 16-bit integer matrices.
function plp_mat_trans_i16s_rv32im
void plp_mat_trans_i16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst
)
matrix transpose of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
matrix transpose of a 16-bit integer matrices for RV32IM extension.
function plp_mat_trans_i16s_xpulpv2
void plp_mat_trans_i16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst
)
matrix transpose of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix transpose of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_trans_i16_parallel
void plp_mat_trans_i16_parallel(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for parallel matrix transpose of a 16-bit integer matrices.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
Glue code for parallel matrix transpose of a 16-bit integer matrices.
function plp_mat_trans_i16p_xpulpv2
void plp_mat_trans_i16p_xpulpv2(
void * args
)
Parallel matrix transpose of 16-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_trans_instance_i16 struct initialized by plp_mat_trans_i16_parallel
- args pointer to plp_mat_trans_instance_i16 struct initialized by plp_mat_trans_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_trans_i8
void plp_mat_trans_i8(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst
)
Glue code for matrix transpose of a 8-bit integer matrices.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
Glue code for matrix transpose of a 8-bit integer matrices.
function plp_mat_trans_i8s_rv32im
void plp_mat_trans_i8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst
)
matrix transpose of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
matrix transpose of a 8-bit integer matrices for RV32IM extension.
function plp_mat_trans_i8s_xpulpv2
void plp_mat_trans_i8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst
)
matrix transpose of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix transpose of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_trans_i8_parallel
void plp_mat_trans_i8_parallel(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for parallel matrix transpose of a 8-bit integer matrices.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
Glue code for parallel matrix transpose of a 8-bit integer matrices.
function plp_mat_trans_i8p_xpulpv2
void plp_mat_trans_i8p_xpulpv2(
void * args
)
Parallel matrix transpose of 8-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_trans_instance_i8 struct initialized by plp_mat_trans_i8_parallel
- args pointer to plp_mat_trans_instance_i8 struct initialized by plp_mat_trans_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_trans_f32
void plp_mat_trans_f32(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
float *__restrict__ pDst
)
Glue code for matrix transpose of a 32-bit float*ing-point matrices.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
Par:
- This function will use plp_mat_trans_i32s_xpulpv2 for its computation.
- This function will use plp_mat_trans_i32s_xpulpv2 for its computation.
Glue code for matrix transpose of a 32-bit float*ing-point matrices.
function plp_mat_trans_f32_parallel
void plp_mat_trans_f32_parallel(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code for parallel matrix transpose of a 32-bit float*ing-point matrices.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxM
- pSrc Points to the input matrix of shape MxN
- M Height of the input matrix and width of the output matrix
- N Width of the input matrix and height of the output matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxM
Return:
- none
- none
Par:
- This function will use plp_mat_trans_i32p_xpulpv2 for its computation.
- This function will use plp_mat_trans_i32p_xpulpv2 for its computation.
Glue code for parallel matrix transpose of a 32-bit float*ing-point matrices.
function plp_mat_inv_f32
int plp_mat_inv_f32(
float *__restrict__ pSrc,
float *__restrict__ pDst,
uint32_t N
)
Glue code for matrix inverse of a 32-bit floating-point matrices.
Parameters:
- pSrc Points to the first input matrix. pSrc is modified by this funciton
- N Width and height of both matrices
- pDst Points to the output matrix
- pSrc Points to the input matrix. pSrc is modified by this function
- N Width and height of both matrices
- pDst Points to the output matrix
Return:
- none
- 0: Success, 1: Matrix is singular, 2: operation not supported
Par: This function will use plp_mat_inv_i32s_xpulpv2 for its computation.
Glue code for matrix inverse of a 32-bit floating-point matrices.
function plp_mat_inv_f32s_xpulpv2
int plp_mat_inv_f32s_xpulpv2(
float *__restrict__ pSrc,
float *__restrict__ pDst,
uint32_t N
)
matrix inverse of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the first input matrix. pSrc is modified by this funciton
- N Width and height of both matrices
- pDst Points to the output matrix
- pSrc Points to the first input matrix. pSrc is modified by this funciton
- N Width and height of both matrices
- pDst Points to the output matrix
Return:
- 0: Success, 1: Matrix is singular
- 0: Success, 1: Matrix is singular
matrix inverse of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_inv_f32_parallel
int plp_mat_inv_f32_parallel(
float *__restrict__ pSrc,
float *__restrict__ pDst,
uint32_t N,
uint32_t nPE
)
Glue code for parallel matrix inverse of a 32-bit floating-point matrices.
Parameters:
- pSrc Points to the input matrix. pSrc is modified by this funciton
- pDst Points to the output matrix
- N Width and height of both matrices
- nPE Number of cores to use for computation
- pSrc Points to the input matrix, pSrc is modified by this function
- N Width and height of both matrices
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- 0: Success, 1: Matrix is singular, 2: operation not supported
- 0: Success, 1: Matrix is singular, 2: operation not supported
Glue code for parallel matrix inverse of a 32-bit floating-point matrices.
@warn This function is not yet implemented in parallel, and it will call the single-core implementation!
function plp_mat_inv_f32p_xpulpv2
int plp_mat_inv_f32p_xpulpv2(
void * args
)
Parallel matrix inverse of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_inv_instance_f32 struct initialized by plp_mat_inv_f32_parallel
- args pointer to plp_mat_inv_instance_f32 struct initialized by plp_mat_inv_f32_parallel
Return:
- 0: Success, 1: Matrix is singular
- 0: Success, 1: Matrix is singular
Parallel matrix inverse of 32-bit floating-point matrices kernel for XPULPV2 extension.
@warn Not yet implemented
function plp_mat_fill_I_i32
void plp_mat_fill_I_i32(
uint32_t N,
int32_t *__restrict__ pDst
)
Glue code for creating a 32-bit integer identity matrix.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_i32s_rv32im
void plp_mat_fill_I_i32s_rv32im(
uint32_t N,
int32_t *__restrict__ pDst
)
Create a 32-bit integer identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_i32s_xpulpv2
void plp_mat_fill_I_i32s_xpulpv2(
uint32_t N,
int32_t *__restrict__ pDst
)
Create a 32-bit integer identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_i32_parallel
void plp_mat_fill_I_i32_parallel(
uint32_t N,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for creating a 32-bit integer identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_i32p_xpulpv2
void plp_mat_fill_I_i32p_xpulpv2(
void * args
)
Create a 32-bit integer identity matrix in parallel on XpulpV2.
Parameters:
- args pointer to plp_mat_fill_I_instance_i32 struct initialized by plp_mat_fill_I_i32_parallel
- args pointer to plp_mat_fill_I_instance_i32 struct initialized by plp_mat_fill_I_i32_parallel
Return:
- none
- none
function plp_mat_fill_I_i16
void plp_mat_fill_I_i16(
uint32_t N,
int16_t *__restrict__ pDst
)
Glue code for creating a 16-bit integer identity matrix.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_i16s_rv32im
void plp_mat_fill_I_i16s_rv32im(
uint32_t N,
int16_t *__restrict__ pDst
)
Create a 16-bit integer identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_i16s_xpulpv2
void plp_mat_fill_I_i16s_xpulpv2(
uint32_t N,
int16_t *__restrict__ pDst
)
Create a 16-bit integer identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_i16_parallel
void plp_mat_fill_I_i16_parallel(
uint32_t N,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for creating a 16-bit integer identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_i16p_xpulpv2
void plp_mat_fill_I_i16p_xpulpv2(
void * args
)
Create a 16-bit integer identity matrix in parallel on XpulpV2.
Parameters:
- args pointer to plp_mat_fill_I_instance_i16 struct initialized by plp_mat_fill_I_i16_parallel
- args pointer to plp_mat_fill_I_instance_i16 struct initialized by plp_mat_fill_I_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_i8
void plp_mat_fill_I_i8(
uint32_t N,
int8_t *__restrict__ pDst
)
Glue code for creating a 8-bit integer identity matrix.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_i8s_rv32im
void plp_mat_fill_I_i8s_rv32im(
uint32_t N,
int8_t *__restrict__ pDst
)
Create a 8-bit integer identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_i8s_xpulpv2
void plp_mat_fill_I_i8s_xpulpv2(
uint32_t N,
int8_t *__restrict__ pDst
)
Create a 8-bit integer identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_i8_parallel
void plp_mat_fill_I_i8_parallel(
uint32_t N,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for creating a 8-bit integer identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_i8p_xpulpv2
void plp_mat_fill_I_i8p_xpulpv2(
void * args
)
Create a 8-bit integer identity matrix in parallel on XpulpV2.
Parameters:
- args pointer to plp_mat_fill_I_instance_i8 struct initialized by plp_mat_fill_I_i8_parallel
- args pointer to plp_mat_fill_I_instance_i8 struct initialized by plp_mat_fill_I_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_f32
void plp_mat_fill_I_f32(
uint32_t N,
float *__restrict__ pDst
)
Glue code for creating a 32-bit float identity matrix.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_f32s_xpulpv2
void plp_mat_fill_I_f32s_xpulpv2(
uint32_t N,
float *__restrict__ pDst
)
Create a 32-bit float identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_f32_parallel
void plp_mat_fill_I_f32_parallel(
uint32_t N,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code for creating a 32-bit float identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_f32p_xpulpv2
void plp_mat_fill_I_f32p_xpulpv2(
void * args
)
Create a 32-bit float identity matrix in parallel on XpulpV2.
Parameters:
- args pointer to plp_mat_fill_I_instance_f32 struct initialized by plp_mat_fill_I_f32_parallel
- args pointer to plp_mat_fill_I_instance_f32 struct initialized by plp_mat_fill_I_f32_parallel
Return:
- none
- none
function plp_mat_fill_I_q32
void plp_mat_fill_I_q32(
uint32_t N,
int32_t fracBits,
int32_t *__restrict__ pDst
)
Glue code for creating a 32-bit fix-point identity matrix.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_q32s_rv32im
void plp_mat_fill_I_q32s_rv32im(
uint32_t N,
int32_t fracBits,
int32_t *__restrict__ pDst
)
Create a 32-bit fix-point identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_q32s_xpulpv2
void plp_mat_fill_I_q32s_xpulpv2(
uint32_t N,
int32_t fracBits,
int32_t *__restrict__ pDst
)
Create a 32-bit fix-point identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_q32_parallel
void plp_mat_fill_I_q32_parallel(
uint32_t N,
int32_t fracBits,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for creating a 32-bit fix-point identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_q32p_xpulpv2
void plp_mat_fill_I_q32p_xpulpv2(
void * args
)
Create a 32-bit fix-point identity matrix in parallel on XpulpV2.
Parameters:
- args pointer to plp_mat_fill_I_instance_q32 struct initialized by plp_mat_fill_I_q32_parallel
- args pointer to plp_mat_fill_I_instance_q32 struct initialized by plp_mat_fill_I_q32_parallel
Return:
- none
- none
function plp_mat_fill_I_q16
void plp_mat_fill_I_q16(
uint32_t N,
int32_t fracBits,
int16_t *__restrict__ pDst
)
Glue code for creating a 16-bit fix-point identity matrix.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_q16s_rv32im
void plp_mat_fill_I_q16s_rv32im(
uint32_t N,
int32_t fracBits,
int16_t *__restrict__ pDst
)
Create a 16-bit fix-point identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_q16s_xpulpv2
void plp_mat_fill_I_q16s_xpulpv2(
uint32_t N,
int32_t fracBits,
int16_t *__restrict__ pDst
)
Create a 16-bit fix-point identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_q16_parallel
void plp_mat_fill_I_q16_parallel(
uint32_t N,
int32_t fracBits,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for creating a 16-bit fix-point identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_q16p_xpulpv2
void plp_mat_fill_I_q16p_xpulpv2(
void * args
)
Create a 16-bit fix-point identity matrix in parallel on XpulpV2.
Parameters:
- args pointer to plp_mat_fill_I_instance_q16 struct initialized by plp_mat_fill_I_q16_parallel
- args pointer to plp_mat_fill_I_instance_q16 struct initialized by plp_mat_fill_I_q16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_q8
void plp_mat_fill_I_q8(
uint32_t N,
int32_t fracBits,
int8_t *__restrict__ pDst
)
Glue code for creating a 8-bit fix-point identity matrix.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_q8s_rv32im
void plp_mat_fill_I_q8s_rv32im(
uint32_t N,
int32_t fracBits,
int8_t *__restrict__ pDst
)
Create a 8-bit fix-point identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_q8s_xpulpv2
void plp_mat_fill_I_q8s_xpulpv2(
uint32_t N,
int32_t fracBits,
int8_t *__restrict__ pDst
)
Create a 8-bit fix-point identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_q8_parallel
void plp_mat_fill_I_q8_parallel(
uint32_t N,
int32_t fracBits,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for creating a 8-bit fix-point identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- fracBits decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
- N Width and height of the matrix
- fracBits Decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix of shape NxN
Return:
- none
- none
function plp_mat_fill_I_q8p_xpulpv2
void plp_mat_fill_I_q8p_xpulpv2(
void * args
)
Create a 8-bit fix-point identity matrix in parallel on XpulpV2.
Parameters:
- args pointer to plp_mat_fill_I_instance_q8 struct initialized by plp_mat_fill_I_q8_parallel
- args pointer to plp_mat_fill_I_instance_q8 struct initialized by plp_mat_fill_I_q8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_stride_i32
void plp_mat_mult_stride_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code for strided matrix matrix multiplication of a 32-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Glue code for strided matrix matrix multiplication of a 32-bit integer matrices.
function plp_mat_mult_stride_i32s_rv32im
void plp_mat_mult_stride_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
strided matrix matrix multiplication of a 32-bit integer matrices for RV32IM extension.
function plp_mat_mult_stride_i32s_xpulpv2
void plp_mat_mult_stride_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_stride_i16
void plp_mat_mult_stride_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code for strided matrix matrix multiplication of a 16-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Glue code for strided matrix matrix multiplication of a 16-bit integer matrices.
function plp_mat_mult_stride_i16s_rv32im
void plp_mat_mult_stride_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
strided matrix matrix multiplication of a 16-bit integer matrices for RV32IM extension.
function plp_mat_mult_stride_i16s_xpulpv2
void plp_mat_mult_stride_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
strided matrix matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_stride_i8
void plp_mat_mult_stride_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code for strided matrix matrix multiplication of a 8-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Glue code for strided matrix matrix multiplication of a 8-bit integer matrices.
function plp_mat_mult_stride_i8s_rv32im
void plp_mat_mult_stride_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
strided matrix matrix multiplication of a 8-bit integer matrices for RV32IM extension.
function plp_mat_mult_stride_i8s_xpulpv2
void plp_mat_mult_stride_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
strided matrix matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_stride_i32_parallel
void plp_mat_mult_stride_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices.
function plp_mat_mult_stride_i32p_xpulpv2
void plp_mat_mult_stride_i32p_xpulpv2(
void * args
)
Parallel strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_i32 struct initialized by plp_mat_mult_stride_i32_parallel
- args pointer to plp_mat_mult_stride_instance_i32 struct initialized by plp_mat_mult_stride_i32_parallel
Return:
- none
- none
Parallel strided matrix matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_stride_i16_parallel
void plp_mat_mult_stride_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel strided matrix matrix multiplication of a 16-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix matrix multiplication of a 16-bit integer matrices.
function plp_mat_mult_stride_i16p_xpulpv2
void plp_mat_mult_stride_i16p_xpulpv2(
void * args
)
Parallel matrix multiplication of 16-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_i16 struct initialized by plp_mat_mult_stride_i16_parallel
- args pointer to plp_mat_mult_stride_instance_i16 struct initialized by plp_mat_mult_stride_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
Parallel matrix multiplication of 16-bit integer matrices kernel for XPULPV2 extension.
function plp_mat_mult_stride_i8_parallel
void plp_mat_mult_stride_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel strided matrix matrix multiplication of a 8-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix matrix multiplication of a 8-bit integer matrices.
function plp_mat_mult_stride_f32
void plp_mat_mult_stride_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC
)
Glue code for strided matrix matrix multiplication of a 32-bit floating-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Glue code for strided matrix matrix multiplication of a 32-bit floating-point matrices.
function plp_mat_mult_stride_f32s_xpulpv2
void plp_mat_mult_stride_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC
)
strided matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
strided matrix matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_mult_stride_f32_parallel
void plp_mat_mult_stride_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
float *__restrict__ pDstC
)
Glue code for parallel strided matrix matrix multiplication of a 32-bit floating-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix matrix multiplication of a 32-bit floating-point matrices.
function plp_mat_mult_stride_f32p_xpulpv2
void plp_mat_mult_stride_f32p_xpulpv2(
void * args
)
Parallel matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_f32 struct initialized by plp_mat_mult_stride_f32_parallel
- args pointer to plp_mat_mult_stride_instance_f32 struct initialized by plp_mat_mult_stride_f32_parallel
Return:
- none
- none
Parallel matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
function plp_mat_mult_stride_i8p_xpulpv2
void plp_mat_mult_stride_i8p_xpulpv2(
void * args
)
Parallel matrix multiplication of 8-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_i8 struct initialized by plp_mat_mult_stride_i8_parallel
- args pointer to plp_mat_mult_stride_instance_i8 struct initialized by plp_mat_mult_stride_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
Parallel matrix multiplication of 8-bit integer matrices kernel for XPULPV2 extension.
function plp_mat_mult_stride_q32
void plp_mat_mult_stride_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Glue code for strided matrix matrix multiplication of a 32-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Glue code for strided matrix matrix multiplication of a 32-bit fix-point matrices.
function plp_mat_mult_stride_q32_parallel
void plp_mat_mult_stride_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel strided matrix matrix multiplication of a 32-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Glue code for parallel strided matrix matrix multiplication of a 32-bit fix-point matrices.
function plp_mat_mult_stride_q32s_rv32im
void plp_mat_mult_stride_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
strided matrix matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
function plp_mat_mult_stride_q32s_xpulpv2
void plp_mat_mult_stride_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
strided matrix matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
function plp_mat_mult_stride_q32p_xpulpv2
void plp_mat_mult_stride_q32p_xpulpv2(
void * args
)
Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_q32 struct initialized by plp_mat_mult_stride_q32_parallel
- args pointer to plp_mat_mult_stride_instance_q32 struct initialized by plp_mat_mult_stride_q32_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
function plp_mat_mult_stride_q16
void plp_mat_mult_stride_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Glue code for strided matrix matrix multiplication of a 16-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for strided matrix matrix multiplication of a 16-bit fix-point matrices.
The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_stride_q16_parallel
void plp_mat_mult_stride_q16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC
)
Glue code for parallel strided matrix matrix multiplication of a 16-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for parallel strided matrix matrix multiplication of a 16-bit fix-point matrices.
The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_stride_q16s_rv32im
void plp_mat_mult_stride_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
strided matrix matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_stride_q16s_xpulpv2
void plp_mat_mult_stride_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
strided matrix matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_stride_q16p_xpulpv2
void plp_mat_mult_stride_q16p_xpulpv2(
void * args
)
Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_q16 struct initialized by plp_mat_mult_stride_q16_parallel
- args pointer to plp_mat_mult_stride_instance_q16 struct initialized by plp_mat_mult_stride_q16_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_stride_q8
void plp_mat_mult_stride_q8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Glue code for strided matrix matrix multiplication of a 8-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for strided matrix matrix multiplication of a 8-bit fix-point matrices.
The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_stride_q8_parallel
void plp_mat_mult_stride_q8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC
)
Glue code for parallel strided matrix matrix multiplication of a 8-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for parallel strided matrix matrix multiplication of a 8-bit fix-point matrices.
The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_stride_q8s_rv32im
void plp_mat_mult_stride_q8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
strided matrix matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_stride_q8s_xpulpv2
void plp_mat_mult_stride_q8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
strided matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
strided matrix matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_stride_q8p_xpulpv2
void plp_mat_mult_stride_q8p_xpulpv2(
void * args
)
Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_q8 struct initialized by plp_mat_mult_stride_q8_parallel
- args pointer to plp_mat_mult_stride_instance_q8 struct initialized by plp_mat_mult_stride_q8_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_i32
void plp_mat_mult_trans_stride_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code for strided matrix transposed matrix multiplication of a 32-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Glue code for strided matrix transposed matrix multiplication of a 32-bit integer matrices.
function plp_mat_mult_trans_stride_i32s_rv32im
void plp_mat_mult_trans_stride_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
function plp_mat_mult_trans_stride_i32s_xpulpv2
void plp_mat_mult_trans_stride_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
strided matrix transposed matrix multiplication of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_trans_stride_i16
void plp_mat_mult_trans_stride_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code for strided matrix transposed matrix multiplication of a 16-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Glue code for strided matrix transposed matrix multiplication of a 16-bit integer matrices.
function plp_mat_mult_trans_stride_i16s_rv32im
void plp_mat_mult_trans_stride_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
strided matrix transposed matrix multiplication of a 16-bit integer matrices for RV32IM extension.
function plp_mat_mult_trans_stride_i16s_xpulpv2
void plp_mat_mult_trans_stride_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_trans_stride_i8
void plp_mat_mult_trans_stride_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code for strided matrix transposed matrix multiplication of a 8-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Glue code for strided matrix transposed matrix multiplication of a 8-bit integer matrices.
function plp_mat_mult_trans_stride_i8s_rv32im
void plp_mat_mult_trans_stride_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
strided matrix transposed matrix multiplication of a 8-bit integer matrices for RV32IM extension.
function plp_mat_mult_trans_stride_i8s_xpulpv2
void plp_mat_mult_trans_stride_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_trans_stride_i32_parallel
void plp_mat_mult_trans_stride_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix matrix multiplication of a 32-bit integer matrices.
function plp_mat_mult_trans_stride_i32p_xpulpv2
void plp_mat_mult_trans_stride_i32p_xpulpv2(
void * args
)
Parallel strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_i32 struct initialized by plp_mat_mult_stride_i32_parallel
- args pointer to plp_mat_mult_stride_instance_i32 struct initialized by plp_mat_mult_trans_stride_i32_parallel
Return:
- none
- none
Parallel strided matrix transposed matrix multiplication of a 32-bit integer matrices for RV32IM extension.
function plp_mat_mult_trans_stride_i16_parallel
void plp_mat_mult_trans_stride_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices.
function plp_mat_mult_trans_stride_i16p_xpulpv2
void plp_mat_mult_trans_stride_i16p_xpulpv2(
void * args
)
Parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_i16 struct initialized by plp_mat_mult_stride_i16_parallel
- args pointer to plp_mat_mult_stride_instance_i16 struct initialized by plp_mat_mult_trans_stride_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
Parallel strided matrix transposed matrix multiplication of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_trans_stride_i8_parallel
void plp_mat_mult_trans_stride_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices.
function plp_mat_mult_trans_stride_i8p_xpulpv2
void plp_mat_mult_trans_stride_i8p_xpulpv2(
void * args
)
Parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_i8 struct initialized by plp_mat_mult_stride_i8_parallel
- args pointer to plp_mat_mult_stride_instance_i8 struct initialized by plp_mat_mult_trans_stride_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
Parallel strided matrix transposed matrix multiplication of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_mult_trans_stride_q32
void plp_mat_mult_trans_stride_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Glue code for strided matrix transposed matrix multiplication of a 32-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Glue code for strided matrix transposed matrix multiplication of a 32-bit fix-point matrices.
function plp_mat_mult_trans_stride_q32_parallel
void plp_mat_mult_trans_stride_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit fix-point matrices.
function plp_mat_mult_trans_stride_q32s_rv32im
void plp_mat_mult_trans_stride_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for RV32IM extension.
function plp_mat_mult_trans_stride_q32s_xpulpv2
void plp_mat_mult_trans_stride_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
strided matrix transposed matrix multiplication of a 32-bit fix-point matrices for XPULPV2 extension.
function plp_mat_mult_trans_stride_q32p_xpulpv2
void plp_mat_mult_trans_stride_q32p_xpulpv2(
void * args
)
Parallel strided matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_q32 struct initialized by plp_mat_mult_trans_stride_q32_parallel
- args pointer to plp_mat_mult_stride_instance_q32 struct initialized by plp_mat_mult_trans_stride_q32_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel strided matrix transposed matrix multiplication of 32-bit fix-point matrices kernel for XPULPV2 extension.
function plp_mat_mult_trans_stride_q16
void plp_mat_mult_trans_stride_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Glue code for strided matrix transposed matrix multiplication of a 16-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for strided matrix transposed matrix multiplication of a 16-bit fix-point matrices.
The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_q16_parallel
void plp_mat_mult_trans_stride_q16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC
)
Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for parallel strided matrix transposed matrix multiplication of a 16-bit fix-point matrices.
The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_q16s_rv32im
void plp_mat_mult_trans_stride_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for RV32IM extension.
The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_q16s_xpulpv2
void plp_mat_mult_trans_stride_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
strided matrix transposed matrix multiplication of a 16-bit fix-point matrices for XPULPV2 extension.
The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_q16p_xpulpv2
void plp_mat_mult_trans_stride_q16p_xpulpv2(
void * args
)
Parallel strided matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_q16 struct initialized by plp_mat_mult_trans_stride_q16_parallel
- args pointer to plp_mat_mult_stride_instance_q16 struct initialized by plp_mat_mult_trans_stride_q16_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel strided matrix transposed matrix multiplication of 16-bit fix-point matrices kernel for XPULPV2 extension.
The output of the strided matrix multiplication will also be stored as an 16-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_q8
void plp_mat_mult_trans_stride_q8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Glue code for strided matrix transposed matrix multiplication of a 8-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for strided matrix transposed matrix multiplication of a 8-bit fix-point matrices.
The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_q8_parallel
void plp_mat_mult_trans_stride_q8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC
)
Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit fix-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
Glue code for parallel strided matrix transposed matrix multiplication of a 8-bit fix-point matrices.
The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_q8s_rv32im
void plp_mat_mult_trans_stride_q8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for RV32IM extension.
The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_q8s_xpulpv2
void plp_mat_mult_trans_stride_q8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- shift Amount to shift the result of each multiplication.
- pDstC points to the output matrix
Return:
- none
- none
Par:
- Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
* Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
The output of the matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
strided matrix transposed matrix multiplication of a 8-bit fix-point matrices for XPULPV2 extension.
The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_q8p_xpulpv2
void plp_mat_mult_trans_stride_q8p_xpulpv2(
void * args
)
Parallel strided matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_q8 struct initialized by plp_mat_mult_trans_stride_q8_parallel
- args pointer to plp_mat_mult_stride_instance_q8 struct initialized by plp_mat_mult_trans_stride_q8_parallel
Return:
- none
- none
Par: Fix-Point and Shifting
The result will be shifted by the parameter shift
to the right (multiplied by 2^-shift). Assume that matrix A is represented as pSrcA * 2^-x, and matrix B as pSrcB * 2^-y (in other words, A has it's x last digits after the binary point). Then, the output is represented as pDstC * 2^-(x + y - shift).
Parallel strided matrix transposed matrix multiplication of 8-bit fix-point matrices kernel for XPULPV2 extension.
The output of the strided matrix multiplication will also be stored as an 8-bit array. Set the shift
parameter such that no overflow ocurrs.
function plp_mat_mult_trans_stride_f32
void plp_mat_mult_trans_stride_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC
)
Glue code for strided matrix transposed matrix multiplication of a 32-bit floating-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
Glue code for strided matrix transposed matrix multiplication of a 32-bit floating-point matrices.
function plp_mat_mult_trans_stride_f32s_xpulpv2
void plp_mat_mult_trans_stride_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC
)
strided matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- pDstC points to the output matrix
Return:
- none
- none
strided matrix transposed matrix multiplication of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_mult_trans_stride_f32_parallel
void plp_mat_mult_trans_stride_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
float *__restrict__ pDstC
)
Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit floating-point matrices.
Parameters:
- pSrcA points to first the input matrix
- pSrcB points to second the input matrix
- M Height of first matrix
- N Width of first and heigt of second matrix
- O Width of second matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strudeY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC Output is written here
- pSrcA points to the first input matrix
- pSrcB points to the second input matrix
- M height of the first input matrix
- N width of the first input matrix and hight of the second
- O width of the second input matrix
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideC Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDstC points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix transposed matrix multiplication of a 32-bit floating-point matrices.
function plp_mat_mult_trans_stride_f32p_xpulpv2
void plp_mat_mult_trans_stride_f32p_xpulpv2(
void * args
)
Parallel strided matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_mult_stride_instance_f32 struct initialized by plp_mat_mult_trans_stride_f32_parallel
- args pointer to plp_mat_mult_stride_instance_f32 struct initialized by plp_mat_mult_trans_stride_f32_parallel
Return:
- none
- none
Parallel strided matrix transposed matrix multiplication of 32-bit floating-point matrices kernel for XPULPV2 extension.
function plp_mat_mult_cmplx_stride_i32
void plp_mat_mult_cmplx_stride_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code of strided matrix matrix multiplication for complex 32-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i32s_rv32im
void plp_mat_mult_cmplx_stride_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 32-bit integers on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i32s_xpulpv2
void plp_mat_mult_cmplx_stride_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 32-bit integers on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i32_parallel
void plp_mat_mult_cmplx_stride_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel strided matrix matrix multiplication for complex 32-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i32p_xpulpv2
void plp_mat_mult_cmplx_stride_i32p_xpulpv2(
void * args
)
parallel strided matrix matrix multiplication for complex 32-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_i32 struct initialized by plp_mat_mult_cmplx_stride_i32_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_i32 struct initialized by plp_mat_mult_cmplx_stride_i32_parallel
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i16
void plp_mat_mult_cmplx_stride_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code of strided matrix matrix multiplication for complex 16-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i16s_rv32im
void plp_mat_mult_cmplx_stride_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 16-bit integers on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i16s_xpulpv2
void plp_mat_mult_cmplx_stride_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 16-bit integers on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_cmplx_stride_i16_parallel
void plp_mat_mult_cmplx_stride_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel strided matrix matrix multiplication for complex 16-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i16p_xpulpv2
void plp_mat_mult_cmplx_stride_i16p_xpulpv2(
void * args
)
parallel strided matrix matrix multiplication for complex 16-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_i16 struct initialized by plp_mat_mult_cmplx_stride_i16_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_i16 struct initialized by plp_mat_mult_cmplx_stride_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_cmplx_stride_i8
void plp_mat_mult_cmplx_stride_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code of strided matrix matrix multiplication for complex 8-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i8s_rv32im
void plp_mat_mult_cmplx_stride_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 8-bit integers on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i8s_xpulpv2
void plp_mat_mult_cmplx_stride_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 8-bit integers on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_cmplx_stride_i8_parallel
void plp_mat_mult_cmplx_stride_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel strided matrix matrix multiplication for complex 8-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_i8p_xpulpv2
void plp_mat_mult_cmplx_stride_i8p_xpulpv2(
void * args
)
parallel strided matrix matrix multiplication for complex 8-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_i8 struct initialized by plp_mat_mult_cmplx_stride_i8_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_i8 struct initialized by plp_mat_mult_cmplx_stride_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_cmplx_stride_f32
void plp_mat_mult_cmplx_stride_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC
)
Glue code of strided matrix matrix multiplication for complex 32-bit floats.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_f32s_xpulpv2
void plp_mat_mult_cmplx_stride_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 32-bit floats on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_f32_parallel
void plp_mat_mult_cmplx_stride_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
float *__restrict__ pDstC
)
Glue code of parallel strided matrix matrix multiplication for complex 32-bit floats.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_cmplx_stride_f32p_xpulpv2
void plp_mat_mult_cmplx_stride_f32p_xpulpv2(
void * args
)
parallel strided matrix matrix multiplication for complex 32-bit floats on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_f32 struct initialized by plp_mat_mult_cmplx_stride_f32_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_f32 struct initialized by plp_mat_mult_cmplx_stride_f32_parallel
Return:
- none
- none
function plp_mat_mult_cmplx_stride_q32
void plp_mat_mult_cmplx_stride_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Glue code of strided matrix matrix multiplication for complex 32-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q32s_rv32im
void plp_mat_mult_cmplx_stride_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 32-bit fix-point on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q32s_xpulpv2
void plp_mat_mult_cmplx_stride_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 32-bit fix-point on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q32_parallel
void plp_mat_mult_cmplx_stride_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel strided matrix matrix multiplication for complex 32-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q32p_xpulpv2
void plp_mat_mult_cmplx_stride_q32p_xpulpv2(
void * args
)
parallel strided matrix matrix multiplication for complex 32-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_q32 struct initialized by plp_mat_mult_cmplx_stride_q32_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_q32 struct initialized by plp_mat_mult_cmplx_stride_q32_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q16
void plp_mat_mult_cmplx_stride_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Glue code of strided matrix matrix multiplication for complex 16-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q16s_rv32im
void plp_mat_mult_cmplx_stride_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 16-bit fix-point on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q16s_xpulpv2
void plp_mat_mult_cmplx_stride_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 16-bit fix-point on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q16_parallel
void plp_mat_mult_cmplx_stride_q16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC
)
Glue code of parallel strided matrix matrix multiplication for complex 16-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q16p_xpulpv2
void plp_mat_mult_cmplx_stride_q16p_xpulpv2(
void * args
)
parallel strided matrix matrix multiplication for complex 16-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_q16 struct initialized by plp_mat_mult_cmplx_stride_q16_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_q16 struct initialized by plp_mat_mult_cmplx_stride_q16_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_cmplx_stride_q8
void plp_mat_mult_cmplx_stride_q8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Glue code of strided matrix matrix multiplication for complex 8-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q8s_rv32im
void plp_mat_mult_cmplx_stride_q8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 8-bit fix-point on RV32IM.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q8s_xpulpv2
void plp_mat_mult_cmplx_stride_q8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Strided strided matrix matrix multiplication for complex 8-bit fix-point on XpulpV2.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q8_parallel
void plp_mat_mult_cmplx_stride_q8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC
)
Glue code of parallel strided matrix matrix multiplication for complex 8-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape NxO
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and height of matrix SrcB
- O Width of matrix SrcB and DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_cmplx_stride_q8p_xpulpv2
void plp_mat_mult_cmplx_stride_q8p_xpulpv2(
void * args
)
parallel strided matrix matrix multiplication for complex 8-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_q8 struct initialized by plp_mat_mult_cmplx_stride_q8_parallel
- args pointer to plp_mat_mat_mult_cmplx_instance_q8 struct initialized by plp_mat_mult_cmplx_stride_q8_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_trans_cmplx_stride_i32
void plp_mat_mult_trans_cmplx_stride_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code of strided matrix transpose matrix multiplication for complex 32-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_stride_i32s_rv32im
void plp_mat_mult_trans_cmplx_stride_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 32-bit integers on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
strided matrix transpose matrix multiplication for complex 32-bit integers on RV32IM
function plp_mat_mult_trans_cmplx_stride_i32s_xpulpv2
void plp_mat_mult_trans_cmplx_stride_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
strided matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2
function plp_mat_mult_trans_cmplx_stride_i32_parallel
void plp_mat_mult_trans_cmplx_stride_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_stride_i32p_xpulpv2
void plp_mat_mult_trans_cmplx_stride_i32p_xpulpv2(
void * args
)
parallel strided matrix transpose matrix multiplication for complex 32-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_i32 struct initialized by plp_mat_mult_trans_cmplx_stride_i32_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_i32 struct initialized by plp_mat_mult_trans_cmplx_stride_i32_parallel
Return:
- none
- none
function plp_mat_mult_trans_cmplx_stride_i16
void plp_mat_mult_trans_cmplx_stride_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code of strided matrix transpose matrix multiplication for complex 16-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_stride_i16s_rv32im
void plp_mat_mult_trans_cmplx_stride_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 16-bit integers on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
strided matrix transpose matrix multiplication for complex 16-bit integers on RV32IM
function plp_mat_mult_trans_cmplx_stride_i16s_xpulpv2
void plp_mat_mult_trans_cmplx_stride_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
strided matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2
function plp_mat_mult_trans_cmplx_stride_i16_parallel
void plp_mat_mult_trans_cmplx_stride_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel strided matrix transpose matrix multiplication for complex 16-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_stride_i16p_xpulpv2
void plp_mat_mult_trans_cmplx_stride_i16p_xpulpv2(
void * args
)
parallel strided matrix transpose matrix multiplication for complex 16-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_i16 struct initialized by plp_mat_mult_trans_cmplx_stride_i16_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_i16 struct initialized by plp_mat_mult_trans_cmplx_stride_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_trans_cmplx_stride_i8
void plp_mat_mult_trans_cmplx_stride_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
Glue code of strided matrix transpose matrix multiplication for complex 8-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_stride_i8s_rv32im
void plp_mat_mult_trans_cmplx_stride_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 8-bit integers on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
strided matrix transpose matrix multiplication for complex 8-bit integers on RV32IM
function plp_mat_mult_trans_cmplx_stride_i8s_xpulpv2
void plp_mat_mult_trans_cmplx_stride_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
strided matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2
function plp_mat_mult_trans_cmplx_stride_i8_parallel
void plp_mat_mult_trans_cmplx_stride_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel strided matrix transpose matrix multiplication for complex 8-bit integers.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_stride_i8p_xpulpv2
void plp_mat_mult_trans_cmplx_stride_i8p_xpulpv2(
void * args
)
parallel strided matrix transpose matrix multiplication for complex 8-bit integers on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_i8 struct initialized by plp_mat_mult_trans_cmplx_stride_i8_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_i8 struct initialized by plp_mat_mult_trans_cmplx_stride_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_trans_cmplx_stride_f32
void plp_mat_mult_trans_cmplx_stride_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC
)
Glue code of strided matrix transpose matrix multiplication for complex 32-bit floats.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_stride_f32s_xpulpv2
void plp_mat_mult_trans_cmplx_stride_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
strided matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2
function plp_mat_mult_trans_cmplx_stride_f32_parallel
void plp_mat_mult_trans_cmplx_stride_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
float *__restrict__ pDstC
)
Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit floats.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
function plp_mat_mult_trans_cmplx_stride_f32p_xpulpv2
void plp_mat_mult_trans_cmplx_stride_f32p_xpulpv2(
void * args
)
parallel strided matrix transpose matrix multiplication for complex 32-bit floats on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_f32 struct initialized by plp_mat_mult_trans_cmplx_stride_f32_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_f32 struct initialized by plp_mat_mult_trans_cmplx_stride_f32_parallel
Return:
- none
- none
function plp_mat_mult_trans_cmplx_stride_q32
void plp_mat_mult_trans_cmplx_stride_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
Glue code of strided matrix transpose matrix multiplication for complex 32-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_stride_q32s_rv32im
void plp_mat_mult_trans_cmplx_stride_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
strided matrix transpose matrix multiplication for complex 32-bit fix-point on RV32IM
function plp_mat_mult_trans_cmplx_stride_q32s_xpulpv2
void plp_mat_mult_trans_cmplx_stride_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
strided matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2
function plp_mat_mult_trans_cmplx_stride_q32_parallel
void plp_mat_mult_trans_cmplx_stride_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC
)
Glue code of parallel strided matrix transpose matrix multiplication for complex 32-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_stride_q32p_xpulpv2
void plp_mat_mult_trans_cmplx_stride_q32p_xpulpv2(
void * args
)
parallel strided matrix transpose matrix multiplication for complex 32-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_q32 struct initialized by plp_mat_mult_trans_cmplx_stride_q32_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_q32 struct initialized by plp_mat_mult_trans_cmplx_stride_q32_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_stride_q16
void plp_mat_mult_trans_cmplx_stride_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
Glue code of strided matrix transpose matrix multiplication for complex 16-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_stride_q16s_rv32im
void plp_mat_mult_trans_cmplx_stride_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
strided matrix transpose matrix multiplication for complex 16-bit fix-point on RV32IM
function plp_mat_mult_trans_cmplx_stride_q16s_xpulpv2
void plp_mat_mult_trans_cmplx_stride_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
strided matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2
function plp_mat_mult_trans_cmplx_stride_q16_parallel
void plp_mat_mult_trans_cmplx_stride_q16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC
)
Glue code of parallel strided matrix transpose matrix multiplication for complex 16-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_stride_q16p_xpulpv2
void plp_mat_mult_trans_cmplx_stride_q16p_xpulpv2(
void * args
)
parallel strided matrix transpose matrix multiplication for complex 16-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_q16 struct initialized by plp_mat_mult_trans_cmplx_stride_q16_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_q16 struct initialized by plp_mat_mult_trans_cmplx_stride_q16_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_mult_trans_cmplx_stride_q8
void plp_mat_mult_trans_cmplx_stride_q8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
Glue code of strided matrix transpose matrix multiplication for complex 8-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_stride_q8s_rv32im
void plp_mat_mult_trans_cmplx_stride_q8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
strided matrix transpose matrix multiplication for complex 8-bit fix-point on RV32IM
function plp_mat_mult_trans_cmplx_stride_q8s_xpulpv2
void plp_mat_mult_trans_cmplx_stride_q8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC
)
strided matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
strided matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2
function plp_mat_mult_trans_cmplx_stride_q8_parallel
void plp_mat_mult_trans_cmplx_stride_q8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC
)
Glue code of parallel strided matrix transpose matrix multiplication for complex 8-bit fix-point.
Parameters:
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
- pSrcA Points to the first input matrix of shape MxN
- pSrcB Points to the second input matrix of shape OxN
- M Height of matrix SrcA and DstC
- N Width of matrix SrcA and SrcB
- O Height of matrix SrcB and width of matrix DstC
- strideA Stride of input matrix A (elements between each row)
- strideB Stride of input matrix B (elements between each row)
- strideC Stride of output matrix C (Elements between each row)
- shift Amount to shift the result of each multiplication ot the right
- nPE Number of cores to use for computation
- pDstC Points to the output matrix of shape MxO
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
function plp_mat_mult_trans_cmplx_stride_q8p_xpulpv2
void plp_mat_mult_trans_cmplx_stride_q8p_xpulpv2(
void * args
)
parallel strided matrix transpose matrix multiplication for complex 8-bit fix-point on XpulpV2
Parameters:
- args pointer to plp_mat_mult_cmplx_stride_instance_q8 struct initialized by plp_mat_mult_trans_cmplx_stride_q8_parallel
- args pointer to plp_mat_mat_mult_trans_cmplx_instance_q8 struct initialized by plp_mat_mult_trans_cmplx_stride_q8_parallel
Return:
- none
- none
Par:
- Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
Fix-Point and Shifting The result will be shifted by the parameter shift
to the right (which corresponds to a multiplication by 2^-shift
). Assume that matrix A is represente as pSrcA * 2^-x
and matrix B as pSrcB * 2^-y
(which means that A has x
, and B has y
bits after the binary point). Then, the output matrix C is represented as pDstC * 2^-(x + y - shift)
. The output matrix is also stored with the same number of bits as the inputs. Set the shift
parameter such that no overflow occurrs.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_add_stride_i32
void plp_mat_add_stride_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst
)
Glue code for matrix addition of a 32-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix addition of a 32-bit integer matrices.
function plp_mat_add_stride_i32s_rv32im
void plp_mat_add_stride_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst
)
matrix addition of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
matrix addition of a 32-bit integer matrices for RV32IM extension.
function plp_mat_add_stride_i32s_xpulpv2
void plp_mat_add_stride_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst
)
matrix addition of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
matrix addition of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_add_stride_i32_parallel
void plp_mat_add_stride_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for parallel matrix addition of a 32-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix addition of a 32-bit integer matrices.
function plp_mat_add_stride_i32p_xpulpv2
void plp_mat_add_stride_i32p_xpulpv2(
void * args
)
Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_add_stride_instance_i32 struct initialized by plp_mat_add_stride_i32_parallel
- args pointer to plp_mat_add_stride_instance_i32 struct initialized by plp_mat_add_stride_i32_parallel
Return:
- none
- none
Parallel matrix addition of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_add_stride_i16
void plp_mat_add_stride_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst
)
Glue code for matrix addition of a 16-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix addition of a 16-bit integer matrices.
function plp_mat_add_stride_i16s_rv32im
void plp_mat_add_stride_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst
)
matrix addition of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
matrix addition of a 16-bit integer matrices for RV32IM extension.
function plp_mat_add_stride_i16s_xpulpv2
void plp_mat_add_stride_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst
)
matrix addition of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix addition of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_add_stride_i16_parallel
void plp_mat_add_stride_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for parallel matrix addition of a 16-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix addition of a 16-bit integer matrices.
function plp_mat_add_stride_i16p_xpulpv2
void plp_mat_add_stride_i16p_xpulpv2(
void * args
)
Parallel matrix addition of 16-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_add_stride_instance_i16 struct initialized by plp_mat_add_stride_i16_parallel
- args pointer to plp_mat_add_stride_instance_i16 struct initialized by plp_mat_add_stride_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
Parallel matrix addition of 16-bit integer matrices kernel for XPULPV2 extension.
function plp_mat_add_stride_i8
void plp_mat_add_stride_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst
)
Glue code for matrix addition of a 8-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix addition of a 8-bit integer matrices.
function plp_mat_add_stride_i8s_rv32im
void plp_mat_add_stride_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst
)
matrix addition of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
matrix addition of a 8-bit integer matrices for RV32IM extension.
function plp_mat_add_stride_i8s_xpulpv2
void plp_mat_add_stride_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst
)
matrix addition of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix addition of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_add_stride_i8_parallel
void plp_mat_add_stride_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for parallel matrix addition of a 8-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix addition of a 8-bit integer matrices.
function plp_mat_add_stride_i8p_xpulpv2
void plp_mat_add_stride_i8p_xpulpv2(
void * args
)
Parallel matrix addition of 8-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_add_stride_instance_i8 struct initialized by plp_mat_add_stride_i8_parallel
- args pointer to plp_mat_add_stride_instance_i8 struct initialized by plp_mat_add_stride_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
Parallel matrix addition of 8-bit integer matrices kernel for XPULPV2 extension.
function plp_mat_add_stride_f32
void plp_mat_add_stride_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
float *__restrict__ pDst
)
Glue code for matrix addition of a 32-bit floating-point matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix addition of a 32-bit floating-point matrices.
function plp_mat_add_stride_f32s_xpulpv2
void plp_mat_add_stride_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
float *__restrict__ pDst
)
matrix addition of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
matrix addition of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_add_stride_f32_parallel
void plp_mat_add_stride_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code for parallel matrix addition of a 32-bit floating-point matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix addition of a 32-bit floating-point matrices.
function plp_mat_add_stride_f32p_xpulpv2
void plp_mat_add_stride_f32p_xpulpv2(
void * args
)
Parallel matrix addition of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_add_stride_instance_f32 struct initialized by plp_mat_add_stride_f32_parallel
- args pointer to plp_mat_add_stride_instance_f32 struct initialized by plp_mat_add_stride_f32_parallel
Return:
- none
- none
Parallel matrix addition of 32-bit floating-point matrices kernel for XPULPV2 extension.
function plp_mat_sub_stride_i32
void plp_mat_sub_stride_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst
)
Glue code for matrix subtraction of a 32-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix subtraction of a 32-bit integer matrices.
function plp_mat_sub_stride_i32s_rv32im
void plp_mat_sub_stride_i32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst
)
matrix subtraction of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
matrix subtraction of a 32-bit integer matrices for RV32IM extension.
function plp_mat_sub_stride_i32s_xpulpv2
void plp_mat_sub_stride_i32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst
)
matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_sub_stride_i32_parallel
void plp_mat_sub_stride_i32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for parallel matrix subtraction of a 32-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix subtraction of a 32-bit integer matrices.
function plp_mat_sub_stride_i32p_xpulpv2
void plp_mat_sub_stride_i32p_xpulpv2(
void * args
)
Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_sub_stride_instance_i32 struct initialized by plp_mat_sub_stride_i32_parallel
- args pointer to plp_mat_sub_stride_instance_i32 struct initialized by plp_mat_sub_stride_i32_parallel
Return:
- none
- none
Parallel matrix subtraction of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_sub_stride_i16
void plp_mat_sub_stride_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst
)
Glue code for matrix subtraction of a 16-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix subtraction of a 16-bit integer matrices.
function plp_mat_sub_stride_i16s_rv32im
void plp_mat_sub_stride_i16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst
)
matrix subtraction of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
matrix subtraction of a 16-bit integer matrices for RV32IM extension.
function plp_mat_sub_stride_i16s_xpulpv2
void plp_mat_sub_stride_i16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst
)
matrix subtraction of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix subtraction of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_sub_stride_i16_parallel
void plp_mat_sub_stride_i16_parallel(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for parallel matrix subtraction of a 16-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix subtraction of a 16-bit integer matrices.
function plp_mat_sub_stride_i16p_xpulpv2
void plp_mat_sub_stride_i16p_xpulpv2(
void * args
)
Parallel matrix subtraction of 16-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_sub_stride_instance_i16 struct initialized by plp_mat_sub_stride_i16_parallel
- args pointer to plp_mat_sub_stride_instance_i16 struct initialized by plp_mat_sub_stride_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
Parallel matrix subtraction of 16-bit integer matrices kernel for XPULPV2 extension.
function plp_mat_sub_stride_i8
void plp_mat_sub_stride_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst
)
Glue code for matrix subtraction of a 8-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix subtraction of a 8-bit integer matrices.
function plp_mat_sub_stride_i8s_rv32im
void plp_mat_sub_stride_i8s_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst
)
matrix subtraction of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
matrix subtraction of a 8-bit integer matrices for RV32IM extension.
function plp_mat_sub_stride_i8s_xpulpv2
void plp_mat_sub_stride_i8s_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst
)
matrix subtraction of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
matrix subtraction of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_sub_stride_i8_parallel
void plp_mat_sub_stride_i8_parallel(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for parallel matrix subtraction of a 8-bit integer matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix subtraction of a 8-bit integer matrices.
function plp_mat_sub_stride_i8p_xpulpv2
void plp_mat_sub_stride_i8p_xpulpv2(
void * args
)
Parallel matrix subtraction of 8-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_sub_stride_instance_i8 struct initialized by plp_mat_sub_stride_i8_parallel
- args pointer to plp_mat_sub_stride_instance_i8 struct initialized by plp_mat_sub_stride_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
Parallel matrix subtraction of 8-bit integer matrices kernel for XPULPV2 extension.
function plp_mat_sub_stride_f32
void plp_mat_sub_stride_f32(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
float *__restrict__ pDst
)
Glue code for matrix subtraction of a 32-bit floating-point matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Glue code for matrix subtraction of a 32-bit floating-point matrices.
function plp_mat_sub_stride_f32s_xpulpv2
void plp_mat_sub_stride_f32s_xpulpv2(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
float *__restrict__ pDst
)
matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
matrix subtraction of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_sub_stride_f32_parallel
void plp_mat_sub_stride_f32_parallel(
const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code for parallel matrix subtraction of a 32-bit floating-point matrices.
Parameters:
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of the matrices
- N Width of the matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrix B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use
- pDst Points to the output matrix
- pSrcA Points to the first input matrix
- pSrcB Points to the second input matrix
- M Height of all matrices
- N Width of all matrices
- strideA Stride of matrix A (elements between each row)
- strideB Stride of matrid B (elements between each row)
- strideY Stride of output matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel matrix subtraction of a 32-bit floating-point matrices.
function plp_mat_sub_stride_f32p_xpulpv2
void plp_mat_sub_stride_f32p_xpulpv2(
void * args
)
Parallel matrix subtraction of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_sub_stride_instance_f32 struct initialized by plp_mat_sub_stride_f32_parallel
- args pointer to plp_mat_sub_stride_instance_f32 struct initialized by plp_mat_sub_stride_f32_parallel
Return:
- none
- none
Parallel matrix subtraction of 32-bit floating-point matrices kernel for XPULPV2 extension.
function plp_mat_scale_stride_i32
void plp_mat_scale_stride_i32(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst
)
Glue code for strided matrix scale of a 32-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Glue code for strided matrix scale of a 32-bit integer matrices.
function plp_mat_scale_stride_i32s_rv32im
void plp_mat_scale_stride_i32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst
)
strided matrix scale of a 32-bit integer matrices for RV32IM extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
strided matrix scale of a 32-bit integer matrices for RV32IM extension.
function plp_mat_scale_stride_i32s_xpulpv2
void plp_mat_scale_stride_i32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst
)
strided matrix scale of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
strided matrix scale of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_scale_stride_i32_parallel
void plp_mat_scale_stride_i32_parallel(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t scaleFactor,
int32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for parallel strided matrix scale of a 32-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- nPE Number of cores to use for computation
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix scale of a 32-bit integer matrices.
function plp_mat_scale_stride_i32p_xpulpv2
void plp_mat_scale_stride_i32p_xpulpv2(
void * args
)
Parallel strided matrix scale of a 32-bit integer matrices for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_scale_stride_instance_i32 struct initialized by plp_mat_scale_stride_i32_parallel
- args pointer to plp_mat_scale_stride_instance_i32 struct initialized by plp_mat_scale_stride_i32_parallel
Return:
- none
- none
Parallel strided matrix scale of a 32-bit integer matrices for XPULPV2 extension.
function plp_mat_scale_stride_i16
void plp_mat_scale_stride_i16(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst
)
Glue code for strided matrix scale of a 16-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Glue code for strided matrix scale of a 16-bit integer matrices.
function plp_mat_scale_stride_i16s_rv32im
void plp_mat_scale_stride_i16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst
)
strided matrix scale of a 16-bit integer matrices for RV32IM extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
strided matrix scale of a 16-bit integer matrices for RV32IM extension.
function plp_mat_scale_stride_i16s_xpulpv2
void plp_mat_scale_stride_i16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst
)
strided matrix scale of a 16-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
strided matrix scale of a 16-bit integer matrices for XPULPV2 extension.
function plp_mat_scale_stride_i16_parallel
void plp_mat_scale_stride_i16_parallel(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t scaleFactor,
int32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for parallel strided matrix scale of a 16-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix scale of a 16-bit integer matrices.
function plp_mat_scale_stride_i16p_xpulpv2
void plp_mat_scale_stride_i16p_xpulpv2(
void * args
)
Parallel strided matrix scale of 16-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_scale_stride_instance_i16 struct initialized by plp_mat_scale_stride_i16_parallel
- args pointer to plp_mat_scale_stride_instance_i16 struct initialized by plp_mat_scale_stride_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_scale_stride_i8
void plp_mat_scale_stride_i8(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst
)
Glue code for strided matrix scale of a 8-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Glue code for strided matrix scale of a 8-bit integer matrices.
function plp_mat_scale_stride_i8s_rv32im
void plp_mat_scale_stride_i8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst
)
strided matrix scale of a 8-bit integer matrices for RV32IM extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
strided matrix scale of a 8-bit integer matrices for RV32IM extension.
function plp_mat_scale_stride_i8s_xpulpv2
void plp_mat_scale_stride_i8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst
)
strided matrix scale of a 8-bit integer matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
strided matrix scale of a 8-bit integer matrices for XPULPV2 extension.
function plp_mat_scale_stride_i8_parallel
void plp_mat_scale_stride_i8_parallel(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t scaleFactor,
int32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for parallel strided matrix scale of a 8-bit integer matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements before shifting
- shift Amount to shift each element
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix scale of a 8-bit integer matrices.
function plp_mat_scale_stride_i8p_xpulpv2
void plp_mat_scale_stride_i8p_xpulpv2(
void * args
)
Parallel strided matrix scale of 8-bit integer matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_scale_stride_instance_i8 struct initialized by plp_mat_scale_stride_i8_parallel
- args pointer to plp_mat_scale_stride_instance_i8 struct initialized by plp_mat_scale_stride_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_scale_stride_f32
void plp_mat_scale_stride_f32(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
float scaleFactor,
float *__restrict__ pDst
)
Glue code for strided matrix scale of a 32-bit floating-point matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements
- pDst Points to the output matrix
Return:
- none
- none
Glue code for strided matrix scale of a 32-bit floating-point matrices.
function plp_mat_scale_stride_f32s_xpulpv2
void plp_mat_scale_stride_f32s_xpulpv2(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
float scaleFactor,
float *__restrict__ pDst
)
strided matrix scale of a 32-bit floating-point matrices for XPULPV2 extension.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements
- pDst Points to the output matrix
Return:
- none
- none
strided matrix scale of a 32-bit floating-point matrices for XPULPV2 extension.
function plp_mat_scale_stride_f32_parallel
void plp_mat_scale_stride_f32_parallel(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
float scaleFactor,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code for parallel strided matrix scale of a 32-bit floating-point matrices.
Parameters:
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride for input matrix (elements between each row)
- strideDst Stride for output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- pSrc Points to the input matrix
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of input matrix (elements between each row)
- strideDst Stride of output matrix (elements between each row)
- scaleFactor Factor to mulitply all elements
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Glue code for parallel strided matrix scale of a 32-bit floating-point matrices.
function plp_mat_scale_stride_f32p_xpulpv2
void plp_mat_scale_stride_f32p_xpulpv2(
void * args
)
Parallel strided matrix scale of 32-bit floating-point matrices kernel for XPULPV2 extension.
Parameters:
- args pointer to plp_mat_scale_stride_instance_f32 struct initialized by plp_mat_scale_stride_f32_parallel
- args pointer to plp_mat_scale_stride_instance_f32 struct initialized by plp_mat_scale_stride_f32_parallel
Return:
- none
- none
function plp_mat_fill_I_stride_i32
void plp_mat_fill_I_stride_i32(
uint32_t N,
uint32_t stride,
int32_t *__restrict__ pDst
)
Glue code for creating a strided 32-bit integers identity matrix.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
function plp_mat_fill_I_stride_i32s_rv32im
void plp_mat_fill_I_stride_i32s_rv32im(
uint32_t N,
uint32_t stride,
int32_t *__restrict__ pDst
)
Create a strided 32-bit integers identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Create a strided 32-bit integers identity matrix on RV32IM.
function plp_mat_fill_I_stride_i32s_xpulpv2
void plp_mat_fill_I_stride_i32s_xpulpv2(
uint32_t N,
uint32_t stride,
int32_t *__restrict__ pDst
)
Create a strided 32-bit integers identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
function plp_mat_fill_I_stride_i32_parallel
void plp_mat_fill_I_stride_i32_parallel(
uint32_t N,
uint32_t stride,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for creating a strided 32-bit integers identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
function plp_mat_fill_I_stride_i32p_xpulpv2
void plp_mat_fill_I_stride_i32p_xpulpv2(
void * args
)
Create a strided 32-bit integers identity matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_I_stride_instance_i32 struct initialized by plp_mat_fill_I_stride_i32_parallel
- args pointer to plp_mat_mat_fill_I_stride_instance_i32 struct initialized by plp_mat_fill_I_stride_i32_parallel
Return:
- none
- none
function plp_mat_fill_I_stride_i16
void plp_mat_fill_I_stride_i16(
uint32_t N,
uint32_t stride,
int16_t *__restrict__ pDst
)
Glue code for creating a strided 16-bit integers identity matrix.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
function plp_mat_fill_I_stride_i16s_rv32im
void plp_mat_fill_I_stride_i16s_rv32im(
uint32_t N,
uint32_t stride,
int16_t *__restrict__ pDst
)
Create a strided 16-bit integers identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Create a strided 16-bit integers identity matrix on RV32IM.
function plp_mat_fill_I_stride_i16s_xpulpv2
void plp_mat_fill_I_stride_i16s_xpulpv2(
uint32_t N,
uint32_t stride,
int16_t *__restrict__ pDst
)
Create a strided 16-bit integers identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_stride_i16_parallel
void plp_mat_fill_I_stride_i16_parallel(
uint32_t N,
uint32_t stride,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for creating a strided 16-bit integers identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
function plp_mat_fill_I_stride_i16p_xpulpv2
void plp_mat_fill_I_stride_i16p_xpulpv2(
void * args
)
Create a strided 16-bit integers identity matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_I_stride_instance_i16 struct initialized by plp_mat_fill_I_stride_i16_parallel
- args pointer to plp_mat_mat_fill_I_stride_instance_i16 struct initialized by plp_mat_fill_I_stride_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_stride_i8
void plp_mat_fill_I_stride_i8(
uint32_t N,
uint32_t stride,
int8_t *__restrict__ pDst
)
Glue code for creating a strided 8-bit integers identity matrix.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
function plp_mat_fill_I_stride_i8s_rv32im
void plp_mat_fill_I_stride_i8s_rv32im(
uint32_t N,
uint32_t stride,
int8_t *__restrict__ pDst
)
Create a strided 8-bit integers identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Create a strided 8-bit integers identity matrix on RV32IM.
function plp_mat_fill_I_stride_i8s_xpulpv2
void plp_mat_fill_I_stride_i8s_xpulpv2(
uint32_t N,
uint32_t stride,
int8_t *__restrict__ pDst
)
Create a strided 8-bit integers identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_stride_i8_parallel
void plp_mat_fill_I_stride_i8_parallel(
uint32_t N,
uint32_t stride,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for creating a strided 8-bit integers identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
function plp_mat_fill_I_stride_i8p_xpulpv2
void plp_mat_fill_I_stride_i8p_xpulpv2(
void * args
)
Create a strided 8-bit integers identity matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_I_stride_instance_i8 struct initialized by plp_mat_fill_I_stride_i8_parallel
- args pointer to plp_mat_mat_fill_I_stride_instance_i8 struct initialized by plp_mat_fill_I_stride_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_stride_f32
void plp_mat_fill_I_stride_f32(
uint32_t N,
uint32_t stride,
float *__restrict__ pDst
)
Glue code for creating a strided 32-bit floats identity matrix.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
function plp_mat_fill_I_stride_f32s_xpulpv2
void plp_mat_fill_I_stride_f32s_xpulpv2(
uint32_t N,
uint32_t stride,
float *__restrict__ pDst
)
Create a strided 32-bit floats identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- pDst Points to the output matrix
Return:
- none
- none
function plp_mat_fill_I_stride_f32_parallel
void plp_mat_fill_I_stride_f32_parallel(
uint32_t N,
uint32_t stride,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code for creating a strided 32-bit floats identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
function plp_mat_fill_I_stride_f32p_xpulpv2
void plp_mat_fill_I_stride_f32p_xpulpv2(
void * args
)
Create a strided 32-bit floats identity matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_I_stride_instance_f32 struct initialized by plp_mat_fill_I_stride_f32_parallel
- args pointer to plp_mat_mat_fill_I_stride_instance_f32 struct initialized by plp_mat_fill_I_stride_f32_parallel
Return:
- none
- none
function plp_mat_fill_I_stride_q32
void plp_mat_fill_I_stride_q32(
uint32_t N,
uint32_t stride,
int32_t fracBits,
int32_t *__restrict__ pDst
)
Glue code for creating a strided 32-bit fix-point identity matrix.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
function plp_mat_fill_I_stride_q32s_rv32im
void plp_mat_fill_I_stride_q32s_rv32im(
uint32_t N,
uint32_t stride,
int32_t fracBits,
int32_t *__restrict__ pDst
)
Create a strided 32-bit fix-point identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
Create a strided 32-bit fix-point identity matrix on RV32IM.
function plp_mat_fill_I_stride_q32s_xpulpv2
void plp_mat_fill_I_stride_q32s_xpulpv2(
uint32_t N,
uint32_t stride,
int32_t fracBits,
int32_t *__restrict__ pDst
)
Create a strided 32-bit fix-point identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
function plp_mat_fill_I_stride_q32_parallel
void plp_mat_fill_I_stride_q32_parallel(
uint32_t N,
uint32_t stride,
int32_t fracBits,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for creating a strided 32-bit fix-point identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
function plp_mat_fill_I_stride_q32p_xpulpv2
void plp_mat_fill_I_stride_q32p_xpulpv2(
void * args
)
Create a strided 32-bit fix-point identity matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_I_stride_instance_q32 struct initialized by plp_mat_fill_I_stride_q32_parallel
- args pointer to plp_mat_mat_fill_I_stride_instance_q32 struct initialized by plp_mat_fill_I_stride_q32_parallel
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
function plp_mat_fill_I_stride_q16
void plp_mat_fill_I_stride_q16(
uint32_t N,
uint32_t stride,
int32_t fracBits,
int16_t *__restrict__ pDst
)
Glue code for creating a strided 16-bit fix-point identity matrix.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
function plp_mat_fill_I_stride_q16s_rv32im
void plp_mat_fill_I_stride_q16s_rv32im(
uint32_t N,
uint32_t stride,
int32_t fracBits,
int16_t *__restrict__ pDst
)
Create a strided 16-bit fix-point identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
Create a strided 16-bit fix-point identity matrix on RV32IM.
function plp_mat_fill_I_stride_q16s_xpulpv2
void plp_mat_fill_I_stride_q16s_xpulpv2(
uint32_t N,
uint32_t stride,
int32_t fracBits,
int16_t *__restrict__ pDst
)
Create a strided 16-bit fix-point identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
function plp_mat_fill_I_stride_q16_parallel
void plp_mat_fill_I_stride_q16_parallel(
uint32_t N,
uint32_t stride,
int32_t fracBits,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for creating a strided 16-bit fix-point identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
function plp_mat_fill_I_stride_q16p_xpulpv2
void plp_mat_fill_I_stride_q16p_xpulpv2(
void * args
)
Create a strided 16-bit fix-point identity matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_I_stride_instance_q16 struct initialized by plp_mat_fill_I_stride_q16_parallel
- args pointer to plp_mat_mat_fill_I_stride_instance_q16 struct initialized by plp_mat_fill_I_stride_q16_parallel
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_I_stride_q8
void plp_mat_fill_I_stride_q8(
uint32_t N,
uint32_t stride,
int32_t fracBits,
int8_t *__restrict__ pDst
)
Glue code for creating a strided 8-bit fix-point identity matrix.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
function plp_mat_fill_I_stride_q8s_rv32im
void plp_mat_fill_I_stride_q8s_rv32im(
uint32_t N,
uint32_t stride,
int32_t fracBits,
int8_t *__restrict__ pDst
)
Create a strided 8-bit fix-point identity matrix on RV32IM.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
Create a strided 8-bit fix-point identity matrix on RV32IM.
function plp_mat_fill_I_stride_q8s_xpulpv2
void plp_mat_fill_I_stride_q8s_xpulpv2(
uint32_t N,
uint32_t stride,
int32_t fracBits,
int8_t *__restrict__ pDst
)
Create a strided 8-bit fix-point identity matrix on XpulpV2.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
function plp_mat_fill_I_stride_q8_parallel
void plp_mat_fill_I_stride_q8_parallel(
uint32_t N,
uint32_t stride,
int32_t fracBits,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for creating a strided 8-bit fix-point identity matrix in parallel.
Parameters:
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix
- N Width and height of the matrix
- stride Stride of the matrix (elements between each row)
- fracBits Decimal point for the appropriate scale
- nPE Number of cores to use for computation
- pDst Points to the output matrix
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
function plp_mat_fill_I_stride_q8p_xpulpv2
void plp_mat_fill_I_stride_q8p_xpulpv2(
void * args
)
Create a strided 8-bit fix-point identity matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_I_stride_instance_q8 struct initialized by plp_mat_fill_I_stride_q8_parallel
- args pointer to plp_mat_mat_fill_I_stride_instance_q8 struct initialized by plp_mat_fill_I_stride_q8_parallel
Return:
- none
- none
Par:
- Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Fix-Point
The diagonal elements will be filled with the value: 1 << fracBits
.
* Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_stride_i32
void plp_mat_fill_stride_i32(
uint32_t M,
uint32_t N,
uint32_t stride,
int32_t value,
int32_t *__restrict__ pDst
)
Glue code for filling an MxN strided 32-bit integers matrix.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_i32s_rv32im
void plp_mat_fill_stride_i32s_rv32im(
uint32_t M,
uint32_t N,
uint32_t stride,
int32_t value,
int32_t *__restrict__ pDst
)
Fill an MxN strided 32-bit integers matrix on RV32IM.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_i32s_xpulpv2
void plp_mat_fill_stride_i32s_xpulpv2(
uint32_t M,
uint32_t N,
uint32_t stride,
int32_t value,
int32_t *__restrict__ pDst
)
Fill an MxN strided 32-bit integers matrix on XpulpV2.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_i32_parallel
void plp_mat_fill_stride_i32_parallel(
uint32_t M,
uint32_t N,
uint32_t stride,
int32_t value,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code for filling an MxN strided 32-bit integers matrix in parallel.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for processing
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for processing
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_i32p_xpulpv2
void plp_mat_fill_stride_i32p_xpulpv2(
void * args
)
Fill an MxN strided 32-bit integers matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_stride_instance_i32 struct initialized by plp_mat_fill_stride_i32_parallel
- args pointer to plp_mat_mat_fill_stride_instance_i32 struct initialized by plp_mat_fill_stride_i32_parallel
Return:
- none
- none
function plp_mat_fill_stride_i16
void plp_mat_fill_stride_i16(
uint32_t M,
uint32_t N,
uint32_t stride,
int16_t value,
int16_t *__restrict__ pDst
)
Glue code for filling an MxN strided 16-bit integers matrix.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_i16s_rv32im
void plp_mat_fill_stride_i16s_rv32im(
uint32_t M,
uint32_t N,
uint32_t stride,
int16_t value,
int16_t *__restrict__ pDst
)
Fill an MxN strided 16-bit integers matrix on RV32IM.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_i16s_xpulpv2
void plp_mat_fill_stride_i16s_xpulpv2(
uint32_t M,
uint32_t N,
uint32_t stride,
int16_t value,
int16_t *__restrict__ pDst
)
Fill an MxN strided 16-bit integers matrix on XpulpV2.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_stride_i16_parallel
void plp_mat_fill_stride_i16_parallel(
uint32_t M,
uint32_t N,
uint32_t stride,
int16_t value,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code for filling an MxN strided 16-bit integers matrix in parallel.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for processing
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for processing
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_i16p_xpulpv2
void plp_mat_fill_stride_i16p_xpulpv2(
void * args
)
Fill an MxN strided 16-bit integers matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_stride_instance_i16 struct initialized by plp_mat_fill_stride_i16_parallel
- args pointer to plp_mat_mat_fill_stride_instance_i16 struct initialized by plp_mat_fill_stride_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_stride_i8
void plp_mat_fill_stride_i8(
uint32_t M,
uint32_t N,
uint32_t stride,
int8_t value,
int8_t *__restrict__ pDst
)
Glue code for filling an MxN strided 8-bit integers matrix.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_i8s_rv32im
void plp_mat_fill_stride_i8s_rv32im(
uint32_t M,
uint32_t N,
uint32_t stride,
int8_t value,
int8_t *__restrict__ pDst
)
Fill an MxN strided 8-bit integers matrix on RV32IM.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_i8s_xpulpv2
void plp_mat_fill_stride_i8s_xpulpv2(
uint32_t M,
uint32_t N,
uint32_t stride,
int8_t value,
int8_t *__restrict__ pDst
)
Fill an MxN strided 8-bit integers matrix on XpulpV2.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_stride_i8_parallel
void plp_mat_fill_stride_i8_parallel(
uint32_t M,
uint32_t N,
uint32_t stride,
int8_t value,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code for filling an MxN strided 8-bit integers matrix in parallel.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for processing
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for processing
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_i8p_xpulpv2
void plp_mat_fill_stride_i8p_xpulpv2(
void * args
)
Fill an MxN strided 8-bit integers matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_stride_instance_i8 struct initialized by plp_mat_fill_stride_i8_parallel
- args pointer to plp_mat_mat_fill_stride_instance_i8 struct initialized by plp_mat_fill_stride_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_fill_stride_f32
void plp_mat_fill_stride_f32(
uint32_t M,
uint32_t N,
uint32_t stride,
float value,
float *__restrict__ pDst
)
Glue code for filling an MxN strided 32-bit floats matrix.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_f32s_xpulpv2
void plp_mat_fill_stride_f32s_xpulpv2(
uint32_t M,
uint32_t N,
uint32_t stride,
float value,
float *__restrict__ pDst
)
Fill an MxN strided 32-bit floats matrix on XpulpV2.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_f32_parallel
void plp_mat_fill_stride_f32_parallel(
uint32_t M,
uint32_t N,
uint32_t stride,
float value,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code for filling an MxN strided 32-bit floats matrix in parallel.
Parameters:
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for processing
- pSrc Points to the output matrix
- M Height of the matrix
- N Width of the matrix
- stride Stride of the matrix (elements between each row)
- nPE Number of cores to use for processing
- pSrc Points to the output matrix
Return:
- none
- none
function plp_mat_fill_stride_f32p_xpulpv2
void plp_mat_fill_stride_f32p_xpulpv2(
void * args
)
Fill an MxN strided 32-bit floats matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_fill_stride_instance_f32 struct initialized by plp_mat_fill_stride_f32_parallel
- args pointer to plp_mat_mat_fill_stride_instance_f32 struct initialized by plp_mat_fill_stride_f32_parallel
Return:
- none
- none
function plp_mat_copy_stride_i32
void plp_mat_copy_stride_i32(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t *__restrict__ pDst
)
Glue code to copy an MxN strided 32-bit integers matrix.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_i32s_rv32im
void plp_mat_copy_stride_i32s_rv32im(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t *__restrict__ pDst
)
Copy an MxN strided 32-bit integers matrix on RV32IM.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_i32s_xpulpv2
void plp_mat_copy_stride_i32s_xpulpv2(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t *__restrict__ pDst
)
Copy an MxN strided 32-bit integers matrix on XpulpV2.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_i32_parallel
void plp_mat_copy_stride_i32_parallel(
const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
uint32_t nPE,
int32_t *__restrict__ pDst
)
Glue code to copy an MxN strided 32-bit integers matrix in parallel.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- nPE Number of cores to use for processing
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- nPE Number of cores to use for processing
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_i32p_xpulpv2
void plp_mat_copy_stride_i32p_xpulpv2(
void * args
)
Copy an MxN strided 32-bit integers matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_copy_stride_instance_i32 struct initialized by plp_mat_copy_stride_i32_parallel
- args pointer to plp_mat_mat_copy_stride_instance_i32 struct initialized by plp_mat_copy_stride_i32_parallel
Return:
- none
- none
function plp_mat_copy_stride_i16
void plp_mat_copy_stride_i16(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t *__restrict__ pDst
)
Glue code to copy an MxN strided 16-bit integers matrix.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_i16s_rv32im
void plp_mat_copy_stride_i16s_rv32im(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t *__restrict__ pDst
)
Copy an MxN strided 16-bit integers matrix on RV32IM.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_i16s_xpulpv2
void plp_mat_copy_stride_i16s_xpulpv2(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t *__restrict__ pDst
)
Copy an MxN strided 16-bit integers matrix on XpulpV2.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
Par: Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_copy_stride_i16_parallel
void plp_mat_copy_stride_i16_parallel(
const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
uint32_t nPE,
int16_t *__restrict__ pDst
)
Glue code to copy an MxN strided 16-bit integers matrix in parallel.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- nPE Number of cores to use for processing
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- nPE Number of cores to use for processing
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_i16p_xpulpv2
void plp_mat_copy_stride_i16p_xpulpv2(
void * args
)
Copy an MxN strided 16-bit integers matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_copy_stride_instance_i16 struct initialized by plp_mat_copy_stride_i16_parallel
- args pointer to plp_mat_mat_copy_stride_instance_i16 struct initialized by plp_mat_copy_stride_i16_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two each into 32 bit vectors and then the two dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_copy_stride_i8
void plp_mat_copy_stride_i8(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t *__restrict__ pDst
)
Glue code to copy an MxN strided 8-bit integers matrix.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_i8s_rv32im
void plp_mat_copy_stride_i8s_rv32im(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t *__restrict__ pDst
)
Copy an MxN strided 8-bit integers matrix on RV32IM.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_i8s_xpulpv2
void plp_mat_copy_stride_i8s_xpulpv2(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t *__restrict__ pDst
)
Copy an MxN strided 8-bit integers matrix on XpulpV2.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
Par: Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_copy_stride_i8_parallel
void plp_mat_copy_stride_i8_parallel(
const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
uint32_t nPE,
int8_t *__restrict__ pDst
)
Glue code to copy an MxN strided 8-bit integers matrix in parallel.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- nPE Number of cores to use for processing
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- nPE Number of cores to use for processing
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_i8p_xpulpv2
void plp_mat_copy_stride_i8p_xpulpv2(
void * args
)
Copy an MxN strided 8-bit integers matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_copy_stride_instance_i8 struct initialized by plp_mat_copy_stride_i8_parallel
- args pointer to plp_mat_mat_copy_stride_instance_i8 struct initialized by plp_mat_copy_stride_i8_parallel
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 8 bit values are packed four each into 32 bit vectors and then the four dot products are performed on 32 bit vectors, with 32 bit accumulator.
function plp_mat_copy_stride_f32
void plp_mat_copy_stride_f32(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
float *__restrict__ pDst
)
Glue code to copy an MxN strided 32-bit floats matrix.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_f32s_xpulpv2
void plp_mat_copy_stride_f32s_xpulpv2(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
float *__restrict__ pDst
)
Copy an MxN strided 32-bit floats matrix on XpulpV2.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_f32_parallel
void plp_mat_copy_stride_f32_parallel(
const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
uint32_t nPE,
float *__restrict__ pDst
)
Glue code to copy an MxN strided 32-bit floats matrix in parallel.
Parameters:
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- nPE Number of cores to use for processing
- pDst Points to the output matrix of shape MxN
- pSrc Points to the input matrix of shape MxN
- M Height of both matrices
- N Width of both matrices
- strideSrc Stride of the input matrix (elements between each row)
- strideDst Stride of the output matrix (elements between each row)
- nPE Number of cores to use for processing
- pDst Points to the output matrix of shape MxN
Return:
- none
- none
function plp_mat_copy_stride_f32p_xpulpv2
void plp_mat_copy_stride_f32p_xpulpv2(
void * args
)
Copy an MxN strided 32-bit floats matrix on XpulpV2 in parallel.
Parameters:
- args pointer to plp_mat_copy_stride_instance_f32 struct initialized by plp_mat_copy_stride_f32_parallel
- args pointer to plp_mat_mat_copy_stride_instance_f32 struct initialized by plp_mat_copy_stride_f32_parallel
Return:
- none
- none
function plp_cmplx_conj_f32
void plp_cmplx_conj_f32(
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex conjugate of 32-bit float vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_conj_f32_xpulpv2
void plp_cmplx_conj_f32_xpulpv2(
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t numSamples
)
Floating-point complex conjugate.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_conj_i32
void plp_cmplx_conj_i32(
const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex conjugate of 32-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_conj_i32_xpulpv2
void plp_cmplx_conj_i32_xpulpv2(
const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
32-bit integer complex conjugate.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_conj_i32_rv32im
void plp_cmplx_conj_i32_rv32im(
const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
32-bit integer complex conjugate.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_conj_i16
void plp_cmplx_conj_i16(
const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex conjugate of 16-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_conj_i16_xpulpv2
void plp_cmplx_conj_i16_xpulpv2(
const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
16-bit integer complex conjugate.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_conj_i16_rv32im
void plp_cmplx_conj_i16_rv32im(
const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
16-bit integer complex conjugate.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_conj_i8
void plp_cmplx_conj_i8(
const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex conjugate of 8-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_conj_i8_xpulpv2
void plp_cmplx_conj_i8_xpulpv2(
const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
8-bit integer complex conjugate.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_conj_i8_rv32im
void plp_cmplx_conj_i8_rv32im(
const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
8-bit integer complex conjugate.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_dot_prod_f32
void plp_cmplx_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult
)
Glue code for complex dot product of 32-bit float vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_dot_prod_f32_xpulpv2
void plp_cmplx_dot_prod_f32_xpulpv2(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult
)
Floating-point complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
function plp_cmplx_dot_prod_i32
void plp_cmplx_dot_prod_i32(
const int32_t * pSrcA,
const int32_t * pSrcB,
uint32_t numSamples,
int32_t * realResult,
int32_t * imagResult
)
Glue code for complex dot product of 32-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_dot_prod_i32_xpulpv2
void plp_cmplx_dot_prod_i32_xpulpv2(
const int32_t * pSrcA,
const int32_t * pSrcB,
uint32_t numSamples,
int32_t * realResult,
int32_t * imagResult
)
32-bit integer complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
function plp_cmplx_dot_prod_i32_rv32im
void plp_cmplx_dot_prod_i32_rv32im(
const int32_t * pSrcA,
const int32_t * pSrcB,
uint32_t numSamples,
int32_t * realResult,
int32_t * imagResult
)
32-bit integer complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
function plp_cmplx_dot_prod_i16
void plp_cmplx_dot_prod_i16(
const int16_t * pSrcA,
const int16_t * pSrcB,
uint32_t numSamples,
int16_t * realResult,
int16_t * imagResult
)
Glue code for complex dot product of 16-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_dot_prod_i16_xpulpv2
void plp_cmplx_dot_prod_i16_xpulpv2(
const int16_t * pSrcA,
const int16_t * pSrcB,
uint32_t numSamples,
int16_t * realResult,
int16_t * imagResult
)
16-bit integer complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
function plp_cmplx_dot_prod_i16_rv32im
void plp_cmplx_dot_prod_i16_rv32im(
const int16_t * pSrcA,
const int16_t * pSrcB,
uint32_t numSamples,
int16_t * realResult,
int16_t * imagResult
)
16-bit integer complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
function plp_cmplx_dot_prod_i8
void plp_cmplx_dot_prod_i8(
const int8_t * pSrcA,
const int8_t * pSrcB,
uint32_t numSamples,
int8_t * realResult,
int8_t * imagResult
)
Glue code for complex dot product of 8-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_dot_prod_i8_xpulpv2
void plp_cmplx_dot_prod_i8_xpulpv2(
const int8_t * pSrcA,
const int8_t * pSrcB,
uint32_t numSamples,
int8_t * realResult,
int8_t * imagResult
)
8-bit integer complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
function plp_cmplx_dot_prod_i8_rv32im
void plp_cmplx_dot_prod_i8_rv32im(
const int8_t * pSrcA,
const int8_t * pSrcB,
uint32_t numSamples,
int8_t * realResult,
int8_t * imagResult
)
8-bit integer complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
function plp_cmplx_dot_prod_q32
void plp_cmplx_dot_prod_q32(
const int32_t * pSrcA,
const int32_t * pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int32_t * realResult,
int32_t * imagResult
)
Glue code for complex dot product of 32-bit fixed-point vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_dot_prod_q32_xpulpv2
void plp_cmplx_dot_prod_q32_xpulpv2(
const int32_t * pSrcA,
const int32_t * pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int32_t * realResult,
int32_t * imagResult
)
32-bit fixed-point complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- deciPoint decimal point for right shift
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
32-bit fixed-point complex dot product.
function plp_cmplx_dot_prod_q32_rv32im
void plp_cmplx_dot_prod_q32_rv32im(
const int32_t * pSrcA,
const int32_t * pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int32_t * realResult,
int32_t * imagResult
)
32-bit integer complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- deciPoint decimal point for right shift
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
function plp_cmplx_dot_prod_q16
void plp_cmplx_dot_prod_q16(
const int16_t * pSrcA,
const int16_t * pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int16_t * realResult,
int16_t * imagResult
)
Glue code for complex dot product of 16-bit fixed-point vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
Return: none
function plp_cmplx_dot_prod_q16_xpulpv2
void plp_cmplx_dot_prod_q16_xpulpv2(
const int16_t * pSrcA,
const int16_t * pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int16_t * realResult,
int16_t * imagResult
)
16-bit fixed-point complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- deciPoint decimal point for right shift
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
16-bit fixed-point complex dot product.
function plp_cmplx_dot_prod_q16_rv32im
void plp_cmplx_dot_prod_q16_rv32im(
const int16_t * pSrcA,
const int16_t * pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int16_t * realResult,
int16_t * imagResult
)
16-bit fixed-point complex dot product.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- numSamples number of samples in each vector
- deciPoint decimal point for right shift
- realResult real part of the result returned here
- imagResult imaginary part of the result returned here
Return:
- none
- none
16-bit fixed-point complex dot product.
function plp_cmplx_mult_real_f32
void plp_cmplx_mult_real_f32(
const float32_t *__restrict__ pSrcCmplx,
const float32_t *__restrict__ pSrcReal,
float32_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex multiplied with real of 32-bit float vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied with real of 32-bit float vectors.
function plp_cmplx_mult_real_f32_xpulpv2
void plp_cmplx_mult_real_f32_xpulpv2(
const float32_t *__restrict__ pSrcCmplx,
const float32_t *__restrict__ pSrcReal,
float32_t *__restrict__ pDst,
uint32_t numSamples
)
Floating-point complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
Floating-point complex multiplied with real.
function plp_cmplx_mult_real_i32
void plp_cmplx_mult_real_i32(
const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex multiplied with real of 32-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied with real of 32-bit integer vectors.
function plp_cmplx_mult_real_i32_xpulpv2
void plp_cmplx_mult_real_i32_xpulpv2(
const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
32-bit integer complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
32-bit integer complex multiplied with real.
function plp_cmplx_mult_real_i32_rv32im
void plp_cmplx_mult_real_i32_rv32im(
const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
32-bit integer complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
32-bit integer complex multiplied with real.
function plp_cmplx_mult_real_i16
void plp_cmplx_mult_real_i16(
const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex multiplied with real of 16-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied with real of 16-bit integer vectors.
function plp_cmplx_mult_real_i16_xpulpv2
void plp_cmplx_mult_real_i16_xpulpv2(
const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
16-bit integer complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
16-bit integer complex multiplied with real.
function plp_cmplx_mult_real_i16_rv32im
void plp_cmplx_mult_real_i16_rv32im(
const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
16-bit integer complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
16-bit integer complex multiplied with real.
function plp_cmplx_mult_real_i8
void plp_cmplx_mult_real_i8(
const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex multiplied with real of 8-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied with real of 8-bit integer vectors.
function plp_cmplx_mult_real_i8_xpulpv2
void plp_cmplx_mult_real_i8_xpulpv2(
const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
8-bit integer complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
8-bit integer complex multiplied with real.
function plp_cmplx_mult_real_i8_rv32im
void plp_cmplx_mult_real_i8_rv32im(
const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
8-bit integer complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
8-bit integer complex multiplied with real.
function plp_cmplx_mult_real_q32
void plp_cmplx_mult_real_q32(
const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
Glue code for complex multiplied with real of 32-bit fixed-point vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied with real of 32-bit fixed-point vectors.
function plp_cmplx_mult_real_q32_xpulpv2
void plp_cmplx_mult_real_q32_xpulpv2(
const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
32-bit fixed-point complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
32-bit fixed-point complex multiplied with real.
function plp_cmplx_mult_real_q32_rv32im
void plp_cmplx_mult_real_q32_rv32im(
const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
32-bit fixed-point complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
32-bit fixed-point complex multiplied with real.
function plp_cmplx_mult_real_q16
void plp_cmplx_mult_real_q16(
const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
Glue code for complex multiplied with real of 16-bit fixed-point vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied with real of 16-bit fixed-point vectors.
function plp_cmplx_mult_real_q16_xpulpv2
void plp_cmplx_mult_real_q16_xpulpv2(
const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
16-bit fixed-point complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
16-bit fixed-point complex multiplied with real.
function plp_cmplx_mult_real_q16_rv32im
void plp_cmplx_mult_real_q16_rv32im(
const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
16-bit fixed-point complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
16-bit fixed-point complex multiplied with real.
function plp_cmplx_mult_real_q8
void plp_cmplx_mult_real_q8(
const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
Glue code for complex multiplied with real of 8-bit fixed-point vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied with real of 8-bit fixed-point vectors.
function plp_cmplx_mult_real_q8_xpulpv2
void plp_cmplx_mult_real_q8_xpulpv2(
const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
8-bit fixed-point complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
8-bit fixed-point complex multiplied with real.
function plp_cmplx_mult_real_q8_rv32im
void plp_cmplx_mult_real_q8_rv32im(
const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
8-bit fixed-point complex multiplied with real.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
8-bit fixed-point complex multiplied with real.
function plp_cmplx_mag_squared_f32
void plp_cmplx_mag_squared_f32(
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex squared magnitude of 32-bit float vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex squared magnitude of 32-bit float vectors.
function plp_cmplx_mag_squared_f32_xpulpv2
void plp_cmplx_mag_squared_f32_xpulpv2(
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t numSamples
)
Floating-point complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
Floating-point complex squared magnitude.
function plp_cmplx_mag_squared_i16
void plp_cmplx_mag_squared_i16(
const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex squared magnitude of 16-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex squared magnitude of 16-bit integer vectors.
function plp_cmplx_mag_squared_i16_rv32im
void plp_cmplx_mag_squared_i16_rv32im(
const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
16-bit integer complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
16-bit integer complex squared magnitude.
function plp_cmplx_mag_squared_i16_xpulpv2
void plp_cmplx_mag_squared_i16_xpulpv2(
const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
16 bit Integer complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
16 bit Integer complex squared magnitude.
function plp_cmplx_mag_squared_i32
void plp_cmplx_mag_squared_i32(
const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex squared magnitude of 32-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex squared magnitude of 32-bit integer vectors.
function plp_cmplx_mag_squared_i32_rv32im
void plp_cmplx_mag_squared_i32_rv32im(
const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
32-bit integer complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
32-bit integer complex squared magnitude.
function plp_cmplx_mag_squared_i32_xpulpv2
void plp_cmplx_mag_squared_i32_xpulpv2(
const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
32-bit integer complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
32-bit integer complex squared magnitude.
function plp_cmplx_mag_squared_i8_xpulpv2
void plp_cmplx_mag_squared_i8_xpulpv2(
const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
8 bit Integer complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
8 bit Integer complex squared magnitude.
function plp_cmplx_mag_squared_i8
void plp_cmplx_mag_squared_i8(
const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex squared magnitude of 32-bit integer vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex squared magnitude of 32-bit integer vectors.
function plp_cmplx_mag_squared_i8_rv32im
void plp_cmplx_mag_squared_i8_rv32im(
const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
8-bit integer complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
8-bit integer complex squared magnitude.
function plp_cmplx_mag_squared_q32
void plp_cmplx_mag_squared_q32(
const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
Glue code for complex squared magnitude of 32-bit fixed-point vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex squared magnitude of 32-bit fixed-point vectors.
function plp_cmplx_mag_squared_q32_rv32im
void plp_cmplx_mag_squared_q32_rv32im(
const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
32-bit fixed-point complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
32-bit fixed-point complex squared magnitude.
function plp_cmplx_mag_squared_q32_xpulpv2
void plp_cmplx_mag_squared_q32_xpulpv2(
const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
32 bit fixed-point complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
32 bit fixed-point complex squared magnitude.
function plp_cmplx_mag_squared_q16
void plp_cmplx_mag_squared_q16(
const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
Glue code for complex squared magnitude of 16-bit fixed-point vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex squared magnitude of 16-bit fixed-point vectors.
function plp_cmplx_mag_squared_q16_rv32im
void plp_cmplx_mag_squared_q16_rv32im(
const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
16-bit fixed-point complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
16-bit fixed-point complex squared magnitude.
function plp_cmplx_mag_squared_q16_xpulpv2
void plp_cmplx_mag_squared_q16_xpulpv2(
const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
16 bit fixed-point complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
16 bit fixed-point complex squared magnitude.
function plp_cmplx_mag_squared_q8
void plp_cmplx_mag_squared_q8(
const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
Glue code for complex squared magnitude of 8-bit fixed-point vectors.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex squared magnitude of 8-bit fixed-point vectors.
function plp_cmplx_mag_squared_q8_rv32im
void plp_cmplx_mag_squared_q8_rv32im(
const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
8-bit fixed-point complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
8-bit fixed-point complex squared magnitude.
function plp_cmplx_mag_squared_q8_xpulpv2
void plp_cmplx_mag_squared_q8_xpulpv2(
const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
8 bit fixed-point complex squared magnitude.
Parameters:
- pSrc points to the input vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrc points to input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
8 bit fixed-point complex squared magnitude.
function plp_cmplx_mult_cmplx_f32
void plp_cmplx_mult_cmplx_f32(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
float32_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex multiplied by complex of 32-bit float vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied by complex of 32-bit float vectors.
function plp_cmplx_mult_cmplx_f32_xpulpv2
void plp_cmplx_mult_cmplx_f32_xpulpv2(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
float32_t *__restrict__ pDst,
uint32_t numSamples
)
Floating-point complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to complex input vector
- pSrcB points to real input vector
- pDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
Floating-point complex multiplied by complex.
function plp_cmplx_mult_cmplx_i32
void plp_cmplx_mult_cmplx_i32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex multiplied by complex of 32-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcCmplx points to complex input vector
- pSrcReal points to real input vector
- pCmplxDst points to complex output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied by complex of 32-bit integer vectors.
function plp_cmplx_mult_cmplx_i32_xpulpv2
void plp_cmplx_mult_cmplx_i32_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
32-bit integer complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
32-bit integer complex multiplied by complex.
function plp_cmplx_mult_cmplx_i32_rv32im
void plp_cmplx_mult_cmplx_i32_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t numSamples
)
32-bit integer complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
32-bit integer complex multiplied by complex.
function plp_cmplx_mult_cmplx_i16
void plp_cmplx_mult_cmplx_i16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex multiplied by complex of 16-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied by complex of 16-bit integer vectors.
function plp_cmplx_mult_cmplx_i16_xpulpv2
void plp_cmplx_mult_cmplx_i16_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
16-bit integer complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
16-bit integer complex multiplied by complex.
function plp_cmplx_mult_cmplx_i16_rv32im
void plp_cmplx_mult_cmplx_i16_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t numSamples
)
16-bit integer complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
16-bit integer complex multiplied by complex.
function plp_cmplx_mult_cmplx_i8
void plp_cmplx_mult_cmplx_i8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
Glue code for complex multiplied by complex of 8-bit integer vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied by complex of 8-bit integer vectors.
function plp_cmplx_mult_cmplx_i8_xpulpv2
void plp_cmplx_mult_cmplx_i8_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
8-bit integer complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
8-bit integer complex multiplied by complex.
function plp_cmplx_mult_cmplx_i8_rv32im
void plp_cmplx_mult_cmplx_i8_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t numSamples
)
8-bit integer complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- numSamples number of samples in each vector
Return:
- none
- none
8-bit integer complex multiplied by complex.
function plp_cmplx_mult_cmplx_q32
void plp_cmplx_mult_cmplx_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
Glue code for complex multiplied by complex of 32-bit fixed-point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied by complex of 32-bit fixed-point vectors.
function plp_cmplx_mult_cmplx_q32_xpulpv2
void plp_cmplx_mult_cmplx_q32_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
32-bit fixed-point complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
32-bit fixed-point complex multiplied by complex.
function plp_cmplx_mult_cmplx_q32_rv32im
void plp_cmplx_mult_cmplx_q32_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
32-bit fixed-point complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
32-bit fixed-point complex multiplied by complex.
function plp_cmplx_mult_cmplx_q16
void plp_cmplx_mult_cmplx_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
Glue code for complex multiplied by complex of 16-bit fixed-point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied by complex of 16-bit fixed-point vectors.
function plp_cmplx_mult_cmplx_q16_xpulpv2
void plp_cmplx_mult_cmplx_q16_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
16-bit fixed-point complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
16-bit fixed-point complex multiplied by complex.
function plp_cmplx_mult_cmplx_q16_rv32im
void plp_cmplx_mult_cmplx_q16_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
16-bit fixed-point complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
16-bit fixed-point complex multiplied by complex.
function plp_cmplx_mult_cmplx_q8
void plp_cmplx_mult_cmplx_q8(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
Glue code for complex multiplied by complex of 8-bit fixed-point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
Glue code for complex multiplied by complex of 8-bit fixed-point vectors.
function plp_cmplx_mult_cmplx_q8_xpulpv2
void plp_cmplx_mult_cmplx_q8_xpulpv2(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
8-bit fixed-point complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
8-bit fixed-point complex multiplied by complex.
function plp_cmplx_mult_cmplx_q8_rv32im
void plp_cmplx_mult_cmplx_q8_rv32im(
const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples
)
8-bit fixed-point complex multiplied by complex.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second vector
- pDst points to the output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
- pSrcA points to first input vector
- pSrcB points to second input vector
- pDst points to output vector
- deciPoint decimal point for right shift
- numSamples number of samples in each vector
Return:
- none
- none
8-bit fixed-point complex multiplied by complex.
function plp_euclidean_distance_q32_parallel
void plp_euclidean_distance_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
uint32_t nPE,
uint32_t *__restrict__ pRes
)
Glue code for parallel Euclidean distance of 32-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- nPE number of parallel processing units
- pRes output result returned here
Return: none
function plp_euclidean_distance_f32_parallel
void plp_euclidean_distance_f32_parallel(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pRes
)
Glue code for parallel Euclidean distance between 32-bit float vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- nPE number of parallel processing units
- pRes output result returned here
Return: none
function plp_euclidean_distance_q32p_xpulpv2
void plp_euclidean_distance_q32p_xpulpv2(
void * S
)
Parallel euclidean distance with interleaved access 32-bit fixed point vectors. vectors kernel for XPULPV2 extension.
Parameters:
- S points to the instance structure for integer parallel dot product
- S points to the instance structure for integer parallel Euclidean distance
Return:
- none
- none
function plp_euclidean_distance_f32p_xpulpv2
void plp_euclidean_distance_f32p_xpulpv2(
void * S
)
32-bit floating-point parallel Euclidean distance between two vectors
Parameters:
- S points to the instance structure for float euclidean distance
Return: none
function plp_euclidean_distance_q32
void plp_euclidean_distance_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for euclidean distance of 32-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- pRes output result returned here
Return: none
function plp_euclidean_distance_q32s_xpulpv2
void plp_euclidean_distance_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Euclidean distance of 32-bit fixed point vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- pRes output result returned here
Return: none
function plp_euclidean_distance_q32s_rv32im
void plp_euclidean_distance_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Euclidean distance of 32-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- pRes output result returned here
Return: none
function plp_euclidean_distance_q16
void plp_euclidean_distance_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint16_t blockSize,
uint16_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for euclidean distance of 16-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- pRes output result returned here
Return: none
function plp_euclidean_distance_q16s_xpulpv2
void plp_euclidean_distance_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes
)
Euclidean distance of 16-bit fixed point vectors kernel for XPULPV2.
Parameters:
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- fracBits decimal point for right shift
- pRes output result returned here [32 bit]
Return: none
Par: Exploiting SIMD instructions
The 16 bit values are packed two by two into 32 bit vectors and then the sums and prducts are performed simultaneously on 32 bit vectors, with 32 bit accumulator.
function plp_euclidean_distance_q16s_rv32im
void plp_euclidean_distance_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Euclidean distance of 16-bit fixed point vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- fracBits decimal point for right shift
- pRes output result returned here [32 bit]
Return: none
Par: Exploiting SIMD instructions
When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).
function plp_euclidean_distance_f32
void plp_euclidean_distance_f32(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes
)
Glue code for Euclidean distance between 32-bit float vectors.
Parameters:
- pSrcA First vector
- pSrcB Second vector
- blockSize vector length
Return: none
function plp_euclidean_distance_f32s_xpulpv2
void plp_euclidean_distance_f32s_xpulpv2(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes
)
32-bit floating point Euclidean distance between two vectors
Parameters:
- pA First vector
- pB Second vector
- blockSize vector length
- pRes output result returned here
Return: none
function plp_euclidean_distance_f32s_rv32im
void plp_euclidean_distance_f32s_rv32im(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes
)
32-bit floating point Euclidean distance between two vectors
Parameters:
- pA First vector
- pB Second vector
- blockSize vector length
- pRes output result returned here
Return: none
function plp_cosine_distance_q32_parallel
void plp_cosine_distance_q32_parallel(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
uint32_t nPE,
int32_t *__restrict__ pRes
)
Glue code for parallel cosine distance between 32-bit fixed-precision vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- nPE number of parallel processing units
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- nPE number of parallel processing units
- pRes output result returned here
Return:
- none
- none
Glue code for parallel cosine distance between 32-bit fixed-precision vectors.
function plp_cosine_distance_f32_parallel
void plp_cosine_distance_f32_parallel(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pRes
)
Glue code for parallel cosine distance between 32-bit float vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- nPE number of parallel processing units
- pRes output result returned here
Return: none
function plp_cosine_distance_f32p_xpulpv2
void plp_cosine_distance_f32p_xpulpv2(
void * S
)
32-bit floating-point parallel cosine distance between two vectors (computes power in parallel)
Parameters:
- S points to the instance structure for float cosine distance
Return: none
function plp_cosine_distance_f32
void plp_cosine_distance_f32(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes
)
Glue code for cosine distance between 32-bit float vectors.
Parameters:
- pSrcA First vector
- pSrcB Second vector
- blockSize vector length
- pSrcA First vector
- pSrcB Second vector
- blockSize vector length
Return:
- none
- none
Glue code for cosine distance between 32-bit float vectors.
function plp_cosine_distance_f32s_rv32im
void plp_cosine_distance_f32s_rv32im(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes
)
32-bit floating point cosine distance between two vectors
Parameters:
- pA First vector
- pB Second vector
- blockSize vector length
- pRes output result returned here
Return: none
function plp_cosine_distance_f32s_xpulpv2
void plp_cosine_distance_f32s_xpulpv2(
const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes
)
32-bit floating point cosine distance between two vectors
Parameters:
- pA First vector
- pB Second vector
- blockSize vector length
- pRes output result returned here
Return: none
function plp_cosine_distance_q32
void plp_cosine_distance_q32(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for cosine distance of 32-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- pRes output result returned here
Return: none
function plp_cosine_distance_q32s_rv32im
void plp_cosine_distance_q32s_rv32im(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
cosine distance of 32-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed-point fractional bits
- pRes output result returned here
Return:
- none
- none
function plp_cosine_distance_q32s_xpulpv2
void plp_cosine_distance_q32s_xpulpv2(
const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
cosine distance of 32-bit fixed point vectors kernel for XPULPV2 extension.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- pRes output result returned here
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- pRes output result returned here
Return:
- none
- none
cosine distance of 32-bit fixed point vectors kernel for XPULPV2 extension.
function plp_cosine_distance_q16
void plp_cosine_distance_q16(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint16_t blockSize,
uint16_t fracBits,
int32_t *__restrict__ pRes
)
Glue code for cosine distance of 16-bit fixed point vectors.
Parameters:
- pSrcA points to the first input vector
- pSrcB points to the second input vector
- blockSize number of samples in each vector
- fracBits number of fixed point fractional bits
- pRes output result returned here
Return: none
function plp_cosine_distance_q16s_rv32im
void plp_cosine_distance_q16s_rv32im(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
cosine distance of 16-bit fixed point vectors kernel for RV32IM extension.
Parameters:
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- fracBits decimal point for right shift
- pRes output result returned here [32 bit]
Return: none
Par: Exploiting SIMD instructions
When the ISA supports, the 16 bit values are packed two by two into 32 bit vectors and then the two dot products are performed simultaneously on 32 bit vectors, with 32 bit accumulator. RV32IM doesn't support SIMD. For SIMD, check out other ISA extensions (e.g. XPULPV2).
function plp_cosine_distance_q16s_xpulpv2
void plp_cosine_distance_q16s_xpulpv2(
const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes
)
cosine distance of 16-bit fixed point vectors kernel for XPULPV2.
Parameters:
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- fracBits decimal point for right shift
- pRes output result returned here [32 bit]
- pSrcA points to the first input vector [16 bit]
- pSrcB points to the second input vector [16 bit]
- blockSize number of samples in each vector
- fracBits decimal point for right shift
- pRes output result returned here [32 bit]
Return:
- none
- none
Par:
- Exploiting SIMD instructions
The 16 bit values are packed two by two into 32 bit vectors and then the sums and prducts are performed simultaneously on 32 bit vectors, with 32 bit accumulator. * Exploiting SIMD instructions
The 16 bit values are packed two by two into 32 bit vectors and then the sums and prducts are performed simultaneously on 32 bit vectors, with 32 bit accumulator.
cosine distance of 16-bit fixed point vectors kernel for XPULPV2.
Macros Documentation
define PLP_MATH_IBEX
#define PLP_MATH_IBEX
define PLP_MATH_LOOPUNROLL
#define PLP_MATH_LOOPUNROLL
define PLP_DWT_DEC_LEN
#define PLP_DWT_DEC_LEN(
SIG_LEN,
WAVELET,
LEVEL
)
plp_dwt_dec_len(SIG_LEN, WAVELET.length, LEVEL)
define PLP_DWT_DEC_TEMP_LEN
#define PLP_DWT_DEC_TEMP_LEN(
SRC_LEN,
WAVELET_LEN
)
(((SRC_LEN+WAVELET_LEN-1)/2 + ((SRC_LEN+WAVELET_LEN-1)/2 + WAVELET_LEN-1))/2)
define PLP_DWT_OUTPUT_LENGTH
#define PLP_DWT_OUTPUT_LENGTH(
SIG_LEN,
WAVELET_LEN
)
((SIG_LEN + WAVELET_LEN - 1) >> 1)
define FAST_MATH_TABLE_SIZE
#define FAST_MATH_TABLE_SIZE 512
Glue code for square root of a 32-bit floating point number.
Parameters:
- pSrc points to the input vectoro
- pRes Square root returned here
Return: none
Macros required for SINE and COSINE Fast math approximations
define FAST_MATH_Q32_SHIFT
#define FAST_MATH_Q32_SHIFT (32 - 10)
define FAST_MATH_Q16_SHIFT
#define FAST_MATH_Q16_SHIFT (16 - 10)
define CONTROLLER_Q32_SHIFT
#define CONTROLLER_Q32_SHIFT (32 - 9)
define TABLE_SPACING_Q32
#define TABLE_SPACING_Q32 0x400000
define TABLE_SPACING_Q16
#define TABLE_SPACING_Q16 0x80
Source code
/*
* Copyright (C) 2019 ETH Zurich and University of Bologna. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __PLP_MATH_H__
#define __PLP_MATH_H__
#include "math.h"
#include "rtos_hal.h"
typedef float float32_t;
#define PLP_MATH_IBEX // previously called zero-riscy
//#define PLP_MATH_RISCY
#define PLP_MATH_LOOPUNROLL
typedef struct {
int32_t *pSrcA; // pointer to the first vector
int32_t *pSrcB; // pointer to the second vector
uint32_t blkSizePE; // number of samples in each vector
uint32_t nPE; // number of processing units
int32_t *resBuffer; // pointer to result vector
} plp_dot_prod_instance_i32;
typedef struct {
int32_t *pSrcA; // pointer to the first vector
int32_t *pSrcB; // pointer to the second vector
uint32_t blkSizePE; // number of samples in each vector
uint32_t deciPoint; // decimal point for right shift
uint32_t nPE; // number of processing units
int32_t *resBuffer; // pointer to result vector
} plp_dot_prod_instance_q32;
typedef struct {
const float32_t *pSrcA; // pointer to the first vector
const float32_t *pSrcB; // pointer to the second vector
uint32_t blkSizePE; // number of samples in each vector
uint32_t nPE; // number of processing units
float32_t *resBuffer; // pointer to result vector
} plp_dot_prod_instance_f32;
typedef struct {
const float32_t *pSrcA; // pointer to the first vector
const float32_t *pSrcB; // pointer to the second vector
uint32_t blkSizePE; // number of samples in each vector
uint32_t nPE; // number of processing units
float32_t *pDst; // pointer to result vector
} plp_mult_instance_f32;
typedef struct {
const float32_t *pSrc; // pointer to the vector
uint32_t blkSizePE; // number of samples in each vector
uint32_t nPE; // number of processing units
float32_t *pDst; // pointer to result vector
} plp_log_instance_f32;
typedef struct {
const int32_t *pSrcA; // pointer to the first vector
uint32_t srcALen;
const int32_t *pSrcB; // pointer to the second vector
uint32_t srcBLen; // number of samples in each vector
uint8_t nPE; // number of processing units
int32_t *pRes; // pointer to result vector
} plp_conv_instance_i32;
typedef struct {
const int16_t *pSrcA; // pointer to the first vector
uint32_t srcALen;
const int16_t *pSrcB; // pointer to the second vector
uint32_t srcBLen; // number of samples in each vector
uint8_t nPE; // number of processing units
int32_t *pRes; // pointer to result vector
} plp_conv_instance_i16;
typedef struct {
const int8_t *pSrcA; // pointer to the first vector
uint32_t srcALen;
const int8_t *pSrcB; // pointer to the second vector
uint32_t srcBLen; // number of samples in each vector
uint8_t nPE; // number of processing units
int32_t *pRes; // pointer to result vector
} plp_conv_instance_i8;
typedef struct {
uint32_t addOffset;
uint32_t addLengthfirst;
uint32_t addLengthsecond;
uint32_t numVectors;
uint32_t blockOffset;
int32_t *pRes;
uint8_t coresPerVector;
} plp_conv_tree_add_instance;
typedef struct {
uint16_t fftLen; /*< length of the FFT. */
const int16_t *pTwiddle; /*< points to the Twiddle factor table. */
const int16_t *pBitRevTable; /*< points to the bit reversal table. */
uint16_t bitRevLength; /*< bit reversal table length. */
} plp_cfft_instance_q16;
typedef struct {
plp_cfft_instance_q16 *S;
int16_t *p1;
uint8_t ifftFlag;
uint8_t bitReverseFlag;
uint32_t deciPoint;
uint32_t nPE;
} plp_cfft_instance_q16_parallel;
typedef struct {
uint16_t fftLen; /*< length of the FFT. */
const int32_t *pTwiddle; /*< points to the Twiddle factor table. */
const int16_t *pBitRevTable; /*< points to the bit reversal table. */
uint16_t bitRevLength; /*< bit reversal table length. */
} plp_cfft_instance_q32;
typedef struct {
plp_cfft_instance_q32 *S;
int32_t *p1;
uint8_t ifftFlag;
uint8_t bitReverseFlag;
uint32_t fracBits;
uint32_t nPE;
} plp_cfft_instance_q32_parallel;
typedef struct {
uint32_t fftLen;
const float32_t *pTwiddle;
const uint16_t *pBitRevTable;
uint16_t bitRevLength;
} plp_cfft_instance_f32;
typedef struct {
plp_cfft_instance_f32 *S;
const float32_t *pSrc;
uint8_t ifftFlag;
uint8_t bitReverseFlag;
const uint32_t nPE;
} plp_cfft_instance_f32_parallel;
typedef struct {
uint32_t FFTLength;
uint8_t bitReverseFlag;
const float32_t *pTwiddleFactors;
const uint16_t *pBitReverseLUT;
} plp_fft_instance_f32;
typedef struct {
plp_cfft_instance_f32 *Sint;
uint32_t FFTLengthRFFT;
const float32_t *pTwiddleFactorsRFFT;
} plp_fft_fast_instance_f32;
typedef struct {
plp_fft_fast_instance_f32* S;
float32_t *__restrict__ pSrc;
float32_t *__restrict__ pDst;
const uint32_t nPE;
} plp_fft_fast_instance_f32_parallel;
typedef struct {
plp_fft_instance_f32 *S;
const float32_t *pSrc;
const uint32_t nPE;
float32_t *pDst;
} plp_fft_instance_f32_parallel;
typedef struct {
const float32_t *V;
const uint16_t *firstValue;
const uint16_t *filterLength;
const uint8_t nFilters;
} plp_triangular_filter_f32;
typedef struct {
float32_t re;
float32_t im;
} Complex_type_f32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_instance_i32;
typedef struct {
const float *__restrict__ pSrcA;
const float *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t nPE;
float *__restrict__ pDstC;
} plp_mat_mult_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t shift;
uint32_t nPE;
int8_t *__restrict__ pDstC;
} plp_mat_mult_instance_q8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t shift;
uint32_t nPE;
int16_t *__restrict__ pDstC;
} plp_mat_mult_instance_q16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t shift;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_instance_q32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_i32;
typedef struct {
const float *__restrict__ pSrcA;
const float *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t nPE;
float *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t shift;
uint32_t nPE;
int8_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_q8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t shift;
uint32_t nPE;
int16_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_q16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t shift;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_instance_q32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_add_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_add_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_add_instance_i32;
typedef struct {
const float *__restrict__ pSrcA;
const float *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t nPE;
float *__restrict__ pDst;
} plp_mat_add_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_sub_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_sub_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_sub_instance_i32;
typedef struct {
const float *__restrict__ pSrcA;
const float *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t nPE;
float *__restrict__ pDst;
} plp_mat_sub_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
int8_t scaleFactor;
int32_t shift;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_scale_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
int16_t scaleFactor;
int32_t shift;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_scale_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
int32_t scaleFactor;
int32_t shift;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_scale_instance_i32;
typedef struct {
const float *__restrict__ pSrc;
uint32_t M;
uint32_t N;
float scaleFactor;
uint32_t nPE;
float *__restrict__ pDst;
} plp_mat_scale_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_trans_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_trans_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_trans_instance_i32;
typedef struct {
uint32_t N;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_fill_I_instance_i8;
typedef struct {
uint32_t N;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_fill_I_instance_i16;
typedef struct {
uint32_t N;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_fill_I_instance_i32;
typedef struct {
uint32_t N;
uint32_t nPE;
float *__restrict__ pDst;
} plp_mat_fill_I_instance_f32;
typedef struct {
uint32_t N;
int32_t fracBits;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_fill_I_instance_q8;
typedef struct {
uint32_t N;
uint32_t fracBits;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_fill_I_instance_q16;
typedef struct {
uint32_t N;
uint32_t fracBits;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_fill_I_instance_q32;
typedef struct {
float *__restrict__ pSrc;
float *__restrict__ pDst;
uint32_t *__restrict__ flag;
uint32_t N;
uint32_t nPE;
} plp_mat_inv_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_i32;
typedef struct {
const float *__restrict__ pSrcA;
const float *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t nPE;
float *__restrict__ pDstC;
} plp_mat_mult_stride_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t shift;
uint32_t nPE;
int8_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_q8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t shift;
uint32_t nPE;
int16_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_q16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t shift;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_stride_instance_q32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_i32;
typedef struct {
const float *__restrict__ pSrcA;
const float *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t nPE;
float *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t shift;
uint32_t nPE;
int8_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_q8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t shift;
uint32_t nPE;
int16_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_q16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t O;
uint32_t strideA;
uint32_t strideB;
uint32_t strideC;
uint32_t shift;
uint32_t nPE;
int32_t *__restrict__ pDstC;
} plp_mat_mult_cmplx_stride_instance_q32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t strideA;
uint32_t strideB;
uint32_t strideY;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_add_stride_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t strideA;
uint32_t strideB;
uint32_t strideY;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_add_stride_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t strideA;
uint32_t strideB;
uint32_t strideY;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_add_stride_instance_i32;
typedef struct {
const float *__restrict__ pSrcA;
const float *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t strideA;
uint32_t strideB;
uint32_t strideY;
uint32_t nPE;
float *__restrict__ pDst;
} plp_mat_add_stride_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrcA;
const int8_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t strideA;
uint32_t strideB;
uint32_t strideY;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_sub_stride_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrcA;
const int16_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t strideA;
uint32_t strideB;
uint32_t strideY;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_sub_stride_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrcA;
const int32_t *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t strideA;
uint32_t strideB;
uint32_t strideY;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_sub_stride_instance_i32;
typedef struct {
const float *__restrict__ pSrcA;
const float *__restrict__ pSrcB;
uint32_t M;
uint32_t N;
uint32_t strideA;
uint32_t strideB;
uint32_t strideY;
uint32_t nPE;
float *__restrict__ pDst;
} plp_mat_sub_stride_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t strideSrc;
uint32_t strideDst;
int8_t scaleFactor;
int32_t shift;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_scale_stride_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t strideSrc;
uint32_t strideDst;
int16_t scaleFactor;
int32_t shift;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_scale_stride_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t strideSrc;
uint32_t strideDst;
int32_t scaleFactor;
int32_t shift;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_scale_stride_instance_i32;
typedef struct {
const float *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t strideSrc;
uint32_t strideDst;
float scaleFactor;
uint32_t nPE;
float *__restrict__ pDst;
} plp_mat_scale_stride_instance_f32;
typedef struct {
uint32_t N;
uint32_t stride;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_i8;
typedef struct {
uint32_t N;
uint32_t stride;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_i16;
typedef struct {
uint32_t N;
uint32_t stride;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_i32;
typedef struct {
uint32_t N;
uint32_t stride;
uint32_t nPE;
float *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_f32;
typedef struct {
uint32_t N;
uint32_t stride;
uint32_t nPE;
int32_t fracBits;
int8_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_q8;
typedef struct {
uint32_t N;
uint32_t stride;
uint32_t nPE;
int32_t fracBits;
int16_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_q16;
typedef struct {
uint32_t N;
uint32_t stride;
uint32_t nPE;
int32_t fracBits;
int32_t *__restrict__ pDst;
} plp_mat_fill_I_stride_instance_q32;
typedef struct {
uint32_t M;
uint32_t N;
uint32_t stride;
int8_t value;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_fill_stride_instance_i8;
typedef struct {
uint32_t M;
uint32_t N;
uint32_t stride;
int16_t value;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_fill_stride_instance_i16;
typedef struct {
uint32_t M;
uint32_t N;
uint32_t stride;
int32_t value;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_fill_stride_instance_i32;
typedef struct {
uint32_t M;
uint32_t N;
uint32_t stride;
float value;
uint32_t nPE;
float *__restrict__ pDst;
} plp_mat_fill_stride_instance_f32;
typedef struct {
const int8_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t strideSrc;
uint32_t strideDst;
uint32_t nPE;
int8_t *__restrict__ pDst;
} plp_mat_copy_stride_instance_i8;
typedef struct {
const int16_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t strideSrc;
uint32_t strideDst;
uint32_t nPE;
int16_t *__restrict__ pDst;
} plp_mat_copy_stride_instance_i16;
typedef struct {
const int32_t *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t strideSrc;
uint32_t strideDst;
uint32_t nPE;
int32_t *__restrict__ pDst;
} plp_mat_copy_stride_instance_i32;
typedef struct {
const float *__restrict__ pSrc;
uint32_t M;
uint32_t N;
uint32_t strideSrc;
uint32_t strideDst;
uint32_t nPE;
float *__restrict__ pDst;
} plp_mat_copy_stride_instance_f32;
typedef struct {
const float32_t *pSrcA; // pointer to the first vector
const float32_t *pSrcB; // pointer to the second vector
uint32_t blkSizePE; // number of samples in each vector
uint32_t nPE; // number of processing units
float32_t *resBuffer; // pointer to result vector
} plp_euclidean_distance_instance_f32;
typedef struct {
const int32_t *pSrcA; // pointer to the first vector
const int32_t *pSrcB; // pointer to the second vector
uint32_t blkSizePE; // number of samples in each vector
uint32_t nPE; // number of processing units
uint32_t fracBits; // number of fixed point fractional bits
int32_t *resBuffer; // pointer to result vector
} plp_euclidean_distance_instance_q32;
typedef struct {
const float32_t *pSrcA; // pointer to the first vector
const float32_t *pSrcB; // pointer to the second vector
uint32_t blkSizePE; // number of samples in each vector
uint32_t nPE; // number of processing units
float32_t *resBuffer_A; // pointer to result vector
float32_t *resBuffer_B; // pointer to result vector
float32_t *resBuffer_dot; // pointer to result vector
} plp_cosine_distance_instance_f32;
typedef struct {
int32_t *pSrc; // pointer to the first vector
uint32_t blkSizePE; // number of samples in each vector
uint32_t fracBits; // fracBits for right shift
uint32_t nPE; // number of processing units
int32_t *resBuffer; // pointer to result vector
} plp_power_instance_q32;
typedef struct {
const float32_t *pSrc; // pointer to the first vector
uint32_t blkSizePE; // number of samples in each vector
uint32_t nPE; // number of processing units
float32_t *resBuffer; // pointer to result vector
} plp_power_instance_f32;
typedef enum {
PLP_DWT_WAVELET_OTHER,
PLP_DWT_WAVELET_HAAR,
PLP_DWT_WAVELET_DB1,
PLP_DWT_WAVELET_DB2,
PLP_DWT_WAVELET_DB3,
PLP_DWT_WAVELET_DB4,
PLP_DWT_WAVELET_DB5,
PLP_DWT_WAVELET_DB6,
PLP_DWT_WAVELET_DB7,
PLP_DWT_WAVELET_DB8,
PLP_DWT_WAVELET_DB9,
PLP_DWT_WAVELET_DB10,
PLP_DWT_WAVELET_DB11,
PLP_DWT_WAVELET_DB12,
PLP_DWT_WAVELET_DB13,
PLP_DWT_WAVELET_DB14,
PLP_DWT_WAVELET_DB15,
PLP_DWT_WAVELET_DB16,
PLP_DWT_WAVELET_DB17,
PLP_DWT_WAVELET_DB18,
PLP_DWT_WAVELET_DB19,
PLP_DWT_WAVELET_DB20,
PLP_DWT_WAVELET_SYM2,
PLP_DWT_WAVELET_SYM3,
PLP_DWT_WAVELET_SYM4,
PLP_DWT_WAVELET_SYM5,
PLP_DWT_WAVELET_SYM6,
PLP_DWT_WAVELET_SYM7,
PLP_DWT_WAVELET_SYM8,
PLP_DWT_WAVELET_SYM9,
PLP_DWT_WAVELET_SYM10,
PLP_DWT_WAVELET_SYM11,
PLP_DWT_WAVELET_SYM12,
PLP_DWT_WAVELET_SYM13,
PLP_DWT_WAVELET_SYM14,
PLP_DWT_WAVELET_SYM15,
PLP_DWT_WAVELET_SYM16,
PLP_DWT_WAVELET_SYM17,
PLP_DWT_WAVELET_SYM18,
PLP_DWT_WAVELET_SYM19,
PLP_DWT_WAVELET_SYM20,
PLP_DWT_WAVELET_COIF1,
PLP_DWT_WAVELET_COIF2,
PLP_DWT_WAVELET_COIF3,
PLP_DWT_WAVELET_COIF4,
PLP_DWT_WAVELET_COIF5,
PLP_DWT_WAVELET_COIF6,
PLP_DWT_WAVELET_COIF7,
PLP_DWT_WAVELET_COIF8,
PLP_DWT_WAVELET_COIF9,
PLP_DWT_WAVELET_COIF10,
PLP_DWT_WAVELET_COIF11,
PLP_DWT_WAVELET_COIF12,
PLP_DWT_WAVELET_COIF13,
PLP_DWT_WAVELET_COIF14,
PLP_DWT_WAVELET_COIF15,
PLP_DWT_WAVELET_COIF16,
PLP_DWT_WAVELET_COIF17
} plp_dwt_wavelet_type;
typedef struct {
uint32_t length;
plp_dwt_wavelet_type type;
float32_t *dec_lo; /* decomposition lowpass */
float32_t *dec_hi; /* decomposition highpass */
float32_t *rec_lo; /* reconstruction lowpass */
float32_t *rec_hi; /* reconstruction highpass */
} plp_dwt_wavelet_f32;
typedef struct {
uint32_t length;
plp_dwt_wavelet_type type;
int32_t *dec_lo; /* decomposition lowpass */
int32_t *dec_hi; /* decomposition highpass */
} plp_dwt_wavelet_q32;
typedef struct {
uint32_t length;
plp_dwt_wavelet_type type;
int16_t *dec_lo; /* decomposition lowpass */
int16_t *dec_hi; /* decomposition highpass */
} plp_dwt_wavelet_q16;
typedef struct {
uint32_t length;
plp_dwt_wavelet_type type;
int8_t *dec_lo; /* decomposition lowpass */
int8_t *dec_hi; /* decomposition highpass */
} plp_dwt_wavelet_q8;
typedef enum {
PLP_DWT_MODE_ZERO,
PLP_DWT_MODE_CONSTANT,
PLP_DWT_MODE_SYMMETRIC,
PLP_DWT_MODE_REFLECT,
PLP_DWT_MODE_PERIODIC,
PLP_DWT_MODE_ANTISYMMETRIC,
PLP_DWT_MODE_ANTIREFLECT
} plp_dwt_extension_mode;
typedef struct {
const float32_t *pSrc; // points to the input buffer
uint32_t length; // length of input buffer
plp_dwt_wavelet_f32 wavelet; // wavelet structure for calculating DWT
plp_dwt_extension_mode mode; // boundary extension mode
uint32_t nPE; // number of processing units
float32_t *pDstA; // output buffer with Approximate coefficients
float32_t *pDstD; // ouput buffer with Detailed coefficients
} plp_dwt_instance_f32;
typedef struct {
const int32_t *pSrc; // points to the input buffer
uint32_t length; // length of input buffer
plp_dwt_wavelet_q32 wavelet; // wavelet structure for calculating DWT
plp_dwt_extension_mode mode; // boundary extension mode
uint32_t nPE; // number of processing units
int32_t *pDstA; // output buffer with Approximate coefficients
int32_t *pDstD; // ouput buffer with Detailed coefficients
} plp_dwt_instance_q32;
typedef struct {
const int16_t *pSrc; // points to the input buffer
uint32_t length; // length of input buffer
plp_dwt_wavelet_q16 wavelet; // wavelet structure for calculating DWT
plp_dwt_extension_mode mode; // boundary extension mode
uint32_t nPE; // number of processing units
int16_t *pDstA; // output buffer with Approximate coefficients
int16_t *pDstD; // ouput buffer with Detailed coefficients
} plp_dwt_instance_q16;
typedef struct {
const int8_t *pSrc; // points to the input buffer
uint32_t length; // length of input buffer
plp_dwt_wavelet_q8 wavelet; // wavelet structure for calculating DWT
plp_dwt_extension_mode mode; // boundary extension mode
uint32_t nPE; // number of processing units
int8_t *pDstA; // output buffer with Approximate coefficients
int8_t *pDstD; // ouput buffer with Detailed coefficients
} plp_dwt_instance_q8;
#define PLP_DWT_DEC_LEN(SIG_LEN, WAVELET, LEVEL) plp_dwt_dec_len(SIG_LEN, WAVELET.length, LEVEL)
#define PLP_DWT_DEC_TEMP_LEN(SRC_LEN, WAVELET_LEN) (((SRC_LEN+WAVELET_LEN-1)/2 + ((SRC_LEN+WAVELET_LEN-1)/2 + WAVELET_LEN-1))/2)
#define PLP_DWT_OUTPUT_LENGTH(SIG_LEN, WAVELET_LEN) ((SIG_LEN + WAVELET_LEN - 1) >> 1)
uint32_t plp_dwt_max_level(uint32_t sig_len, uint32_t wavelet_len);
uint32_t plp_dwt_dec_len(uint32_t sig_len, uint32_t wavelet_len, uint32_t level);
void plp_dot_prod_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t nPE,
int32_t *__restrict__ pRes);
void plp_dot_prod_q32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
uint32_t nPE,
int32_t *__restrict__ pRes);
void plp_dot_prod_f32_parallel(const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pRes);
void plp_dot_prod_i32p_xpulpv2(void *S);
void plp_dot_prod_q32p_xpulpv2(void *S);
void plp_dot_prod_f32p_xpulpv2(void *S);
void plp_dot_prod_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_dot_prod_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_dot_prod_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_dot_prod_q32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes);
void plp_dot_prod_q32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes);
void plp_dot_prod_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes);
void plp_dot_prod_f32(const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes);
void plp_dot_prod_f32s_xpulpv2(const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes);
void plp_dot_prod_f32s_rv32im(const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes);
void plp_dot_prod_i16(const int16_t *pSrcA,
const int16_t *pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_dot_prod_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_dot_prod_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_dot_prod_q16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes);
void plp_dot_prod_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes);
void plp_dot_prod_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes);
void plp_dot_prod_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_dot_prod_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_dot_prod_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_dot_prod_q8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes);
void plp_dot_prod_q8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes);
void plp_dot_prod_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes);
void plp_abs_i32(const int32_t * pSrc,
int32_t * pDst,
uint32_t blockSize);
void plp_abs_i32s_rv32im(const int32_t * pSrc,
int32_t * pDst,
uint32_t blockSize);
void plp_abs_i32s_xpulpv2(const int32_t * pSrc,
int32_t * pDst,
uint32_t blockSize);
void plp_abs_i16(const int16_t * pSrc,
int16_t * pDst,
uint32_t blockSize);
void plp_abs_i16s_rv32im(const int16_t * pSrc,
int16_t * pDst,
uint32_t blockSize);
void plp_abs_i16s_xpulpv2(const int16_t * pSrc,
int16_t * pDst,
uint32_t blockSize);
void plp_abs_i8(const int8_t * pSrc,
int8_t * pDst,
uint32_t blockSize);
void plp_abs_i8s_rv32im(const int8_t * pSrc,
int8_t * pDst,
uint32_t blockSize);
void plp_abs_i8s_xpulpv2(const int8_t * pSrc,
int8_t * pDst,
uint32_t blockSize);
void plp_add_i32(const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_add_i32s_rv32im(const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_add_i32s_xpulpv2(const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_add_i16(const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_add_i16s_rv32im(const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_add_i16s_xpulpv2(const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_add_i8(const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_add_i8s_rv32im(const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_add_i8s_xpulpv2(const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_mult_i32(const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_mult_i32s_rv32im(const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_mult_i32s_xpulpv2(const int32_t * pSrcA,
const int32_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_mult_i16(const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_mult_i16s_rv32im(const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_mult_i16s_xpulpv2(const int16_t * pSrcA,
const int16_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_mult_i8(const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_mult_i8s_rv32im(const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_mult_i8s_xpulpv2(const int8_t * pSrcA,
const int8_t * pSrcB,
int32_t * pDst,
uint32_t blockSize);
void plp_mult_f32(const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize);
void plp_mult_f32s_xpulpv2(const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize);
void plp_mult_f32_parallel(const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pDst);
void plp_mult_f32p_xpulpv2(void *S);
void plp_log_f32_parallel(const float32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pDst);
void plp_log_f32p_xpulpv2(void *S);
void plp_negate_i32(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize);
void plp_negate_i32s_rv32im(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize);
void plp_negate_i32s_xpulpv2(const int32_t * pSrc, int32_t * pDst, uint32_t blockSize);
void plp_negate_i16(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize);
void plp_negate_i16s_rv32im(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize);
void plp_negate_i16s_xpulpv2(const int16_t * pSrc, int16_t * pDst, uint32_t blockSize);
void plp_negate_i8(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize);
void plp_negate_i8s_rv32im(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize);
void plp_negate_i8s_xpulpv2(const int8_t * pSrc, int8_t * pDst, uint32_t blockSize);
void plp_negate_f32(const float32_t * pSrc, float32_t * pDst, uint32_t blockSize);
void plp_negate_f32s_xpulpv2(const float32_t * pSrc, float32_t * pDst, uint32_t blockSize);
void plp_offset_i32(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize);
void plp_offset_i32s_rv32im(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize);
void plp_offset_i32s_xpulpv2(const int32_t * pSrc, int32_t offset, int32_t * pDst, uint32_t blockSize);
void plp_offset_i16(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize);
void plp_offset_i16s_rv32im(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize);
void plp_offset_i16s_xpulpv2(const int16_t * pSrc, int16_t offset, int16_t * pDst, uint32_t blockSize);
void plp_offset_i8(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize);
void plp_offset_i8s_rv32im(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize);
void plp_offset_i8s_xpulpv2(const int8_t * pSrc, int8_t offset, int8_t * pDst, uint32_t blockSize);
void plp_offset_f32(const float32_t * pSrc, float32_t offset, float32_t * pDst, uint32_t blockSize);
void plp_offset_f32s_xpulpv2(const float32_t * pSrc, float32_t offset, float32_t * pDst, uint32_t blockSize);
void plp_sub_i32(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize);
void plp_sub_i32s_rv32im(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize);
void plp_sub_i32s_xpulpv2(const int32_t * pSrcA, const int32_t * pSrcB, int32_t * pDst, uint32_t blockSize);
void plp_sub_i16(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize);
void plp_sub_i16s_rv32im(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize);
void plp_sub_i16s_xpulpv2(const int16_t * pSrcA, const int16_t * pSrcB, int32_t * pDst, uint32_t blockSize);
void plp_sub_i8(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize);
void plp_sub_i8s_rv32im(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize);
void plp_sub_i8s_xpulpv2(const int8_t * pSrcA, const int8_t * pSrcB, int32_t * pDst, uint32_t blockSize);
void plp_sub_f32(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize);
void plp_sub_f32s_xpulpv2(const float32_t * pSrcA, const float32_t * pSrcB, float32_t * pDst, uint32_t blockSize);
void plp_scale_i32(const int32_t *__restrict__ pSrc, int32_t scaleFactor, int32_t shift, int32_t *__restrict__ pDst, uint32_t blockSize);
void plp_scale_i32s_rv32im(const int32_t *__restrict__ pSrc, int32_t scaleFactor, int32_t shift, int32_t *__restrict__ pDst, uint32_t blockSize);
void plp_scale_i32s_xpulpv2(const int32_t *__restrict__ pSrc, int32_t scaleFactor, int32_t shift, int32_t *__restrict__ pDst, uint32_t blockSize);
void plp_scale_i16(const int16_t *__restrict__ pSrc, int16_t scaleFactor, int32_t shift, int16_t *__restrict__ pDst, uint32_t blockSize);
void plp_scale_i16s_rv32im(const int16_t *__restrict__ pSrc, int16_t scaleFactor, int32_t shift, int16_t *__restrict__ pDst, uint32_t blockSize);
void plp_scale_i16s_xpulpv2(const int16_t *__restrict__ pSrc, int16_t scaleFactor, int32_t shift, int16_t *__restrict__ pDst, uint32_t blockSize);
void plp_scale_i8(const int8_t *__restrict__ pSrc, int8_t scaleFactor, int32_t shift, int8_t *__restrict__ pDst, uint32_t blockSize);
void plp_scale_i8s_rv32im(const int8_t *__restrict__ pSrc, int8_t scaleFactor, int32_t shift, int8_t *__restrict__ pDst, uint32_t blockSize);
void plp_scale_i8s_xpulpv2(const int8_t *__restrict__ pSrc, int8_t scaleFactor, int32_t shift, int8_t *__restrict__ pDst, uint32_t blockSize);
void plp_scale_f32(const float32_t *__restrict__ pSrc, float32_t scaleFactor, float32_t *__restrict__ pDst, uint32_t blockSize);
void plp_scale_f32s_xpulpv2(const float32_t *__restrict__ pSrc, float32_t scaleFactor, float32_t *__restrict__ pDst, uint32_t blockSize);
void plp_fill_i32(int32_t value, int32_t *__restrict__ pDst, uint32_t blockSize);
void plp_fill_i32s_rv32im(int32_t value, int32_t *__restrict__ pDst, uint32_t blockSize);
void plp_fill_i32s_xpulpv2(int32_t value, int32_t *__restrict__ pDst, uint32_t blockSize);
void plp_copy_i32(int32_t *__restrict__ pSrc, int32_t *__restrict__ pDst, uint32_t blockSize);
void plp_copy_i32s_rv32im(int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t blockSize);
void plp_copy_i32s_xpulpv2(int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t blockSize);
void plp_copy_f32(float32_t *__restrict__ pSrc, float32_t *__restrict__ pDst, uint32_t blockSize);
void plp_copy_f32s_xpulpv2(float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t blockSize);
void plp_copy_f32s_rv32im(float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t blockSize);
void plp_mean_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);
void plp_mean_f32s_xpulpv2(const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes);
void plp_mean_i32(const int32_t *__restrict__ pSrc, uint32_t blockSize, int32_t *__restrict__ pRes);
void plp_mean_i32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_mean_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_mean_i16(const int16_t *__restrict__ pSrc, uint32_t blockSize, int16_t *__restrict__ pRes);
void plp_mean_i16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes);
void plp_mean_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes);
void plp_mean_i8(const int8_t *__restrict__ pSrc, uint32_t blockSize, int8_t *__restrict__ pRes);
void plp_mean_i8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes);
void plp_mean_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes);
void plp_max_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);
void plp_max_f32s_xpulpv2(const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes);
void plp_max_i32(const int32_t *__restrict__ pSrc, uint32_t blockSize, int32_t *__restrict__ pRes);
void plp_max_i32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_max_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_max_i16(const int16_t *__restrict__ pSrc, uint32_t blockSize, int16_t *__restrict__ pRes);
void plp_max_i16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes);
void plp_max_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes);
void plp_max_i8(const int8_t *__restrict__ pSrc, uint32_t blockSize, int8_t *__restrict__ pRes);
void plp_max_i8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes);
void plp_max_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes);
void plp_min_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);
void plp_min_f32s_xpulpv2(const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes);
void plp_min_i32(const int32_t *__restrict__ pSrc, uint32_t blockSize, int32_t *__restrict__ pRes);
void plp_min_i32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_min_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_min_i16(const int16_t *__restrict__ pSrc, uint32_t blockSize, int16_t *__restrict__ pRes);
void plp_min_i16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes);
void plp_min_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int16_t *__restrict__ pRes);
void plp_min_i8(const int8_t *__restrict__ pSrc, uint32_t blockSize, int8_t *__restrict__ pRes);
void plp_min_i8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes);
void plp_min_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int8_t *__restrict__ pRes);
void plp_power_f32_parallel(const float32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pRes);
void plp_power_f32p_xpulpv2(void* S);
void plp_power_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);
void plp_power_f32s_xpulpv2(const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes);
void plp_power_f32s_rv32im(const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes);
void plp_power_i32(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_power_i32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_power_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_power_i16(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_power_i16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_power_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_power_i8(const int8_t *__restrict__ pSrc, uint32_t blockSize, int32_t *__restrict__ pRes);
void plp_power_i8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_power_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
int32_t *__restrict__ pRes);
void plp_power_q32_parallel(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
uint32_t nPE,
int32_t *__restrict__ pRes);
void plp_power_q32p_xpulpv2(void *S);
void plp_power_q32(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_power_q32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_power_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_power_q16(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_power_q16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_power_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_power_q8(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_power_q8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_power_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_var_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);
void plp_var_f32s_xpulpv2(const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes);
void plp_var_q32(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_var_q32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_var_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_var_q16(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_var_q16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_var_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_var_q8(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes);
void plp_var_q8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes);
void plp_var_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes);
void plp_std_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);
void plp_std_f32s_xpulpv2(const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes);
void plp_std_q32(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_std_q32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_std_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_std_q16(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_std_q16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_std_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_std_q8(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes);
void plp_std_q8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes);
void plp_std_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes);
void plp_rms_f32(const float *__restrict__ pSrc, uint32_t blockSize, float *__restrict__ pRes);
void plp_rms_f32s_xpulpv2(const float *__restrict__ pSrc,
uint32_t blockSize,
float *__restrict__ pRes);
void plp_rms_q32(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_rms_q32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_rms_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_rms_q16(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_rms_q16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_rms_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_rms_q8(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes);
void plp_rms_q8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes);
void plp_rms_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t blockSize,
uint32_t fracBits,
int8_t *__restrict__ pRes);
void plp_sqrt_q32(const int32_t *__restrict__ pSrc,
const uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_sqrt_q32s_rv32im(const int32_t *__restrict__ pSrc,
const uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_sqrt_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
const uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_sqrt_q16(const int16_t *__restrict__ pSrc,
const uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_sqrt_q16s_rv32im(const int16_t *__restrict__ pSrc,
const uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_sqrt_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
const uint32_t fracBits,
int16_t *__restrict__ pRes);
void plp_sqrt_f32(const float *__restrict__ pSrc,
float *__restrict__ pRes);
void plp_sqrt_f32s_rv32im(const float *__restrict__ pSrc, float *__restrict__ pRes);
void plp_sqrt_f32s_xpulpv2(const float *__restrict__ pSrc,
float *__restrict__ pRes);
#define FAST_MATH_TABLE_SIZE 512
#define FAST_MATH_Q32_SHIFT (32 - 10)
#define FAST_MATH_Q16_SHIFT (16 - 10)
#define CONTROLLER_Q32_SHIFT (32 - 9)
#define TABLE_SPACING_Q32 0x400000
#define TABLE_SPACING_Q16 0x80
int32_t plp_cos_q32(int32_t x);
int32_t plp_cos_q32s_rv32im(int32_t x);
int32_t plp_cos_q32s_xpulpv2(int32_t x);
int16_t plp_cos_q16(int16_t x);
int16_t plp_cos_q16s_rv32im(int16_t x);
int16_t plp_cos_q16s_xpulpv2(int16_t x);
float32_t plp_cos_f32(float32_t x);
float32_t plp_cos_f32s_xpulpv2(float32_t x);
int32_t plp_sin_q32(int32_t x);
int32_t plp_sin_q32s_rv32im(int32_t x);
int32_t plp_sin_q32s_xpulpv2(int32_t x);
int16_t plp_sin_q16(int16_t x);
int16_t plp_sin_q16s_rv32im(int16_t x);
int16_t plp_sin_q16s_xpulpv2(int16_t x);
float32_t plp_sin_f32(float32_t x);
float32_t plp_sin_f32s_xpulpv2(float32_t x);
void plp_correlate_i32(const int32_t *pSrcA,
const uint32_t srcALen,
const int32_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_correlate_i32s_rv32im(const int32_t *pSrcA,
const uint32_t srcALen,
const int32_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_correlate_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const uint32_t srcALen,
const int32_t *__restrict__ pSrcB,
const uint32_t srcBLen,
int32_t *__restrict__ pRes);
void plp_correlate_i16(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_correlate_i16s_xpulpv2(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_correlate_i16s_rv32im(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_correlate_i8(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_correlate_valid_i8(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_correlate_i8s_xpulpv2(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_correlate_i8s_rv32im(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_correlate_q32(const int32_t *pSrcA,
const uint32_t srcALen,
const int32_t *pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *pRes);
void plp_correlate_q32s_rv32im(const int32_t *pSrcA,
const uint32_t srcALen,
const int32_t *pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *pRes);
void plp_correlate_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const uint32_t srcALen,
const int32_t *__restrict__ pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_correlate_q16(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *pRes);
void plp_correlate_q16s_xpulpv2(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *pRes);
void plp_correlate_q16s_rv32im(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *pRes);
void plp_correlate_q8(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *pRes);
void plp_correlate_valid_q8(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *pRes);
void plp_correlate_q8s_xpulpv2(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *pRes);
void plp_correlate_q8s_rv32im(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
const uint32_t fracBits,
int32_t *pRes);
void plp_conv_i32(const int32_t *pSrcA,
const uint32_t srcALen,
const int32_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_valid_i32(const int32_t *pSrcA,
const uint32_t srcALen,
const int32_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_i32s_rv32im(const int32_t *pSrcA,
const uint32_t srcALen,
const int32_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const uint32_t srcALen,
const int32_t *__restrict__ pSrcB,
const uint32_t srcBLen,
int32_t *__restrict__ pRes);
void plp_conv_valid_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const uint32_t srcALen,
const int32_t *__restrict__ pSrcB,
const uint32_t srcBLen,
int32_t *__restrict__ pRes);
void plp_conv_i16(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_valid_i16(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_valid_rep_i16(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_i16s_xpulpv2(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_valid_i16s_xpulpv2(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_valid_rep_i16s_xpulpv2(const int16_t *pSrcA,
const uint32_t srcALen,
const uint32_t srcAMem,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_i16s_rv32im(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_i8(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_valid_i8(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_valid_rep_i8(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_i8s_xpulpv2(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_valid_i8s_xpulpv2(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_valid_rep_i8s_xpulpv2(const int8_t *pSrcA,
const uint32_t srcALen,
const uint32_t srcAMem,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_i8s_rv32im(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
int32_t *pRes);
void plp_conv_i32_parallel(const int32_t *pSrcA,
const uint32_t srcALen,
const int32_t *pSrcB,
const uint32_t srcBLen,
const uint8_t nPE,
int32_t *pRes);
void plp_conv_i32p_xpulpv2(void *task_args);
void plp_conv_i16_parallel(const int16_t *pSrcA,
const uint32_t srcALen,
const int16_t *pSrcB,
const uint32_t srcBLen,
const uint8_t nPE,
int32_t *pRes);
void plp_conv_i16p_xpulpv2(void *task_args);
void plp_conv_i8_parallel(const int8_t *pSrcA,
const uint32_t srcALen,
const int8_t *pSrcB,
const uint32_t srcBLen,
const uint8_t nPE,
int32_t *pRes);
void plp_conv_i8p_xpulpv2(void *task_args);
void plp_conv_parallel_OLA(uint32_t nPE,
uint32_t srcALen,
uint32_t srcBLen,
int32_t *resultsBuffer);
void plp_conv_parallel_OLA_kernel(void *task_args);
void plp_mat_mult_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i32p_xpulpv2(void *args);
void plp_mat_mult_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_i16p_xpulpv2(void *args);
void plp_mat_mult_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC);
void plp_mat_mult_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC);
void plp_mat_mult_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
float *__restrict__ pDstC);
void plp_mat_mult_f32p_xpulpv2(void *args);
void plp_mat_mult_i8p_xpulpv2(void *args);
void plp_mat_mult_q32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_q32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_q32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_q32p_xpulpv2(void *args);
void plp_mat_mult_q16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_q16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC);
void plp_mat_mult_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_q16p_xpulpv2(void *args);
void plp_mat_mult_q8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_q8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC);
void plp_mat_mult_q8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_q8p_xpulpv2(void *args);
void plp_mat_mult_cmplx_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i32p_xpulpv2(void *args);
void plp_mat_mult_cmplx_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i16p_xpulpv2(void *args);
void plp_mat_mult_cmplx_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_i8p_xpulpv2(void *args);
void plp_mat_mult_cmplx_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC);
void plp_mat_mult_cmplx_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC);
void plp_mat_mult_cmplx_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
float *__restrict__ pDstC);
void plp_mat_mult_cmplx_f32p_xpulpv2(void *args);
void plp_mat_mult_cmplx_q32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q32p_xpulpv2(void *args);
void plp_mat_mult_cmplx_q16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q16p_xpulpv2(void *args);
void plp_mat_mult_cmplx_q8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_q8p_xpulpv2(void *args);
void plp_mat_mult_trans_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i32p_xpulpv2(void *args);
void plp_mat_mult_trans_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i16p_xpulpv2(void *args);
void plp_mat_mult_trans_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_i8p_xpulpv2(void *args);
void plp_mat_mult_trans_q32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_q32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_q32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_q32p_xpulpv2(void *args);
void plp_mat_mult_trans_q16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_q16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_q16p_xpulpv2(void *args);
void plp_mat_mult_trans_q8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_q8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_q8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_q8p_xpulpv2(void *args);
void plp_mat_mult_trans_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC);
void plp_mat_mult_trans_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC);
void plp_mat_mult_trans_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
float *__restrict__ pDstC);
void plp_mat_mult_trans_f32p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i32p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i16p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_i8p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
float *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t nPE,
float *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_f32p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_q32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q32p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_q16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q16p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_q8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_q8p_xpulpv2(void *args);
void plp_cmplx_mag_f32(const float32_t *pSrc,
float32_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_f32s_xpulpv2(const float32_t *pSrc,
float32_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_q32(const int32_t *pSrc,
const uint32_t fracBits,
int32_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_q32s_rv32im(const int32_t *pSrc,
const uint32_t fracBits,
int32_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_q32s_xpulpv2(const int32_t *pSrc,
const uint32_t fracBits,
int32_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_q8(const int8_t *pSrc,
const uint32_t fracBits,
int8_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_q8s_rv32im(const int8_t *pSrc,
const uint32_t fracBits,
int8_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_q8s_xpulpv2(const int8_t *pSrc,
const uint32_t fracBits,
int8_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_i16(const int16_t *pSrc,
int16_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_i16s_rv32im(const int16_t *pSrc,
int16_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_i16s_xpulpv2(const int16_t *pSrc,
int16_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_i32(const int32_t *pSrc,
int32_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_i32s_rv32im(const int32_t *pSrc,
int32_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_i32s_xpulpv2(const int32_t *pSrc,
int32_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_i8(const int8_t *pSrc,
int8_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_i8s_rv32im(const int8_t *pSrc,
int8_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_i8s_xpulpv2(const int8_t *pSrc,
int8_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_q16(const int16_t *pSrc,
const uint32_t fracBits,
int16_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_q16s_rv32im(const int16_t *pSrc,
const uint32_t fracBits,
int16_t *pRes,
uint32_t numSamples);
void plp_cmplx_mag_q16s_xpulpv2(const int16_t *pSrc,
const uint32_t fracBits,
int16_t *pRes,
uint32_t numSamples);
void plp_bitreversal_16s_rv32im(uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab);
void plp_bitreversal_16s_xpulpv2(uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab);
void plp_bitreversal_16p_xpulpv2(uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab,
uint32_t nPE);
void plp_cfft_q16(const plp_cfft_instance_q16 *S,
int16_t *p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t deciPoint);
void plp_cfft_q16_parallel(const plp_cfft_instance_q16 *S,
int16_t *p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t deciPoint,
uint32_t nPE);
void plp_cfft_q16s_rv32im(const plp_cfft_instance_q16 *S,
int16_t *p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t deciPoint);
void plp_cfft_q16s_xpulpv2(const plp_cfft_instance_q16 *S,
int16_t *p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t deciPoint);
void plp_cfft_q16p_xpulpv2(void *args);
void plp_bitreversal_32s_rv32im(uint32_t *pSrc, const uint16_t bitRevLen, const uint16_t *pBitRevTab);
void plp_bitreversal_32s_xpulpv2(uint32_t *pSrc, const uint16_t bitRevLen, const uint16_t *pBitRevTab);
void plp_bitreversal_32p_xpulpv2(uint32_t *pSrc, const uint16_t bitRevLen, const uint16_t *pBitRevTab, uint32_t nPE);
void plp_cfft_q32(const plp_cfft_instance_q32 *S,
int32_t *p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t fracBits);
void plp_cfft_q32_parallel( const plp_cfft_instance_q32 *S,
int32_t *p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t fracBits,
uint32_t nPE );
void plp_cfft_q32s_rv32im(const plp_cfft_instance_q32 *S,
int32_t *p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t fracBits);
void plp_cfft_q32s_xpulpv2(const plp_cfft_instance_q32 *S,
int32_t *p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
uint32_t fracBits);
void plp_cfft_q32p_xpulpv2(void *args);
void plp_rfft_f32(const plp_fft_instance_f32 *S,
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst);
void plp_rfft_f32_parallel(const plp_fft_instance_f32 *S,
const float32_t *__restrict__ pSrc,
const uint32_t nPE,
float32_t *__restrict__ pDst);
void plp_rfft_f32s_xpulpv2(const plp_fft_instance_f32 *S,
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst);
void plp_rfft_f32p_xpulpv2(void *arg);
void plp_rfftfast_f32( const plp_fft_fast_instance_f32 *S,
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst);
void plp_rfftfast_f32_parallel( const plp_fft_fast_instance_f32 *S,
float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
const uint32_t nPE);
void plp_rfftfast_f32s_xpulpv2( const plp_fft_fast_instance_f32 *S,
float32_t *pSrc,
float32_t *pDst);
void plp_rfftfast_f32p_xpulpv2( void *arg );
void plp_cfft_f32( const plp_cfft_instance_f32 *S,
float32_t *pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
void plp_cfft_f32_parallel( const plp_cfft_instance_f32 *S,
const float32_t *pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag,
const uint32_t nPE);
void plp_cfft_f32s_xpulpv2( const plp_cfft_instance_f32 *S,
const float32_t *pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
void plp_cfft_f32p_xpulpv2(void *arg);
void plp_dct2_f32(const plp_fft_instance_f32 *S,
const Complex_type_f32 *pShift,
const uint8_t orthoNorm,
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pBuf,
float32_t *__restrict__ pDst);
void plp_dct2_f32_parallel(const plp_fft_instance_f32 *S,
const Complex_type_f32 *pShift,
const uint8_t orthoNorm,
const float32_t *__restrict__ pSrc,
const uint32_t nPE,
float32_t *__restrict__ pBuf,
float32_t *__restrict__ pDst);
void plp_mfcc_f32(const plp_fft_instance_f32 *SFFT,
const plp_fft_instance_f32 *SDCT,
const Complex_type_f32 *pShift,
const plp_triangular_filter_f32 *filterBank,
const float32_t *window,
const uint8_t *orthoNorm,
const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst);
void plp_mfcc_f32_parallel(const plp_fft_instance_f32 *SFFT,
const plp_fft_instance_f32 *SDCT,
const Complex_type_f32 *pShift,
const plp_triangular_filter_f32 *filterBank,
const float32_t *window,
const uint8_t *orthoNorm,
const float32_t *__restrict__ pSrc,
const uint32_t nPE,
float32_t *__restrict__ pDst);
void plp_dwt_f32(const float32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_f32 wavelet,
plp_dwt_extension_mode mode,
float32_t *__restrict__ pDstA,
float32_t *__restrict__ pDstD);
void plp_dwt_q32(const int32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q32 wavelet,
plp_dwt_extension_mode mode,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD);
void plp_dwt_q16(const int16_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q16 wavelet,
plp_dwt_extension_mode mode,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD);
void plp_dwt_q8(const int8_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q8 wavelet,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD);
void plp_dwt_dec_f32(const float32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_f32 wavelet,
plp_dwt_extension_mode mode,
uint32_t level,
float32_t *__restrict__ pTmp,
float32_t *__restrict__ pDst);
void plp_dwt_dec_f32_parallel(const float32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_f32 wavelet,
plp_dwt_extension_mode mode,
uint32_t level,
uint32_t nPE,
float32_t *__restrict__ pTemp,
float32_t *__restrict__ pDst);
void plp_dwt_f32s_xpulpv2(const float32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_f32 wavelet,
plp_dwt_extension_mode mode,
float32_t *__restrict__ pDstA,
float32_t *__restrict__ pDstD);
void plp_dwt_haar_f32s_xpulpv2(const float32_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
float32_t *__restrict__ pDstA,
float32_t *__restrict__ pDstD);
void plp_dwt_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q32 wavelet,
plp_dwt_extension_mode mode,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD);
void plp_dwt_haar_q32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD);
void plp_dwt_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q16 wavelet,
plp_dwt_extension_mode mode,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD);
void plp_dwt_haar_q16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD);
void plp_dwt_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q8 wavelet,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD);
void plp_dwt_haar_q8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD);
void plp_dwt_f32_parallel(const float32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_f32 wavelet,
plp_dwt_extension_mode mode,
uint32_t nPE,
float32_t *__restrict__ pDstA,
float32_t *__restrict__ pDstD);
void plp_dwt_q8_parallel(const int8_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q8 wavelet,
plp_dwt_extension_mode mode,
uint32_t nPE,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD);
void plp_dwt_q16_parallel(const int16_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q16 wavelet,
plp_dwt_extension_mode mode,
uint32_t nPE,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD);
void plp_dwt_q32_parallel(const int32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q32 wavelet,
plp_dwt_extension_mode mode,
uint32_t nPE,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD);
void plp_dwt_f32p_xpulpv2(void *args);
void plp_dwt_haar_f32p_xpulpv2(void *args);
void plp_dwt_q8p_xpulpv2(void *args);
void plp_dwt_haar_q8p_xpulpv2(void *args);
void plp_dwt_q16p_xpulpv2(void *args);
void plp_dwt_haar_q16p_xpulpv2(void *args);
void plp_dwt_q32p_xpulpv2(void *arg);
void plp_dwt_haar_q32p_xpulpv2(void *args);
void plp_dwt_q32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q32 wavelet,
plp_dwt_extension_mode mode,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD);
void plp_dwt_haar_q32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int32_t *__restrict__ pDstA,
int32_t *__restrict__ pDstD);
void plp_dwt_q16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q16 wavelet,
plp_dwt_extension_mode mode,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD);
void plp_dwt_haar_q16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int16_t *__restrict__ pDstA,
int16_t *__restrict__ pDstD);
void plp_dwt_q8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t length,
const plp_dwt_wavelet_q8 wavelet,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD);
void plp_dwt_haar_q8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t length,
plp_dwt_extension_mode mode,
int8_t *__restrict__ pDstA,
int8_t *__restrict__ pDstD);
void plp_mat_add_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst);
void plp_mat_add_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst);
void plp_mat_add_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst);
void plp_mat_add_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_add_i32p_xpulpv2(void *args);
void plp_mat_add_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst);
void plp_mat_add_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst);
void plp_mat_add_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst);
void plp_mat_add_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_add_i16p_xpulpv2(void *args);
void plp_mat_add_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst);
void plp_mat_add_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst);
void plp_mat_add_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst);
void plp_mat_add_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int8_t *__restrict__ pDst);
void plp_mat_add_i8p_xpulpv2(void *args);
void plp_mat_add_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
float *__restrict__ pDst);
void plp_mat_add_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
float *__restrict__ pDst);
void plp_mat_add_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
float *__restrict__ pDst);
void plp_mat_add_f32p_xpulpv2(void *args);
void plp_mat_sub_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst);
void plp_mat_sub_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst);
void plp_mat_sub_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst);
void plp_mat_sub_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_sub_i32p_xpulpv2(void *args);
void plp_mat_sub_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst);
void plp_mat_sub_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst);
void plp_mat_sub_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst);
void plp_mat_sub_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_sub_i16p_xpulpv2(void *args);
void plp_mat_sub_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst);
void plp_mat_sub_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst);
void plp_mat_sub_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst);
void plp_mat_sub_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
int8_t *__restrict__ pDst);
void plp_mat_sub_i8p_xpulpv2(void *args);
void plp_mat_sub_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
float *__restrict__ pDst);
void plp_mat_sub_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
float *__restrict__ pDst);
void plp_mat_sub_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t nPE,
float *__restrict__ pDst);
void plp_mat_sub_f32p_xpulpv2(void *args);
void plp_mat_scale_i32(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst);
void plp_mat_scale_i32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst);
void plp_mat_scale_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst);
void plp_mat_scale_i32_parallel(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t scaleFactor,
int32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_scale_i32p_xpulpv2(void *args);
void plp_mat_scale_i16(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst);
void plp_mat_scale_i16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst);
void plp_mat_scale_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst);
void plp_mat_scale_i16_parallel(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t scaleFactor,
int32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_scale_i16p_xpulpv2(void *args);
void plp_mat_scale_i8(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst);
void plp_mat_scale_i8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst);
void plp_mat_scale_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst);
void plp_mat_scale_i8_parallel(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t scaleFactor,
int32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDst);
void plp_mat_scale_i8p_xpulpv2(void *args);
void plp_mat_scale_f32(const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
float scaleFactor,
float *__restrict__ pDst);
void plp_mat_scale_f32s_xpulpv2(const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
float scaleFactor,
float *__restrict__ pDst);
void plp_mat_scale_f32_parallel(const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
float scaleFactor,
uint32_t nPE,
float *__restrict__ pDst);
void plp_mat_scale_f32p_xpulpv2(void *args);
void plp_mat_trans_i32(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst);
void plp_mat_trans_i32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst);
void plp_mat_trans_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int32_t *__restrict__ pDst);
void plp_mat_trans_i32_parallel(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_trans_i32p_xpulpv2(void *args);
void plp_mat_trans_i16(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst);
void plp_mat_trans_i16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst);
void plp_mat_trans_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int16_t *__restrict__ pDst);
void plp_mat_trans_i16_parallel(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_trans_i16p_xpulpv2(void *args);
void plp_mat_trans_i8(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst);
void plp_mat_trans_i8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst);
void plp_mat_trans_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
int8_t *__restrict__ pDst);
void plp_mat_trans_i8_parallel(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t nPE,
int8_t *__restrict__ pDst);
void plp_mat_trans_i8p_xpulpv2(void *args);
void plp_mat_trans_f32(const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
float *__restrict__ pDst);
void plp_mat_trans_f32_parallel(
const float *__restrict__ pSrc, uint32_t M, uint32_t N, uint32_t nPE, float *__restrict__ pDst);
int plp_mat_inv_f32(float *__restrict__ pSrc, float *__restrict__ pDst, uint32_t N);
int plp_mat_inv_f32s_xpulpv2(float *__restrict__ pSrc, float *__restrict__ pDst, uint32_t N);
int plp_mat_inv_f32_parallel( float *__restrict__ pSrc,
float *__restrict__ pDst,
uint32_t N,
uint32_t nPE);
int plp_mat_inv_f32p_xpulpv2(void *args);
void plp_mat_fill_I_i32(uint32_t N, int32_t *__restrict__ pDst);
void plp_mat_fill_I_i32s_rv32im(uint32_t N, int32_t *__restrict__ pDst);
void plp_mat_fill_I_i32s_xpulpv2(uint32_t N, int32_t *__restrict__ pDst);
void plp_mat_fill_I_i32_parallel(uint32_t N, uint32_t nPE, int32_t *__restrict__ pDst);
void plp_mat_fill_I_i32p_xpulpv2(void *args);
void plp_mat_fill_I_i16(uint32_t N, int16_t *__restrict__ pDst);
void plp_mat_fill_I_i16s_rv32im(uint32_t N, int16_t *__restrict__ pDst);
void plp_mat_fill_I_i16s_xpulpv2(uint32_t N, int16_t *__restrict__ pDst);
void plp_mat_fill_I_i16_parallel(uint32_t N, uint32_t nPE, int16_t *__restrict__ pDst);
void plp_mat_fill_I_i16p_xpulpv2(void *args);
void plp_mat_fill_I_i8(uint32_t N, int8_t *__restrict__ pDst);
void plp_mat_fill_I_i8s_rv32im(uint32_t N, int8_t *__restrict__ pDst);
void plp_mat_fill_I_i8s_xpulpv2(uint32_t N, int8_t *__restrict__ pDst);
void plp_mat_fill_I_i8_parallel(uint32_t N, uint32_t nPE, int8_t *__restrict__ pDst);
void plp_mat_fill_I_i8p_xpulpv2(void *args);
void plp_mat_fill_I_f32(uint32_t N, float *__restrict__ pDst);
void plp_mat_fill_I_f32s_xpulpv2(uint32_t N, float *__restrict__ pDst);
void plp_mat_fill_I_f32_parallel(uint32_t N, uint32_t nPE, float *__restrict__ pDst);
void plp_mat_fill_I_f32p_xpulpv2(void *args);
void plp_mat_fill_I_q32(uint32_t N, int32_t fracBits, int32_t *__restrict__ pDst);
void plp_mat_fill_I_q32s_rv32im(uint32_t N, int32_t fracBits, int32_t *__restrict__ pDst);
void plp_mat_fill_I_q32s_xpulpv2(uint32_t N, int32_t fracBits, int32_t *__restrict__ pDst);
void plp_mat_fill_I_q32_parallel(uint32_t N,
int32_t fracBits,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_fill_I_q32p_xpulpv2(void *args);
void plp_mat_fill_I_q16(uint32_t N, int32_t fracBits, int16_t *__restrict__ pDst);
void plp_mat_fill_I_q16s_rv32im(uint32_t N, int32_t fracBits, int16_t *__restrict__ pDst);
void plp_mat_fill_I_q16s_xpulpv2(uint32_t N, int32_t fracBits, int16_t *__restrict__ pDst);
void plp_mat_fill_I_q16_parallel(uint32_t N,
int32_t fracBits,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_fill_I_q16p_xpulpv2(void *args);
void plp_mat_fill_I_q8(uint32_t N, int32_t fracBits, int8_t *__restrict__ pDst);
void plp_mat_fill_I_q8s_rv32im(uint32_t N, int32_t fracBits, int8_t *__restrict__ pDst);
void plp_mat_fill_I_q8s_xpulpv2(uint32_t N, int32_t fracBits, int8_t *__restrict__ pDst);
void plp_mat_fill_I_q8_parallel(uint32_t N,
int32_t fracBits,
uint32_t nPE,
int8_t *__restrict__ pDst);
void plp_mat_fill_I_q8p_xpulpv2(void *args);
void plp_mat_mult_stride_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i32p_xpulpv2(void *args);
void plp_mat_mult_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_i16p_xpulpv2(void *args);
void plp_mat_mult_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC);
void plp_mat_mult_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC);
void plp_mat_mult_stride_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
float *__restrict__ pDstC);
void plp_mat_mult_stride_f32p_xpulpv2(void *args);
void plp_mat_mult_stride_i8p_xpulpv2(void *args);
void plp_mat_mult_stride_q32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_q32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_q32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_stride_q32p_xpulpv2(void *args);
void plp_mat_mult_stride_q16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_stride_q16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC);
void plp_mat_mult_stride_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_stride_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_stride_q16p_xpulpv2(void *args);
void plp_mat_mult_stride_q8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_stride_q8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC);
void plp_mat_mult_stride_q8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_stride_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_stride_q8p_xpulpv2(void *args);
void plp_mat_mult_trans_stride_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i32p_xpulpv2(void *args);
void plp_mat_mult_trans_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i16p_xpulpv2(void *args);
void plp_mat_mult_trans_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_i8p_xpulpv2(void *args);
void plp_mat_mult_trans_stride_q32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q32p_xpulpv2(void *args);
void plp_mat_mult_trans_stride_q16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q16p_xpulpv2(void *args);
void plp_mat_mult_trans_stride_q8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_stride_q8p_xpulpv2(void *args);
void plp_mat_mult_trans_stride_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC);
void plp_mat_mult_trans_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC);
void plp_mat_mult_trans_stride_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
float *__restrict__ pDstC);
void plp_mat_mult_trans_stride_f32p_xpulpv2(void *args);
void plp_mat_mult_cmplx_stride_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i32p_xpulpv2(void *args);
void plp_mat_mult_cmplx_stride_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i16p_xpulpv2(void *args);
void plp_mat_mult_cmplx_stride_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_i8p_xpulpv2(void *args);
void plp_mat_mult_cmplx_stride_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
float *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_f32p_xpulpv2(void *args);
void plp_mat_mult_cmplx_stride_q32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q32p_xpulpv2(void *args);
void plp_mat_mult_cmplx_stride_q16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q16p_xpulpv2(void *args);
void plp_mat_mult_cmplx_stride_q8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC);
void plp_mat_mult_cmplx_stride_q8p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_stride_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i32p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_stride_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i16p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_stride_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_i8p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_stride_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
float *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t nPE,
float *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_f32p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_stride_q32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q32p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_stride_q16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q16p_xpulpv2(void *args);
void plp_mat_mult_trans_cmplx_stride_q8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t O,
uint32_t strideA,
uint32_t strideB,
uint32_t strideC,
uint32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDstC);
void plp_mat_mult_trans_cmplx_stride_q8p_xpulpv2(void *args);
void plp_mat_add_stride_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst);
void plp_mat_add_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst);
void plp_mat_add_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst);
void plp_mat_add_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_add_stride_i32p_xpulpv2(void *args);
void plp_mat_add_stride_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst);
void plp_mat_add_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst);
void plp_mat_add_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst);
void plp_mat_add_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_add_stride_i16p_xpulpv2(void *args);
void plp_mat_add_stride_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst);
void plp_mat_add_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst);
void plp_mat_add_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst);
void plp_mat_add_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int8_t *__restrict__ pDst);
void plp_mat_add_stride_i8p_xpulpv2(void *args);
void plp_mat_add_stride_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
float *__restrict__ pDst);
void plp_mat_add_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
float *__restrict__ pDst);
void plp_mat_add_stride_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
float *__restrict__ pDst);
void plp_mat_add_stride_f32p_xpulpv2(void *args);
void plp_mat_sub_stride_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst);
void plp_mat_sub_stride_i32s_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst);
void plp_mat_sub_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int32_t *__restrict__ pDst);
void plp_mat_sub_stride_i32_parallel(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_sub_stride_i32p_xpulpv2(void *args);
void plp_mat_sub_stride_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst);
void plp_mat_sub_stride_i16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst);
void plp_mat_sub_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int16_t *__restrict__ pDst);
void plp_mat_sub_stride_i16_parallel(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_sub_stride_i16p_xpulpv2(void *args);
void plp_mat_sub_stride_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst);
void plp_mat_sub_stride_i8s_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst);
void plp_mat_sub_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
int8_t *__restrict__ pDst);
void plp_mat_sub_stride_i8_parallel(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
int8_t *__restrict__ pDst);
void plp_mat_sub_stride_i8p_xpulpv2(void *args);
void plp_mat_sub_stride_f32(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
float *__restrict__ pDst);
void plp_mat_sub_stride_f32s_xpulpv2(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
float *__restrict__ pDst);
void plp_mat_sub_stride_f32_parallel(const float *__restrict__ pSrcA,
const float *__restrict__ pSrcB,
uint32_t M,
uint32_t N,
uint32_t strideA,
uint32_t strideB,
uint32_t strideY,
uint32_t nPE,
float *__restrict__ pDst);
void plp_mat_sub_stride_f32p_xpulpv2(void *args);
void plp_mat_scale_stride_i32(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst);
void plp_mat_scale_stride_i32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst);
void plp_mat_scale_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t scaleFactor,
int32_t shift,
int32_t *__restrict__ pDst);
void plp_mat_scale_stride_i32_parallel(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t scaleFactor,
int32_t shift,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_scale_stride_i32p_xpulpv2(void *args);
void plp_mat_scale_stride_i16(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst);
void plp_mat_scale_stride_i16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst);
void plp_mat_scale_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t scaleFactor,
int32_t shift,
int16_t *__restrict__ pDst);
void plp_mat_scale_stride_i16_parallel(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t scaleFactor,
int32_t shift,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_scale_stride_i16p_xpulpv2(void *args);
void plp_mat_scale_stride_i8(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst);
void plp_mat_scale_stride_i8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst);
void plp_mat_scale_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t scaleFactor,
int32_t shift,
int8_t *__restrict__ pDst);
void plp_mat_scale_stride_i8_parallel(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t scaleFactor,
int32_t shift,
uint32_t nPE,
int8_t *__restrict__ pDst);
void plp_mat_scale_stride_i8p_xpulpv2(void *args);
void plp_mat_scale_stride_f32(const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
float scaleFactor,
float *__restrict__ pDst);
void plp_mat_scale_stride_f32s_xpulpv2(const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
float scaleFactor,
float *__restrict__ pDst);
void plp_mat_scale_stride_f32_parallel(const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
float scaleFactor,
uint32_t nPE,
float *__restrict__ pDst);
void plp_mat_scale_stride_f32p_xpulpv2(void *args);
void plp_mat_fill_I_stride_i32(uint32_t N, uint32_t stride, int32_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i32s_rv32im(uint32_t N, uint32_t stride, int32_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i32s_xpulpv2(uint32_t N, uint32_t stride, int32_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i32_parallel(uint32_t N,
uint32_t stride,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i32p_xpulpv2(void *args);
void plp_mat_fill_I_stride_i16(uint32_t N, uint32_t stride, int16_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i16s_rv32im(uint32_t N, uint32_t stride, int16_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i16s_xpulpv2(uint32_t N, uint32_t stride, int16_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i16_parallel(uint32_t N,
uint32_t stride,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i16p_xpulpv2(void *args);
void plp_mat_fill_I_stride_i8(uint32_t N, uint32_t stride, int8_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i8s_rv32im(uint32_t N, uint32_t stride, int8_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i8s_xpulpv2(uint32_t N, uint32_t stride, int8_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i8_parallel(uint32_t N,
uint32_t stride,
uint32_t nPE,
int8_t *__restrict__ pDst);
void plp_mat_fill_I_stride_i8p_xpulpv2(void *args);
void plp_mat_fill_I_stride_f32(uint32_t N, uint32_t stride, float *__restrict__ pDst);
void plp_mat_fill_I_stride_f32s_xpulpv2(uint32_t N, uint32_t stride, float *__restrict__ pDst);
void plp_mat_fill_I_stride_f32_parallel(uint32_t N,
uint32_t stride,
uint32_t nPE,
float *__restrict__ pDst);
void plp_mat_fill_I_stride_f32p_xpulpv2(void *args);
void plp_mat_fill_I_stride_q32(uint32_t N,
uint32_t stride,
int32_t fracBits,
int32_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q32s_rv32im(uint32_t N,
uint32_t stride,
int32_t fracBits,
int32_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q32s_xpulpv2(uint32_t N,
uint32_t stride,
int32_t fracBits,
int32_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q32_parallel(
uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int32_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q32p_xpulpv2(void *args);
void plp_mat_fill_I_stride_q16(uint32_t N,
uint32_t stride,
int32_t fracBits,
int16_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q16s_rv32im(uint32_t N,
uint32_t stride,
int32_t fracBits,
int16_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q16s_xpulpv2(uint32_t N,
uint32_t stride,
int32_t fracBits,
int16_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q16_parallel(
uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int16_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q16p_xpulpv2(void *args);
void plp_mat_fill_I_stride_q8(uint32_t N,
uint32_t stride,
int32_t fracBits,
int8_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q8s_rv32im(uint32_t N,
uint32_t stride,
int32_t fracBits,
int8_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q8s_xpulpv2(uint32_t N,
uint32_t stride,
int32_t fracBits,
int8_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q8_parallel(
uint32_t N, uint32_t stride, int32_t fracBits, uint32_t nPE, int8_t *__restrict__ pDst);
void plp_mat_fill_I_stride_q8p_xpulpv2(void *args);
void plp_mat_fill_stride_i32(
uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *__restrict__ pDst);
void plp_mat_fill_stride_i32s_rv32im(
uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *__restrict__ pDst);
void plp_mat_fill_stride_i32s_xpulpv2(
uint32_t M, uint32_t N, uint32_t stride, int32_t value, int32_t *__restrict__ pDst);
void plp_mat_fill_stride_i32_parallel(uint32_t M,
uint32_t N,
uint32_t stride,
int32_t value,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_fill_stride_i32p_xpulpv2(void *args);
void plp_mat_fill_stride_i16(
uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *__restrict__ pDst);
void plp_mat_fill_stride_i16s_rv32im(
uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *__restrict__ pDst);
void plp_mat_fill_stride_i16s_xpulpv2(
uint32_t M, uint32_t N, uint32_t stride, int16_t value, int16_t *__restrict__ pDst);
void plp_mat_fill_stride_i16_parallel(uint32_t M,
uint32_t N,
uint32_t stride,
int16_t value,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_fill_stride_i16p_xpulpv2(void *args);
void plp_mat_fill_stride_i8(
uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *__restrict__ pDst);
void plp_mat_fill_stride_i8s_rv32im(
uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *__restrict__ pDst);
void plp_mat_fill_stride_i8s_xpulpv2(
uint32_t M, uint32_t N, uint32_t stride, int8_t value, int8_t *__restrict__ pDst);
void plp_mat_fill_stride_i8_parallel(
uint32_t M, uint32_t N, uint32_t stride, int8_t value, uint32_t nPE, int8_t *__restrict__ pDst);
void plp_mat_fill_stride_i8p_xpulpv2(void *args);
void plp_mat_fill_stride_f32(
uint32_t M, uint32_t N, uint32_t stride, float value, float *__restrict__ pDst);
void plp_mat_fill_stride_f32s_xpulpv2(
uint32_t M, uint32_t N, uint32_t stride, float value, float *__restrict__ pDst);
void plp_mat_fill_stride_f32_parallel(
uint32_t M, uint32_t N, uint32_t stride, float value, uint32_t nPE, float *__restrict__ pDst);
void plp_mat_fill_stride_f32p_xpulpv2(void *args);
void plp_mat_copy_stride_i32(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t *__restrict__ pDst);
void plp_mat_copy_stride_i32s_rv32im(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t *__restrict__ pDst);
void plp_mat_copy_stride_i32s_xpulpv2(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int32_t *__restrict__ pDst);
void plp_mat_copy_stride_i32_parallel(const int32_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
uint32_t nPE,
int32_t *__restrict__ pDst);
void plp_mat_copy_stride_i32p_xpulpv2(void *args);
void plp_mat_copy_stride_i16(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t *__restrict__ pDst);
void plp_mat_copy_stride_i16s_rv32im(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t *__restrict__ pDst);
void plp_mat_copy_stride_i16s_xpulpv2(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int16_t *__restrict__ pDst);
void plp_mat_copy_stride_i16_parallel(const int16_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
uint32_t nPE,
int16_t *__restrict__ pDst);
void plp_mat_copy_stride_i16p_xpulpv2(void *args);
void plp_mat_copy_stride_i8(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t *__restrict__ pDst);
void plp_mat_copy_stride_i8s_rv32im(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t *__restrict__ pDst);
void plp_mat_copy_stride_i8s_xpulpv2(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
int8_t *__restrict__ pDst);
void plp_mat_copy_stride_i8_parallel(const int8_t *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
uint32_t nPE,
int8_t *__restrict__ pDst);
void plp_mat_copy_stride_i8p_xpulpv2(void *args);
void plp_mat_copy_stride_f32(const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
float *__restrict__ pDst);
void plp_mat_copy_stride_f32s_xpulpv2(const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
float *__restrict__ pDst);
void plp_mat_copy_stride_f32_parallel(const float *__restrict__ pSrc,
uint32_t M,
uint32_t N,
uint32_t strideSrc,
uint32_t strideDst,
uint32_t nPE,
float *__restrict__ pDst);
void plp_mat_copy_stride_f32p_xpulpv2(void *args);
void plp_cmplx_conj_f32(const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_conj_f32_xpulpv2(const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_conj_i32(const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_conj_i32_xpulpv2(const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_conj_i32_rv32im(const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_conj_i16(const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_conj_i16_xpulpv2(const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_conj_i16_rv32im(const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_conj_i8(const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_conj_i8_xpulpv2(const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_conj_i8_rv32im(const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_dot_prod_f32(const float32_t *pSrcA,
const float32_t *pSrcB,
uint32_t numSamples,
float32_t *realResult,
float32_t *imagResult);
void plp_cmplx_dot_prod_f32_xpulpv2(const float32_t *pSrcA,
const float32_t *pSrcB,
uint32_t numSamples,
float32_t *realResult,
float32_t *imagResult);
void plp_cmplx_dot_prod_i32(const int32_t *pSrcA,
const int32_t *pSrcB,
uint32_t numSamples,
int32_t *realResult,
int32_t *imagResult);
void plp_cmplx_dot_prod_i32_xpulpv2(const int32_t *pSrcA,
const int32_t *pSrcB,
uint32_t numSamples,
int32_t *realResult,
int32_t *imagResult);
void plp_cmplx_dot_prod_i32_rv32im(const int32_t *pSrcA,
const int32_t *pSrcB,
uint32_t numSamples,
int32_t *realResult,
int32_t *imagResult);
void plp_cmplx_dot_prod_i16(const int16_t *pSrcA,
const int16_t *pSrcB,
uint32_t numSamples,
int16_t *realResult,
int16_t *imagResult);
void plp_cmplx_dot_prod_i16_xpulpv2(const int16_t *pSrcA,
const int16_t *pSrcB,
uint32_t numSamples,
int16_t *realResult,
int16_t *imagResult);
void plp_cmplx_dot_prod_i16_rv32im(const int16_t *pSrcA,
const int16_t *pSrcB,
uint32_t numSamples,
int16_t *realResult,
int16_t *imagResult);
void plp_cmplx_dot_prod_i8(const int8_t *pSrcA,
const int8_t *pSrcB,
uint32_t numSamples,
int8_t *realResult,
int8_t *imagResult);
void plp_cmplx_dot_prod_i8_xpulpv2(const int8_t *pSrcA,
const int8_t *pSrcB,
uint32_t numSamples,
int8_t *realResult,
int8_t *imagResult);
void plp_cmplx_dot_prod_i8_rv32im(const int8_t *pSrcA,
const int8_t *pSrcB,
uint32_t numSamples,
int8_t *realResult,
int8_t *imagResult);
void plp_cmplx_dot_prod_q32(const int32_t *pSrcA,
const int32_t *pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int32_t *realResult,
int32_t *imagResult);
void plp_cmplx_dot_prod_q32_xpulpv2(const int32_t *pSrcA,
const int32_t *pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int32_t *realResult,
int32_t *imagResult);
void plp_cmplx_dot_prod_q32_rv32im(const int32_t *pSrcA,
const int32_t *pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int32_t *realResult,
int32_t *imagResult);
void plp_cmplx_dot_prod_q16(const int16_t *pSrcA,
const int16_t *pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int16_t *realResult,
int16_t *imagResult);
void plp_cmplx_dot_prod_q16_xpulpv2(const int16_t *pSrcA,
const int16_t *pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int16_t *realResult,
int16_t *imagResult);
void plp_cmplx_dot_prod_q16_rv32im(const int16_t *pSrcA,
const int16_t *pSrcB,
uint32_t numSamples,
uint32_t deciPoint,
int16_t *realResult,
int16_t *imagResult);
void plp_cmplx_mult_real_f32(const float32_t *__restrict__ pSrcCmplx,
const float32_t *__restrict__ pSrcReal,
float32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_f32_xpulpv2(const float32_t *__restrict__ pSrcCmplx,
const float32_t *__restrict__ pSrcReal,
float32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_i32(const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_i32_xpulpv2(const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_i32_rv32im(const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_i16(const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_i16_xpulpv2(const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_i16_rv32im(const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_i8(const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_i8_xpulpv2(const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_i8_rv32im(const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_real_q32(const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_real_q32_xpulpv2(const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_real_q32_rv32im(const int32_t *__restrict__ pSrcCmplx,
const int32_t *__restrict__ pSrcReal,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_real_q16(const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_real_q16_xpulpv2(const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_real_q16_rv32im(const int16_t *__restrict__ pSrcCmplx,
const int16_t *__restrict__ pSrcReal,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_real_q8(const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_real_q8_xpulpv2(const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_real_q8_rv32im(const int8_t *__restrict__ pSrcCmplx,
const int8_t *__restrict__ pSrcReal,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mag_squared_f32(const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_f32_xpulpv2(const float32_t *__restrict__ pSrc,
float32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_i16(const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_i16_rv32im(const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_i16_xpulpv2(const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_i32(const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_i32_rv32im(const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_i32_xpulpv2(const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_i8_xpulpv2(const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_i8(const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_i8_rv32im(const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_i8_xpulpv2(const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mag_squared_q32(const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mag_squared_q32_rv32im(const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mag_squared_q32_xpulpv2(const int32_t *__restrict__ pSrc,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mag_squared_q16(const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mag_squared_q16_rv32im(const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mag_squared_q16_xpulpv2(const int16_t *__restrict__ pSrc,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mag_squared_q8(const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mag_squared_q8_rv32im(const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mag_squared_q8_xpulpv2(const int8_t *__restrict__ pSrc,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_f32(const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
float32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_f32_xpulpv2(const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
float32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_i32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_i32_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_i32_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_i16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_i16_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_i16_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_i8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_i8_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_i8_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_q32(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_q32_xpulpv2(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_q32_rv32im(const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
int32_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_q16(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_q16_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_q16_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
int16_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_q8(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_q8_xpulpv2(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_cmplx_mult_cmplx_q8_rv32im(const int8_t *__restrict__ pSrcA,
const int8_t *__restrict__ pSrcB,
int8_t *__restrict__ pDst,
uint32_t deciPoint,
uint32_t numSamples);
void plp_euclidean_distance_q32_parallel( const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
uint32_t nPE,
uint32_t *__restrict__ pRes);
void plp_euclidean_distance_f32_parallel( const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pRes);
void plp_euclidean_distance_q32p_xpulpv2(void *S);
void plp_euclidean_distance_f32p_xpulpv2(void *S);
void plp_euclidean_distance_q32( const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_euclidean_distance_q32s_xpulpv2( const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_euclidean_distance_q32s_rv32im( const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_euclidean_distance_q16( const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint16_t blockSize,
uint16_t fracBits,
int32_t *__restrict__ pRes);
void plp_euclidean_distance_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t deciPoint,
int32_t *__restrict__ pRes);
void plp_euclidean_distance_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_euclidean_distance_f32( const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes);
void plp_euclidean_distance_f32s_xpulpv2( const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes);
void plp_euclidean_distance_f32s_rv32im( const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes);
void plp_cosine_distance_q32_parallel( const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
uint32_t nPE,
int32_t *__restrict__ pRes);
void plp_cosine_distance_f32_parallel( const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t nPE,
float32_t *__restrict__ pRes);
void plp_cosine_distance_f32p_xpulpv2(void *S);
void plp_cosine_distance_f32( const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes);
void plp_cosine_distance_f32s_rv32im( const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes);
void plp_cosine_distance_f32s_xpulpv2( const float32_t *__restrict__ pSrcA,
const float32_t *__restrict__ pSrcB,
uint32_t blockSize,
float32_t *__restrict__ pRes);
void plp_cosine_distance_q32( const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_cosine_distance_q32s_rv32im( const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_cosine_distance_q32s_xpulpv2( const int32_t *__restrict__ pSrcA,
const int32_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_cosine_distance_q16( const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint16_t blockSize,
uint16_t fracBits,
int32_t *__restrict__ pRes);
void plp_cosine_distance_q16s_rv32im(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
void plp_cosine_distance_q16s_xpulpv2(const int16_t *__restrict__ pSrcA,
const int16_t *__restrict__ pSrcB,
uint32_t blockSize,
uint32_t fracBits,
int32_t *__restrict__ pRes);
#endif // __PLP_MATH_H__
Updated on 2023-03-01 at 16:16:34 +0000