applications/src/kernels/conv2d.h
Classes
Name | |
---|---|
struct | kernel_fp32 parameters for single-precision fusedconv kernel |
struct | kernel_fp64 parameters for double-precision fusedconv kernel |
Functions
Name | |
---|---|
void | occamy_conv_opt_fp64(kernel_fp64 * k) implementation of a double-precision fp convolutional kernel for DORY trials. Currently does a direct convolution without im2col. The memory layout of input/output feature map is HxWxC, resp. CoxFhxFwxCi. Fuses multiple layers together (Conv2d, Batchnorm, Relu) that can be enabled with a flag |
void | occamy_conv_opt_fp32(kernel_fp32 * k) implementation of a single-precision fp convolutional kernel for DORY trials. Currently does a direct convolution without im2col. The memory layout of input/output feature map is HxWxC, resp. CoxFhxFwxCi. Fuses multiple layers together (Conv2d, Batchnorm, Relu) that can be enabled with a flag |
void | occamy_conv_dw_opt_fp32(kernel_fp32 * k) implementation of a single-precision fp DEPTHWISE convolutional kernel for DORY trials. Currently does a direct convolution without im2col. The memory layout of input/output feature map is HxWxC, resp. CoxFhxFwxCi. Fuses multiple layers together (Conv2d, Batchnorm, Relu) that can be enabled with a flag |
void | occamy_conv_chw_opt_fp32(kernel_fp32 * k) implementation of a single-precision fp convolutional kernel for DORY trials. Currently does a direct convolution without im2col. The memory layout of input feature map is C x H x W, resp. Co x Fh x Fw x Ci for weights Howevever, the output memory layout is H x W x C. This kernel should be used for the first layers in a network where Ci is very small and usually odd numbered. Fuses multiple layers together (Conv2d, Batchnorm, Relu) that can be enabled with a flag |
void | bn_relu(const float * pBuffer, const uint16_t dim_x, const uint16_t dim_y, const uint16_t ch, float * kappa, float * lambda, int flag_relu, int flag_batch_norm) helper function that implements Batch Normalization and ReLU |
Functions Documentation
function occamy_conv_opt_fp64
void occamy_conv_opt_fp64(
kernel_fp64 * k
)
implementation of a double-precision fp convolutional kernel for DORY trials. Currently does a direct convolution without im2col. The memory layout of input/output feature map is HxWxC, resp. CoxFhxFwxCi. Fuses multiple layers together (Conv2d, Batchnorm, Relu) that can be enabled with a flag
Parameters:
- k kernel_fp64 struct reference that holds all parameters
function occamy_conv_opt_fp32
void occamy_conv_opt_fp32(
kernel_fp32 * k
)
implementation of a single-precision fp convolutional kernel for DORY trials. Currently does a direct convolution without im2col. The memory layout of input/output feature map is HxWxC, resp. CoxFhxFwxCi. Fuses multiple layers together (Conv2d, Batchnorm, Relu) that can be enabled with a flag
Parameters:
- k kernel_fp32 struct reference that holds all parameters
function occamy_conv_dw_opt_fp32
void occamy_conv_dw_opt_fp32(
kernel_fp32 * k
)
implementation of a single-precision fp DEPTHWISE convolutional kernel for DORY trials. Currently does a direct convolution without im2col. The memory layout of input/output feature map is HxWxC, resp. CoxFhxFwxCi. Fuses multiple layers together (Conv2d, Batchnorm, Relu) that can be enabled with a flag
Parameters:
- k kernel_fp32 struct reference that holds all parameters
function occamy_conv_chw_opt_fp32
void occamy_conv_chw_opt_fp32(
kernel_fp32 * k
)
implementation of a single-precision fp convolutional kernel for DORY trials. Currently does a direct convolution without im2col. The memory layout of input feature map is C x H x W, resp. Co x Fh x Fw x Ci for weights Howevever, the output memory layout is H x W x C. This kernel should be used for the first layers in a network where Ci is very small and usually odd numbered. Fuses multiple layers together (Conv2d, Batchnorm, Relu) that can be enabled with a flag
Parameters:
- k kernel_fp32 struct reference that holds all parameters
function bn_relu
void bn_relu(
const float * pBuffer,
const uint16_t dim_x,
const uint16_t dim_y,
const uint16_t ch,
float * kappa,
float * lambda,
int flag_relu,
int flag_batch_norm
)
helper function that implements Batch Normalization and ReLU
Parameters:
- pBuffer pointer to the feature map
- dim_x width of feature map
- dim_y height of feature map
- ch number of channels (SIMD restricts multiple of 2)
- kappa multiplication factor for BatchNorm
- lambda bias for BatchNorm
- flag_relu RELU activation flag
- flag_batch_norm BatchNorm flag
Source code
// Copyright 2020 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "snrt.h"
typedef struct {
float *pInBuffer;
uint16_t dim_in_x;
uint16_t dim_in_y;
uint16_t ch_in;
float *pWeight;
uint16_t ch_out;
uint16_t dim_kernel_x;
uint16_t dim_kernel_y;
uint16_t padding_y_top;
uint16_t padding_y_bottom;
uint16_t padding_x_left;
uint16_t padding_x_right;
uint16_t stride_x;
uint16_t stride_y;
int8_t *bias;
uint16_t bias_shift;
uint16_t out_shift;
uint16_t out_mult;
float *pOutBuffer;
uint16_t dim_out_x;
uint16_t dim_out_y;
float *kappa;
float *lambda;
uint8_t *pIm2ColBuffer;
int flag_relu;
int flag_batch_norm;
int flag_y_accumulate_start;
int flag_y_accumulate_end;
unsigned int *memory_chan;
} kernel_fp32;
typedef struct {
double *pInBuffer;
uint16_t dim_in_x;
uint16_t dim_in_y;
uint16_t ch_in;
double *pWeight;
uint16_t ch_out;
uint16_t dim_kernel_x;
uint16_t dim_kernel_y;
uint16_t padding_y_top;
uint16_t padding_y_bottom;
uint16_t padding_x_left;
uint16_t padding_x_right;
uint16_t stride_x;
uint16_t stride_y;
int8_t *bias;
uint16_t bias_shift;
uint16_t out_shift;
uint16_t out_mult;
double *pOutBuffer;
uint16_t dim_out_x;
uint16_t dim_out_y;
double *kappa;
double *lambda;
uint8_t *pIm2ColBuffer;
int flag_relu;
int flag_batch_norm;
int flag_y_accumulate_start;
int flag_y_accumulate_end;
unsigned int *memory_chan;
} kernel_fp64;
void occamy_conv_opt_fp64(kernel_fp64 *k);
void occamy_conv_opt_fp32(kernel_fp32 *k);
void occamy_conv_dw_opt_fp32(kernel_fp32 *k);
void occamy_conv_chw_opt_fp32(kernel_fp32 *k);
void bn_relu(const float *pBuffer, const uint16_t dim_x, const uint16_t dim_y,
const uint16_t ch, float *kappa, float *lambda, int flag_relu,
int flag_batch_norm);
Updated on 2023-06-19 at 09:43:56 +0000