applications/src/kernels/gemm.h
Functions
Name | |
---|---|
void | gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double * A, uint32_t ldA, uint32_t ta, double * B, uint32_t ldB, uint32_t tb, double * C, uint32_t ldC, const double ALPHA) baseline implementation of a FP64 GEMM |
void | gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double * A, uint32_t ldA, uint32_t ta, double * B, uint32_t ldB, uint32_t tb, double * C, uint32_t ldC, const uint32_t * ALPHA, uint32_t setup_SSR) implementation of a FP64 GEMM with configured SSRs and frep loop. |
void | gemm_fp32_opt(const uint32_t M, const uint32_t N, const uint32_t K, float * A, const uint32_t ldA, float * B, const uint32_t ldB, float * C, const uint32_t ldC, const uint32_t * ALPHA, const uint32_t setup_SSR) implementation of a FP32 SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions. |
void | gemm_fp16_opt(uint32_t M, uint32_t N, uint32_t K, __fp16 * A, uint32_t ldA, __fp16 * B, uint32_t ldB, __fp16 * C, uint32_t ldC, const uint32_t * ALPHA, uint32_t setup_SSR) implementation of a non-expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions. |
void | gemm_fp16_ex_opt(uint32_t M, uint32_t N, uint32_t K, __fp16 * A, uint32_t ldA, __fp16 * B, uint32_t ldB, __fp16 * C, uint32_t ldC, const uint32_t * ALPHA, uint32_t setup_SSR) implementation of a FP16 -> FP32 expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions. |
void | gemm_fp8_ex_opt(uint32_t M, uint32_t N, uint32_t K, char * A, uint32_t ldA, char * B, uint32_t ldB, char * C, uint32_t ldC, const uint32_t * ALPHA, uint32_t setup_SSR) implementation of a FP8->FP16 expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions. |
Functions Documentation
function gemm_fp64_baseline
void gemm_fp64_baseline(
uint32_t M,
uint32_t N,
uint32_t K,
double * A,
uint32_t ldA,
uint32_t ta,
double * B,
uint32_t ldB,
uint32_t tb,
double * C,
uint32_t ldC,
const double ALPHA
)
baseline implementation of a FP64 GEMM
Parameters:
- M number of rows of matrix A
- N number of columns of matrix B
- K number of columns of matrix A
- A pointer to matrix A
- ldA row stride in matrix A
- ta transposed memory layout for matrix A
- B pointer to matrix B
- ldB row stride in matrix B
- tb transposed memory layout for matrix B
- C pointer to matrix C
- ldC row stride in matrix C
- ALPHA accmulate factor of C
function gemm_fp64_opt
void gemm_fp64_opt(
uint32_t M,
uint32_t N,
uint32_t K,
double * A,
uint32_t ldA,
uint32_t ta,
double * B,
uint32_t ldB,
uint32_t tb,
double * C,
uint32_t ldC,
const uint32_t * ALPHA,
uint32_t setup_SSR
)
implementation of a FP64 GEMM with configured SSRs and frep loop.
Parameters:
- M number of rows of matrix A
- N number of columns of matrix B
- K number of columns of matrix A
- A pointer to matrix A
- ldA row stride in matrix A
- ta transposed memory layout for matrix A
- B pointer to matrix B
- ldB row stride in matrix B
- tb transposed memory layout for matrix B
- C pointer to matrix C
- ldC row stride in matrix C
- ALPHA accmulate factor of C
- setup_SSR setup SSR bounds and strides
function gemm_fp32_opt
void gemm_fp32_opt(
const uint32_t M,
const uint32_t N,
const uint32_t K,
float * A,
const uint32_t ldA,
float * B,
const uint32_t ldB,
float * C,
const uint32_t ldC,
const uint32_t * ALPHA,
const uint32_t setup_SSR
)
implementation of a FP32 SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.
Parameters:
- M number of rows of matrix A
- N number of columns of matrix B
- K number of columns of matrix A
- A pointer to matrix A
- ldA row stride in matrix A
- B pointer to matrix B
- ldB row stride in matrix B
- C pointer to matrix C
- ldC row stride in matrix C
- ALPHA accmulate factor of C
- setup_SSR setup SSR bounds and strides
Return: * void
function gemm_fp16_opt
void gemm_fp16_opt(
uint32_t M,
uint32_t N,
uint32_t K,
__fp16 * A,
uint32_t ldA,
__fp16 * B,
uint32_t ldB,
__fp16 * C,
uint32_t ldC,
const uint32_t * ALPHA,
uint32_t setup_SSR
)
implementation of a non-expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.
Parameters:
- M number of rows of matrix A
- N number of columns of matrix B
- K number of columns of matrix A
- A pointer to matrix A
- ldA row stride in matrix A
- B pointer to matrix B
- ldB row stride in matrix B
- C pointer to matrix C
- ldC row stride in matrix C
- ALPHA accmulate factor of C
- setup_SSR setup SSR bounds and strides
Return: * void
function gemm_fp16_ex_opt
void gemm_fp16_ex_opt(
uint32_t M,
uint32_t N,
uint32_t K,
__fp16 * A,
uint32_t ldA,
__fp16 * B,
uint32_t ldB,
__fp16 * C,
uint32_t ldC,
const uint32_t * ALPHA,
uint32_t setup_SSR
)
implementation of a FP16 -> FP32 expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.
Parameters:
- M number of rows of matrix A
- N number of columns of matrix B
- K number of columns of matrix A
- A pointer to matrix A
- ldA row stride in matrix A
- B pointer to matrix B
- ldB row stride in matrix B
- C pointer to matrix C
- ldC row stride in matrix C
- ALPHA accmulate factor of C
- setup_SSR setup SSR bounds and strides
Return: * void
function gemm_fp8_ex_opt
void gemm_fp8_ex_opt(
uint32_t M,
uint32_t N,
uint32_t K,
char * A,
uint32_t ldA,
char * B,
uint32_t ldB,
char * C,
uint32_t ldC,
const uint32_t * ALPHA,
uint32_t setup_SSR
)
implementation of a FP8->FP16 expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.
Parameters:
- M number of rows of matrix A
- N number of columns of matrix B
- K number of columns of matrix A
- A pointer to matrix A
- ldA row stride in matrix A
- B pointer to matrix B
- ldB row stride in matrix B
- C pointer to matrix C
- ldC row stride in matrix C
- ALPHA accmulate factor of C
- setup_SSR setup SSR bounds and strides
Return: * void
Source code
// Copyright 2020 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include <stdint.h>
void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
uint32_t ldA, uint32_t ta, double* B, uint32_t ldB,
uint32_t tb, double* C, uint32_t ldC,
const double ALPHA);
void gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA,
uint32_t ta, double* B, uint32_t ldB, uint32_t tb, double* C,
uint32_t ldC, const uint32_t* ALPHA, uint32_t setup_SSR);
void gemm_fp32_opt(const uint32_t M, const uint32_t N, const uint32_t K,
float* A, const uint32_t ldA, float* B, const uint32_t ldB,
float* C, const uint32_t ldC, const uint32_t* ALPHA,
const uint32_t setup_SSR);
void gemm_fp16_opt(uint32_t M, uint32_t N, uint32_t K, __fp16* A, uint32_t ldA,
__fp16* B, uint32_t ldB, __fp16* C, uint32_t ldC,
const uint32_t* ALPHA, uint32_t setup_SSR);
void gemm_fp16_ex_opt(uint32_t M, uint32_t N, uint32_t K, __fp16* A,
uint32_t ldA, __fp16* B, uint32_t ldB, __fp16* C,
uint32_t ldC, const uint32_t* ALPHA, uint32_t setup_SSR);
void gemm_fp8_ex_opt(uint32_t M, uint32_t N, uint32_t K, char* A, uint32_t ldA,
char* B, uint32_t ldB, char* C, uint32_t ldC,
const uint32_t* ALPHA, uint32_t setup_SSR);
Updated on 2023-06-19 at 09:43:56 +0000