applications/src/kernels/gemm.h

Functions

	Name
void	gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double * A, uint32_t ldA, uint32_t ta, double * B, uint32_t ldB, uint32_t tb, double * C, uint32_t ldC, const double ALPHA) baseline implementation of a FP64 GEMM
void	gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double * A, uint32_t ldA, uint32_t ta, double * B, uint32_t ldB, uint32_t tb, double * C, uint32_t ldC, const uint32_t * ALPHA, uint32_t setup_SSR) implementation of a FP64 GEMM with configured SSRs and frep loop.
void	gemm_fp32_opt(const uint32_t M, const uint32_t N, const uint32_t K, float * A, const uint32_t ldA, float * B, const uint32_t ldB, float * C, const uint32_t ldC, const uint32_t * ALPHA, const uint32_t setup_SSR) implementation of a FP32 SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.
void	gemm_fp16_opt(uint32_t M, uint32_t N, uint32_t K, __fp16 * A, uint32_t ldA, __fp16 * B, uint32_t ldB, __fp16 * C, uint32_t ldC, const uint32_t * ALPHA, uint32_t setup_SSR) implementation of a non-expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.
void	gemm_fp16_ex_opt(uint32_t M, uint32_t N, uint32_t K, __fp16 * A, uint32_t ldA, __fp16 * B, uint32_t ldB, __fp16 * C, uint32_t ldC, const uint32_t * ALPHA, uint32_t setup_SSR) implementation of a FP16 -> FP32 expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.
void	gemm_fp8_ex_opt(uint32_t M, uint32_t N, uint32_t K, char * A, uint32_t ldA, char * B, uint32_t ldB, char * C, uint32_t ldC, const uint32_t * ALPHA, uint32_t setup_SSR) implementation of a FP8->FP16 expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.

Functions Documentation

function gemm_fp64_baseline

void gemm_fp64_baseline(
    uint32_t M,
    uint32_t N,
    uint32_t K,
    double * A,
    uint32_t ldA,
    uint32_t ta,
    double * B,
    uint32_t ldB,
    uint32_t tb,
    double * C,
    uint32_t ldC,
    const double ALPHA
)

baseline implementation of a FP64 GEMM

Parameters:

M number of rows of matrix A
N number of columns of matrix B
K number of columns of matrix A
A pointer to matrix A
ldA row stride in matrix A
ta transposed memory layout for matrix A
B pointer to matrix B
ldB row stride in matrix B
tb transposed memory layout for matrix B
C pointer to matrix C
ldC row stride in matrix C
ALPHA accmulate factor of C

function gemm_fp64_opt

void gemm_fp64_opt(
    uint32_t M,
    uint32_t N,
    uint32_t K,
    double * A,
    uint32_t ldA,
    uint32_t ta,
    double * B,
    uint32_t ldB,
    uint32_t tb,
    double * C,
    uint32_t ldC,
    const uint32_t * ALPHA,
    uint32_t setup_SSR
)

implementation of a FP64 GEMM with configured SSRs and frep loop.

Parameters:

M number of rows of matrix A
N number of columns of matrix B
K number of columns of matrix A
A pointer to matrix A
ldA row stride in matrix A
ta transposed memory layout for matrix A
B pointer to matrix B
ldB row stride in matrix B
tb transposed memory layout for matrix B
C pointer to matrix C
ldC row stride in matrix C
ALPHA accmulate factor of C
setup_SSR setup SSR bounds and strides

function gemm_fp32_opt

void gemm_fp32_opt(
    const uint32_t M,
    const uint32_t N,
    const uint32_t K,
    float * A,
    const uint32_t ldA,
    float * B,
    const uint32_t ldB,
    float * C,
    const uint32_t ldC,
    const uint32_t * ALPHA,
    const uint32_t setup_SSR
)

implementation of a FP32 SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.

Parameters:

M number of rows of matrix A
N number of columns of matrix B
K number of columns of matrix A
A pointer to matrix A
ldA row stride in matrix A
B pointer to matrix B
ldB row stride in matrix B
C pointer to matrix C
ldC row stride in matrix C
ALPHA accmulate factor of C
setup_SSR setup SSR bounds and strides

Return: * void

function gemm_fp16_opt

void gemm_fp16_opt(
    uint32_t M,
    uint32_t N,
    uint32_t K,
    __fp16 * A,
    uint32_t ldA,
    __fp16 * B,
    uint32_t ldB,
    __fp16 * C,
    uint32_t ldC,
    const uint32_t * ALPHA,
    uint32_t setup_SSR
)

implementation of a non-expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.

Parameters:

M number of rows of matrix A
N number of columns of matrix B
K number of columns of matrix A
A pointer to matrix A
ldA row stride in matrix A
B pointer to matrix B
ldB row stride in matrix B
C pointer to matrix C
ldC row stride in matrix C
ALPHA accmulate factor of C
setup_SSR setup SSR bounds and strides

Return: * void

function gemm_fp16_ex_opt

void gemm_fp16_ex_opt(
    uint32_t M,
    uint32_t N,
    uint32_t K,
    __fp16 * A,
    uint32_t ldA,
    __fp16 * B,
    uint32_t ldB,
    __fp16 * C,
    uint32_t ldC,
    const uint32_t * ALPHA,
    uint32_t setup_SSR
)

implementation of a FP16 -> FP32 expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.

Parameters:

M number of rows of matrix A
N number of columns of matrix B
K number of columns of matrix A
A pointer to matrix A
ldA row stride in matrix A
B pointer to matrix B
ldB row stride in matrix B
C pointer to matrix C
ldC row stride in matrix C
ALPHA accmulate factor of C
setup_SSR setup SSR bounds and strides

Return: * void

function gemm_fp8_ex_opt

void gemm_fp8_ex_opt(
    uint32_t M,
    uint32_t N,
    uint32_t K,
    char * A,
    uint32_t ldA,
    char * B,
    uint32_t ldB,
    char * C,
    uint32_t ldC,
    const uint32_t * ALPHA,
    uint32_t setup_SSR
)

implementation of a FP8->FP16 expanding SIMD GEMM with configured SSRs and frep loop. Matrix B has to be stored in transposed/consecutive memory layout in order to support SIMD instructions.

Parameters:

M number of rows of matrix A
N number of columns of matrix B
K number of columns of matrix A
A pointer to matrix A
ldA row stride in matrix A
B pointer to matrix B
ldB row stride in matrix B
C pointer to matrix C
ldC row stride in matrix C
ALPHA accmulate factor of C
setup_SSR setup SSR bounds and strides

Return: * void

Source code

// Copyright 2020 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <stdint.h>

void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
                        uint32_t ldA, uint32_t ta, double* B, uint32_t ldB,
                        uint32_t tb, double* C, uint32_t ldC,
                        const double ALPHA);

void gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA,
                   uint32_t ta, double* B, uint32_t ldB, uint32_t tb, double* C,
                   uint32_t ldC, const uint32_t* ALPHA, uint32_t setup_SSR);

void gemm_fp32_opt(const uint32_t M, const uint32_t N, const uint32_t K,
                   float* A, const uint32_t ldA, float* B, const uint32_t ldB,
                   float* C, const uint32_t ldC, const uint32_t* ALPHA,
                   const uint32_t setup_SSR);

void gemm_fp16_opt(uint32_t M, uint32_t N, uint32_t K, __fp16* A, uint32_t ldA,
                   __fp16* B, uint32_t ldB, __fp16* C, uint32_t ldC,
                   const uint32_t* ALPHA, uint32_t setup_SSR);
void gemm_fp16_ex_opt(uint32_t M, uint32_t N, uint32_t K, __fp16* A,
                      uint32_t ldA, __fp16* B, uint32_t ldB, __fp16* C,
                      uint32_t ldC, const uint32_t* ALPHA, uint32_t setup_SSR);

void gemm_fp8_ex_opt(uint32_t M, uint32_t N, uint32_t K, char* A, uint32_t ldA,
                     char* B, uint32_t ldB, char* C, uint32_t ldC,
                     const uint32_t* ALPHA, uint32_t setup_SSR);

Updated on 2023-06-19 at 09:43:56 +0000