/github/workspace/src/DistanceFunctions/plp_cosine_distance/plp_cosine_distance_f32_parallel.c

Functions

	Name
void	plp_cosine_distance_f32_parallel(const float32_t restrict pSrcA, const float32_t restrict pSrcB, uint32_t blockSize, uint32_t nPE, float32_t *restrict pRes) Glue code for parallel cosine distance between 32-bit float vectors.

Functions Documentation

function plp_cosine_distance_f32_parallel

void plp_cosine_distance_f32_parallel(
    const float32_t *__restrict__ pSrcA,
    const float32_t *__restrict__ pSrcB,
    uint32_t blockSize,
    uint32_t nPE,
    float32_t *__restrict__ pRes
)

Glue code for parallel cosine distance between 32-bit float vectors.

Parameters:

pSrcA points to the first input vector
pSrcB points to the second input vector
blockSize number of samples in each vector
nPE number of parallel processing units
pRes output result returned here

Return: none

Source code

/* =====================================================================
 * Project:      PULP DSP Library
 * Title:        plp_cosine_distance_f32_parallel.c
 * Description:  32-bit floating point cosine distance kernel for RV32IM
 *
 * $Date:        21. March 2022
 * $Revision:    V0
 *
 * Target Processor: PULP cores
 * ===================================================================== */
/*
 * Copyright (C) 2022 ETH Zurich and University of Bologna.
 *
 * Author: Marco Bertuletti ETH Zurich
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "plp_math.h"

void plp_cosine_distance_f32_parallel(  const float32_t *__restrict__ pSrcA,
                                        const float32_t *__restrict__ pSrcB,
                                        uint32_t blockSize,
                                        uint32_t nPE,
                                        float32_t *__restrict__ pRes) {
  /*float32_t pwrA, pwrB;
  float32_t dot, tmp;
  plp_power_f32_parallel(pSrcA, blockSize, nPE, &pwrA);
  plp_power_f32_parallel(pSrcB, blockSize, nPE, &pwrB);
  tmp = pwrA*pwrB;

  plp_dot_prod_f32_parallel(pSrcA, pSrcB, blockSize, nPE, &dot);
  plp_sqrt_f32(&tmp, &tmp);
  *pRes = 1.0f - dot/tmp;*/

  if (hal_cluster_id() == ARCHI_FC_CID) {
        printf("parallel processing supported only for cluster side\n");
        return;
    } else {


        uint32_t i;
        uint32_t tmpblkSizePE = blockSize / 2;
        float32_t resBuffer_A[2];
        float32_t resBuffer_B[2];
        float32_t resBuffer_dot[2];
        plp_cosine_distance_instance_f32 S;

        // Initialize the plp_cosine_distance_instance
        S.pSrcA = pSrcA;
        S.pSrcB = pSrcB;
        S.blkSizePE = tmpblkSizePE;
        S.nPE = 2;
        S.resBuffer_A = resBuffer_A;
        S.resBuffer_B = resBuffer_B;
        S.resBuffer_dot = resBuffer_dot;

        hal_cl_team_fork(2, plp_cosine_distance_f32p_xpulpv2, (void *)&S);

        float32_t pwrA = 0, pwrB=0;
        float32_t dot = 0;
        for (i = 0; i < 2; i++) { // not necessary hal_cl_nb_pe_cores()
            pwrA += resBuffer_A[i];
            pwrB += resBuffer_B[i];
            dot += resBuffer_dot[i];
        }


        /*uint32_t nPEdot = nPE;
        uint32_t nPEpwr = nPE;

        uint32_t i;
        uint32_t tmpblkSizePE_pwr = blockSize / nPEpwr;
        float32_t resBuffer_pwr[nPEpwr];
        plp_power_instance_f32 S_pwr;
        // Initialize the plp_power_instance
        S_pwr.blkSizePE = tmpblkSizePE_pwr;
        S_pwr.nPE = nPE;
        S_pwr.resBuffer = resBuffer_pwr;

        // POWER OF THE FIRST VECTOR
        S_pwr.pSrc = pSrcA;
        // Fork the dot product to nPE cores (i.e. processing units)
        hal_cl_team_fork(nPEpwr, plp_power_f32p_xpulpv2, (void *)&S_pwr);
        float32_t pwrA = 0, tmpA;
        for (i = 0; i < nPE; i++) { // not necessary hal_cl_nb_pe_cores()
            pwrA += resBuffer_pwr[i];
        }
        for (i = (tmpblkSizePE_pwr)*nPEpwr; i < blockSize; i++) {
            tmpA = pSrcA[i];
            pwrA += tmpA*tmpA;
        }

        // POWER OF THE SECOND VECTOR
        S_pwr.pSrc = pSrcB;
        // Fork the dot product to nPE cores (i.e. processing units)
        hal_cl_team_fork(nPE, plp_power_f32p_xpulpv2, (void *)&S_pwr);
        float32_t pwrB = 0, tmpB;
        for (i = 0; i < nPE; i++) { // not necessary hal_cl_nb_pe_cores()
            pwrB += resBuffer_pwr[i];
        }
        for (i = (tmpblkSizePE_pwr)*nPEpwr; i < blockSize; i++) {
            tmpB = pSrcB[i];
            pwrB += tmpB*tmpB;
        }

        // DOT PRODUCT
        float32_t dot = 0;
        plp_dot_prod_f32s_xpulpv2(pSrcA, pSrcB, blockSize, &dot);

        // DOT PRODUCT
        uint32_t tmpblkSizePE_dot = blockSize / nPEdot;
        float32_t resBuffer_dot[nPEdot];
        plp_dot_prod_instance_f32 S_dot;
        // Initialize the plp_dot_prod_instance
        S_dot.pSrcA = pSrcA;
        S_dot.pSrcB = pSrcB;
        S_dot.blkSizePE = tmpblkSizePE_dot;
        S_dot.nPE = nPEdot;
        S_dot.resBuffer = resBuffer_dot;

        // Fork the dot product to nPE cores (i.e. processing units)
        hal_cl_team_fork(nPEdot, plp_dot_prod_f32p_xpulpv2, (void *)&S_dot);
        float32_t dot = 0;
        for (i = 0; i < nPEdot; i++) { // not necessary hal_cl_nb_pe_cores()
            dot += resBuffer_dot[i];
        }
        for (i = (tmpblkSizePE_dot)*nPEdot; i < blockSize; i++) {
            dot += pSrcA[i] * pSrcB[i];
        }*/

        float32_t tmp = pwrA*pwrB;
        plp_sqrt_f32(&tmp, &tmp);
        *pRes = 1.0f - dot/tmp;
    }

}

Updated on 2023-03-01 at 16:16:32 +0000