CUDA实现L2欧式距离

文章目录

前言

  本教程实现一个A[5] 和 B[3][5]两个矩阵之间欧氏距离的CUDA代码。

#include <stdio.h>

#define N 5
#define D 3  
#define SIZE  N*D


void __global__ cpt(int *da, int *db, int *dres);

void __global__ cpt(int *da, int *db, int *dres)
{
    
    
    int tid = threadIdx.x;   // tid = 0,1,2
    int sum=0;   // register 
    for(int i=0; i<N; ++i)
    {
    
    
        sum += (da[i]-db[tid*N + i]) * (da[i]-db[tid*N]+i);
    }
    dres[tid] = sum;
}

int main(int arc, char *argv[])
{
    
    
    // host memory and assignment
    int *ha, *hb, *hres;
    ha =(int *)malloc(sizeof(int)*N);
    hb =(int *)malloc(sizeof(int)* SIZE);
    hres = (int *)malloc(sizeof(int)*D);

    for(int i=0; i<N; ++i)
    {
    
    
        ha[i] = 1;
    }
    
    for(int i=0; i<SIZE; ++i)
    {
    
    
        hb[i] = 0;
    }
    
    for(int i=0; i<D; ++i)
    {
    
    
        hres[i] = 0;
    }

    // device memory and copy
    int *da, *db, *dres;
    cudaMalloc((void **)&da, sizeof(int)*N);
    cudaMalloc((void **)&db, sizeof(int)*SIZE);
    cudaMalloc((void **)&dres, sizeof(int)*D);

    cudaMemcpy(da, ha, sizeof(int)*N, cudaMemcpyHostToDevice);
    cudaMemcpy(db, hb, sizeof(int)*SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(dres, hres, sizeof(int)*D, cudaMemcpyHostToDevice);

    // set threads and global kerner fun
    const dim3 grid_size(1);
    const dim3 block_size(D);

    cpt<<<grid_size,block_size>>>(da,db,dres);

    // cpy device to host
    cudaMemcpy(hres, dres, sizeof(int)*D, cudaMemcpyDeviceToHost);
    
    printf("%d\n",hres[0]);
    // free memory
    free(ha);
    free(hb);
    free(hres);
    cudaFree(da);
    cudaFree(db);
    cudaFree(dres);

    return 0;


}

猜你喜欢

转载自blog.csdn.net/wulele2/article/details/119043340
今日推荐