CUDA Basic编程

1.向量点积 cuda dot product

Question 1: Dot product
Dot product is a reduction from vectors to a scalar.
Please implement the kernel of dot product in CUDA. The host code is provided. Your work will be evaluated by accuracy and efficiency.
计算两个向量的点积经典的官方教程

#include <stdio.h>

#define imin(a,b) (a<b?a:b)

const int N = 4096 * 4096;
const int threadsPerBlock = 512;
const int blocksPerGrid =  imin(32, (N+threadsPerBlock-1) / threadsPerBlock);     //32768.99取整


__global__ void dot( float *a, float *b, float *c ) {
    // Shared memory for results of multiplication
    __shared__ float temp[threadsPerBlock];             //对于每个block缓存对应的thread的点积
    int tid = threadIdx.x + blockIdx.x * blockDim.x;   //tid为线程的偏移量 
    float move = 0;
    while (tid < N){
        move += a[tid] * b[tid];
        tid += blockDim.x * gridDim.x;
    }
    temp[threadIdx.x] = move;

    //wait all threads calculate over
    __syncthreads();

    // Thread 0 sums the pairwise products
    // calculate current block. 而不是将thread_sum放在thread_0计算 
    int i = blockDim.x/2;
    while (i != 0){
        if (threadIdx.x < i)
            temp[threadIdx.x] += temp[threadIdx.x + i];
        __syncthreads();
        i /= 2;
    }
    if (threadIdx.x == 0)
        c[blockIdx.x] = temp[0];
}


int main( void ) {
    float   *a, *b, c, *partial_c;
    float   *dev_a, *dev_b, *dev_partial_c;

    // allocate memory on the cpu side
    a = (float*)malloc( N*sizeof(float) );
    b = (float*)malloc( N*sizeof(float) );
    partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );

    // allocate the memory on the GPU
    cudaMalloc( (void**)&dev_a,
                              N*sizeof(float) ) ;
    cudaMalloc( (void**)&dev_b,
                              N*sizeof(float) ) ;
    cudaMalloc( (void**)&dev_partial_c,
                              blocksPerGrid*sizeof(float) );        //注意这里开辟空间的大小

    // fill in the host memory with data
    for (int i=0; i<N; i++) {
        a[i] = i;
        b[i] = i*2;
    }

    // copy the arrays 'a' and 'b' to the GPU
    cudaMemcpy( dev_a, a, N*sizeof(float),
                              cudaMemcpyHostToDevice ) ;
    cudaMemcpy( dev_b, b, N*sizeof(float),
                              cudaMemcpyHostToDevice ) ; 

    //注意thread的wait
    dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
                                            dev_partial_c );

    // copy the array 'c' back from the GPU to the CPU
   cudaMemcpy( partial_c, dev_partial_c,
                              blocksPerGrid*sizeof(float),
                              cudaMemcpyDeviceToHost ) ;

    // finish up on the CPU side
    // 统计block_sum in grid
    c = 0;
    for (int i=0; i<blocksPerGrid; i++) {
        c += partial_c[i];
    }

    //#define sum_squares(x) (x*(x+1)*(2*x+1)/6)        //此程序相当于计算0,1,...N-1的平方和 
    //printf("Does GPU value %.10g = %.10g?\n", c, 2*sum_squares((float)(N-1)));
    //printf("Does GPU value %f = %f?\n", c, 2*sum_squares((float)(N-1)));

    // free memory on the gpu side
    cudaFree( dev_a ) ;
    cudaFree( dev_b ) ;
    cudaFree( dev_partial_c );

    // free memory on the cpu side
    free( a );
    free( b );
    free( partial_c );
}

/*
blockDim.x,y,z gives the number of threads in a block, in the particular direction
gridDim.x,y,z gives the number of blocks in a grid, in the particular direction
blockDim.x * gridDim.x gives the number of threads in a grid (in the x direction, in this case)

1 block的官方examle
__global__ voiddot( int*a, int*b, int*c ) {
    __shared__ inttemp[N];
    temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
    __syncthreads();
    if( 0 == threadIdx.x ) {
        intsum = 0;
        for( inti= 0; i< N; i++ )
            sum += temp[i];
        *c = sum;
    }
}
*/

2.cuda实现枚举排序
枚举排序(Enumeration Sort)是一种最简单的排序算法，通常也称为秩排序(Rank Sort)。该算法的具体思想是（假设按关键字递增排序），对每一个待排序的元素统计小于它的所有元素的个数，从而得到该元素最终处于序列中的位置。假定待排序的n个数存在a[1], …, a[n]中。首先将a[1]与a[2],…, a[n]比较，记录比其小的数的个数，假设为k，则a[1]就被存入有序的数组b[1], b[2],…, b[n]的b[k+1]位置上；然后将a[2]与a[1], a[3],…, a[n]比较，记录比其小的数的个数，以此类推。这样的比较操作共n(n-1)次，所以串行秩排序的时间复杂度为O(n2). 请用CUDA并行枚举排序法

//感觉block没用好  是不是再遍历全部array之前有个combine操作？
//目前计算是9s    有点慢呢
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE      512

__global__ void RankSortKernel(const unsigned int *ori, unsigned int *sorted, unsigned int len)
{
    unsigned int final_location = 0;
    unsigned int same = 0;
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    for(int i=0;i<len;i++){
        if(ori[tid] == ori[i])
            same ++;
        if(ori[tid] > ori[i])
            final_location ++;
    }
    for(int i=0;i<same;i++){
        sorted[final_location+i] = ori[tid];
    }
}

int main ( int argc, char *argv[] )
{
    unsigned int len  = 1024*1024;
    unsigned int *ori = new unsigned int[len];
    unsigned int *gpuSorted = new unsigned int[len];
    unsigned int *d_ori = NULL;
    unsigned int *d_sorted = NULL;
    unsigned int size = len*sizeof(unsigned int);
    unsigned int numBlock = len/BLOCK_SIZE;

    dim3 dimGrid(numBlock);                     //num of block in grid
    dim3 dimBlock(BLOCK_SIZE);
    cudaMalloc((void**)&d_ori, size);
    cudaMalloc((void**)&d_sorted, size);
    for (int i=0;i<len;i++)
        ori[i] = rand()%100;
    printf("init origin array ok\n");
    cudaMemcpy(d_ori, ori, size, cudaMemcpyHostToDevice);
    cudaMemset(d_sorted, 0, size);

    cudaEvent_t start, stop;
    float elapsedTime;
    cudaEventCreate(&start);
    cudaEventRecord(start,0);
    RankSortKernel<<<dimGrid, dimBlock>>>(d_ori, d_sorted, len);            //cuda kernel函数
    //阻塞until device结束
    cudaThreadSynchronize();
    cudaEventCreate(&stop);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start,stop);
    printf("sort_Elapsed time : %f ms\n" ,elapsedTime);


    cudaMemcpy(gpuSorted, d_sorted, len*sizeof(unsigned int), cudaMemcpyDeviceToHost);
    for (int i=0;i<len;i++){
        //printf("%d ",gpuSorted[i]);
        if (i==len-1)
            break;
        if(gpuSorted[i] > gpuSorted[i+1])
            printf("sort failed in %d\n",gpuSorted[i]);
    }
    printf("sort ok\n");
    cudaFree(d_ori);
    cudaFree(d_sorted);
    delete ori;
    delete gpuSorted;

    return 0;
}

/*
1.cuda中kernel函数<<<dim3>>>和<<<int,int>>>区别？
https://stackoverflow.com/questions/31141541/cuda-block-grid-dimensions-when-to-use-dim3

*/

猜你喜欢