1. Knowledge points

The value of nthreads cannot be greater than the maximum value that the block can take. Generally, you can directly give 512, 256, and the performance is relatively good.
- (input_size + block_size - 1) / block_size; is rounded up
For a one-dimensional array, use the x dimension that only defines the layout. If you are dealing with a two-dimensional array, you can consider defining the x and y dimensions. For example, if you are dealing with an image
About the calculation of the index when the data is regarded as one-dimensional
- The general calculation formula is as follows
```
Pseudo code:
position = 0
for i in range(6):
    position *= dims[i]
    position += indexs[i]
```
- For example, when only the x dimension is used, actually dims = [1, 1, gd, 1, 1, bd], indexes = [0, 0, bi, 0, 0, ti]
  - Because of the existence of 0 and 1, the above loop can be simplified as: idx = threadIdx.x + blockIdx.x * blockDim.x
  - Immediate: idx = ti + bi * bd

2. main.cpp file

#include <cuda_runtime.h>
#include <stdio.h>

#define checkRuntime(op)  __check_cuda_runtime((op), #op, __FILE__, __LINE__)

bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
    
    
    if(code != cudaSuccess){
    
        
        const char* err_name = cudaGetErrorName(code);    
        const char* err_message = cudaGetErrorString(code);  
        printf("runtime error %s:%d  %s failed. \n  code = %s, message = %s\n", file, line, op, err_name, err_message);   
        return false;
    }
    return true;
}

void vector_add(const float* a, const float* b, float* c, int ndata);

int main(){
    
    

    const int size = 3;
    float vector_a[size] = {
    
    2, 3, 2};
    float vector_b[size] = {
    
    5, 3, 3};
    float vector_c[size] = {
    
    0};

    float* vector_a_device = nullptr;
    float* vector_b_device = nullptr;
    float* vector_c_device = nullptr;
    checkRuntime(cudaMalloc(&vector_a_device, size * sizeof(float)));
    checkRuntime(cudaMalloc(&vector_b_device, size * sizeof(float)));
    checkRuntime(cudaMalloc(&vector_c_device, size * sizeof(float)));

    checkRuntime(cudaMemcpy(vector_a_device, vector_a, size * sizeof(float), cudaMemcpyHostToDevice));
    checkRuntime(cudaMemcpy(vector_b_device, vector_b, size * sizeof(float), cudaMemcpyHostToDevice));

    vector_add(vector_a_device, vector_b_device, vector_c_device, size);
    checkRuntime(cudaMemcpy(vector_c, vector_c_device, size * sizeof(float), cudaMemcpyDeviceToHost));
    
    for(int i = 0; i < size; ++i){
    
    
        printf("vector_c[%d] = %f\n", i, vector_c[i]);
    }

    checkRuntime(cudaFree(vector_a_device));
    checkRuntime(cudaFree(vector_b_device));
    checkRuntime(cudaFree(vector_c_device));
    return 0;
}

First define three arrays: a, b, c and then use cudaMalloc() to open up three memories on the GPU, let a + b on the GPU and store the result in c, and then put the memory of c from the GPU to the Host upper output

3. Case.cu file

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void vector_add_kernel(const float* a, const float* b, float* c, int ndata){
    
    

    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx >= ndata) return;
    /*    dims                 indexs
        gridDim.z            blockIdx.z
        gridDim.y            blockIdx.y
        gridDim.x            blockIdx.x
        blockDim.z           threadIdx.z
        blockDim.y           threadIdx.y
        blockDim.x           threadIdx.x

        Pseudo code:
        position = 0
        for i in 6:
            position *= dims[i]
            position += indexs[i]
    */
    c[idx] = a[idx] + b[idx];
}

void vector_add(const float* a, const float* b, float* c, int ndata){
    
    

    const int nthreads = 512;
    int block_size = ndata < nthreads ? ndata : nthreads;  // 如果ndata < nthreads 那block_size = ndata就够了
    int grid_size = (ndata + block_size - 1) / block_size; // 其含义是我需要多少个blocks可以处理完所有的任务
    printf("block_size = %d, grid_size = %d\n", block_size, grid_size);
    vector_add_kernel<<<grid_size, block_size, 0, nullptr>>>(a, b, c, ndata);

    // 在核函数执行结束后，通过cudaPeekAtLastError获取得到的代码，来知道是否出现错误
    // cudaPeekAtLastError和cudaGetLastError都可以获取得到错误代码
    // cudaGetLastError是获取错误代码并清除掉，也就是再一次执行cudaGetLastError获取的会是success
    // 而cudaPeekAtLastError是获取当前错误，但是再一次执行cudaPeekAtLastError或者cudaGetLastErro拿到的还是那个错
    cudaError_t code = cudaPeekAtLastError();
    if(code != cudaSuccess){
    
        
        const char* err_name    = cudaGetErrorName(code);    
        const char* err_message = cudaGetErrorString(code);  
        printf("kernel error %s:%d  test_print_kernel failed. \n  code = %s, message = %s\n", __FILE__, __LINE__, err_name, err_message);   
    }
}

two points of attention

Like this case, he just adds three numbers. In fact, it is enough to start three threads, but the general block is 512, 256, so it needs to be set. If the length of the array is less than 256/512, the array will be used directly. The length of the number of threads is fine. Here are 3 threads
If the thread index is greater than the length of the array, it will return directly, otherwise it will access the memory that does not know where it is

Deep Learning Deployment (16): CUDA RunTime API _vector-add Use cuda kernel function to realize vector addition

1. Knowledge points

2. main.cpp file

3. Case.cu file

Guess you like