Deep learning deployment (11): CUDA RunTime API kernel function

1. Kernel function

Kernel function is the key to cuda programming

Create a cudac program file through xxx.cu, and pass cu to nvcc to compile to recognize cuda syntax

__global__ is represented as a kernel function, which is called by the host. __device__ is represented as a device function, called by device

__host__ is represented as a host function, which is called by host. __shared__ indicates that the variable is a shared variable

The host calls the kernel function: function<<<gridDim, blockDim, sharedMemorySize, stream>>>(args…);

Stream: stream gridDim blockDim tells the function how many threads to start

Only __global__ modified functions can be called in the way of <<<>>>

Calling the kernel function is passed by value, not by reference, but by passing classes, structures, etc. The kernel function can be a template and the return value must be void

The execution of the kernel function is asynchronous, that is, it returns immediately

Thread layout mainly uses blockDim and gridDim

The access thread index in the kernel function mainly uses built-in variables such as threadIdx, blockIdx, blockDim, and gridDim

2. main.cpp file

#include <cuda_runtime.h>
#include <stdio.h>

#define checkRuntime(op)  __check_cuda_runtime((op), #op, __FILE__, __LINE__)

bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
    
    
    if(code != cudaSuccess){
    
        
        const char* err_name = cudaGetErrorName(code);    
        const char* err_message = cudaGetErrorString(code);  
        printf("runtime error %s:%d  %s failed. \n  code = %s, message = %s\n", file, line, op, err_name, err_message);   
        return false;
    }
    return true;
}

void test_print(const float* pdata, int ndata);

int main(){
    
    

    // 定义device指针和host指针
    float* parray_host = nullptr;
    float *parray_device = nullptr;
    int narray = 10;
    int array_bytes = sizeof(float) * narray;

    // 开辟GPU的内存,cudaMalloc返回的指针指向GPU
    checkRuntime(cudaMalloc(&parray_device, array_bytes));

    // 开辟主机内存
    parray_host = new float[narray];

    // 往主机内存放进10个数字
    for (int i = 0; i < narray; i++){
    
    
        parray_host[i] = i;
    }

    // 把主机的内存复制上去
    checkRuntime(cudaMemcpy(parray_device, parray_host, array_bytes,cudaMemcpyHostToDevice));
    
    // 把在GPU的东西打印出来
    test_print(parray_device, narray);

    checkRuntime(cudaDeviceSynchronize());

    // 释放device内存, 释放host内存
    checkRuntime(cudaFree(parray_device));
    delete[] parray_host;
    return 0;
}

This code first declares two pointer variables parray_host and parray_device and an integer variable narray, which will be used later in the code.

Then, it allocates narray float type spaces on the host for parray_host, which will be used to store input data. Afterwards, it uses the cudaMalloc function to allocate the same space size as the host side for parray_device on the device, that is, narray float type spaces, which will be used to store data on the device side.

Next, use a for loop to assign the input data on the host side to parray_host one by one.

Then, use the cudaMemcpy function to asynchronously copy the parray_host data on the host side to the parray_device data space on the device side.

Then, it calls the test_print function, passing parray_device and narray as parameters to the function. The test_print function is used to print out the data content on the device side.

Finally, it uses the cudaDeviceSynchronize function to synchronize the device and the host, wait for the completion of the calculation on the device side, and prevent the program from exiting prematurely, so that the space allocated on the device and the host can be released later to avoid memory leaks. Then, the space allocated by parray_device and parray_host is released, and the program ends.

3.kernel.cu file

#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

// 全部的函数 exp std::io 英伟达GPU都封装好了
__device__ __host__ float sigmoid(float x){
    
    
    return 1 / (1 + exp(-x));
}

__global__ void test_print_kernel(const float* pdata, int ndata){
    
    

    // 内置变量 
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    /*    dims                 indexs
        gridDim.z    1        blockIdx.z      0
        gridDim.y    1        blockIdx.y      0
        gridDim.x    1        blockIdx.x      0
        blockDim.z   1        threadIdx.z     0
        blockDim.y   1        threadIdx.y     0
        blockDim.x   10        threadIdx.x    0-9

        Pseudo code:
        position = 0
        for i in 6:
            position *= dims[i]
            position += indexs[i]
    */
    float y = sigmoid(0.5f);
    printf("Element[%d] = %f, threadIdx.x=%d, blockIdx.x=%d, blockDim.x=%d\n", idx, pdata[idx], threadIdx.x, blockIdx.x, blockDim.x);
}

// 这个__host__写不写都是一样的,他就是一个设备函数
__host__ void test_print(const float* pdata, int ndata){
    
    

    float y = sigmoid(0.5f);
    // <<<gridDim, blockDim, bytes_of_shared_memory, stream>>>
    dim3 gridDim;
    dim3 blockDim;
    // 总线程数
    int nthreads = gridDim.x * gridDim.y * gridDim.z * blockDim.x * blockDim.y * blockDim.z;

    // 为什么补nthreads = 10; ?
    // 我们会遇到多维度的问题,
    // 这些是两个Dim的极限了
    // gridDim(21亿, 65536, 65536) 
    // blockDim(1024, 64, 64) blockDim.x * blockDim.y * blockDim.z <= 1024;

    // nullptr这里是默认流,想要异步操作就放个stream
    test_print_kernel<<<dim3(1), dim3(ndata), 0, nullptr>>>(pdata, ndata);

    // test_print_kernel<<<1, ndata, 0, nullptr>>>(pdata, ndata);

    // 在核函数执行结束后,通过cudaPeekAtLastError获取得到的代码,来知道是否出现错误
    // cudaPeekAtLastError和cudaGetLastError都可以获取得到错误代码
    // cudaGetLastError是获取错误代码并清除掉,也就是再一次执行cudaGetLastError获取的会是success
    // 而cudaPeekAtLastError是获取当前错误,但是再一次执行 cudaPeekAtLastError 或者 cudaGetLastError 拿到的还是那个错
    // cuda的错误会传递,如果这里出错了,不移除。那么后续的任意api的返回值都会是这个错误,都会失败
    cudaError_t code = cudaPeekAtLastError();
    if(code != cudaSuccess){
    
        
        const char* err_name    = cudaGetErrorName(code);    
        const char* err_message = cudaGetErrorString(code);  
        printf("kernel error %s:%d  test_print_kernel failed. \n  code = %s, message = %s\n", __FILE__, __LINE__, err_name, err_message);   
    }
}

The NVCC compiler actually encapsulates all the features of C++, and also encapsulates the features of CUDA C++

device let the device call __host__ let the host call

If you want the function to be called by the host and the device, add the modifier device host in front of it

Why this place does not use nthreads = 10; but this method of operation
because we will actually encounter multi-dimensional problems

The nullptr of the kernel function is the default stream, and if you want to operate asynchronously, you need to use stream

test_print_kernel<<<dim3(1), dim3(20), 0, nullptr>>>(pdata, ndata);

dim3 is a type provided by CUDA to represent a three-dimensional vector, and it has three member variables x, y, and z, which respectively represent the three components of the vector.

dim3(1) means there is only one thread block, that is, there is only one thread block in the x direction; dim3(20) means that there are 20 threads in each thread block, that is, there are 20 threads in the x direction.

Therefore, this kernel function has a total of 1 * 20 = 20 threads.

Guess you like

Origin blog.csdn.net/bobchen1017/article/details/129461891