一、概述

使用CUDA编程的目的：在普通的加速手段(SIMD指令、C++多线程、OpenMP等)无法满足实际需求时，使用CUDA对算法运行的进行加速，以满足系统的实时性要求。
举个例子：立体匹配算法、深度学习训练与测试、三维重建等。
硬件需求：可以在官网上查到支持CUDA的显卡，以及显卡的运算能力。

二、CDUA安装

安装过程比较简单，简单来说分为三步：

1、准备好VS和官网下载的CUDA安装程序(我用的是10.2，现在有更高的版本了，可以试一试)。在安装过程中，会自动检测本机是否已经安装了配套的VS版本其中之一，如果VS版本和Cuda版本不匹配的话，安装无法进行。另外，如果电脑安装了360杀毒(直接关掉最好)的话，安装过程中会不断有疑似病毒修改的提示，要全部允许操作，否则无法安装。
2、安装完成后可以打开命令窗口，输入path，查看一下是否有相应的环境变量，如下图所示。

如果没有，可以自己添加环境变量。一般都有，因为在安装的时候都是默认添加的。使用nvcc -V命令查看相应的CUDA安装信息。
3、打开VS可以发现多了一个NVIDA选项，选择之后即可新建CUDA工程。
4、参考：https://blog.csdn.net/HaleyDong/article/details/86093520

三、简单结构描述

以下是Host(CPU)和Device(可以称为CUDA或GPU)的简单数据流过程。
Grid是最外面的一层，称为网格，一般是三维的，其中gridDim.x,gridDim.y, gridDim.z表示网格每个维度的大小。
Block表示网格中的一个线程块，一般是三维的，其中blockDim.x,blockDim.y,blockDim.z表示线程块每个维度的大小；blockIdx.x,blockIdx.y,blockIdx.z表示线程块在网格中的索引。
最内部的就是实际使用的线程了，同样是三维分布，其中threadIdx.x,threadIdx.y,threadIdx.z表示每个线程在线程块中x,y,z方向的索引值。
线程和线程块的具体分布如下图所示：
注意事项：不同块内线程不能相互影响！他们是物理隔离的！；线程块中的线程可以通过共享内存进行交互。先记住这两点，具体后续会详细讨论。

四、标准例子

使用VS新建一个CUDA工程后，会默认出来一个kernel.cu 文件，下面对该文件进行详细的注解：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);


//修饰符“__global__”，这个修饰符告诉编译器，被修饰的函数应该编译为在GPU而不是在CPU上运行，
__global__ void addKernel(int *c, const int *a, const int *b)
{
    
    
	//threadIdx.x，表示的是thread在x方向上的索引号
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    
    
    const int arraySize = 5;
    const int a[arraySize] = {
    
     1, 2, 3, 4, 5 };
    const int b[arraySize] = {
    
     10, 20, 30, 40, 50 };
    int c[arraySize] = {
    
     0 };

    //调用GPU运算的入口函数，返回类型是cudaError_t
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    //函数用于释放所有申请的显存空间和重置设备状态；
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    
    
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
	// 初始化设备上的GPU，并选择ID为0的GPU执行程序
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output).
	// 为device(GPU)中的求和数组c分配内存
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

	// 为device(GPU)中的数组a分配内存
    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

	// 为device(GPU)中的数组b分配内存
    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
	// 将CPU中的数组a数据拷贝到GPU
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

	// 将CPU中的数组b数据拷贝到GPU
    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
	// “<<<>>>”表示运行时配置符号，在本程序中的定义是<<<1，size>>>，表示分配了一个线程块（Block），每个线程块有分配了size个线程
	// 这种设置默认线程块和线程的维度为1，即：blockIdx.x=0，threadId.x的范围为[0，size)
	// 一共开arraySize个线程，每个线程执行一组数据的加法。
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel 函数用于返回最新的一个运行时调用错误，对于任何CUDA错误，都可以通过函数
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));  //函数来获取错误的详细信息。
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch. 函数提供了一个阻塞，用于等待所有的线程都执行完各自的计算任务，然后继续往下执行。
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory. 函数用于主机内存和设备显存以及主机与主机之间，设备与设备之间相互拷贝数据
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
    
    
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c); //函数用于释放申请的显存空间。
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

以上注释已经十分详细，就不在解释其具体含义了，实际运行结果如下：
一个hello_world小程序：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void hello_world(void)
{
    
    
	printf("GPU: Hello world!\n");
}
int main(int argc, char **argv)
{
    
    
	printf("CPU: Hello world!\n");
	hello_world << <1, 10 >> >();
	cudaDeviceReset();//if no this line ,it can not output hello world from gpu
	return 0;
}

结果输出：

五、参考

谭升的博客：

1、异构计算与CUDA
2、CUDA编程概述一

那些年CUDA编程那些事（一）

详解CUDA的第一个例程

一、概述

二、CDUA安装

三、简单结构描述

四、标准例子

五、参考

六其他

猜你喜欢

那些年CUDA编程那些事（一）

详解CUDA的第一个例程

一、概述

二、CDUA安装

三、简单结构描述

四、标准例子

五、参考

六 其他

猜你喜欢

六其他