CUDA:(一) 基本设备管理函数、向量相加

LZ之前看过一点点的GPU的加速代码,后来发现要想写好cuda代码对硬件的理解,内存的理解,并行计算等要求还是很高的,然后就很怂的放弃了,没有继续下去,而且实验室基本上都是做CV的,很少有同学做HPC,所以对这方面属于真小白。

直到前阵子,去GTC CHINA,也还是小白,CUDA也就用过最基础的cudaMalloc和cudaMemcpy这两个常规操作,直到后续需要将整个算法移植到gpu上,才不得不看cuda的相关知识。

但是这个对LZ未尝不是件好事,互联网行业最近两年一直在降温,热钱也在减少,AI也不是万能的,甚至说是死亡AI,原来确实泡沫太大了,需要挤挤。而且现在各个学校都在开设人工智能学院,可想而知,四年后的市场,高端人才一定是供不应求的,但处于一些比较基础的AI算法人员肯定会供过于求,如果只会从github上下载一些代码,然后换个数据训练,调个参数,竞争力必然是不强的。除了人,还有机器,AI芯片,各种AI集成框架,autoML, Metalearning等一系列软硬件,会使得深度学习极为简单,以后连调参的机会都不给你了,所以趁着自己年轻,刚好还有对应的项目,多学习些“硬菜”,入门门槛高点,你才能保持的核心竞争力,要做到“人无我有”还是挺难的,快过年了,也希望自己能够沉下心来好好学习,LZ现在的老大很好,也感觉自己很幸运。

回到正题,以前弄深度学习,配置环境,上来就是安装CUDA,相信很多小伙伴都被安装CUDA所折磨过,但是CUDA到底是什么呢?

百度百科是这么说的:

CUDA(Compute Unified Device Architecture),是显卡厂商NVIDIA推出的运算平台。 CUDA™是一种由NVIDIA推出的通用并行计算架构,该架构使GPU能够解决复杂的计算问题。 它包含了CUDA指令集架构(ISA)以及GPU内部的并行计算引擎。 开发人员可以使用C语言来为CUDA™架构编写程序,C语言是应用最广泛的一种高级编程语言。所编写出的程序可以在支持CUDA™的处理器上以超高性能运行。CUDA3.0已经开始支持C++和FORTRAN。

换个方式理解,就是为程序猿利用GPU的计算能力提供的一个工具!

什么是cuDnn?
NVIDIA cuDNN是用于深度神经网络的GPU加速库。它强调性能、易用性和低内存开销。NVIDIA cuDNN可以集成到更高级别的机器学习框架中,如谷歌的Tensorflow、加州大学伯克利分校的流行caffe软件。简单的插入式设计可以让开发人员专注于设计和实现神经网络模型,而不是简单调整性能,同时还可以在GPU上实现高性能现代并行计算。

LZ使用cudnn和不使用cudnn进行测试,发现使用cudnn确实inference的更快,而且效果非常好,所以配置深度框架都会要求下载cudnn的库,况且配置很简单,干嘛不用呢?

第一个cuda代码

#include <cuda_runtime_api.h>
#include <iostream>

#define RANDOM(x) (rand() % x)
#define MAX 10

// single block single thread
__global__ void vector_add_gpu_1(int *d_a, int *d_b, int *d_c, int n){
	for(int i = 0; i < n; i++){
		d_c[i] = d_a[i] + d_b[i];
	}
}

// single block multiple threads
__global__ void vector_add_gpu_2(int *d_a, int *d_b, int *d_c, int n){
	int tid = threadIdx.x;
	const int t_n = blockDim.x;
	while(tid < n){
		d_c[tid] = d_a[tid] + d_b[tid];
		tid+=t_n;
	}
}

// multiple blocks multiple threads
__global__ void vector_add_gpu_3(int *d_a, int *d_b, int *d_c, int n){
	const int tidx = threadIdx.x;
	const int bidx = blockIdx.x;
	const int t_n = gridDim.x*blockDim.x;
	int tid = bidx*blockDim.x+tidx;

	while(tid<n){
		d_c[tid] = d_a[tid]+d_b[tid];
		tid += t_n;
	}
}



int main(){

	int count;
	cudaGetDeviceCount(&count); //返回计算能力大于1.0的GPU数量

	int gpuid=0;//选择GPU: 0
	cudaSetDevice(gpuid);//根据GPU的index设置需要的GPU,默认为0

	cudaGetDevice(&gpuid);//获得当前线程所使用的GPU index,赋值给device


	struct cudaDeviceProp device_prop;
	cudaGetDeviceProperties(&device_prop, 0);

	std::cout<<device_prop.name <<std::endl;
	std::cout<<(device_prop.totalGlobalMem/1024/1024) << " MB " <<std::endl;
	std::cout<<(device_prop.sharedMemPerBlock/1024) << " KB " <<std::endl;
	
	/***向量相加的实现***/
	int n = 5;
	int *a = (int *)malloc(sizeof(int)*n);
	int *b = (int *)malloc(sizeof(int)*n);
	int *c = (int *)malloc(sizeof(int)*n);

	for (size_t i = 0; i<n; i++){
		a[i] = RANDOM(MAX);
		b[i] = RANDOM(MAX);
		std::cout << a[i] << "   " << b[i] << std::endl;
	}

    cudaError_t  cudaStatus;

	// GPU memory allocate
	int *d_a, *d_b, *d_c;
	cudaMalloc((void **)&d_a, sizeof(int)*n);
	cudaMalloc((void **)&d_b, sizeof(int)*n);
	cudaMalloc((void **)&d_c, sizeof(int)*n);

	// data a and b copy to GPU
	cudaStatus = cudaMemcpy(d_a, a, sizeof(int)*n, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
        LOG(ERROR) << ("Memory copy failed! error code: %s", cudaGetErrorString(cudaStatus)) << std::endl;
    }
	cudaStatus = cudaMemcpy(d_b, b, sizeof(int)*n, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
        LOG(ERROR) << ("Memory copy failed! error code: %s", cudaGetErrorString(cudaStatus)) << std::endl;
    }

	//vector_add_gpu_1<<<1, 1>>>(d_a, d_b, d_c, n);

	//vector_add_gpu_2<<<1, 12>>>(d_a, d_b, d_c, n);

	vector_add_gpu_3<<<4, 3>>>(d_a, d_b, d_c, n);

	// result copy back to CPU
	cudaMemcpy(c, d_c, sizeof(int)*n, cudaMemcpyDeviceToHost);
	std::cout << "the result of add is: " << std::endl;

	for (size_t i = 0; i<n; i++){
		std::cout << " " << c[i] ;
	}

	// GPU memory free
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	free(a);
	free(b);
	free(c);

	return 0;
}

输出的结果:

GeForce GTX 970M
3024 MB 
48 KB 
3   6
7   5
3   5
6   2
9   1
the result of add is: 
 9 12 8 8 10

关于cuda的设备管理函数,可以对应下面参数进行查询:

/**
 * CUDA device properties
 */
struct __device_builtin__ cudaDeviceProp
{
    char         name[256];                  /**< ASCII string identifying device */
    cudaUUID_t   uuid;                       /**< 16-byte unique identifier */
    char         luid[8];                    /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
    unsigned int luidDeviceNodeMask;         /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
    size_t       totalGlobalMem;             /**< Global memory available on device in bytes */
    size_t       sharedMemPerBlock;          /**< Shared memory available per block in bytes */
    int          regsPerBlock;               /**< 32-bit registers available per block */
    int          warpSize;                   /**< Warp size in threads */
    size_t       memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */
    int          maxThreadsPerBlock;         /**< Maximum number of threads per block */
    int          maxThreadsDim[3];           /**< Maximum size of each dimension of a block */
    int          maxGridSize[3];             /**< Maximum size of each dimension of a grid */
    int          clockRate;                  /**< Clock frequency in kilohertz */
    size_t       totalConstMem;              /**< Constant memory available on device in bytes */
    int          major;                      /**< Major compute capability */
    int          minor;                      /**< Minor compute capability */
    size_t       textureAlignment;           /**< Alignment requirement for textures */
    size_t       texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */
    int          deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
    int          multiProcessorCount;        /**< Number of multiprocessors on device */
    int          kernelExecTimeoutEnabled;   /**< Specified whether there is a run time limit on kernels */
    int          integrated;                 /**< Device is integrated as opposed to discrete */
    int          canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
    int          computeMode;                /**< Compute mode (See ::cudaComputeMode) */
    int          maxTexture1D;               /**< Maximum 1D texture size */
    int          maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */
    int          maxTexture1DLinear;         /**< Maximum size for 1D textures bound to linear memory */
    int          maxTexture2D[2];            /**< Maximum 2D texture dimensions */
    int          maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */
    int          maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
    int          maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
    int          maxTexture3D[3];            /**< Maximum 3D texture dimensions */
    int          maxTexture3DAlt[3];         /**< Maximum alternate 3D texture dimensions */
    int          maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */
    int          maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */
    int          maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */
    int          maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
    int          maxSurface1D;               /**< Maximum 1D surface size */
    int          maxSurface2D[2];            /**< Maximum 2D surface dimensions */
    int          maxSurface3D[3];            /**< Maximum 3D surface dimensions */
    int          maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */
    int          maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */
    int          maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */
    int          maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
    size_t       surfaceAlignment;           /**< Alignment requirements for surfaces */
    int          concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */
    int          ECCEnabled;                 /**< Device has ECC support enabled */
    int          pciBusID;                   /**< PCI bus ID of the device */
    int          pciDeviceID;                /**< PCI device ID of the device */
    int          pciDomainID;                /**< PCI domain ID of the device */
    int          tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
    int          asyncEngineCount;           /**< Number of asynchronous engines */
    int          unifiedAddressing;          /**< Device shares a unified address space with the host */
    int          memoryClockRate;            /**< Peak memory clock frequency in kilohertz */
    int          memoryBusWidth;             /**< Global memory bus width in bits */
    int          l2CacheSize;                /**< Size of L2 cache in bytes */
    int          maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
    int          streamPrioritiesSupported;  /**< Device supports stream priorities */
    int          globalL1CacheSupported;     /**< Device supports caching globals in L1 */
    int          localL1CacheSupported;      /**< Device supports caching locals in L1 */
    size_t       sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
    int          regsPerMultiprocessor;      /**< 32-bit registers available per multiprocessor */
    int          managedMemory;              /**< Device supports allocating managed memory on this system */
    int          isMultiGpuBoard;            /**< Device is on a multi-GPU board */
    int          multiGpuBoardGroupID;       /**< Unique identifier for a group of devices on the same multi-GPU board */
    int          hostNativeAtomicSupported;  /**< Link between the device and the host supports native atomic operations */
    int          singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
    int          pageableMemoryAccess;       /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
    int          concurrentManagedAccess;    /**< Device can coherently access managed memory concurrently with the CPU */
    int          computePreemptionSupported; /**< Device supports Compute Preemption */
    int          canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
    int          cooperativeLaunch;          /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
    int          cooperativeMultiDeviceLaunch; /**< Device can participate in cooperative kernels launched via ::cudaLaunchCooperativeKernelMultiDevice */
    size_t       sharedMemPerBlockOptin;     /**< Per device maximum shared memory per block usable by special opt in */
    int          pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
    int          directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
};

写CUDA代码一直很犹豫,这也算是迈出了第一步吧/(ㄒoㄒ)/~~

参考地址:
https://baike.baidu.com/item/CUDA

发布了300 篇原创文章 · 获赞 203 · 访问量 59万+

猜你喜欢

转载自blog.csdn.net/Felaim/article/details/104044451