Those things about CUDA programming in those years (2)

I. Overview

1. The previous article roughly introduced the installation and structure of CUDA, and at the same time realized the simple use of CUDA with a basic routine. This article specifically introduces the implementation steps of CUDA on the matrix.
2. The source codes are all from: https://github.com/Tony-Tan/CUDA_Freshman
3. The required freshman.hheader files) are as follows:

#ifndef FRESHMAN_H
#define FRESHMAN_H

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <math.h>
#include <time.h>

#ifdef _WIN32
#	include <windows.h>
#else
#	include <sys/time.h>
#endif
#ifdef _WIN32

//宏定义
#define CHECK(call)\
{
      
      \
  const cudaError_t error=call;\
  if(error!=cudaSuccess)\
  {
      
      \
      printf("ERROR: %s:%d,",__FILE__,__LINE__);\
      printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
      exit(1);\
  }\
}

int gettimeofday(struct timeval *tp, void *tzp)
{
    
    
  time_t clock;
  struct tm tm;
  SYSTEMTIME wtm;
  GetLocalTime(&wtm);
  tm.tm_year   = wtm.wYear - 1900;
  tm.tm_mon   = wtm.wMonth - 1;
  tm.tm_mday   = wtm.wDay;
  tm.tm_hour   = wtm.wHour;
  tm.tm_min   = wtm.wMinute;
  tm.tm_sec   = wtm.wSecond;
  tm. tm_isdst  = -1;
  clock = mktime(&tm);
  tp->tv_sec = clock;
  tp->tv_usec = wtm.wMilliseconds * 1000;
  return (0);
}
#endif

//计时函数
double cpuSecond()
{
    
    
  struct timeval tp;
  gettimeofday(&tp,NULL);
  return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
}

//产生浮点型随机数
void initialData(float* ip,int size)
{
    
    
  //使用srand和rand产生随机数
  time_t t;
  srand((unsigned )time(&t));  //https://www.runoob.com/w3cnote/cpp-rand-srand.html
  for(int i=0;i<size;i++)
  {
    
    
    ip[i]=(float)(rand()&0xffff)/1000.0f;
  }
}

//产生整型随机数
void initialData_int(int* ip, int size)
{
    
    
	time_t t;
	srand((unsigned)time(&t));
	for (int i = 0; i<size; i++)
	{
    
    
		ip[i] = int(rand()&0xff);
	}
}

//
void printMatrix(float * C,const int nx,const int ny)
{
    
    
  float *ic=C;
  printf("Matrix<%d,%d>:",ny,nx);
  for(int i=0;i<ny;i++)
  {
    
    
    for(int j=0;j<nx;j++)
    {
    
    
      printf("%6f ",C[j]);
    }
    ic+=nx;
    printf("\n");
  }
}

//初始化设备
void initDevice(int devNum)
{
    
    
  int dev = devNum;

  //打印设备信息
  cudaDeviceProp deviceProp;
  CHECK(cudaGetDeviceProperties(&deviceProp,dev));
  printf("Using device %d: %s\n",dev,deviceProp.name);
  printf("Maximun number of thread per block: %d\n",deviceProp.maxThreadsPerBlock);
  printf("Maximun size of each dimension of a block: %d x %d x %d\n",deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
  printf("Maximun size of each dimension of a grid: %d x %d x %d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
  printf("Maximu memory pitch %lu bytes\n", deviceProp.memPitch);
  printf("Total amount of global memory: %.2f GBytes (%llu bytes)\n",(float)deviceProp.totalGlobalMem / pow(1024.0, 3), deviceProp.totalGlobalMem);

  //设置设备ID
  CHECK(cudaSetDevice(dev));

}

//检查计算结果
void checkResult(float * hostRef,float * gpuRef,const int N)
{
    
    
  long epsilon=1.0E-8;
  for(int i=0;i<N;i++)
  {
    
    
    if(abs(long(hostRef[i]-gpuRef[i]))>epsilon)
    {
    
    
      printf("Results don\'t match!\n");
      printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
      return;
    }
  }
  printf("Check result success!\n");
}
#endif//FRESHMAN_H

Second, the operation of one-dimensional matrix

  1. Initialize CUDA and its ID, which means that the GPU with ID 0 is used to execute the algorithm.
	cudaSetDevice(0);
  1. Allocate memory for CUDA internal variables, mainly including variable memory pre-allocation for input and output.
	//需要计算的向量的大小:2^14
	int nElem = 1 << 14;
	printf("Vector size:%d\n", nElem);
	
	//数据的字节数
	int nByte = sizeof(float)*nElem;
	
	//d:device表示设备(GPU),a,b为输入数据,res为输出数据
	float *a_d, *b_d, *res_d;
	cudaMalloc((float**)&a_d, nByte);
	cudaMalloc((float**)&b_d, nByte);
	cudaMalloc((float**)&res_d, nByte);
  1. Copy the matrix variables that need to be calculated by the CPU to the GPU through the cudaMemcpy function.
	//将CPU数据拷贝到GPU
	cudaMemcpy(a_d, a_h, nByte, cudaMemcpyHostToDevice);
	cudaMemcpy(b_d, b_h, nByte, cudaMemcpyHostToDevice);
  1. Allocate thread blocks and threads for the algorithm to perform the addition of elements in the matrix.
	// gridSize表示有多少个block, blockSize表示每个block有多少个thread(最大的线程数好像不能超过1024个)
	//定义为一维的线程块
	dim3 blocksize(1024);  
	//定义为一维的线程网格
	dim3 gridsize(nElem / blocksize.x);
  1. Through the defined kernel function, each thread in CUDA is used to perform the addition operation to realize the parallel operation of the program. Note: Here you need to wait for all threads to finish executing before performing subsequent operations, because CUDA first returns control to the CPU during execution, and then the thread ends. Refer to this blog.
    insert image description here
	//“ << < >> >”表示运行时配置符号,在本程序中的定义是 << <1,size >> >,表示分配了一个线程块(Block),每个线程块有分配了size个线程
	sumArraysGPU << <gridsize, blocksize >> >(a_d, b_d, res_d);  //执行核函数
	printf("Execution configuration<<<%d,%d>>>\n", gridsize.x, blocksize.x);
	
	//等待所有线程结束
	cudaDeviceSynchronize();
  1. Copy the result of the CUDA kernel function execution to the CPU to complete the calculation
	//将GPU的数据拷贝到CPU
	cudaMemcpy(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost);
  1. After execution, release the memory and reset the GPU
	cudaFree(a_d);
	cudaFree(b_d);
	cudaFree(res_d);
	cudaDeviceReset();
  1. The complete code is as follows:
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"

//普通的CPU版本求和方式
void sumArrays(float * a, float * b, float * res, const int size)
{
    
    
	for (int i = 0; i<size; i += 4)
	{
    
    
		res[i] = a[i] + b[i];
		res[i + 1] = a[i + 1] + b[i + 1];
		res[i + 2] = a[i + 2] + b[i + 2];
		res[i + 3] = a[i + 3] + b[i + 3];
	}
}

//使用GPU核函数进行数组求和
__global__ void sumArraysGPU(float*a, float*b, float*res)
{
    
    
	//根据线程的索引号对数组进行赋值:第几个块*每个块的线程数+线程数
	int i = blockIdx.x*blockDim.x + threadIdx.x;
	res[i] = a[i] + b[i];
}


int main(int argc, char **argv)
{
    
    
	// Choose which GPU to run on, change this on a multi-GPU system.
	cudaSetDevice(0);
	
	//需要计算的向量的大小:2^14
	int nElem = 1 << 14;
	printf("Vector size:%d\n", nElem);

	//为待计算的数组和输出结果数组分配内存 h:host表示主机(CPU)
	int nByte = sizeof(float)*nElem;
	float *a_h = (float*)malloc(nByte);
	float *b_h = (float*)malloc(nByte);
	float *res_h = (float*)malloc(nByte);
	float *res_from_gpu_h = (float*)malloc(nByte);

	//分配初始值(可以不用)
	memset(res_h, 0, nByte);
	memset(res_from_gpu_h, 0, nByte);

	//使用宏的方式为GPU变量分配内存,对于宏和内联函数的区别可以参考:https://blog.csdn.net/qq_37934101/article/details/81266548
	float *a_d, *b_d, *res_d;
	CHECK(cudaMalloc((float**)&a_d, nByte));
	CHECK(cudaMalloc((float**)&b_d, nByte));
	CHECK(cudaMalloc((float**)&res_d, nByte));

	//产生随机数
	initialData(a_h, nElem);
	initialData(b_h, nElem);

	//将CPU数据拷贝到GPU
	CHECK(cudaMemcpy(a_d, a_h, nByte, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(b_d, b_h, nByte, cudaMemcpyHostToDevice));

	// gridSize表示有多少个block, blockSize表示每个block有多少个thread(最大的线程数好像不能超过1024个)
	//定义为一维的线程块
	dim3 blocksize(1024);  
	//定义为一维的线程网格
	dim3 gridsize(nElem / blocksize.x);

	//“ << < >> >”表示运行时配置符号,在本程序中的定义是 << <1,size >> >,表示分配了一个线程块(Block),每个线程块有分配了size个线程
	sumArraysGPU << <gridsize, blocksize >> >(a_d, b_d, res_d);  //执行核函数
	printf("Execution configuration<<<%d,%d>>>\n", gridsize.x, blocksize.x);

	CHECK(cudaDeviceSynchronize());  //等待线程结束

	//将GPU的数据拷贝到CPU
	CHECK(cudaMemcpy(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost));
	sumArrays(a_h, b_h, res_h, nElem);  //执行CPU版本的数组求和

	//检查两种求和方式是否一致
	checkResult(res_h, res_from_gpu_h, nElem);

	//释放GPU内存
	cudaFree(a_d);
	cudaFree(b_d);
	cudaFree(res_d);

	//释放CPU内存
	free(a_h);
	free(b_h);
	free(res_h);
	free(res_from_gpu_h);

	CHECK(cudaDeviceReset());

	return 0;
}
  1. operation result
    insert image description here

3. Two-dimensional matrix operation

  1. The two-dimensional computing method is very similar to the one-dimensional one, the only difference lies in the allocation of thread blocks and threads. The key parts are mainly emphasized here, and other parts such as memory allocation, copying, and release are equal to the above. The first is to define the dimensions of the thread block:
	//设置线程块在x和y方向上的维度,均为32
	int dimx = 32;
	int dimy = 32;
	
	//二维网格和二维块(当然也可以定义二维网格与一维块、一维网格与二维块,具体见完整代码)
	dim3 blockSize_0(dimx, dimy);
	//这里这样操作主要是为了在最后不足一个线程块时,按一个计算
	dim3 gridSize_0((nx - 1) / blockSize_0.x + 1, (ny - 1) / blockSize_0.y + 1);  
  1. Call timing function and kernel function:
	//开始计时
	iStart = cpuSecond();
	sumMatrix << <gridSize_0, blockSize_0 >> >(A_dev, B_dev, C_dev, nx, ny);
	cudaDeviceSynchronize();  //等待线程结束
	//计时结束
	iElaps = cpuSecond() - iStart;
  1. CUDA kernel function realizes matrix summation
//使用GPU核函数进行举证求和,调用核函数时,就相当于一大地线程同时(不完全同步)执行这个函数
__global__ void sumMatrix(float * MatA, float * MatB, float * MatC, int nx, int ny)
{
    
    
	//线程在网格中的x坐标
	int ix = threadIdx.x + blockDim.x*blockIdx.x;
	//线程在网格中的y坐标
	int iy = threadIdx.y + blockDim.y*blockIdx.y;

	//线程在网格中的索引,与矩阵中某个元素的索引一致
	int idx = ix + iy*ny;
	if (ix<nx && iy<ny)
	{
    
    
		MatC[idx] = MatA[idx] + MatB[idx];
	}
}
  1. The complete code is as follows:
#include "freshman.h"

//这个就是CPU版本的求和函数,用于验证GPU结果是否正确
void sumMatrix2D_CPU(float * MatA, float * MatB, float * MatC, int nx, int ny)
{
    
    
	float * a = MatA;
	float * b = MatB;
	float * c = MatC;
	for (int j = 0; j<ny; j++)
	{
    
    
		for (int i = 0; i<nx; i++)
		{
    
    
			c[i] = a[i] + b[i];
		}
		c += nx;
		b += nx;
		a += nx;
	}
}

//使用GPU核函数进行举证求和,调用核函数时,就相当于一大地线程同时(不完全同步)执行这个函数
__global__ void sumMatrix(float * MatA, float * MatB, float * MatC, int nx, int ny)
{
    
    
	//线程在网格中的x坐标
	int ix = threadIdx.x + blockDim.x*blockIdx.x;
	//线程在网格中的y坐标
	int iy = threadIdx.y + blockDim.y*blockIdx.y;

	//线程在网格中的索引,与矩阵中某个元素的索引一致
	int idx = ix + iy*ny;
	if (ix<nx && iy<ny)
	{
    
    
		MatC[idx] = MatA[idx] + MatB[idx];
	}
}

int main(int* argc, char** argv)
{
    
    
	printf("strating...\n");
	initDevice(0);

	//设置矩阵大小和字节数
	int nx = 1 << 12;
	int ny = 1 << 12;
	int nxy = nx*ny;
	int nBytes = nxy * sizeof(float);

	//分配内存(CPU),初始化数据
	float* A_host = (float*)malloc(nBytes);
	float* B_host = (float*)malloc(nBytes);
	float* C_host = (float*)malloc(nBytes);
	float* C_from_gpu = (float*)malloc(nBytes);
	initialData(A_host, nxy);
	initialData(B_host, nxy);

	//分配内存(GPU)
	float *A_dev = NULL;
	float *B_dev = NULL;
	float *C_dev = NULL;
	CHECK(cudaMalloc((void**)&A_dev, nBytes));
	CHECK(cudaMalloc((void**)&B_dev, nBytes));
	CHECK(cudaMalloc((void**)&C_dev, nBytes));

	//数据拷贝到GPU
	CHECK(cudaMemcpy(A_dev, A_host, nBytes, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(B_dev, B_host, nBytes, cudaMemcpyHostToDevice));


	//设置线程块在x和y方向上的维度,均为32
	int dimx = 32;
	int dimy = 32;

	// cpu compute 这里不需要,后面有拷贝函数
	// cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost);

	double iStart = cpuSecond();
	sumMatrix2D_CPU(A_host, B_host, C_host, nx, ny);
	double iElaps = cpuSecond() - iStart;
	printf("CPU Execution Time elapsed %f sec\n", iElaps);

	//二维网格和二维块
	dim3 blockSize_0(dimx, dimy);
	dim3 gridSize_0((nx - 1) / blockSize_0.x + 1, (ny - 1) / blockSize_0.y + 1);  //这里这样操作主要是为了在最后不足一个线程块时,按一个计算
	iStart = cpuSecond();
	sumMatrix << <gridSize_0, blockSize_0 >> >(A_dev, B_dev, C_dev, nx, ny);
	CHECK(cudaDeviceSynchronize());  //等待线程结束


	iElaps = cpuSecond() - iStart;
	printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
		gridSize_0.x, gridSize_0.y, blockSize_0.x, blockSize_0.y, iElaps);
	CHECK(cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost));
	checkResult(C_host, C_from_gpu, nxy);

	//一维网格和一维块
	dimx = 32;
	dim3 blockSize_1(dimx);
	dim3 gridSize_1((nxy - 1) / blockSize_1.x + 1);
	iStart = cpuSecond();
	sumMatrix << <gridSize_1, blockSize_1 >> >(A_dev, B_dev, C_dev, nx*ny, 1);
	CHECK(cudaDeviceSynchronize());  //等待线程结束

	iElaps = cpuSecond() - iStart;
	printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
		gridSize_1.x, gridSize_1.y, blockSize_1.x, blockSize_1.y, iElaps);
	CHECK(cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost));
	checkResult(C_host, C_from_gpu, nxy);

	//二维网格和一维块
	dimx = 32;
	dim3 blockSize_2(dimx);
	dim3 gridSize_2((nx - 1) / blockSize_2.x + 1, ny);

	//计算开始时间
	iStart = cpuSecond();
	sumMatrix << <gridSize_2, blockSize_2 >> >(A_dev, B_dev, C_dev, nx, ny);
	CHECK(cudaDeviceSynchronize());  //等待线程结束

	//获得所有线程结束时间
	iElaps = cpuSecond() - iStart;
	printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
		gridSize_2.x, gridSize_2.y, blockSize_2.x, blockSize_2.y, iElaps);
	CHECK(cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost));
	checkResult(C_host, C_from_gpu, nxy);

	//释放内存
	cudaFree(A_dev);
	cudaFree(B_dev);
	cudaFree(C_dev);
	free(A_host);
	free(B_host);
	free(C_host);
	free(C_from_gpu);
	cudaDeviceReset();

	return 0;
}
  1. operation result:
    insert image description here

Four. Summary

1. Careful friends can find that when CUDA executes the program, it will copy the data. This step is actually quite time-consuming.
2. The number of threads above is the same as the number of matrix data to be calculated. When the allocated threads are not enough to calculate a rectangular block or a single thread is required to process multiple data, how to deal with it, to be continued...

5. Others

An example of printing thread index:

#include "freshman.h"

__global__ void printThreadIndex(float *A, const int nx, const int ny)
{
    
    
	int ix = threadIdx.x + blockIdx.x*blockDim.x;
	int iy = threadIdx.y + blockIdx.y*blockDim.y;
	unsigned int idx = iy*nx + ix;
	printf("thread_id(%d,%d) block_id(%d,%d) coordinate(%d,%d)"
		"global index %2d ival %2d\n", threadIdx.x, threadIdx.y,
		blockIdx.x, blockIdx.y, ix, iy, idx, A[idx]);
}

int main(int* argc, char** argv)
{
    
    
	initDevice(0);

	//设置矩阵大小和字节数
	int nx = 8, ny = 6;
	int nxy = nx*ny;
	int nBytes = nxy * sizeof(float);

	//分配内存(CPU),初始化数据,打印矩阵
	float* A_host = (float*)malloc(nBytes);
	initialData(A_host, nxy);
	printMatrix(A_host, nx, ny);

	//分配内存(GPU)
	float *A_dev = NULL;
	CHECK(cudaMalloc((void**)&A_dev, nBytes));

	//数据拷贝
	cudaMemcpy(A_dev, A_host, nBytes, cudaMemcpyHostToDevice);

	// gridSize表示有多少个block, blockSize表示每个block有多少个thread(最大的线程数好像不能超过1024个)
	dim3 blocksize(4, 2);
	dim3 gridsize((nx - 1) / blocksize.x + 1, (ny - 1) / blocksize.y + 1);

	printThreadIndex << <gridsize, blocksize >> >(A_dev, nx, ny);

	//等待所有线程结束
	CHECK(cudaDeviceSynchronize());

	//释放内存
	cudaFree(A_dev);
	free(A_host);

	//重置设备
	cudaDeviceReset();
	return 0;
}

Guess you like

Origin blog.csdn.net/qq_38589460/article/details/120235546