1. Thread composition structure
2. Determine the configuration of your graphics card
3. CUDA kernel function parameters (Kernel Function)

1. Thread composition structure

The specific structure can be divided into: Grid-> Block-> Warp -> thread from top to bottom.

Grid/Block/Warp/Thread are all software organization structures, not hardware, so theoretically we can arrange threads in any dimension (one-dimensional, two-dimensional, three-dimensional); on hardware, it is just one SP, There is no such thing as dimensionality, but the concept of dimensionality is abstracted from software.

Warp is the third level in the structure, with 32 threads as a group.

Insert picture description here

2. Determine the configuration of your graphics card

Before programming, it should be noted that the number of threads and blocks supported by the graphics card is limited. Therefore, first confirm the configuration of the graphics card. Otherwise, once the program exceeds the hardware limit of the graphics card, it will stop working.

CUDA routines for printing graphics card configuration:

/***
 *打印显卡属性
 *2017-11-8
 * ***/

#include<iostream>
#include <cuda_runtime.h>
#include<device_launch_parameters.h>

using namespace std;

int main()
{
    
    
	int deviceCount;

	cudaGetDeviceCount(&deviceCount);	//Returns in *deviceCount the number of devices
	cout << "deviceCount:   " << deviceCount << "\n\n";
	if (deviceCount == 0)
	{
    
    
		cout << "error: no devices supporting CUDA.\n";
		exit(EXIT_FAILURE);
	}

	int dev = 0;
	cudaSetDevice(dev);	//Sets dev=0 device as the current device for the calling host thread.

	cudaDeviceProp devProps;
	cudaGetDeviceProperties(&devProps, dev);

	cout << "name: " << devProps.name << "\n";
	cout << "totalGlobalMem: " << devProps.totalGlobalMem << "\n";
	cout << "regsPerBlock: " << devProps.regsPerBlock << "\n";
	cout << "warpSize: " << devProps.warpSize << "\n";
	cout << "memPitch: " << devProps.memPitch << "\n\n";

	cout << "一个线程块中可使用的最大共享内存\n";
	cout << "devProps.sharedMemPerBlock: " << devProps.sharedMemPerBlock << "\n\n";

	cout << "一个线程块中可包含的最大线程数量\n";
	cout << "maxThreadsPerBlock: " << devProps.maxThreadsPerBlock << "\n\n";

	cout << "多维线程块数组中每一维可包含的最大线程数量\n";
	cout << "maxThreadsDim[0]: " << devProps.maxThreadsDim[0] << "\n";
	cout << "maxThreadsDim[1]: " << devProps.maxThreadsDim[1] << "\n";
	cout << "maxThreadsDim[2]: " << devProps.maxThreadsDim[2] << "\n\n";

	cout << "一个线程格中每一维可包含的最大线程块数量\n";
	cout << "maxGridSize[0]: " << devProps.maxGridSize[0] << "\n";
	cout << "maxGridSize[1]: " << devProps.maxGridSize[1] << "\n";
	cout << "maxGridSize[2]: " << devProps.maxGridSize[2] << "\n\n";

	cout << "clockRate: " << devProps.clockRate << "\n";
	cout << "totalConstMem: " << devProps.totalConstMem << "\n";
	cout << "textureAlignment: " << devProps.textureAlignment << "\n\n";

	cout << "计算能力：" << devProps.major << "." << devProps.minor << "\n\n";

	cout << "minor: " << devProps.minor << "\n";
	cout << "texturePitchAlignment: " << devProps.texturePitchAlignment << "\n";
	cout << "deviceOverlap: " << devProps.deviceOverlap << "\n";
	cout << "multiProcessorCount: " << devProps.multiProcessorCount << "\n";
	cout << "kernelExecTimeoutEnabled: " << devProps.kernelExecTimeoutEnabled << "\n";
	cout << "integrated: " << devProps.integrated << "\n";
	cout << "canMapHostMemory: " << devProps.canMapHostMemory << "\n";
	cout << "computeMode: " << devProps.computeMode << "\n";
	cout << "maxTexture1D: " << devProps.maxTexture1D << "\n";
	cout << "maxTexture1DMipmap: " << devProps.maxTexture1DMipmap << "\n";
	cout << "maxTexture1DLinear: " << devProps.maxTexture1DLinear << "\n";
	cout << "maxTexture2D: " << devProps.maxTexture2D << "\n";
	cout << "maxTexture2DMipmap: " << devProps.maxTexture2DMipmap << "\n";
	cout << "maxTexture2DLinear: " << devProps.maxTexture2DLinear << "\n";
	cout << "maxTexture2DGather: " << devProps.maxTexture2DGather << "\n";
	cout << "maxTexture3D: " << devProps.maxTexture3D << "\n";
	cout << "maxTexture3DAlt: " << devProps.maxTexture3DAlt << "\n";
	cout << "maxTextureCubemap: " << devProps.maxTextureCubemap << "\n";
	cout << "maxTexture1DLayered: " << devProps.maxTexture1DLayered << "\n";
	cout << "maxTexture2DLayered: " << devProps.maxTexture2DLayered << "\n";
	cout << "maxTextureCubemapLayered: " << devProps.maxTextureCubemapLayered << "\n";
	cout << "maxSurface1D: " << devProps.maxSurface1D << "\n";
	cout << "maxSurface2D: " << devProps.maxSurface2D << "\n";
	cout << "maxSurface3D: " << devProps.maxSurface3D << "\n";
	cout << "maxSurface1DLayered: " << devProps.maxSurface1DLayered << "\n";
	cout << "maxSurface2DLayered: " << devProps.maxSurface2DLayered << "\n";
	cout << "maxSurfaceCubemap: " << devProps.maxSurfaceCubemap << "\n";
	cout << "maxSurfaceCubemapLayered: " << devProps.maxSurfaceCubemapLayered << "\n";
	cout << "surfaceAlignment: " << devProps.surfaceAlignment << "\n";
	cout << "concurrentKernels: " << devProps.concurrentKernels << "\n";
	cout << "ECCEnabled: " << devProps.ECCEnabled << "\n";
	cout << "pciBusID: " << devProps.pciBusID << "\n";
	cout << "pciDeviceID: " << devProps.pciDeviceID << "\n";
	cout << "pciDomainID: " << devProps.pciDomainID << "\n";
	cout << "tccDriver: " << devProps.tccDriver << "\n";
	cout << "asyncEngineCount: " << devProps.asyncEngineCount << "\n";
	cout << "unifiedAddressing: " << devProps.unifiedAddressing << "\n";
	cout << "memoryClockRate: " << devProps.memoryClockRate << "\n";
	cout << "memoryBusWidth: " << devProps.memoryBusWidth << "\n";
	cout << "l2CacheSize: " << devProps.l2CacheSize << "\n";
	cout << "maxThreadsPerMultiProcessor: " << devProps.maxThreadsPerMultiProcessor << "\n";
	cout << "streamPrioritiesSupported: " << devProps.streamPrioritiesSupported << "\n";
	cout << "globalL1CacheSupported: " << devProps.globalL1CacheSupported << "\n";
	cout << "localL1CacheSupported: " << devProps.localL1CacheSupported << "\n";
	cout << "sharedMemPerMultiprocessor: " << devProps.sharedMemPerMultiprocessor << "\n";
	cout << "regsPerMultiprocessor: " << devProps.regsPerMultiprocessor << "\n";
	cout << "isMultiGpuBoard: " << devProps.isMultiGpuBoard << "\n";
	cout << "multiGpuBoardGroupID: " << devProps.multiGpuBoardGroupID << "\n";
	cout << "singleToDoublePrecisionPerfRatio: " << devProps.singleToDoublePrecisionPerfRatio << "\n";
	cout << "pageableMemoryAccess: " << devProps.pageableMemoryAccess << "\n";
	cout << "concurrentManagedAccess: " << devProps.concurrentManagedAccess << "\n";

	return 0;
}

Insert picture description here

3. CUDA kernel function parameters (Kernel Function)

To understand the tasks performed by threads in GPU programming.

/*
kernel函数的样例
*/
// CUDA核函数的定义
__global__ void addKernel(int *c, const int *a, const int *b)
{
    
    
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

// CUDA核函数调用
addKernel<<<Dg,Db, Ns, S>>>(c, a, b);

Need to pay attention to the following points:
1. The logo " global " before the function indicates that the function is executed on the GPU.
2. When the function is called, the number of threads must be determined by <<< >>>.
3. The meaning of each parameter when calling the kernel function:
Dg : int type or dim3 type (x, y, z), used to define how the blocks in a Grid are organized, if it is an int type, it means a one-dimensional organizational structure
Db : Int type or dim3 type (x, y, z), used to define how the Thread in a Block is organized, if it is an int type, it means a one-dimensional organizational structure
Ns : size_t type, can be the default, the default is 0; It is used to set the maximum shared memory size that can be dynamically allocated for each block in addition to the statically allocated shared memory, in bytes. 0 means that no dynamic allocation is required.
S : cudaStream_t type, can be defaulted, the default is 0. Indicates which stream the kernel function is located in.

3.1 1d Grid 和 1d Block

The calling mode of cuda:
The figure below shows that a grid has 4 block blocks (numbers: 0-3), and a block block has 8 threads (numbers: 0-7).

kernel_name<<<4, 8>>>(...)

The thread index calculation method in the cuda core function is as follows:

int threadId = blockIdx.x * blockDim.x + threadIdx.x

3.2 2d Grid 和 2d Block

The way to calculate the index for high dimensions is similar to calculating the number of data in a multidimensional array. It is necessary to calculate the number of Threads in all Blocks in front of Thread, and add the serial number of the Thread in this Block.

Call form:

dim3 grid(4,1), block(2,2);
kernel_name<<<grid, block>>>(...)

Index calculation method:

int x = threadIdx.x + blockIdx.x * blockDim.x; 
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;

int blockId = blockIdx.x + blockId.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;

Note : In two-dimensional sorting, Thread(0,1) represents the Thread in row 1, column 0, which is not the same as the abscissa and ordinate in our traditional understanding; we define grid(4,2) to represent the first One dimension has 4 index values, and the second dimension has 2 index values, that is, 2 rows and 4 columns.
Insert picture description here
Note : dim3 is a custom integer vector type in NVIDIA's CUDA programming, based on uint3 for specifying dimensions.

E.g:

dim3 grid（num1，num2，num3）；

The final setting of the dim3 type is a three-dimensional vector, the three-dimensional parameters are x, y, z;

3.3 3d Grid 和 3d Block

3d is relatively rare, and its thread calculation formula is as follows:

int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int threadIc = blockId * (blockDim.x * blockDim.y * blockDim.z) 
                       + (threadIdx.z * (blockDim.x * blockDim.y)) 
                       + (threadIdx.y * blockDim.x) + threadIdx.x;

3.4 Summary of calculation formulas

One-dimensional Grid One-dimensional Block

blockId = blockIdx.x 
threadId = blockIdx.x *blockDim.x + threadIdx.x

One-dimensional Grid Two-dimensional Block

blockId = blockIdx.x 
threadId = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x

One-dimensional Grid Three-dimensional Block

blockId = blockIdx.x 
threadId = blockIdx.x * blockDim.x * blockDim.y * blockDim.z 
                        + threadIdx.z * blockDim.y * blockDim.x 
                        + threadIdx.y * blockDim.x + threadIdx.x

Two-dimensional Grid One-dimensional Block

int blockId = blockIdx.y * gridDim.x + blockIdx.x;  
int threadId = blockId * blockDim.x + threadIdx.x;

Two-dimensional Grid Two-dimensional Block

int blockId = blockIdx.x + blockIdx.y * gridDim.x;  
int threadId = blockId * (blockDim.x * blockDim.y) 
                       + (threadIdx.y * blockDim.x) + threadIdx.x;

2D Grid 3D Block

int blockId = blockIdx.x + blockIdx.y * gridDim.x;  
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)  
                       + (threadIdx.z * (blockDim.x * blockDim.y))  
                       + (threadIdx.y * blockDim.x) + threadIdx.x;

Three-dimensional Grid One-dimensional Block

int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;  
int threadId = blockId * blockDim.x + threadIdx.x;

3D Grid 2D Block

int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;  
int threadId = blockId * (blockDim.x * blockDim.y) 
                       + (threadIdx.y * blockDim.x) + threadIdx.x;

3D Grid 3D Block

int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;  
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)  
                       + (threadIdx.z * (blockDim.x * blockDim.y))  
                       + (threadIdx.y * blockDim.x) + threadIdx.x;

GPU programming thread configuration

table of Contents