GPU编程 线程配置

1.线程的组成结构

具体的结构从上至下可以分为:Grid-> Block-> Warp ->thread。

Grid/Block/Warp/Thread都是软件的组织结构,并不是硬件的,因此理论上我们可以以任意的维度(一维、二维、三维)去排列Thread;在硬件上就是一个个的SP,并没有维度这一说,只是软件上抽象成了具有维度的概念。

Warp是结构中的第三个层次,32个线程为一组。

在这里插入图片描述

2.确定自己显卡的配置

在进行程序设计前要注意显卡所支持的thread数量和block数量是有限制的。因此首先要对显卡的配置进行确认。否则一旦程序超过了显卡的硬件限制,便会停止工作。

打印显卡配置的cuda例程:

/***
 *打印显卡属性
 *2017-11-8
 * ***/

#include<iostream>
#include <cuda_runtime.h>
#include<device_launch_parameters.h>

using namespace std;

int main()
{
    
    
	int deviceCount;

	cudaGetDeviceCount(&deviceCount);	//Returns in *deviceCount the number of devices
	cout << "deviceCount:   " << deviceCount << "\n\n";
	if (deviceCount == 0)
	{
    
    
		cout << "error: no devices supporting CUDA.\n";
		exit(EXIT_FAILURE);
	}

	int dev = 0;
	cudaSetDevice(dev);	//Sets dev=0 device as the current device for the calling host thread.

	cudaDeviceProp devProps;
	cudaGetDeviceProperties(&devProps, dev);

	cout << "name: " << devProps.name << "\n";
	cout << "totalGlobalMem: " << devProps.totalGlobalMem << "\n";
	cout << "regsPerBlock: " << devProps.regsPerBlock << "\n";
	cout << "warpSize: " << devProps.warpSize << "\n";
	cout << "memPitch: " << devProps.memPitch << "\n\n";

	cout << "一个线程块中可使用的最大共享内存\n";
	cout << "devProps.sharedMemPerBlock: " << devProps.sharedMemPerBlock << "\n\n";

	cout << "一个线程块中可包含的最大线程数量\n";
	cout << "maxThreadsPerBlock: " << devProps.maxThreadsPerBlock << "\n\n";

	cout << "多维线程块数组中每一维可包含的最大线程数量\n";
	cout << "maxThreadsDim[0]: " << devProps.maxThreadsDim[0] << "\n";
	cout << "maxThreadsDim[1]: " << devProps.maxThreadsDim[1] << "\n";
	cout << "maxThreadsDim[2]: " << devProps.maxThreadsDim[2] << "\n\n";

	cout << "一个线程格中每一维可包含的最大线程块数量\n";
	cout << "maxGridSize[0]: " << devProps.maxGridSize[0] << "\n";
	cout << "maxGridSize[1]: " << devProps.maxGridSize[1] << "\n";
	cout << "maxGridSize[2]: " << devProps.maxGridSize[2] << "\n\n";

	cout << "clockRate: " << devProps.clockRate << "\n";
	cout << "totalConstMem: " << devProps.totalConstMem << "\n";
	cout << "textureAlignment: " << devProps.textureAlignment << "\n\n";

	cout << "计算能力:" << devProps.major << "." << devProps.minor << "\n\n";

	cout << "minor: " << devProps.minor << "\n";
	cout << "texturePitchAlignment: " << devProps.texturePitchAlignment << "\n";
	cout << "deviceOverlap: " << devProps.deviceOverlap << "\n";
	cout << "multiProcessorCount: " << devProps.multiProcessorCount << "\n";
	cout << "kernelExecTimeoutEnabled: " << devProps.kernelExecTimeoutEnabled << "\n";
	cout << "integrated: " << devProps.integrated << "\n";
	cout << "canMapHostMemory: " << devProps.canMapHostMemory << "\n";
	cout << "computeMode: " << devProps.computeMode << "\n";
	cout << "maxTexture1D: " << devProps.maxTexture1D << "\n";
	cout << "maxTexture1DMipmap: " << devProps.maxTexture1DMipmap << "\n";
	cout << "maxTexture1DLinear: " << devProps.maxTexture1DLinear << "\n";
	cout << "maxTexture2D: " << devProps.maxTexture2D << "\n";
	cout << "maxTexture2DMipmap: " << devProps.maxTexture2DMipmap << "\n";
	cout << "maxTexture2DLinear: " << devProps.maxTexture2DLinear << "\n";
	cout << "maxTexture2DGather: " << devProps.maxTexture2DGather << "\n";
	cout << "maxTexture3D: " << devProps.maxTexture3D << "\n";
	cout << "maxTexture3DAlt: " << devProps.maxTexture3DAlt << "\n";
	cout << "maxTextureCubemap: " << devProps.maxTextureCubemap << "\n";
	cout << "maxTexture1DLayered: " << devProps.maxTexture1DLayered << "\n";
	cout << "maxTexture2DLayered: " << devProps.maxTexture2DLayered << "\n";
	cout << "maxTextureCubemapLayered: " << devProps.maxTextureCubemapLayered << "\n";
	cout << "maxSurface1D: " << devProps.maxSurface1D << "\n";
	cout << "maxSurface2D: " << devProps.maxSurface2D << "\n";
	cout << "maxSurface3D: " << devProps.maxSurface3D << "\n";
	cout << "maxSurface1DLayered: " << devProps.maxSurface1DLayered << "\n";
	cout << "maxSurface2DLayered: " << devProps.maxSurface2DLayered << "\n";
	cout << "maxSurfaceCubemap: " << devProps.maxSurfaceCubemap << "\n";
	cout << "maxSurfaceCubemapLayered: " << devProps.maxSurfaceCubemapLayered << "\n";
	cout << "surfaceAlignment: " << devProps.surfaceAlignment << "\n";
	cout << "concurrentKernels: " << devProps.concurrentKernels << "\n";
	cout << "ECCEnabled: " << devProps.ECCEnabled << "\n";
	cout << "pciBusID: " << devProps.pciBusID << "\n";
	cout << "pciDeviceID: " << devProps.pciDeviceID << "\n";
	cout << "pciDomainID: " << devProps.pciDomainID << "\n";
	cout << "tccDriver: " << devProps.tccDriver << "\n";
	cout << "asyncEngineCount: " << devProps.asyncEngineCount << "\n";
	cout << "unifiedAddressing: " << devProps.unifiedAddressing << "\n";
	cout << "memoryClockRate: " << devProps.memoryClockRate << "\n";
	cout << "memoryBusWidth: " << devProps.memoryBusWidth << "\n";
	cout << "l2CacheSize: " << devProps.l2CacheSize << "\n";
	cout << "maxThreadsPerMultiProcessor: " << devProps.maxThreadsPerMultiProcessor << "\n";
	cout << "streamPrioritiesSupported: " << devProps.streamPrioritiesSupported << "\n";
	cout << "globalL1CacheSupported: " << devProps.globalL1CacheSupported << "\n";
	cout << "localL1CacheSupported: " << devProps.localL1CacheSupported << "\n";
	cout << "sharedMemPerMultiprocessor: " << devProps.sharedMemPerMultiprocessor << "\n";
	cout << "regsPerMultiprocessor: " << devProps.regsPerMultiprocessor << "\n";
	cout << "isMultiGpuBoard: " << devProps.isMultiGpuBoard << "\n";
	cout << "multiGpuBoardGroupID: " << devProps.multiGpuBoardGroupID << "\n";
	cout << "singleToDoublePrecisionPerfRatio: " << devProps.singleToDoublePrecisionPerfRatio << "\n";
	cout << "pageableMemoryAccess: " << devProps.pageableMemoryAccess << "\n";
	cout << "concurrentManagedAccess: " << devProps.concurrentManagedAccess << "\n";

	return 0;
}

在这里插入图片描述

3.CUDA核函数的参数(Kernel Function)

要对GPU编程中线程执行的任务进行了解。

/*
kernel函数的样例
*/
// CUDA核函数的定义
__global__ void addKernel(int *c, const int *a, const int *b)
{
    
    
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

// CUDA核函数调用
addKernel<<<Dg,Db, Ns, S>>>(c, a, b);

需要注意以下几点:
1.函数前的标识“global”表示该函数在GPU执行。
2.在函数调用时,要通过<<< >>>来确定线程的数量。
3.调用kernel函数时各个参数的含义:
Dg:int型或者dim3类型(x,y,z),用于定义一个Grid中Block是如何组织的,如果是int型,则表示一维组织结构
Db:int型或者dim3类型(x,y,z),用于定义一个Block中Thread是如何组织的,如果是int型,则表示一维组织结构
Ns:size_t类型,可缺省,默认为0; 用于设置每个block除了静态分配的共享内存外,最多能动态分配的共享内存大小,单位为byte。 0表示不需要动态分配。
S:cudaStream_t类型,可缺省,默认为0。 表示该核函数位于哪个流。

3.1 1d Grid 和 1d Block

cuda的调用模式:
下图为1个grid有4个block块(编号:0-3),1个block块有8个线程(编号:0-7)。

kernel_name<<<4, 8>>>(...)

在cuda核函数中线程索引计算方式如下:

int threadId = blockIdx.x * blockDim.x + threadIdx.x 

3.2 2d Grid 和 2d Block

对于高维度计算索引的方式来说类似于计算多维数组中数据的数量。需要计算出Thread前面的所有Block中的Thread数量,在求出该Thread在本Block中的序号相加即可。

调用形式:

dim3 grid(4,1), block(2,2);
kernel_name<<<grid, block>>>(...)

索引计算方式:

int x = threadIdx.x + blockIdx.x * blockDim.x; 
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; 

或者

int blockId = blockIdx.x + blockId.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;

:在二维排序中,Thread(0,1)表示第1行第0列的Thread,这跟我们传统中理解的横坐标和纵坐标不太一样;我们定义grid(4,2)表示第一维度有4个索引值,第二个维度有2个索引值,即2行4列。
在这里插入图片描述
:dim3是NVIDIA的CUDA编程中一种自定义的整型向量类型,基于用于指定维度的uint3。

例如:

dim3 grid(num1,num2,num3);

dim3类型最终设置的是一个三维向量,三维参数分别为x,y,z;

3.3 3d Grid 和 3d Block

3d的情况相对较少,它的线程计算公式如下:

int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int threadIc = blockId * (blockDim.x * blockDim.y * blockDim.z) 
                       + (threadIdx.z * (blockDim.x * blockDim.y)) 
                       + (threadIdx.y * blockDim.x) + threadIdx.x;   

3.4 计算公式汇总

一维Grid 一维Block

blockId = blockIdx.x 
threadId = blockIdx.x *blockDim.x + threadIdx.x

一维Grid 二维Block

blockId = blockIdx.x 
threadId = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x

一维Grid 三维Block

blockId = blockIdx.x 
threadId = blockIdx.x * blockDim.x * blockDim.y * blockDim.z 
                        + threadIdx.z * blockDim.y * blockDim.x 
                        + threadIdx.y * blockDim.x + threadIdx.x

二维Grid 一维Block

int blockId = blockIdx.y * gridDim.x + blockIdx.x;  
int threadId = blockId * blockDim.x + threadIdx.x;

二维Grid 二维Block

int blockId = blockIdx.x + blockIdx.y * gridDim.x;  
int threadId = blockId * (blockDim.x * blockDim.y) 
                       + (threadIdx.y * blockDim.x) + threadIdx.x;  

二维Grid 三维Block

int blockId = blockIdx.x + blockIdx.y * gridDim.x;  
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)  
                       + (threadIdx.z * (blockDim.x * blockDim.y))  
                       + (threadIdx.y * blockDim.x) + threadIdx.x;

三维Grid 一维Block

int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;  
int threadId = blockId * blockDim.x + threadIdx.x;  

三维Grid 二维Block

int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;  
int threadId = blockId * (blockDim.x * blockDim.y) 
                       + (threadIdx.y * blockDim.x) + threadIdx.x; 

三维Grid 三维Block

int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;  
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)  
                       + (threadIdx.z * (blockDim.x * blockDim.y))  
                       + (threadIdx.y * blockDim.x) + threadIdx.x;

猜你喜欢

转载自blog.csdn.net/daijingxin/article/details/109017574
今日推荐