CUDA two-dimensional and three-dimensional array traversal

insert image description here

A Grid is divided into multiple Blocks according to the dimension, and the number of Blocks is GridDim.x * GridDim.y
traversal: blockIdx.x , blockIdx.y

A Block is divided into multiple Threads according to the dimension. The number of Threads is BlockDim.x * BlockDim.y.
Thread is the smallest operating unit.
Traverse: threadIdx.x, threadIdx.y

2D data processing

cudaMemcpy2D( d_A, // destination pointer
d_pitch, // destination pitch
bmp1, // source pointer
sizeof(int)*2, // source data pitch
sizeof(int)*2, // data copy width
2, // data copy Height
cudaMemcpyHostToDevice);//Copy two-dimensional array from CPU to GPU


#define H 2
#define W 3


__gloabal__ void process(float* d_mat, size_t pitch){
    
    
		// y 方向最大长度到 blockDim.y * gridDim.y
    for(int h = blockIdx.y * blockDim.y + threadIdx.y; h < H; h += blockDim.y * gridDim.y)
    {
    
    
        float* row_d_mat = (float*)((char*)d_mat + h * pitch);
        //x 方向最大长度到 blockDim.x * gridDim.x
        for(int w = blockIdx.x * blockDim.x + threadIdx.x; w < W; w += blockDim.x * gridDim.x)
        {
    
    
           // process
        }
    }
}

int main(){
    
    

//定义一个数组
float h_mat[H][W] = {
    
    {
    
    123}{
    
    111213}};
float* d_mat;
size_t pitch;
transMat = (float*)malloc(sizeof(float)*M*N)// 分配二维cuda内存
cudaMallocPitch(&d_mat, &pitch, sizeof(float) * W, H);
cudaMemset2D(d_mat, pitch, 0, sizeof(float) * W, H);

//拷贝到device
cudaMemcpy2D(d_mat, pitch, h_mat, sizeof(float) * W, sizeof(float) * W, H, cudaMemcpyHostToDevice);
//拷贝到host
cudaMemcpy2D(h_mat, sizeof(float) * W, d_mat, pitch, sizeof(float) * W, H, cudaMemcpyDeviceToHost);
			
process<<<gridsize, blocksize>>>(d_transMat, pitch);
cudaThreadSynchronize();
}

3D data processing


void __global__  process(cudaPitchedPtr d_F1, cudaExtent extent_3d){
    
    
	double* devd_F1=(double*)d_F1.ptr;
    size_t pitchf1=d_F1.pitch;
    size_t slicePitchf1=pitchf1 * extent_3d.height;
	double *slice_F1, *row_F1,;
	for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < NX; i += blockDim.y * gridDim.y) {
    
    
		slice_F1 = (double*)((char*)devd_F1 + i * slicePitchf1);
		for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < NY; j += blockDim.x * gridDim.x){
    
    
			row_F1 = (double*)(((char *)slice_F1 + j * pitchf1));
			for (int k = 0; k < Q; k++){
    
    
				double temp1 = row_F1[k]; 
			}
		}
	} 
}

int main(){
    
    
	// 分配内存
	cudaPitchedPtr d_3d;
	cudaExtent extent_3d = make_cudaExtent(sizeof(double) * W, H, Z);
	cudaMalloc3D(&d_3d, extent_3d);
	
	double h_mat[Z][H][W] = {
    
    {
    
    {
    
    123}{
    
    111213}}, {
    
    {
    
    123}{
    
    111213}}};
	cudaMemcpy3DParms cpyParm = {
    
    0};
	
	// copy to host 
	cpyParm1.kind = cudaMemcpyDeviceToHost;
	cpyParm1.extent = extent_3d;
	cpyParm1.srcPtr = d_3d;
	cpyParm1.dstPtr = make_cudaPitchedPtr((void*)h_mat, sizeof(double) * W, W, H);
	cudaMemcpy3D(&cpyParm);
	
	// copy to device
	cpyParm1.kind = cudaMemcpyHostToDevice;
	cpyParm1.extent = extent_3d;
	cpyParm1.dstPtr = d_3d;
	cpyParm1.srcPtr = make_cudaPitchedPtr((void*)h_mat, sizeof(double) * W, W, H);
	cudaMemcpy3D(&cpyParm);

	process<<<gridsize, blocksize>>>(d_3d, extent_3d);

Guess you like

Origin blog.csdn.net/long630576366/article/details/125411816