理解二维block和二维thread

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include<stdio.h>
#include<stdlib.h>

#define W 20
#define H 20
#define TX 32 //每个block中一行后32个线程
#define TY 32 //同理在y上

//float2 表示float类型的二维数组
__global__ void distanceKernel(float *d_out, int w, int h, float2 pos)
{
	const int c = blockIdx.x*blockDim.x + threadIdx.x; //c为每一列的索引
	const int r = blockIdx.y*blockDim.y + threadIdx.y; //r为每行的索引
	const int i = w * r + c;//将二维矩阵转换成一维
	if (c >= w || r >= h) return;//防止开辟的线程数多余实际需要(不能正除,必须为他多开辟一个)导致多读数据

	//每个线程要做的事情
	d_out[i] = sqrtf((c - pos.x)*(c - pos.x) + (r - pos.y)*(r - pos.y));//sqrtf表示里面是float类型
	printf("当前的线程好为:%d\t当前距离为%f\n", i, d_out[i]);
}


int main()
{
	float *out = (float*)calloc(W*H,sizeof(float));
	float *d_out = 0;
	cudaMalloc(&d_out, sizeof(float)*W*H);
	const float2 pos = { 0.0f,0.0f };
	const dim3 blockSize(TX, TY);//设置一个行有TX个线程,列有TY个线程的block
	const int bx = (W + TX - 1) / TX; //防止线程不够
	const int by = (H + TY - 1) / TY;
	const dim3 gridSize(bx, by);
	

	//开始运行
	distanceKernel << <gridSize, blockSize >> > (d_out, W, H, pos);
	//将结果写回主机
	cudaMemcpy(out, d_out, W*H * sizeof(float), cudaMemcpyDeviceToHost);
	//释放设备上的内存
	cudaFree(d_out);

	//打印结果
	//for (int i = 0; i < W*H; i++)
	//{
	//	printf("%f", out[i]);
	//	(i % 20 != 19) ? printf("\t") : printf("\n");
	//	//printf(((i % 10 != 9) ? "\t" : "\n"));
	//}
	free(out);
	system("pause");
}
发布了49 篇原创文章 · 获赞 18 · 访问量 1429

猜你喜欢

转载自blog.csdn.net/qq_44099721/article/details/103571268