并行化处理扫描

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include<stdio.h>

#define ARRAY_SIZE 8 //一个block开8个线程
__global__ void global_scan(float*d_out, float*d_in)
{
	int idx = threadIdx.x;//这个地方之开辟了一个线程块没有必要写成下面的样子
	//int idx = blockIdx.x*blockDim.x + threadIdx.x;
	float out = 0.0f;//做临时存储，防止线程不同步造成的数据覆盖问题
	d_out[idx] = d_in[idx];//将本机内存中的数据写道设备内存中
	__syncthreads();//确保每个线程都完成，（vs感知不到这个函数但是可以运行）
	//这个算法就是每隔interpre相加
	for (int interpre = 1; interpre < sizeof(d_in); interpre*=2)
	{
		//这个地方会在每个线程中运行,这个地方会导致一些线程偷懒
		if (idx - interpre >= 0)
		{
			out = d_out[idx] + d_out[idx - interpre];
		}
		__syncthreads();
		if (idx - interpre)
		{
			d_out[idx] = out;
		}

	}


}


int main()
{
	//为主机分配内存
	float h_in[ARRAY_SIZE] = { 0.0f };
	float h_out[ARRAY_SIZE] = { 0.0f };
	//随便为其赋值
	for (int i = 0; i < ARRAY_SIZE; i++)
	{
		h_in[i] = float(i);
		printf("%f", h_in[i]);
		if (i % 4 != 3)
		{
			printf("\t");
		}
		else
		{
			printf("\n");
		}
	}
	printf("\n");
	//为设备分配内存
	float *d_out = 0;
	float *d_in = 0;
	//来回写内存导致时间浪费，这里可以改进
	cudaMalloc(&d_out, sizeof(float)*ARRAY_SIZE);
	cudaMalloc(&d_in, sizeof(float)*ARRAY_SIZE);
	//读取主机上的数据,sizeof(float)*ARRAY_SIZE这个鬼东西就应该在前面算出来
	cudaMemcpy(d_in, h_in, sizeof(float)*ARRAY_SIZE,cudaMemcpyHostToDevice);
	global_scan << <1, ARRAY_SIZE >> > (d_out, d_in);
	//将设备上的数据写回主机
	cudaMemcpy(h_out, d_out, sizeof(float)*ARRAY_SIZE, cudaMemcpyDeviceToHost);
	//在主机上格式化输出
	for (int i = 0; i < ARRAY_SIZE; i++)
	{
		printf("%f", h_out[i]);
		if (i % 4 != 3)
		{
			printf("\t");
		}
		else
		{
			printf("\n");
		}
	}
}

在这个程序可以进一步节省时间！因为81.8%的时间都用在了读写内存上。
在这里插入图片描述

小宅520

发布了49 篇原创文章 · 获赞 18 · 访问量 1430

私信关注

猜你喜欢