第三章 从循环到网络

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/JennyBi/article/details/81280227

3.1 并行化dist_v1

#include <stdio.h>
#include "stdlib.h"

#define N 64
#define TPB 32//每个线程块包含32个线程



__device__ float scale(int i, int n)
{
	return ((float)i / (n - 1));
}

__device__ float distance(float x1, float x2)
{
	return sqrt((x2 - x1)*(x2 - x1));
}

__global__ void distanceKernel(float *d_out, float ref, int len)
{
	const int i = blockIdx.x*blockDim.x + threadIdx.x;
	//for (i; i<N; i++)
	{
		const float x = scale(i, len);
		d_out[i] = distance(x, ref);
		printf("i=%2d:dist from %f to %f is %f.\n", i, ref, x, d_out[i]);
	}
}

int main()
{
	
	const float ref = 0.5f;

	float *d_out = 0;
	cudaMalloc(&d_out, N * sizeof(float));

	distanceKernel <<<N / TPB, TPB >>> (d_out, ref, N);

	cudaFree(d_out);
	system("pause");
	return 0;
}

运行结果:

注:输出结果的索引不是从0-63依次输出的。这是串行和并行的一个根本区别。

在串行应用中,计算在一个循环中按照一定先后顺序执行。

在CUDA并行中,放弃了对计算顺序的部分控制而获得由成百上千个处理器并行计算提供的计算加速。

3.2 并行化dist_v2

kernel.h

#pragma once

void distanceArray(float *out, float *in, float ref, int len);

kernel.cu

#include "kernel.h"
#include <stdio.h>

#define TPB 32

__device__ float distance(float x1, float x2)
{
	return sqrt((x2 - x1)*(x2 - x1));
}

__global__ void distanceKernel(float *d_out, float *d_in, float ref)
{
	const int i = blockIdx.x*blockDim.x + threadIdx.x;
	const float x = d_in[i];
	d_out[i] = distance(x, ref);
	printf("i=%2d:dist from %f to %f is %f.\n", i, ref, x, d_out[i]);
}

void distanceArray(float *out, float *in, float ref, int len)
{
	float *d_in = 0;
	float *d_out = 0;

	cudaMalloc(&d_in, len * sizeof(float));
	cudaMalloc(&d_out, len * sizeof(float));

	cudaMemcpy(d_in, in, len * sizeof(float), cudaMemcpyHostToDevice);
	
	distanceKernel << <len / TPB, TPB >> > (d_out, d_in, ref);
	cudaMemcpy(out, d_out, len * sizeof(float), cudaMemcpyDeviceToHost);

	cudaFree(d_in);
	cudaFree(d_out);
}

main.cpp

#include "kernel.h"
#include <stdlib.h>
#define N 64

float scale(int i, int n)
{
	return ((float)i / (n - 1));
}

int main()
{
	const float ref = 0.5f;
	float *in = (float*)calloc(N, sizeof(float));
	float *out = (float*)calloc(N, sizeof(float));

	for (int i = 0; i<N; i++)
	{
		in[i] = scale(i, N);
	}
	distanceArray(out, in, ref, N);
	free(in);
	free(out);
	system("pause");
	return 0;
}

使用CUDA的推荐策略:

  • 一次性将你的数据复制到设备端。
  • 启动一个执行了大量工作的核函数(因此从大量并行化中获得的收益将大大超出内存传输的代价)。
  • 只将结果复制回主机端一次。

使用统一内存和托管数组进行优化:

#include <stdio.h>
#include <stdlib.h>

#define N 64
#define TPB 32

float scale(int i, int n)
{
	return ((float)i / (n - 1));
}

__device__ float distance(float x1, float x2)
{
	return sqrt((x2 - x1)*(x2 - x1));
}

__global__ void distanceKernel(float *d_out, float *d_in, float ref)
{
	const int i = blockIdx.x*blockDim.x + threadIdx.x;
	const float x = d_in[i];
	d_out[i] = distance(x, ref);
	printf("i=%2d:dist from %2d to %2d is %f.\n", i, ref, x, d_out[i]);
}

int main()
{
	const float ref = 0.5f;
	float *in = 0;
	float *out = 0;

	cudaMallocManaged(&in, N * sizeof(float));
	cudaMallocManaged(&out, N * sizeof(float));

	for (int i = 0; i < N; i++)
	{
		in[i] = scale(i, N);
	}
	distanceKernel << <N / TPB, TPB >> > (out, in, ref);
	cudaDeviceSynchronize();
	cudaFree(in);
	cudaFree(out);

	system("pause");
	return 0;
}

猜你喜欢

转载自blog.csdn.net/JennyBi/article/details/81280227