OpenCL环境配置及测试

由于计算性能的要求，特别需要使用并行计算，这里对TBB和OpenCL都进行了一些测试。

测试了TBB和OpenCL之后，我的感觉是如果真的要做高性能的异构计算，那还是选择CUDA吧，并且应该选用专用计算的GPU。

首先先把自己简单的描述写在前面：

1.TBB（Thread Building Blocks）线程构建模块，是Intel公司开发的并行编程开发工具，我在vs2015上测试了TBB的效果，

测试平台vs2015 release x86 + tbb2018_20180312oss，然后测试的时候是两个数相加的运算，实际测试结果，release模式下的计算耗时还小些，我觉得是vs的优化能力实现的。但是实际上TBB确实调用了多个CPU剂型计算。

使用TBB并行计算时，CPU使用情况：

串行计算时，CPU的使用情况：

2.OpenCL，开始使用这个的时候，是因为这是一个现在在推行的异构计算标准，觉得它的计算能力，应该会有明显提升，然后也差强人意。主要的原因也还是手里没有一块像样的GPU，使用的作为显示的AMD GPU做的测试。

当然换用高性能的GPU时，使用OpenCL也是有性能提升的。

扫描二维码关注公众号，回复： 1216075 查看本文章

同样的计算，CPU耗时<10ms。

3.OpenCL的配置

TBB的配置：https://www.threadingbuildingblocks.org/，下载一个开发包即可，解压后添加路径即可。

OpenCL的配置，也是很简单，我这里使用的AMD的GPU，所以下载一个AMD APP SDK，在这里可以下载：http://hc.csdn.net/resources/classify?id=12

配置的方法：

（1）添加环境变量

（2）添加include目录

（3）添加lib目录

4.OpenCL测试代码

参考：https://blog.csdn.net/cloud_desktop/article/details/19107765

// amd_opencl_test.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"

#include <stdio.h>  
#include <stdlib.h>  
#include <string.h>  
#include <iostream>  
#include <time.h>

#ifdef MAC  
#include <OpenCL/cl.h>  
#else  
#include <CL/cl.h>  
#endif  

char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);

int main()
{
	cl_int errNum;


	/******** 第一部分 选择OpenCL平台，创建一个上下文 ********/
	cl_uint numPlatforms;
	cl_platform_id *platformIds;
	cl_context context = 0;

	// 1. Select an OpenCL platform to run on.  
	errNum = clGetPlatformIDs(0, NULL, &numPlatforms);                      // 1. 获取OpenCL平台数目  
	if (errNum != CL_SUCCESS || numPlatforms <= 0) {
		perror("Failed to find any OpenCL platforms.");
		exit(1);
	}
	printf("Platform Numbers: %d\n", numPlatforms);

	platformIds = (cl_platform_id *)malloc(
		sizeof(cl_platform_id) * numPlatforms);

	errNum = clGetPlatformIDs(numPlatforms, platformIds, NULL);             // 2. 创建所有OpenCL平台    
	if (errNum != CL_SUCCESS) {
		perror("Failed to find any OpenCL platforms.");
		exit(1);
	}

	//------------------ 打印平台信息(Start) ------------------/  
	// Extension data                 
	size_t ext_size = 0;

	//输出生产商的名字  
	errNum = clGetPlatformInfo(platformIds[0],
		CL_PLATFORM_NAME,
		0, NULL, &ext_size);
	if (errNum < 0) {
		perror("Couldn't read CL_PLATFORM_NAME.");
		exit(1);
	}
	char *name = (char*)malloc(ext_size);
	clGetPlatformInfo(platformIds[0], CL_PLATFORM_NAME,
		ext_size, name, NULL);
	printf("Platform Name: %s\n", name);

	//供应商信息  
	errNum = clGetPlatformInfo(platformIds[0],
		CL_PLATFORM_VENDOR,
		0, NULL, &ext_size);
	if (errNum < 0) {
		perror("Couldn't read CL_PLATFORM_VENDOR.");
		exit(1);
	}
	char *vendor = (char*)malloc(ext_size);
	clGetPlatformInfo(platformIds[0], CL_PLATFORM_VENDOR,
		ext_size, vendor, NULL);
	printf("Platform Vendor: %s\n", vendor);

	//最高支持的OpenCL版本  
	errNum = clGetPlatformInfo(platformIds[0],
		CL_PLATFORM_VERSION,
		0, NULL, &ext_size);
	if (errNum < 0) {
		perror("Couldn't read CL_PLATFORM_VERSION.");
		exit(1);
	}
	char *version = (char*)malloc(ext_size);
	clGetPlatformInfo(platformIds[0], CL_PLATFORM_VERSION,
		ext_size, version, NULL);
	printf("Platform Version: %s\n", version);

	//只有两个值：full profile 和 embeded profile  
	errNum = clGetPlatformInfo(platformIds[0],
		CL_PLATFORM_PROFILE,
		0, NULL, &ext_size);
	if (errNum < 0) {
		perror("Couldn't read CL_PLATFORM_PROFILE.");
		exit(1);
	}
	char *profile = (char*)malloc(ext_size);
	clGetPlatformInfo(platformIds[0], CL_PLATFORM_PROFILE,
		ext_size, profile, NULL);
	printf("Platform Full Profile or Embeded Profile?: %s\n", profile);
	//------------------ 打印平台信息(End) ------------------/  

	// 2. Create an OpenCL context on the platform.  
	cl_context_properties contextProperties[] = {
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)platformIds[0],                              // 3. 选择第一个OpenCL平台    
		0
	};
	context = clCreateContextFromType(contextProperties,                    // 4. 尝试为一个GPU设备创建一个上下文   
		CL_DEVICE_TYPE_GPU,
		NULL, NULL, &errNum);
	if (errNum != CL_SUCCESS) {
		perror("Could not create GPU context, trying CPU...");

		context = clCreateContextFromType(contextProperties,                // 5. 尝试为一个CPU设备创建一个上下文    
			CL_DEVICE_TYPE_GPU, NULL, NULL, &errNum);
		if (errNum != CL_SUCCESS) {
			perror("Failed to create an OpenCL GPU or CPU context.");
			exit(1);
		}
	}


	/******** 第二部分 选择设备，创建命令队列 ********/
	cl_device_id *devices;
	cl_device_id device = 0;
	cl_command_queue commandQueue = NULL;
	size_t deviceBufferSize = -1;

	// 3. Get the size of the device buffer.  
	errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL,         // 1. 查询存储上下文所有可用设备ID所需要的缓冲区大小  
		&deviceBufferSize);
	if (errNum != CL_SUCCESS) {
		perror("Failed to get context infomation.");
		exit(1);
	}
	if (deviceBufferSize <= 0) {
		perror("No devices available.");
		exit(1);
	}

	devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
	errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES,                  // 2. 获取上下文中所有可用设备  
		deviceBufferSize, devices, NULL);
	if (errNum != CL_SUCCESS) {
		perror("Failed to get device ID.");
		exit(1);
	}

	// 4. Choose the first device  
	commandQueue = clCreateCommandQueue(context,                            // 3. 选择第一个设备，创建一个命令队列  
		devices[0], 0, NULL);
	if (commandQueue == NULL) {
		perror("Failed to create commandQueue for device 0.");
		exit(1);
	}

	device = devices[0];
	delete[] devices;


	/******** 第三部分 读取OpenCL C语言，创建和构建程序对象 ********/
	cl_program program;
	size_t szKernelLength;          // Byte size of kernel code  
	char* cSourceCL = NULL;         // Buffer to hold source for compilation  

									// 5. Read the OpenCL kernel in from source file  
	cSourceCL = oclLoadProgSource(                                      // 1. 从绝对路径读取HelloWorld.cl的源代码  
		"HelloWorld_Kernel.cl", "",
		&szKernelLength);
	if (cSourceCL == NULL) {
		perror("Error in oclLoadProgSource\n");
		exit(1);
	}

	// 6. Create the program  
	program = clCreateProgramWithSource(context, 1,                     // 2. 使用源代码创建程序对象  
		(const char **)&cSourceCL,
		&szKernelLength, &errNum);
	if (errNum != CL_SUCCESS) {
		perror("Error in clCreateProgramWithSource\n");
		exit(1);
	}

	// 7. Build the program with 'mad' Optimization option  
	char* flags = "-cl-fast-relaxed-math";
	errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);        // 3. 编译内核源代码  
	if (errNum != CL_SUCCESS) {
		perror("Error in clBuildProgram.\n");
		exit(1);
	}


	/******** 第四部分 创建内核和内存对象 ********/
#define ARRAY_SIZE 1280*1024

	cl_kernel kernel = 0;
	cl_mem memObjects[3] = { 0, 0, 0 };

	float *a = new float[ARRAY_SIZE];
	float *b = new float[ARRAY_SIZE];
	float *result = new float[ARRAY_SIZE];

	// 8. Create the kernel  
	kernel = clCreateKernel(program, "helloworld", NULL);          // 1. 创建内核对象  
	if (kernel == NULL) {
		perror("Error in clCreateKernel.\n");
		exit(1);
	}

	// 9. Create memory objects  
	for (int i = 0; i < ARRAY_SIZE; i++) {
		a[i] = (float)i;
		b[i] = (float)i;
	}

	time_t start, end;
	double cost_time;
	
start = clock();
	memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY |      // 2. 创建内存对象  
		CL_MEM_COPY_HOST_PTR,
		sizeof(float) * ARRAY_SIZE,
		a, NULL);
	memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY |
		CL_MEM_COPY_HOST_PTR,
		sizeof(float) * ARRAY_SIZE,
		b, NULL);
	memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE |
		CL_MEM_COPY_HOST_PTR,
		sizeof(float) * ARRAY_SIZE,
		result, NULL);
	if (memObjects[0] == NULL || memObjects[1] == NULL ||
		memObjects[2] == NULL) {
		perror("Error in clCreateBuffer.\n");
		exit(1);
	}


	/******** 第五部分 执行内核 ********/
	size_t globalWorkSize[1] = { ARRAY_SIZE };
	size_t localWorkSize[1] = { 1 };

	// 10. Set the kernel arguments  
	errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);     // 1. 设置内核参数  
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
	errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
	if (errNum != CL_SUCCESS) {
		perror("Error in clSetKernelArg.\n");
		exit(1);
	}

	// 11. Queue the kernel up for execution across the array  
	errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,          // 2. 执行内核排队  
		globalWorkSize, localWorkSize,
		0, NULL, NULL);
	if (errNum != CL_SUCCESS) {
		perror("Error in clEnqueueNDRangeKernel.\n");
		exit(1);
	}

	// 12. Read the output buffer back to the Host  
	errNum = clEnqueueReadBuffer(commandQueue, memObjects[2],               // 3. 读取运算结果到主机  
		CL_TRUE, 0,
		ARRAY_SIZE * sizeof(float), result,
		0, NULL, NULL);
	if (errNum != CL_SUCCESS) {
		perror("Error in clEnqueueReadBuffer.\n");
		exit(1);
	}
	end = clock();
	cost_time = end - start;
	

	/******** 第六部分 测试结果 ********/
	printf("\nTest: a * b = c\n\n");

	//printf("Input numbers:\n");
	//for (int i = 0; i < ARRAY_SIZE; i++)
	//	printf("a[%d] = %f, b[%d] = %f\n", i, a[i], i, b[i]);

	//printf("\nOutput numbers:\n");
	//for (int i = 0; i < ARRAY_SIZE; i++)
	//	printf("a[%d] * b[%d] = %f\n", i, i, result[i]);

	printf("cost time is %.2f \n", cost_time);
	while (1);

	return 0;
}


//////////////////////////////////////////////////////////////////////////////  
//! Loads a Program file and prepends the cPreamble to the code.  
//!  
//! @return the source string if succeeded, 0 otherwise  
//! @param cFilename        program filename  
//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header  
//! @param szFinalLength    returned length of the code string  
//////////////////////////////////////////////////////////////////////////////  
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
	// locals   
	FILE* pFileStream = NULL;
	size_t szSourceLength;

	// open the OpenCL source code file  
#ifdef _WIN32   // Windows version  
	if (fopen_s(&pFileStream, cFilename, "rb") != 0)
	{
		return NULL;
	}
#else           // Linux version  
	pFileStream = fopen(cFilename, "rb");
	if (pFileStream == 0)
	{
		return NULL;
	}
#endif  

	size_t szPreambleLength = strlen(cPreamble);

	// get the length of the source code  
	fseek(pFileStream, 0, SEEK_END);
	szSourceLength = ftell(pFileStream);
	fseek(pFileStream, 0, SEEK_SET);

	// allocate a buffer for the source code string and read it in  
	char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
	memcpy(cSourceString, cPreamble, szPreambleLength);
	if (fread((cSourceString)+szPreambleLength, szSourceLength, 1, pFileStream) != 1)
	{
		fclose(pFileStream);
		free(cSourceString);
		return 0;
	}

	// close the file and return the total length of the combined (preamble + source) string  
	fclose(pFileStream);
	if (szFinalLength != 0)
	{
		*szFinalLength = szSourceLength + szPreambleLength;
	}
	cSourceString[szSourceLength + szPreambleLength] = '\0';

	return cSourceString;
}

然后在新建一个HelloWorld_Kernel.cl文件，放到当前目录下，内容为异构计算的kernel：

 // OpenCL Kernel Function  
__kernel void helloworld(__global const float* a,   
                          __global const float* b,   
                          __global float* result)  
{  
    // get index into global data array  
    int iGID = get_global_id(0);  
      
    // elements operation  
    result[iGID] = a[iGID] * b[iGID];

至此结束！

OpenCL环境配置及测试

猜你喜欢