OpenCL简单入门

一、前言

最近在做三维点云处理方面的项目，对于三维数据方面的处理来说是非常耗时的，为了加快项目算法的处理速度，于是充分发挥计算机的GPU处理性能，在对项目算法中的不同模块采用了Opencl和CUDA加速技术。这篇只在这里记录Opencl部分，后续会更新CUDA部分。一如既往，从简单的入门开始，下面将分小节开始。

二、OpenCL的相关概念

OpenCL是一个为异构平台编写程序的框架，此异构平台可由CPU、GPU或其他类型的处理器组成。OpenCL由一门用于编写kernels （在OpenCL设备上运行的核函数）的语言（基于C99）和一组用于定义并控制平台的API组成。OpenCL提供了两种层面的并行机制：数据并行和任务并行。一个完整的OpenCL加速技术过程涉及到平台（Platform）、设备(Device)、上下文(Context)、OpenCL程序（Program）、指令队列（Command）、核函数（Kernel）、内存对象（Memory Object）、调用设备接口（NDRange），下面将分别进行做简单的介绍，后面也会给出参考资料的相关链接。

平台（Platform）：主机加上OpenCL框架管理下的若干设备构成了这个平台，通过这个平台，应用程序可以与设备共享资源并在设备上执行kernel。
设备(Device)：官方的解释是计算单元（Compute Units）的集合。举例来说，GPU是典型的device。Intel和AMD的多核CPU也提供OpenCL接口，所以也可以作为Device。
上下文(Context)：OpenCL的Platform上共享和使用资源的环境，包括kernel、device、memory objects、command queue等。使用中一般一个Platform对应一个Context。
OpenCL程序（Program）：OpenCL程序，由kernel函数、其他函数和声明等组成。
指令队列（Command Queue）：在指定设备上管理多个指令（Command）。队列里指令执行可以顺序也可以乱序。一个设备可以对应多个指令队列。
核函数（Kernel）：可以从主机端调用，运行在设备端的函数，我们算法需要加速处理的那部分。
内存对象（Memory Object）：在主机和设备之间传递数据的对象，一般映射到OpenCL程序中的global memory。有两种具体的类型：Buffer Object（缓存对象）和Image Object（图像对象）。
调用设备接口（NDRange）：主机端运行设备端kernel函数的主要接口。实际上还有其他的，NDRange是非常常见的，用于分组运算，以后具体用到的时候就知道区别了。

三、OpenCL编程的步骤

Opencl编程的步骤比较繁琐，但是都比较固定，下面集合代码进行介绍，利用理解操作。

1.平台查找和初始化

调用两次clGetPlatformIDs函数，第一次获取可用的平台数量，第二次获取一个可用的平台。代码参考如下：

int getPlatform(cl_platform_id &platform)
{
	platform = NULL;//the chosen platform

	cl_uint numPlatforms;//the NO. of platforms
	cl_int    status = clGetPlatformIDs(0, NULL, &numPlatforms);
	if (status != CL_SUCCESS)
	{
		cout << "Error: Getting platforms!" << endl;
		return -1;
	}

	/**For clarity, choose the first available platform. */
	if (numPlatforms > 0)
	{
		cl_platform_id* platforms =
			(cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
		status = clGetPlatformIDs(numPlatforms, platforms, NULL);
		platform = platforms[0];
		free(platforms);
	}
	else
		return -1;
}

2.设备查找和初始化

调用两次clGetDeviceIDs函数，第一次获取可用的设备数量，第二次获取一个可用的设备。代码参考如下：

cl_device_id *getCl_device_id(cl_platform_id &platform)
{
	cl_uint numDevices = 0;
	cl_device_id *devices = NULL;
	cl_int    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
	if (numDevices > 0) //GPU available.
	{
		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
	}
	return devices;
}

3.创建上下文

调用clCreateContext函数，上下文context可能会管理多个设备device。代码参考如下：

   oclContext=clCreateContext(NULL,1,&oclComputeDeviceID,NULL,NULL,&ret_ocl);

4.创建命令队列

调用clCreateCommandQueue函数，一个设备device对应一个command queue。上下文conetxt将命令发送到设备对应的command queue，设备就可以执行命令队列里的命令。代码参考如下：

 oclCommandQueue=clCreateCommandQueue(oclContext,oclComputeDeviceID,0,&ret_ocl);

5.创建内存对象

调用clCreateBuffer函数，Buffer中保存的是数据对象，就是设备执行程序需要的数据保存在其中。Buffer由上下文conetxt创建，这样上下文管理的多个设备就会共享Buffer中的数据。代码参考如下：

  deviceInput1=clCreateBuffer(oclContext,CL_MEM_READ_ONLY,size,NULL,&ret_ocl);

6.创建程序对象

创建程序对象，程序对象就代表你的程序源文件或者二进制代码数据。这里将要调用Opencl内置函数和一个自己编写的工具函数，工具函数用于读取Kernel核函数。文件后缀为.cl。代码参考如下：

    ret_ocl=clBuildProgram(oclProgram,0,NULL,NULL,NULL,NULL);

char * ReadKernelSourceFile(const char* filename, size_t* length)
{
	FILE *file = NULL;
	size_t sourcesLength;
	char* sourcesString;
	int ret;
	file = fopen(filename, "rb");
	if (file==NULL)
	{
		//printf("%s at %d:Can't open %s\n", _FILE, _LINE_ - 2, filename);
		return NULL;
	}
	fseek(file, 0, SEEK_END);
	sourcesLength = ftell(file);//the file length;
	fseek(file, 0, SEEK_SET);
	sourcesString = (char*)malloc(sourcesLength + 1);
	
	//sourcesString[0] = '\0';

	ret = fread(sourcesString, sourcesLength, 1, file);

	//read file fail
	if (ret==0)
	{
		//printf("%s at %d:Can't open %s\n", _FILE, _LINE_ - 2, filename);
		return NULL;
	}
	fclose(file);
	if (length!=0)
	{
		*length = sourcesLength;
	}

	sourcesString[sourcesLength] = '\0';//最后一位加0表示结束
	return sourcesString;
}

7.创建核函数Kernel

调用clCreateKernel函数，根据你的程序对象，生成kernel对象，表示设备程序的入口。参考代码如下：

 // create OpenCL kernel by passing kernel function name that we used in .cl file
    oclKernel=clCreateKernel(oclProgram,"vecAdd",&ret_ocl);

8.设置Kernel参数

调用clSetKernelArg函数，参考代码如下：

 ret_ocl=clSetKernelArg(oclKernel,0,sizeof(cl_mem),(void *)&deviceInput1); // 'deviceInput1' maps to 'in1' param of kernel function in .cl file

9.设置工作项大小并执行核函数

设置工作项的东西（worksize），可以简单的理解为线程的多少。核函数的执行，调用调用clEnqueueNDRangeKernel函数，参考代码如下：

   ret_ocl=clEnqueueNDRangeKernel(oclCommandQueue,oclKernel,1,NULL,&globalWorkSize,&localWorkSize,0,NULL,NULL);

10.读取结果到主机端

在设备端运行完结果后，需要将结果拷贝到主机端，参与下一步的相应计算，调用调用clEnqueueReadBuffer函数。参考代码如下：

 ret_ocl=clEnqueueReadBuffer(oclCommandQueue,deviceOutput,CL_TRUE,0,size,hostOutput,0,NULL,NULL);

11.资源释放。

当完成所需加速部分得到结果后，需要将设备端的资源进行释放，才能完成完成整个运行过程。资源释opencl有内置的函数，参考代码如下：

void cleanup(void)
{
    // code
    
    // OpenCL cleanup
    if(oclSourceCode)
    {
        free((void *)oclSourceCode);
        oclSourceCode=NULL;
    }
    
    if(oclKernel)
    {
        clReleaseKernel(oclKernel);
        oclKernel=NULL;
    }
    
    if(oclProgram)
    {
        clReleaseProgram(oclProgram);
        oclProgram=NULL;
    }
    
    if(oclCommandQueue)
    {
        clReleaseCommandQueue(oclCommandQueue);
        oclCommandQueue=NULL;
    }
    
    if(oclContext)
    {
        clReleaseContext(oclContext);
        oclContext=NULL;
    }

    // free allocated device-memory
    if(deviceInput1)
    {
        clReleaseMemObject(deviceInput1);
        deviceInput1=NULL;
    }
    
    if(deviceInput2)
    {
        clReleaseMemObject(deviceInput2);
        deviceInput2=NULL;
    }
    
    if(deviceOutput)
    {
        clReleaseMemObject(deviceOutput);
        deviceOutput=NULL;
    }
    
    // free allocated host-memory
    if(hostInput1)
    {
        free(hostInput1);
        hostInput1=NULL;
    }
    
    if(hostInput2)
    {
        free(hostInput2);
        hostInput2=NULL;
    }
    
    if(hostOutput)
    {
        free(hostOutput);
        hostOutput=NULL;
    }

    if(gold)
    {
        free(gold);
        gold=NULL;
    }
}

至此完成opencl整个过程，下面将以向量的相加给出完整的例子，仅供参考。

四、程序实例向量相加

例子包含一个工具文件、函数核函数文件，主程序，工具文件tool.h和tool.cpp，核函数VceAdd.cl ，主程序代码参考如下，另外两个下载链接例子工程文件。

// headers
#include <stdio.h>
#include <stdlib.h> // exit()
#include <string.h> // strlen()
#include <math.h> // fabs()
#include <iostream>


#include "CL/opencl.h"

#include "helper_timer.h"

// global OpenCL variables
cl_int ret_ocl;
cl_platform_id oclPlatformID;
cl_device_id oclComputeDeviceID; // compute device id
cl_context oclContext; // compute context
cl_command_queue oclCommandQueue; // compute command queue
cl_program oclProgram; // compute program
cl_kernel oclKernel; // compute kernel

char *oclSourceCode=NULL;
size_t sizeKernelCodeLength;

// odd number 11444777 is deliberate illustration ( Nvidia OpenCL Samples )
int iNumberOfArrayElements = 11444777;
size_t localWorkSize=256;
size_t globalWorkSize;

float *hostInput1=NULL;
float *hostInput2=NULL;
float *hostOutput=NULL;
float *gold=NULL;

cl_mem deviceInput1=NULL;
cl_mem deviceInput2=NULL;
cl_mem deviceOutput=NULL;

float timeOnCPU;
float timeOnGPU;

int main(void)
{
    // function declarations
    void fillFloatArrayWithRandomNumbers(float *, int);
    size_t roundGlobalSizeToNearestMultipleOfLocalSize(int, unsigned int);
    void vecAddHost(const float *, const float *, float *, int);
    char* loadOclProgramSource(const char *,const char *,size_t *);
    void cleanup(void);
	void FileInit(float *, int );
    
    // code
    // allocate host-memory
    hostInput1=(float *)malloc(sizeof(float) * iNumberOfArrayElements);
    if(hostInput1== NULL)
    {
        printf("CPU Memory Fatal Error = Can Not Allocate Enough Memory For Host Input Array 1.\nExitting ...\n");
        cleanup();
        exit(EXIT_FAILURE);
    }

    hostInput2=(float *)malloc(sizeof(float) * iNumberOfArrayElements);
    if(hostInput2== NULL)
    {
        printf("CPU Memory Fatal Error = Can Not Allocate Enough Memory For Host Input Array 2.\nExitting ...\n");
        cleanup();
        exit(EXIT_FAILURE);
    }

    // allocate host-memory to hold 'float' type host vector hostOutput
    hostOutput=(float *)malloc(sizeof(float) * iNumberOfArrayElements);
    if(hostOutput== NULL)
    {
        printf("CPU Memory Fatal Error = Can Not Allocate Enough Memory For Host Output Array.\nExitting ...\n");
        cleanup();
        exit(EXIT_FAILURE);
    }

    gold=(float *)malloc(sizeof(float) * iNumberOfArrayElements);
    if(gold== NULL)
    {
        printf("CPU Memory Fatal Error = Can Not Allocate Enough Memory For Gold Output Array.\nExitting ...\n");
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // fill above input host vectors with arbitary but hard-coded data
   // fillFloatArrayWithRandomNumbers(hostInput1,iNumberOfArrayElements);
   // fillFloatArrayWithRandomNumbers(hostInput2,iNumberOfArrayElements);
    
	FileInit(hostInput1, iNumberOfArrayElements);
	FileInit(hostInput2, iNumberOfArrayElements);

    // get OpenCL supporting platform's ID
    ret_ocl=clGetPlatformIDs(1,&oclPlatformID,NULL);
    if(ret_ocl != CL_SUCCESS)
    {
        printf("OpenCL Error - clGetDeviceIDs() Failed : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // get OpenCL supporting GPU device's ID
    ret_ocl=clGetDeviceIDs(oclPlatformID,CL_DEVICE_TYPE_GPU,1,&oclComputeDeviceID,NULL);
    if(ret_ocl != CL_SUCCESS)
    {
        printf("OpenCL Error - clGetDeviceIDs() Failed : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    char gpu_name[255];
    clGetDeviceInfo(oclComputeDeviceID,CL_DEVICE_NAME,sizeof(gpu_name),&gpu_name,NULL);
    printf("%s\n",gpu_name);
    
    // create OpenCL compute context
    oclContext=clCreateContext(NULL,1,&oclComputeDeviceID,NULL,NULL,&ret_ocl);
    if(ret_ocl!=CL_SUCCESS)
    {
        printf("OpenCL Error - clCreateContext() Failed : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // create command queue
    oclCommandQueue=clCreateCommandQueue(oclContext,oclComputeDeviceID,0,&ret_ocl);
    if(ret_ocl!=CL_SUCCESS)
    {
        printf("OpenCL Error - clCreateCommandQueue() Failed : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // create OpenCL program from .cl
    oclSourceCode=loadOclProgramSource("VecAddenw.cl","",&sizeKernelCodeLength);
    
    cl_int status=0;
    oclProgram = clCreateProgramWithSource(oclContext, 1, (const char **)&oclSourceCode, &sizeKernelCodeLength, &ret_ocl);
    if(ret_ocl!=CL_SUCCESS)
    {
        printf("OpenCL Error - clCreateProgramWithSource() Failed : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(0);
    }
    
    // build OpenCL program
    ret_ocl=clBuildProgram(oclProgram,0,NULL,NULL,NULL,NULL);
    if(ret_ocl!=CL_SUCCESS)
    {
        printf("OpenCL Error - clBuildProgram() Failed : %d. Exitting Now ...\n",ret_ocl);
        
        size_t len;
        char buffer[2048];
        clGetProgramBuildInfo(oclProgram,oclComputeDeviceID,CL_PROGRAM_BUILD_LOG,sizeof(buffer),buffer,&len);
        printf("OpenCL Program Build Log : %s\n",buffer);
        
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // create OpenCL kernel by passing kernel function name that we used in .cl file
    oclKernel=clCreateKernel(oclProgram,"vecAdd",&ret_ocl);
    if(ret_ocl!=CL_SUCCESS)
    {
        printf("OpenCL Error - clCreateKernel() Failed : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    int size=iNumberOfArrayElements * sizeof(cl_float);
    // allocate device-memory
    deviceInput1=clCreateBuffer(oclContext,CL_MEM_READ_ONLY,size,NULL,&ret_ocl);
    if(ret_ocl!=CL_SUCCESS)
    {
        printf("OpenCL Error - clCreateBuffer() Failed For 1st Input Array : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    deviceInput2=clCreateBuffer(oclContext,CL_MEM_READ_ONLY,size,NULL,&ret_ocl);
    if(ret_ocl!=CL_SUCCESS)
    {
        printf("OpenCL Error - clCreateBuffer() Failed For 2nd Input Array : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    deviceOutput=clCreateBuffer(oclContext,CL_MEM_WRITE_ONLY,size,NULL,&ret_ocl);
    if(ret_ocl!=CL_SUCCESS)
    {
        printf("OpenCL Error - clCreateBuffer() Failed For 2nd Input Array : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // set OpenCL kernel arguments. Our OpenCL kernel has 4 arguments 0,1,2,3
    // set 0 based 0th argument i.e. deviceInput1
    ret_ocl=clSetKernelArg(oclKernel,0,sizeof(cl_mem),(void *)&deviceInput1); // 'deviceInput1' maps to 'in1' param of kernel function in .cl file
    if(ret_ocl != CL_SUCCESS)
    {
        printf("OpenCL Error - clSetKernelArg() Failed For 1st Argument : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // set 0 based 1st argument i.e. deviceInput2
    ret_ocl=clSetKernelArg(oclKernel,1,sizeof(cl_mem),(void *)&deviceInput2); // 'deviceInput2' maps to 'in2' param of kernel function in .cl file
    if(ret_ocl != CL_SUCCESS)
    {
        printf("OpenCL Error - clSetKernelArg() Failed For 2nd Argument : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // set 0 based 2nd argument i.e. deviceOutput
    ret_ocl=clSetKernelArg(oclKernel,2,sizeof(cl_mem),(void *)&deviceOutput); // 'deviceOutput' maps to 'out' param of kernel function in .cl file
    if(ret_ocl != CL_SUCCESS)
    {
        printf("OpenCL Error - clSetKernelArg() Failed For 3rd Argument : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // set 0 based 3rd argument i.e. len
    ret_ocl=clSetKernelArg(oclKernel,3,sizeof(cl_int),(void *)&iNumberOfArrayElements); // 'iNumberOfArrayElements' maps to 'len' param of kernel function in .cl file
    if(ret_ocl != CL_SUCCESS)
    {
        printf("OpenCL Error - clSetKernelArg() Failed For 4th Argument : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // write abve 'input' device buffer to device memory
    ret_ocl=clEnqueueWriteBuffer(oclCommandQueue,deviceInput1,CL_FALSE,0,size,hostInput1,0,NULL,NULL);
    if(ret_ocl != CL_SUCCESS)
    {
        printf("OpenCL Error - clEnqueueWriteBuffer() Failed For 1st Input Device Buffer : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    ret_ocl=clEnqueueWriteBuffer(oclCommandQueue,deviceInput2,CL_FALSE,0,size,hostInput2,0,NULL,NULL);
    if(ret_ocl != CL_SUCCESS)
    {
        printf("OpenCL Error - clEnqueueWriteBuffer() Failed For 2nd Input Device Buffer : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
   
    // run the kernel
    globalWorkSize=roundGlobalSizeToNearestMultipleOfLocalSize(localWorkSize, iNumberOfArrayElements);
    
    // start timer
    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    
    ret_ocl=clEnqueueNDRangeKernel(oclCommandQueue,oclKernel,1,NULL,&globalWorkSize,&localWorkSize,0,NULL,NULL);
    if(ret_ocl != CL_SUCCESS)
    {
        printf("OpenCL Error - clEnqueueNDRangeKernel() Failed : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }
    
    // finish OpenCL command queue
    clFinish(oclCommandQueue);
    
    // stop timer
    sdkStopTimer(&timer);
    timeOnGPU = sdkGetTimerValue(&timer);
    sdkDeleteTimer(&timer);
    
    // read back result from the device (i.e from deviceOutput) into cpu variable (i.e hostOutput)
    ret_ocl=clEnqueueReadBuffer(oclCommandQueue,deviceOutput,CL_TRUE,0,size,hostOutput,0,NULL,NULL);
    if(ret_ocl != CL_SUCCESS)
    {
        printf("OpenCL Error - clEnqueueReadBuffer() Failed : %d. Exitting Now ...\n",ret_ocl);
        cleanup();
        exit(EXIT_FAILURE);
    }

    vecAddHost(hostInput1, hostInput2, gold, iNumberOfArrayElements);
    
    // compare results for golden-host
    const float epsilon = 0.000001f;
    bool bAccuracy=true;
    int breakValue=0;
    int i;
    for(i=0;i<iNumberOfArrayElements;i++)
    {
        
		/*
		float val1 = gold[i];
        float val2 = hostOutput[i];
        if(fabs(val1-val2) > epsilon)
        {
            bAccuracy = false;
            breakValue=i;
            break;
        }
		*/

		//std::cout << "HostOutPut:" << hostOutput[i] << std::endl;
    }
    
    if(bAccuracy==false)
    {
        printf("Break Value = %d\n",breakValue);
    }
    
    char str[125];
    if(bAccuracy==true)
    sprintf(str,"%s","Comparison Of Output Arrays On CPU And GPU Are Accurate Within The Limit Of 0.000001");
    else
    sprintf(str,"%s","Not All Comparison Of Output Arrays On CPU And GPU Are Accurate Within The Limit Of 0.000001");
    
    printf("1st Array Is From 0th Element %.6f To %dth Element %.6f\n",hostInput1[0], iNumberOfArrayElements-1, hostInput1[iNumberOfArrayElements-1]);
    printf("2nd Array Is From 0th Element %.6f To %dth Element %.6f\n",hostInput2[0], iNumberOfArrayElements-1, hostInput2[iNumberOfArrayElements-1]);
    printf("Global Work Size = %u And Local Work Size Size = %u\n",(unsigned int)globalWorkSize, (unsigned int)localWorkSize);
    printf("Sum Of Each Element From Above 2 Arrays Creates 3rd Array As :\n");
    printf("3nd Array Is From 0th Element %.6f To %dth Element %.6f\n",hostOutput[0], iNumberOfArrayElements-1, hostOutput[iNumberOfArrayElements-1]);
    printf("The Time Taken To Do Above Addition On CPU = %.6f (ms)\n",timeOnCPU);
    printf("The Time Taken To Do Above Addition On GPU = %.6f (ms)\n",timeOnGPU);
    printf("%s\n",str);

    // total cleanup
    cleanup();

	system("pause");
    return(0);
}

void cleanup(void)
{
    // code
    
    // OpenCL cleanup
    if(oclSourceCode)
    {
        free((void *)oclSourceCode);
        oclSourceCode=NULL;
    }
    
    if(oclKernel)
    {
        clReleaseKernel(oclKernel);
        oclKernel=NULL;
    }
    
    if(oclProgram)
    {
        clReleaseProgram(oclProgram);
        oclProgram=NULL;
    }
    
    if(oclCommandQueue)
    {
        clReleaseCommandQueue(oclCommandQueue);
        oclCommandQueue=NULL;
    }
    
    if(oclContext)
    {
        clReleaseContext(oclContext);
        oclContext=NULL;
    }

    // free allocated device-memory
    if(deviceInput1)
    {
        clReleaseMemObject(deviceInput1);
        deviceInput1=NULL;
    }
    
    if(deviceInput2)
    {
        clReleaseMemObject(deviceInput2);
        deviceInput2=NULL;
    }
    
    if(deviceOutput)
    {
        clReleaseMemObject(deviceOutput);
        deviceOutput=NULL;
    }
    
    // free allocated host-memory
    if(hostInput1)
    {
        free(hostInput1);
        hostInput1=NULL;
    }
    
    if(hostInput2)
    {
        free(hostInput2);
        hostInput2=NULL;
    }
    
    if(hostOutput)
    {
        free(hostOutput);
        hostOutput=NULL;
    }

    if(gold)
    {
        free(gold);
        gold=NULL;
    }
}

void fillFloatArrayWithRandomNumbers(float *pFloatArray, int iSize)
{
    // code
    int i;
    const float fScale = 1.0f / (float)RAND_MAX;
    for (i = 0; i < iSize; ++i)
    {
        pFloatArray[i] = fScale * rand();
    }
}


void FileInit(float *p, int N)
{
	for (int i = 0; i < N; i++)
	{
		p[i] = i;
	}
}

size_t roundGlobalSizeToNearestMultipleOfLocalSize(int local_size, unsigned int global_size)
{
    // code
    unsigned int r = global_size % local_size;
    if(r == 0)
    {
        return(global_size);
    }
    else
    {
        return(global_size + local_size - r);
    }
}

// "Golden" Host processing vector addition function for comparison purposes
void vecAddHost(const float* pFloatData1, const float* pFloatData2, float* pFloatResult, int iNumElements)
{
    int i;
    
    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    
    for (i = 0; i < iNumElements; i++)
    {
        pFloatResult[i] = pFloatData1[i] + pFloatData2[i];
    }
    
    sdkStopTimer(&timer);
    timeOnCPU = sdkGetTimerValue(&timer);
    sdkDeleteTimer(&timer);
}

char* loadOclProgramSource(const char *filename, const char *preamble, size_t *sizeFinalLength)
{
    // locals
    FILE *pFile=NULL;
    size_t sizeSourceLength;
    
    pFile=fopen(filename,"rb"); // binary read
    if(pFile==NULL)
        return(NULL);
    
    size_t sizePreambleLength=(size_t)strlen(preamble);
    
    // get the length of the source code
    fseek(pFile,0,SEEK_END);
    sizeSourceLength=ftell(pFile);
    fseek(pFile,0,SEEK_SET); // reset to beginning
    
    // allocate a buffer for the source code string and read it in
    char *sourceString=(char *)malloc(sizeSourceLength+sizePreambleLength+1);
    memcpy(sourceString, preamble, sizePreambleLength);
    if(fread((sourceString)+sizePreambleLength,sizeSourceLength,1,pFile)!=1)
    {
        fclose(pFile);
        free(sourceString);
        return(0);
    }
    
    // close the file and return the total length of the combined (preamble + source) string
    fclose(pFile);
    if(sizeFinalLength != 0)
    {
        *sizeFinalLength = sizeSourceLength + sizePreambleLength;
    }
    sourceString[sizeSourceLength + sizePreambleLength]='\0';
    
    return(sourceString);
}

由于工作任务紧张，写的有点仓促，难免有错误之处，望大家指正，相互学习。

一、前言

二、OpenCL的相关概念

三、OpenCL编程的步骤

四、程序实例向量相加

猜你喜欢