▶ 按书上写的设备队列的代码,需要 OpenCL2.0 的平台和设备,先把代码堆上来
● 程序主要功能:用主机上的数组 Ahost 和 Bhost 创建设备缓冲区 Adevice 和 Bdevice,调用核函数 foo 及其子核函数 fooChild 计算 factor * Adevice .* Bdevice,结果写入 Cdevice,最后拷贝回主机数组 Chost 检查结果。
● 代码
1 //deviceQueue.cl 2 __kernel void fooChild(const int nElement, const float factor, 3 __global const float *A, __global const float *B, __global float *C) 4 { 5 uint gid = get_global_id(0); 6 if (gid < nElement) 7 C[gid] = factor * A[gid] * B[gid]; 8 } 9 10 __kernel void foo(const int nElement, const float factor, 11 __global const float *A, __global const float *B, __global float *C) 12 { 13 uint gid = get_global_id(0), gsize = get_global_size(0); 14 uint childGsize = nElement / gsize, childOffset = gid * childGsize; 15 16 __global const float *Achild = &A[childOffset]; 17 __global const float *Bchild = &B[childOffset]; 18 __global const float *Cchild = &C[childOffset]; 19 20 queue_t defQ = get_default_queue(); 21 ndrange_t ndrange = ndrange_1D(childGsize); 22 void(^fooChildWrapper)(void) = ^{ fooChild(childGsize, factor, Achild, Bchild, Cchild); }; 23 enqueue_kernel(defQ, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, saxpyDpChildWrapper); 24 }
1 //main.c 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <cl.h> 5 6 const char *sourceCode = "D:/Code/deviceQueue.cl"; 7 8 char* readSource(const char* kernelPath)// 读取文本文件,存储为 char * 9 { 10 FILE *fp; 11 char *source; 12 long int size; 13 //printf("readSource, Program file: %s\n", kernelPath); 14 fopen_s(&fp, kernelPath, "rb"); 15 if (!fp) 16 { 17 printf("Open kernel file failed\n"); 18 exit(-1); 19 } 20 if (fseek(fp, 0, SEEK_END) != 0) 21 { 22 printf("Seek end of file faildd\n"); 23 exit(-1); 24 } 25 if ((size = ftell(fp)) < 0) 26 { 27 printf("Get file position failed\n"); 28 exit(-1); 29 } 30 rewind(fp); 31 if ((source = (char *)malloc(size + 1)) == NULL) 32 { 33 printf("Allocate space failed\n"); 34 exit(-1); 35 } 36 fread(source, 1, size, fp); 37 fclose(fp); 38 source[size] = '\0'; 39 return source; 40 } 41 42 int main() 43 { 44 const int nElement = 8196, nChildElement = 128, dataSize = nElement * sizeof(float); 45 float factor = 2.3f; 46 char info[1024] = { 0 }; 47 int i; 48 49 // 初始化平台 50 cl_int status; 51 cl_platform_id platform; 52 status = clGetPlatformIDs(1, &platform, NULL); 53 cl_device_id device; 54 status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); 55 cl_context_properties contextProp[] = { CL_CONTEXT_PLATFORM,(cl_context_properties)(platform), 0 }; 56 cl_context context = clCreateContext(contextProp, 1, &device, NULL, contextProp, &status); 57 cl_queue_properties queueProp[3] = { CL_QUEUE_PROPERTIES,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT, 0 }; 58 cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, queueProp, &status); 59 cl_event eventProducer, eventConsumer; 60 61 const char* source = readSource(sourceCode); 62 cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &status); 63 status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); 64 if (status) 65 { 66 clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 1024, info, NULL); 67 printf("Build log:\n%s\n", info); 68 } 69 70 cl_kernel foo = clCreateKernel(program, "foo", &status); 71 size_t globalSize = nElement / nChildElement, localSize = 1;// 每个父工作项调度 nChildElement 个子工作项 72 73 float *Ahost = (float *)malloc(dataSize); 74 float *Bhost = (float *)malloc(dataSize); 75 float *Chost = (float *)malloc(dataSize); 76 for (i = 0; i < nElement; Ahost[i] = i, Bhost[i] = i + 1, Chost[i] = 0.f, i++); 77 78 cl_mem Adevice, Bdevice, Cdevice; 79 Adevice = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, Ahost, &status); 80 Bdevice = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, Bhost, &status); 81 Cdevice = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &status); 82 83 clSetKernelArg(foo, 0, sizeof(int), (void*)&nElement); 84 clSetKernelArg(foo, 1, sizeof(float), (void*)&factor); 85 clSetKernelArg(foo, 2, sizeof(cl_mem), Adevice); 86 clSetKernelArg(foo, 3, sizeof(cl_mem), Bdevice); 87 clSetKernelArg(foo, 4, sizeof(cl_mem), Cdevice); 88 89 clEnqueueNDRangeKernel(queue, foo, 1, NULL, &globalSize, &localSize, 0, NULL, &eventProducer); 90 clFinish(queue); 91 92 clEnqueueReadBuffer(queue, Cdevice, CL_TRUE, dataSize, dataSize, Chost, 1, &eventConsumer, NULL); 93 clFinish(queue); 94 95 for (i = 0; i < nElement; i++) 96 { 97 if (Chost[i] != factor*i*(i + 1)) 98 break; 99 } 100 printf("Output is %s.\n", (i == nElement) ? "correct" : "incorrect"); 101 102 free(Ahost); 103 free(Bhost); 104 free(Chost); 105 clReleaseContext(context); 106 clReleaseCommandQueue(queue); 107 clReleaseProgram(program); 108 clReleaseKernel(foo); 109 clReleaseMemObject(Adevice); 110 clReleaseMemObject(Bdevice); 111 clReleaseMemObject(Cdevice); 112 getchar(); 113 return 0; 114 }
● 输出结果
■ 一直卡在函数 clCreateCommandQueueWithProperties 的调用上,返回值 -6(CL_OUT_OF_HOST_MEMORY),原因不明,stackExchange 上有人说换了显卡驱动就好了(https://stackoverflow.com/questions/39864947/opencl-cl-out-of-host-memory-on-clcreatecommandqueuewithproperties-with-minima),还有人说是设备位数的问题(https://stackoverflow.com/questions/45231329/opencl-clcreatecommandqueue-cl-out-of-host-memory-error),但是我更新了显卡驱动,工程改成 32 位(才发现显卡是 32 位的)还是不行。
■ 强行忽略上面的问题(clCreateCommandQueueWithProperties 第四参数用 NULL)仍然程序编译失败,返回 -11(CL_BUILD_PROGRAM_FAILURE),原因是不能支持和函数中的 queue_t 和 ndrange_t 数据类型,后面的块语法就更别想了,应该是平台和设备不能完全支持 OpenCL2.0 所致。