opencl(十八)----矩阵转置、矩阵乘法

   矩阵转置

//  kernel
__kernel void transpose(__global float4 *g_mat, 
   __local float4 *l_mat, uint size) {

   __global float4 *src, *dst;

   /* Determine row and column location */
   int col = get_global_id(0);
   int row = 0;
   while(col >= size) {
      col -= size--;
      row++;
   }
   col += row;
   size += row;

   /* Read source block into local memory */
   src = g_mat + row * size * 4 + col;
   l_mat += get_local_id(0)*8;
   l_mat[0] = src[0];
   l_mat[1] = src[size];
   l_mat[2] = src[2*size];
   l_mat[3] = src[3*size];

   /* Process block on diagonal */
   if(row == col) {
      src[0] = 
         (float4)(l_mat[0].x, l_mat[1].x, l_mat[2].x, l_mat[3].x);
      src[size] = 
         (float4)(l_mat[0].y, l_mat[1].y, l_mat[2].y, l_mat[3].y);
      src[2*size] = 
         (float4)(l_mat[0].z, l_mat[1].z, l_mat[2].z, l_mat[3].z);
      src[3*size] = 
         (float4)(l_mat[0].w, l_mat[1].w, l_mat[2].w, l_mat[3].w);
   }
   /* Process block off diagonal */
   else {
      /* Read destination block into local memory */
      dst = g_mat + col * size * 4 + row;
      l_mat[4] = dst[0];
      l_mat[5] = dst[size];
      l_mat[6] = dst[2*size];
      l_mat[7] = dst[3*size];

      /* Set elements of destination block */
      dst[0] = 
         (float4)(l_mat[0].x, l_mat[1].x, l_mat[2].x, l_mat[3].x);
      dst[size] = 
         (float4)(l_mat[0].y, l_mat[1].y, l_mat[2].y, l_mat[3].y);
      dst[2*size] = 
         (float4)(l_mat[0].z, l_mat[1].z, l_mat[2].z, l_mat[3].z);
      dst[3*size] = 
         (float4)(l_mat[0].w, l_mat[1].w, l_mat[2].w, l_mat[3].w);

      /* Set elements of source block */
      src[0] = 
         (float4)(l_mat[4].x, l_mat[5].x, l_mat[6].x, l_mat[7].x);
      src[size] = 
         (float4)(l_mat[4].y, l_mat[5].y, l_mat[6].y, l_mat[7].y);
      src[2*size] = 
         (float4)(l_mat[4].z, l_mat[5].z, l_mat[6].z, l_mat[7].z);
      src[3*size] = 
         (float4)(l_mat[4].w, l_mat[5].w, l_mat[6].w, l_mat[7].w);
   }
}

// 主机程序

#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "transpose.cl"
#define KERNEL_FUNC "transpose"

#define MATRIX_DIM 64

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {

   cl_platform_id platform;
   cl_device_id dev;
   int err;

   /* Identify a platform */
   err = clGetPlatformIDs(1, &platform, NULL);
   if(err < 0) {
      perror("Couldn't identify a platform");
      exit(1);
   } 

   /* Access a device */
   err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
   if(err == CL_DEVICE_NOT_FOUND) {
      err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
   }
   if(err < 0) {
      perror("Couldn't access any devices");
      exit(1);   
   }

   return dev;
}

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

   cl_program program;
   FILE *program_handle;
   char *program_buffer, *program_log;
   size_t program_size, log_size;
   int err;

   /* Read program file and place content into buffer */
   program_handle = fopen(filename, "r");
   if(program_handle == NULL) {
      perror("Couldn't find the program file");
      exit(1);
   }
   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle);
   rewind(program_handle);
   program_buffer = (char*)malloc(program_size + 1);
   program_buffer[program_size] = '\0';
   fread(program_buffer, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   /* Create program from file */
   program = clCreateProgramWithSource(ctx, 1, 
      (const char**)&program_buffer, &program_size, &err);
   if(err < 0) {
      perror("Couldn't create the program");
      exit(1);
   }
   free(program_buffer);

   /* Build program */
   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
   if(err < 0) {

      /* Find size of log and print to std output */
      clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
            0, NULL, &log_size);
      program_log = (char*) malloc(log_size + 1);
      program_log[log_size] = '\0';
      clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
            log_size + 1, program_log, NULL);
      printf("%s\n", program_log);
      free(program_log);
      exit(1);
   }
   return program;
}

int main() {

   /* Host/device data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel kernel;
   size_t global_size;
   cl_ulong mem_size;
   int i, j, err, check;
 
   /* Data and buffers */
   cl_uint matrix_dim;
   float data[MATRIX_DIM][MATRIX_DIM];
   cl_mem data_buffer;

   /* Initialize data */
   for(i=0; i<MATRIX_DIM; i++) {
      for(j=0; j<MATRIX_DIM; j++) {
         data[i][j] = 1.0f * i * MATRIX_DIM + j;
      }
   }

   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Build the program */
   program = build_program(context, device, PROGRAM_FILE);

   /* Create a kernel */
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);
   };

   /* Create buffer to hold matrix */
   data_buffer = clCreateBuffer(context, 
         CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
         sizeof(data), data, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };

   /* Determine execution parameters */
   global_size = (MATRIX_DIM/4 * (MATRIX_DIM/4 + 1))/2;
   clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,     
         sizeof(mem_size), &mem_size, NULL);

   /* Create kernel arguments */
   matrix_dim = MATRIX_DIM/4;
   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(kernel, 1, (size_t)mem_size, NULL);
   err |= clSetKernelArg(kernel, 2, sizeof(matrix_dim), &matrix_dim);
   if(err < 0) {
      printf("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue kernel */
   err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, 
         NULL, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      printf("Error: %d\n", err);
      exit(1);   
   } 

   /* Read the result */
   err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 
      sizeof(data), data, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   } 

   /* Check data */
   check = 1;
   for(i=0; i<MATRIX_DIM; i++) {
      for(j=0; j<MATRIX_DIM; j++) {
         if(data[i][j] != 1.0*j*MATRIX_DIM+i) {
            check = 0;
            break;
         }
      }
   }
   if(check)
      printf("Transpose check succeeded.\n");
   else
      printf("Transpose check failed.\n");

   /* Deallocate resources */
   clReleaseMemObject(data_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}

猜你喜欢

转载自www.cnblogs.com/feihu-h/p/12107384.html