如何写出像向量点乘或者矩阵乘法这种重要的数学运算的CUDA程序。这些运算很重要,几乎在所有的应用程序中都要用到。会用到我们之前看到的所有概念,并有助于写以后的其他应用。
一、向量点乘
两个向量的点乘是重要的数学运算,也将会解释CUDA编程中的一个重要概念:归约运算。两个向量的点乘运算定义如下:
(X1,X2,X3)·(Y1,Y2,Y3)=X1Y1+X2Y2+X3Y3
真正的向量可能很长,两个向量里面可能有多个元素,而不仅仅只有三个。最终也会将多个乘法结果累加(归约运算)起来,而不仅仅是3个。现在,你看下这个运算,它和之前的元素两两相加的向量加法操作很类似。不同的是你需要将元素两两相乘。线程需要将它们的所有单个乘法结果连续累加起来,因为所有的一对对的乘法结果需要被累加起来,才能得到点乘的最终结果。最终的点乘的结果将是一个单一值。这种原始输入是两个数组而输出却缩减为一个(单一值)的运算,在CUDA里叫作归约运算。归约运算在很多应用程序里都有用。我们给出进行该种CUDA运算的内核函数如下:
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 1024
#define threadsPerBlock 512
__global__ void gpu_dot(float *d_a, float *d_b, float *d_c) {
//Declare shared memory
__shared__ float partial_sum[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//Calculate index for shared memory
int index = threadIdx.x;
//Calculate Partial Sum
float sum = 0;
while (tid < N)
{
sum += d_a[tid] * d_b[tid];
tid += blockDim.x * gridDim.x;
}
// 将部分和存储在共享内存中
partial_sum[index] = sum;
// synchronize threads
__syncthreads();
// 在归约运算中计算整个块的部分和
int i = blockDim.x / 2;
while (i != 0) {
if (index < i)
partial_sum[index] += partial_sum[index + i];
__syncthreads();
i /= 2;
}
// 将块部分和存储在全局内存中
if (index == 0)
d_c[blockIdx.x] = partial_sum[0];
}
int main(void) {
// 声明主机数组
float *h_a, *h_b, h_c, *partial_sum;
// 声明设备数组
float *d_a, *d_b, *d_partial_sum;
// 计算每个网格的块总数
int block_calc = (N + threadsPerBlock - 1) / threadsPerBlock;
int blocksPerGrid = (32 < block_calc ? 32 : block_calc);
// 在主机端分配内存
h_a = (float*)malloc(N * sizeof(float));
h_b = (float*)malloc(N * sizeof(float));
partial_sum = (float*)malloc(blocksPerGrid * sizeof(float));
// 分配设备上的内存
cudaMalloc((void**)&d_a, N * sizeof(float));
cudaMalloc((void**)&d_b, N * sizeof(float));
cudaMalloc((void**)&d_partial_sum, blocksPerGrid * sizeof(float));
// 用数据填充主机数组
for (int i = 0; i<N; i++) {
h_a[i] = i;
h_b[i] = 2;
}
cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);
//Call kernel
gpu_dot << <blocksPerGrid, threadsPerBlock >> >(d_a, d_b, d_partial_sum);
// copy the array back to host memory
cudaMemcpy(partial_sum, d_partial_sum, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost);
// Calculate final dot product on host
h_c = 0;
for (int i = 0; i<blocksPerGrid; i++) {
h_c += partial_sum[i];
}
printf("The computed dot product is: %f\n", h_c);
#define cpu_sum(x) (x*(x+1))
if (h_c == cpu_sum((float)(N - 1)))
{
printf("The dot product computed by GPU is correct\n");
}
else
{
printf("Error in dot product computation");
}
// free memory on host and device
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_partial_sum);
free(h_a);
free(h_b);
free(partial_sum);
}
二、矩阵乘法
除了向量点乘之外,GPU上通过CUDA进行的最重要的数学运算是矩阵乘法。当矩阵非常大的时候,数学运算将非常复杂。需要记住的是,当进行矩阵乘法的时候,乘号前矩阵的列数(即行宽)需要等于乘号后矩阵的行数(即高度)。矩阵乘法不满足交换律。
前一个矩阵的某行将会和后一个矩阵的所有的列进行点乘,然后对前一个矩阵的所有行依次类推,就得到了矩阵乘法。
我们写出两个独立的内核,它们将会分别使用和不使用共享内存。
//Matrix multiplication using shared and non shared kernal
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
#define TILE_SIZE 2
// 使用非共享内核的矩阵乘法
__global__ void gpu_Matrix_Mul_nonshared(float *d_a, float *d_b, float *d_c, const int size)
{
int row, col;
col = TILE_SIZE * blockIdx.x + threadIdx.x;
row = TILE_SIZE * blockIdx.y + threadIdx.y;
for (int k = 0; k< size; k++)
{
d_c[row*size + col] += d_a[row * size + k] * d_b[k * size + col];
}
}
// 使用共享内核的矩阵乘法
__global__ void gpu_Matrix_Mul_shared(float *d_a, float *d_b, float *d_c, const int size)
{
int row, col;
//Defining Shared Memory
__shared__ float shared_a[TILE_SIZE][TILE_SIZE];
__shared__ float shared_b[TILE_SIZE][TILE_SIZE];
col = TILE_SIZE * blockIdx.x + threadIdx.x;
row = TILE_SIZE * blockIdx.y + threadIdx.y;
for (int i = 0; i< size / TILE_SIZE; i++)
{
shared_a[threadIdx.y][threadIdx.x] = d_a[row* size + (i*TILE_SIZE + threadIdx.x)];
shared_b[threadIdx.y][threadIdx.x] = d_b[(i*TILE_SIZE + threadIdx.y) * size + col];
__syncthreads();
for (int j = 0; j<TILE_SIZE; j++)
d_c[row*size + col] += shared_a[threadIdx.y][j] * shared_b[j][threadIdx.x];
__syncthreads();
}
}
// main routine
int main()
{
const int size = 4;
//Define Host Array
float h_a[size][size], h_b[size][size],h_result[size][size];
//Defining device Array
float *d_a, *d_b, *d_result;
//Initialize host Array
for (int i = 0; i<size; i++)
{
for (int j = 0; j<size; j++)
{
h_a[i][j] = i;
h_b[i][j] = j;
}
}
cudaMalloc((void **)&d_a, size*size*sizeof(int));
cudaMalloc((void **)&d_b, size*size * sizeof(int));
cudaMalloc((void **)&d_result, size*size* sizeof(int));
//copy host array to device array
cudaMemcpy(d_a, h_a, size*size* sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size*size* sizeof(int), cudaMemcpyHostToDevice);
//Define grid and block dimensions
dim3 dimGrid(size / TILE_SIZE, size / TILE_SIZE, 1);
dim3 dimBlock(TILE_SIZE, TILE_SIZE, 1);
//gpu_Matrix_Mul_nonshared << <dimGrid, dimBlock >> > (d_a, d_b, d_result, size);
gpu_Matrix_Mul_shared << <dimGrid, dimBlock >> > (d_a, d_b, d_result, size);
cudaMemcpy(h_result, d_result, size*size * sizeof(int), cudaMemcpyDeviceToHost);
printf("The result of Matrix multiplication is: \n");
for (int i = 0; i< size; i++)
{
for (int j = 0; j < size; j++)
{
printf("%f ", h_result[i][j]);
}
printf("\n");
}
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_result);
return 0;
}
三、矩阵转置
1、简单矩阵复制
__global__ void copy(float *odata, const float *idata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[(y+j)*width + x] = idata[(y+j)*width + x];
}
2、朴素矩阵转置
转置内核看起来与复制内核非常相似。唯一的区别是交换了索引odata
。
__global__ void transposeNaive(float *odata, const float *idata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[x*width + (y+j)] = idata[(y+j)*width + x];
}
3、通过共享内存合并转置
__global__ void transposeCoalesced(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM][TILE_DIM];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
}