CUDA编程模型系列七(利用shared memory优化矩阵转置)

本系列教程将介绍具体的CUDA编程代码的细节
//              matrix transpose
//                                                     t56 t57 t58
//               in        b00 b01 b02 | b03 b04 b05 | b06 b07 b08  B[6][9]
//                         b10 b11 b12 | b13 b14 b15 | b16 b17 b18
//                         b20 b21 b22 | b23 b24 b25 | b26 b27 b28
//                         ------------+-------------+------------
//                         b30 b31 b32 | b33 b34 b35 | b36 b37 b38
//                         b40 b41 b41 | b43 b44 b45 | b46 b47 b48
//                         b50 b51 b52 | b53 b54 b55 | b56 b57 b58   threadIdx.x=1, threadIdx.y=2;
//                                                                   block 1, 2
//
//                         
//               out       b00 b10 b20 | b30 b40 b50
//                         b01 b11 b21 | b31 b41 b51
//                         b02 b12 b22 | b32 b42 b52
//                         ------------+------------
//                         b03 b13 b23 | b33 b43 b53
//                         b04 b14 b24 | b34 b44 b54
//                         b05 b15 b25 | b35 b45 b55
//                         ------------+------------
//                         b06 b16 b26 | b36 b46 b56
//                         b07 b17 b27 | b37 b47 b57
//                         b08 b18 b28 | b38 b48 b58                 block 2, 1
// shared memory 
// t57 read b57 from global memroy to shared memroy
// t57 read b48 from shared memory
// t57 write b48 to global memory

# include <stdio.h>
# include <math.h>

#define BLOCK_SIZE 32
#define M 3000
#define N 1000

__managed__ int matrix[N][M];
__managed__ int gpu_result[M][N];
__managed__ int cpu_result[M][N];

__global__ void gpu_matrix_transpose(int in[N][M], int out[M][N])
{
    
    
    int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockIdx.y;

    if( x < M && y < N)
    {
    
    
        out[x][y] = in[y][x];
    }
}

__global__ void gpu_shared_matrix_transpose(int in[N][M], int out[M][N])
{
    
    
    int y = threadIdx.y + blockDim.y * blockIdx.y;
    int x = threadIdx.x + blockDim.x * blockIdx.x;

    __shared__ int ken[BLOCK_SIZE+1][BLOCK_SIZE+1];//ken[32] warp

    if(x < M && y < N)
    {
    
    
        ken[threadIdx.y][threadIdx.x] = in[y][x];
    }
    __syncthreads();

    int x1 = threadIdx.x + blockDim.y * blockIdx.y;
    int y1 = threadIdx.y + blockDim.x * blockIdx.x;
    if(x1 < N && y1 < M)
    {
    
    
        out[y1][x1] = ken[threadIdx.x][threadIdx.y];//32 bank
    }

}

void cpu_matrix_transpose(int in[N][M], int out[M][N])
{
    
    
    for(int y = 0; y < N; y++)
    {
    
    
        for(int x = 0; x < M; x++)
        {
    
    
            out[x][y] = in[y][x];
        }
    }
}

int main()
{
    
    
    for(int y=0; y<N; y++)
    {
    
    
        for(int x=0; x<M; x++)
        {
    
    
            matrix[y][x] = rand()%1024;
        }
    }

    cudaEvent_t start, stop_gpu, stop_cpu;
    cudaEventCreate(&start);
    cudaEventCreate(&stop_cpu);
    cudaEventCreate(&stop_gpu);

    cudaEventRecord(start);
    cudaEventSynchronize(start);

    dim3 dimGrid((M + BLOCK_SIZE - 1)/BLOCK_SIZE, (N + BLOCK_SIZE -1)/BLOCK_SIZE);
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

    for(int i = 0; i < 20; i++)
    {
    
    
        gpu_shared_matrix_transpose<<<dimGrid,dimBlock>>>(matrix, gpu_result);
        cudaDeviceSynchronize();
    }

    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);

    cpu_matrix_transpose(matrix, cpu_result);

    cudaEventRecord(stop_cpu);
    cudaEventSynchronize(stop_cpu);

    float time_cpu, time_gpu;
    cudaEventElapsedTime(&time_gpu, start, stop_gpu);
    cudaEventElapsedTime(&time_cpu, stop_gpu, stop_cpu);

    bool errors = false;
    for(int y = 0; y<M; y++)
    {
    
    
        for (int x = 0; x < N; x++)
        {
    
    
            if(fabs(cpu_result[y][x] - gpu_result[y][x]) > (1.0e-10))
            {
    
    
                errors = true;
            }
        }
        
    }

    printf("Result: %s\n", errors?"Error":"Pass");
    printf("CPU time: %.2f\nGPU time: %.2f\n", time_cpu, time_gpu/20.0);

    return 0;
}
CUDA编程模型系列七(利用shared memory优化矩阵转置)

CUDA编程模型系列七(利用shared memory优化矩阵转置)

猜你喜欢