2D Transpose算子GPU实现和优化

2D Transpose

一种较好的做法是基于shared mem，合并内存访问读取到shared mem，读取整个warpSize x warpSize大小的矩阵块。然后基于shared mem索引变换读取实现转置效果，最后写回同样可以合并内存访问。可以考虑使用一个warp或者一个thread block读取warpSize x warpSize大小的矩阵，基于shared mem转置后写回。中小尺寸采用后者，也就是一个thread block转置一个warpSize x warpSize大小的矩阵，可以创建更多的线程提高硬件利用率，性能更优。

参考CUDA实现代码

#include <stdio.h>
#include <iostream>
using namespace std;

#include <cuda_runtime.h>
#include "utils/cuda_mem_helper.h"
#include "utils/cuda_stream_helper.h"

#define THREAD_PER_WARP 32
#define WARP_PER_BLOCK 8
#define BLOCK_RD_NUM (THREAD_PER_WARP / WARP_PER_BLOCK)

/*
using a warp to transpose warpSize*warpSize block
*/
template <typename T>
__global__ void transpose_2d_warp(const T* __restrict__ in, T* __restrict__ out,
                                  const int row, const int col,
                                  int warp_row, int warp_col, int total_warp) {
    const int tid = blockDim.x * blockIdx.x + threadIdx.x; // global thread id
    const int warp_bid = threadIdx.x / THREAD_PER_WARP; // warp id in thread block
    const int warp_gid = tid / THREAD_PER_WARP; // warp id in grid

    const int lane = threadIdx.x % THREAD_PER_WARP; // thread id in warp
    const int warp_id_c = warp_gid % warp_col;
    const int warp_id_r = warp_gid / warp_col;

    // add array padding to handle bank-conflict
    __shared__ T block_data[WARP_PER_BLOCK][THREAD_PER_WARP][THREAD_PER_WARP + 1];

    const int row_bias = warp_id_r * THREAD_PER_WARP;
    const int col_bias = warp_id_c * THREAD_PER_WARP;

    // read block from input
    for (int i = 0; i < THREAD_PER_WARP; i++) {
        int addr = (row_bias + i) * col + col_bias + lane;
        block_data[warp_bid][i][lane] = in[addr];
    }
    __syncthreads();

    // write block to output
    for (int i = 0; i < THREAD_PER_WARP; i++) {
        int tgt_c = col_bias + i;
        int tgt_r = row_bias + lane;
        if ((tgt_r < row) && (tgt_c < col)) {
            int addr = tgt_c * row + tgt_r;
            out[addr] = block_data[warp_bid][lane][i];
        }
    }
}

/*
using a thread block to transpose warpSize*warpSize block
*/
template <typename T>
__global__ void transpose_2d_block(const T* __restrict__ in, T* __restrict__ out,
                                   const int row, const int col) {
    int block_id = blockIdx.x;
    int block_num_col = col / warpSize;

    int block_id_row = block_id / block_num_col;
    int block_id_col = block_id % block_num_col;

    int row_offset = block_id_row * warpSize;
    int col_offset = block_id_col * warpSize;

    int row_id = threadIdx.x / warpSize;
    int col_id = threadIdx.x % warpSize;

    // add array padding to handle bank-conflict
    __shared__ T block_data[THREAD_PER_WARP][THREAD_PER_WARP + 1];

#pragma unroll
    for (int i = 0; i < BLOCK_RD_NUM; i++) {
        int row_pos = i * WARP_PER_BLOCK + row_id;
        int in_addr = (row_offset + row_pos) * col + col_offset + col_id;
        block_data[row_pos][col_id] = in[in_addr];
    }
    __syncthreads();


#pragma unroll
    for (int i = 0; i < BLOCK_RD_NUM; i++) {
        int row_pos = i * WARP_PER_BLOCK + row_id;
        int out_addr = (col_offset + row_pos) * row + row_offset + col_id;

        out[out_addr] = block_data[col_id][row_pos];
    }
}

template <typename T>
void Transpose2DWarp(const T* in, T* out, const int row, const int col, cudaStream_t & stream) {
    const int warp_row = (row + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
    const int warp_col = (col + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
    const int total_warp = warp_row * warp_col;

    const int block_size = THREAD_PER_WARP * WARP_PER_BLOCK;
    const int grid_size = (total_warp + WARP_PER_BLOCK - 1) / WARP_PER_BLOCK;

    transpose_2d_warp <<< grid_size, block_size, 0, stream>>>(in, out, row, col, warp_row, warp_col, total_warp);
}


template <typename T>
void Transpose2DBlock(const T* in, T* out, const int row, const int col, cudaStream_t & stream) {
    const int block_row = (row + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
    const int block_col = (col + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
    const int total_block = block_row * block_col;

    const int block_size = THREAD_PER_WARP * WARP_PER_BLOCK;
    const int grid_size = total_block;

    transpose_2d_block <<< grid_size, block_size, 0, stream>>>(in, out, row, col);
}

int main(void) {
    cudaError_t err = cudaSuccess;

    int row = 256;
    int col = 256;

    CudaMemoryHelper<float> data_in({row, col});
    CudaMemoryHelper<float> data_out({row, col});

    data_in.StepInitHostMem(1.0f);
    data_in.CopyMemH2D();
    data_in.PrintElems(4, 512,  col);

    CudaStreamHelper stream_helper;
    auto & stream = stream_helper.stream;

    int eval_num = 20;

    int thread_num = row * col;
    int threadsPerBlock = std::min(128, col);
    int blocksPerGrid = (thread_num + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    for (int i = 0; i < eval_num; i++) {
        Transpose2DWarp(data_in.d_mem, data_out.d_mem,  row, col, stream);
    }
    stream_helper.Sync();

    for (int i = 0; i < eval_num; i++) {
        Transpose2DBlock(data_in.d_mem, data_out.d_mem,  row, col, stream);
    }
    stream_helper.Sync();

    data_out.CopyMemD2H();

    printf("results0:\n");
    // verify results
    data_out.PrintElems(1, 1024, row);

    return 0;
}

可以参考

CUDA学习(二)矩阵转置及优化(合并访问、共享内存、bank conflict) - 知乎

端侧GPU没有shared mem怎么处理？

可以考虑一个线程转置4x4大小矩阵块，连续读取4行，每次读取都能pack4读取，然后基于寄存器转置，写回仍然能pack4写回。

opencl参考代码

#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include <chrono>

#include "mem_helper.h"

#define CL_HPP_TARGET_OPENCL_VERSION 300
#include <CL/opencl.hpp>

using TEST_DTYPE = half;

using namespace std;

// must be the same with real implement
#define TH_ROW_SIZE 4
#define TH_COL_SIZE 4

// should be used only for small channel_size (<=768)
std::string kernel_source{R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable

#define TH_ROW_SIZE 4
#define TH_COL_SIZE 4

#define DTYPE       half
#define DTYPE_PACK4 half4

// use a warp to calculate
kernel void transpose_kernel(__global const DTYPE* __restrict__ d_in, __global DTYPE* __restrict__ d_out, int hight,
                             int width) {
  int gid = get_global_id(0);
  int batch_id = get_global_id(1);
  int batch_addr = batch_id * hight * width;

  int col_th_num = (width + TH_COL_SIZE - 1) / TH_COL_SIZE;

  int row_id = gid / col_th_num;
  int col_id = gid % col_th_num;
  int row_pos = row_id * TH_ROW_SIZE;
  int col_pos = col_id * TH_COL_SIZE;

  // read 4 x 4 tile
  DTYPE_PACK4 in_datas[TH_ROW_SIZE];

#pragma unroll
  for (int rcnt = 0; rcnt < TH_ROW_SIZE; rcnt++) {
    int in_addr = batch_addr + (row_pos + rcnt) * width + col_pos;
    const DTYPE_PACK4* d_in_p4 = (const DTYPE_PACK4*)&d_in[in_addr];
    in_datas[rcnt] = d_in_p4[0];
  }

  DTYPE_PACK4 out_datas[TH_COL_SIZE];
  out_datas[0] = (DTYPE_PACK4)(in_datas[0].s0, in_datas[1].s0, in_datas[2].s0, in_datas[3].s0);
  out_datas[1] = (DTYPE_PACK4)(in_datas[0].s1, in_datas[1].s1, in_datas[2].s1, in_datas[3].s1);
  out_datas[2] = (DTYPE_PACK4)(in_datas[0].s2, in_datas[1].s2, in_datas[2].s2, in_datas[3].s2);
  out_datas[3] = (DTYPE_PACK4)(in_datas[0].s3, in_datas[1].s3, in_datas[2].s3, in_datas[3].s3);


#pragma unroll
  for (int ccnt = 0; ccnt < TH_COL_SIZE; ccnt++) {
    int out_addr = batch_addr + (col_pos + ccnt) * hight + row_pos;

    DTYPE_PACK4* d_out_p4 = (const DTYPE_PACK4*)&d_out[out_addr];
    d_out_p4[0] = out_datas[ccnt];
  }
}
)"};

int main() {
  std::vector<cl::Platform> platforms;
  cl::Platform::get(&platforms);
  std::cout << "get platform num:" << platforms.size() << std::endl;

  cl::Platform plat;
  for (auto& p : platforms) {
    std::string platver = p.getInfo<CL_PLATFORM_VERSION>();
    if (platver.find("OpenCL 2.") != std::string::npos || platver.find("OpenCL 3.") != std::string::npos) {
      // Note: an OpenCL 3.x platform may not support all required features!
      plat = p;
    }
  }
  if (plat() == 0) {
    std::cout << "No OpenCL 2.0 or newer platform found.\n";
    return -1;
  }

  std::cout << "platform name:" << plat.getInfo<CL_PLATFORM_NAME>() << std::endl;

  cl::Platform newP = cl::Platform::setDefault(plat);
  if (newP != plat) {
    std::cout << "Error setting default platform.\n";
    return -1;
  }

  // get default device (CPUs, GPUs) of the default platform
  std::vector<cl::Device> all_devices;
  newP.getDevices(CL_DEVICE_TYPE_GPU, &all_devices);  // CL_DEVICE_TYPE_ALL
  std::cout << "get all_devices num:" << all_devices.size() << std::endl;

  if (all_devices.size() == 0) {
    std::cout << " No devices found. Check OpenCL installation!\n";
    exit(1);
  }

  // cl::Device default_device = cl::Device::getDefault();
  cl::Device default_device = all_devices[0];
  std::cout << "device name: " << default_device.getInfo<CL_DEVICE_NAME>() << std::endl;

  // a context is like a "runtime link" to the device and platform;
  // i.e. communication is possible
  cl::Context context({default_device});
  cl::CommandQueue queue(context, default_device);

  int batch = 4;
  int hight = 1024;
  int width = 1024;
  vector<int> shape1 = {batch, hight, width};

  MemoryHelper<TEST_DTYPE> mem_in(shape1);
  MemoryHelper<TEST_DTYPE> mem_out(shape1);
  mem_in.StepInit(1.0f);

  // CL_MEM_WRITE_ONLY CL_MEM_READ_ONLY CL_MEM_READ_WRITE
  cl::Buffer d_in = cl::Buffer(context, CL_MEM_READ_WRITE, mem_in.bytes);
  cl::Buffer d_out = cl::Buffer(context, CL_MEM_READ_WRITE, mem_out.bytes);

  memset(mem_out.Mem(), 0, mem_out.bytes);

  // push write commands to queue
  queue.enqueueWriteBuffer(d_in, CL_TRUE, 0, mem_in.bytes, mem_in.Mem());

  std::vector<std::string> programStrings;
  programStrings.push_back(kernel_source);
  cl::Program program(context, programStrings);

  if (program.build({default_device}, "-cl-std=CL3.0") != CL_SUCCESS) {
    std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl;
    exit(1);
  }

  auto cl_kernel = cl::KernelFunctor<cl::Buffer, cl::Buffer, int, int>(program, "transpose_kernel");

  int local_thread_num = 128 * 4;

  int row_th_num = (hight + TH_ROW_SIZE - 1) / TH_ROW_SIZE;
  int col_th_num = (width + TH_COL_SIZE - 1) / TH_COL_SIZE;
  int total_thread_num = row_th_num * col_th_num;

  local_thread_num = std::min(local_thread_num, total_thread_num);

  cout << "total_thread_num: " << total_thread_num << endl;
  cout << "local_thread_num: " << local_thread_num << endl;

  // global, or global, local, or offset, global, local
  cl::EnqueueArgs kernel_args(queue, cl::NDRange(total_thread_num, batch), cl::NDRange(local_thread_num));

  int warmup_num = 50;
  int eval_num = 50;
  for (int i = 0; i < warmup_num; i++) {
    cl_kernel(kernel_args, d_in, d_out, hight, width);
  }
  queue.finish();

  auto t1 = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < eval_num; i++) {
    cl_kernel(kernel_args, d_in, d_out, hight, width);
  }
  queue.finish();
  auto t2 = std::chrono::high_resolution_clock::now();
  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
  float mean_time_ms = duration / 1000.0f / eval_num;

  printf("exec time us: %lld %d\n", duration, eval_num);
  printf("exec time: %f ms\n", mean_time_ms);
  printf("batch, hight, width: %d %d %d\n", batch, hight, width);

  queue.enqueueReadBuffer(d_out, CL_TRUE, 0, mem_out.bytes, mem_out.Mem());
  mem_in.PrintElems(1, 512, width);
  mem_out.PrintElems(1, 512, hight);
  return 0;
}

2D Transpose算子GPU实现和优化

2D Transpose

猜你喜欢