2D Transpose
一种较好的做法是基于shared mem,合并内存访问读取到shared mem,读取整个warpSize x warpSize大小的矩阵块。然后基于shared mem索引变换读取实现转置效果,最后写回同样可以合并内存访问。可以考虑使用一个warp或者一个thread block读取warpSize x warpSize大小的矩阵,基于shared mem转置后写回。中小尺寸采用后者,也就是一个thread block转置一个warpSize x warpSize大小的矩阵,可以创建更多的线程提高硬件利用率,性能更优。
参考CUDA实现代码
#include <stdio.h>
#include <iostream>
using namespace std;
#include <cuda_runtime.h>
#include "utils/cuda_mem_helper.h"
#include "utils/cuda_stream_helper.h"
#define THREAD_PER_WARP 32
#define WARP_PER_BLOCK 8
#define BLOCK_RD_NUM (THREAD_PER_WARP / WARP_PER_BLOCK)
/*
using a warp to transpose warpSize*warpSize block
*/
template <typename T>
__global__ void transpose_2d_warp(const T* __restrict__ in, T* __restrict__ out,
const int row, const int col,
int warp_row, int warp_col, int total_warp) {
const int tid = blockDim.x * blockIdx.x + threadIdx.x; // global thread id
const int warp_bid = threadIdx.x / THREAD_PER_WARP; // warp id in thread block
const int warp_gid = tid / THREAD_PER_WARP; // warp id in grid
const int lane = threadIdx.x % THREAD_PER_WARP; // thread id in warp
const int warp_id_c = warp_gid % warp_col;
const int warp_id_r = warp_gid / warp_col;
// add array padding to handle bank-conflict
__shared__ T block_data[WARP_PER_BLOCK][THREAD_PER_WARP][THREAD_PER_WARP + 1];
const int row_bias = warp_id_r * THREAD_PER_WARP;
const int col_bias = warp_id_c * THREAD_PER_WARP;
// read block from input
for (int i = 0; i < THREAD_PER_WARP; i++) {
int addr = (row_bias + i) * col + col_bias + lane;
block_data[warp_bid][i][lane] = in[addr];
}
__syncthreads();
// write block to output
for (int i = 0; i < THREAD_PER_WARP; i++) {
int tgt_c = col_bias + i;
int tgt_r = row_bias + lane;
if ((tgt_r < row) && (tgt_c < col)) {
int addr = tgt_c * row + tgt_r;
out[addr] = block_data[warp_bid][lane][i];
}
}
}
/*
using a thread block to transpose warpSize*warpSize block
*/
template <typename T>
__global__ void transpose_2d_block(const T* __restrict__ in, T* __restrict__ out,
const int row, const int col) {
int block_id = blockIdx.x;
int block_num_col = col / warpSize;
int block_id_row = block_id / block_num_col;
int block_id_col = block_id % block_num_col;
int row_offset = block_id_row * warpSize;
int col_offset = block_id_col * warpSize;
int row_id = threadIdx.x / warpSize;
int col_id = threadIdx.x % warpSize;
// add array padding to handle bank-conflict
__shared__ T block_data[THREAD_PER_WARP][THREAD_PER_WARP + 1];
#pragma unroll
for (int i = 0; i < BLOCK_RD_NUM; i++) {
int row_pos = i * WARP_PER_BLOCK + row_id;
int in_addr = (row_offset + row_pos) * col + col_offset + col_id;
block_data[row_pos][col_id] = in[in_addr];
}
__syncthreads();
#pragma unroll
for (int i = 0; i < BLOCK_RD_NUM; i++) {
int row_pos = i * WARP_PER_BLOCK + row_id;
int out_addr = (col_offset + row_pos) * row + row_offset + col_id;
out[out_addr] = block_data[col_id][row_pos];
}
}
template <typename T>
void Transpose2DWarp(const T* in, T* out, const int row, const int col, cudaStream_t & stream) {
const int warp_row = (row + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
const int warp_col = (col + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
const int total_warp = warp_row * warp_col;
const int block_size = THREAD_PER_WARP * WARP_PER_BLOCK;
const int grid_size = (total_warp + WARP_PER_BLOCK - 1) / WARP_PER_BLOCK;
transpose_2d_warp <<< grid_size, block_size, 0, stream>>>(in, out, row, col, warp_row, warp_col, total_warp);
}
template <typename T>
void Transpose2DBlock(const T* in, T* out, const int row, const int col, cudaStream_t & stream) {
const int block_row = (row + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
const int block_col = (col + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
const int total_block = block_row * block_col;
const int block_size = THREAD_PER_WARP * WARP_PER_BLOCK;
const int grid_size = total_block;
transpose_2d_block <<< grid_size, block_size, 0, stream>>>(in, out, row, col);
}
int main(void) {
cudaError_t err = cudaSuccess;
int row = 256;
int col = 256;
CudaMemoryHelper<float> data_in({row, col});
CudaMemoryHelper<float> data_out({row, col});
data_in.StepInitHostMem(1.0f);
data_in.CopyMemH2D();
data_in.PrintElems(4, 512, col);
CudaStreamHelper stream_helper;
auto & stream = stream_helper.stream;
int eval_num = 20;
int thread_num = row * col;
int threadsPerBlock = std::min(128, col);
int blocksPerGrid = (thread_num + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
for (int i = 0; i < eval_num; i++) {
Transpose2DWarp(data_in.d_mem, data_out.d_mem, row, col, stream);
}
stream_helper.Sync();
for (int i = 0; i < eval_num; i++) {
Transpose2DBlock(data_in.d_mem, data_out.d_mem, row, col, stream);
}
stream_helper.Sync();
data_out.CopyMemD2H();
printf("results0:\n");
// verify results
data_out.PrintElems(1, 1024, row);
return 0;
}
可以参考
CUDA学习(二)矩阵转置及优化(合并访问、共享内存、bank conflict) - 知乎
端侧GPU没有shared mem怎么处理?
可以考虑一个线程转置4x4大小矩阵块,连续读取4行,每次读取都能pack4读取,然后基于寄存器转置,写回仍然能pack4写回。
opencl参考代码
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include <chrono>
#include "mem_helper.h"
#define CL_HPP_TARGET_OPENCL_VERSION 300
#include <CL/opencl.hpp>
using TEST_DTYPE = half;
using namespace std;
// must be the same with real implement
#define TH_ROW_SIZE 4
#define TH_COL_SIZE 4
// should be used only for small channel_size (<=768)
std::string kernel_source{R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define TH_ROW_SIZE 4
#define TH_COL_SIZE 4
#define DTYPE half
#define DTYPE_PACK4 half4
// use a warp to calculate
kernel void transpose_kernel(__global const DTYPE* __restrict__ d_in, __global DTYPE* __restrict__ d_out, int hight,
int width) {
int gid = get_global_id(0);
int batch_id = get_global_id(1);
int batch_addr = batch_id * hight * width;
int col_th_num = (width + TH_COL_SIZE - 1) / TH_COL_SIZE;
int row_id = gid / col_th_num;
int col_id = gid % col_th_num;
int row_pos = row_id * TH_ROW_SIZE;
int col_pos = col_id * TH_COL_SIZE;
// read 4 x 4 tile
DTYPE_PACK4 in_datas[TH_ROW_SIZE];
#pragma unroll
for (int rcnt = 0; rcnt < TH_ROW_SIZE; rcnt++) {
int in_addr = batch_addr + (row_pos + rcnt) * width + col_pos;
const DTYPE_PACK4* d_in_p4 = (const DTYPE_PACK4*)&d_in[in_addr];
in_datas[rcnt] = d_in_p4[0];
}
DTYPE_PACK4 out_datas[TH_COL_SIZE];
out_datas[0] = (DTYPE_PACK4)(in_datas[0].s0, in_datas[1].s0, in_datas[2].s0, in_datas[3].s0);
out_datas[1] = (DTYPE_PACK4)(in_datas[0].s1, in_datas[1].s1, in_datas[2].s1, in_datas[3].s1);
out_datas[2] = (DTYPE_PACK4)(in_datas[0].s2, in_datas[1].s2, in_datas[2].s2, in_datas[3].s2);
out_datas[3] = (DTYPE_PACK4)(in_datas[0].s3, in_datas[1].s3, in_datas[2].s3, in_datas[3].s3);
#pragma unroll
for (int ccnt = 0; ccnt < TH_COL_SIZE; ccnt++) {
int out_addr = batch_addr + (col_pos + ccnt) * hight + row_pos;
DTYPE_PACK4* d_out_p4 = (const DTYPE_PACK4*)&d_out[out_addr];
d_out_p4[0] = out_datas[ccnt];
}
}
)"};
int main() {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::cout << "get platform num:" << platforms.size() << std::endl;
cl::Platform plat;
for (auto& p : platforms) {
std::string platver = p.getInfo<CL_PLATFORM_VERSION>();
if (platver.find("OpenCL 2.") != std::string::npos || platver.find("OpenCL 3.") != std::string::npos) {
// Note: an OpenCL 3.x platform may not support all required features!
plat = p;
}
}
if (plat() == 0) {
std::cout << "No OpenCL 2.0 or newer platform found.\n";
return -1;
}
std::cout << "platform name:" << plat.getInfo<CL_PLATFORM_NAME>() << std::endl;
cl::Platform newP = cl::Platform::setDefault(plat);
if (newP != plat) {
std::cout << "Error setting default platform.\n";
return -1;
}
// get default device (CPUs, GPUs) of the default platform
std::vector<cl::Device> all_devices;
newP.getDevices(CL_DEVICE_TYPE_GPU, &all_devices); // CL_DEVICE_TYPE_ALL
std::cout << "get all_devices num:" << all_devices.size() << std::endl;
if (all_devices.size() == 0) {
std::cout << " No devices found. Check OpenCL installation!\n";
exit(1);
}
// cl::Device default_device = cl::Device::getDefault();
cl::Device default_device = all_devices[0];
std::cout << "device name: " << default_device.getInfo<CL_DEVICE_NAME>() << std::endl;
// a context is like a "runtime link" to the device and platform;
// i.e. communication is possible
cl::Context context({default_device});
cl::CommandQueue queue(context, default_device);
int batch = 4;
int hight = 1024;
int width = 1024;
vector<int> shape1 = {batch, hight, width};
MemoryHelper<TEST_DTYPE> mem_in(shape1);
MemoryHelper<TEST_DTYPE> mem_out(shape1);
mem_in.StepInit(1.0f);
// CL_MEM_WRITE_ONLY CL_MEM_READ_ONLY CL_MEM_READ_WRITE
cl::Buffer d_in = cl::Buffer(context, CL_MEM_READ_WRITE, mem_in.bytes);
cl::Buffer d_out = cl::Buffer(context, CL_MEM_READ_WRITE, mem_out.bytes);
memset(mem_out.Mem(), 0, mem_out.bytes);
// push write commands to queue
queue.enqueueWriteBuffer(d_in, CL_TRUE, 0, mem_in.bytes, mem_in.Mem());
std::vector<std::string> programStrings;
programStrings.push_back(kernel_source);
cl::Program program(context, programStrings);
if (program.build({default_device}, "-cl-std=CL3.0") != CL_SUCCESS) {
std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl;
exit(1);
}
auto cl_kernel = cl::KernelFunctor<cl::Buffer, cl::Buffer, int, int>(program, "transpose_kernel");
int local_thread_num = 128 * 4;
int row_th_num = (hight + TH_ROW_SIZE - 1) / TH_ROW_SIZE;
int col_th_num = (width + TH_COL_SIZE - 1) / TH_COL_SIZE;
int total_thread_num = row_th_num * col_th_num;
local_thread_num = std::min(local_thread_num, total_thread_num);
cout << "total_thread_num: " << total_thread_num << endl;
cout << "local_thread_num: " << local_thread_num << endl;
// global, or global, local, or offset, global, local
cl::EnqueueArgs kernel_args(queue, cl::NDRange(total_thread_num, batch), cl::NDRange(local_thread_num));
int warmup_num = 50;
int eval_num = 50;
for (int i = 0; i < warmup_num; i++) {
cl_kernel(kernel_args, d_in, d_out, hight, width);
}
queue.finish();
auto t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < eval_num; i++) {
cl_kernel(kernel_args, d_in, d_out, hight, width);
}
queue.finish();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
float mean_time_ms = duration / 1000.0f / eval_num;
printf("exec time us: %lld %d\n", duration, eval_num);
printf("exec time: %f ms\n", mean_time_ms);
printf("batch, hight, width: %d %d %d\n", batch, hight, width);
queue.enqueueReadBuffer(d_out, CL_TRUE, 0, mem_out.bytes, mem_out.Mem());
mem_in.PrintElems(1, 512, width);
mem_out.PrintElems(1, 512, hight);
return 0;
}