[CUDA] Format conversion YUVYu12ToRGB

About YU12, NV12 and NV21

YUV is a color space used to describe the luminance and chrominance information of color images. YUV420 is the most commonly used format in YUV, where 420 means that every four Y pixels share one UV pixel. The YUV420 format can be divided into multiple sub-formats such as YUV420P, YUV420SP and YUV420P10. And YU12, NV12 and NV21 are the three sub-formats of YUV420, as follows:

YU12 format: YU12 is a kind of YUV420P format. In the YU12 format, the luminance (Y) component of an image is stored in one matrix, while the chrominance (U, V) components are alternately stored in another matrix, the U component is stored first, and then the V component, and each component occupies one byte.

NV12 format: NV12 is one of the YUV420SP formats, and it is also the most commonly used one. In the NV12 format, the brightness (Y) component of the image is still stored in one matrix, while the chrominance (U, V) components are alternately stored in another matrix. The difference is that the U component is stored first, then the V component, and the U and V components of each pixel are stored alternately, that is, every two pixels share a UV pixel, and the U and V components of each pixel occupy one byte.

NV21 format: NV21 is also a kind of YUV420SP format. The difference from NV12 is that in the NV21 format, the V component is stored first, then the U component, and the U and V components of each pixel are stored alternately. The U and V components of each pixel occupy one byte.
These three formats are widely used in the fields of video codec, image acquisition and processing.

YUVYu12ToRGB

For YUV420P format:

/// @brief Convert YUV(yu12, also called i420) image to RGB image.
///        The order of RGB image is RGBRGBRGB. 这里仍然是HWC格式
/// @param in_buf The input buffer allocate in device memory.
///               The size is (in_w * in_h * 3 / 2) * sizeof(uint8_t).
/// @param out_buf The output buffer allocate in device memory.
///               The size is (in_w * in_h * 3) * sizeof(uint8_t).
/// @param in_w The width of input image.
/// @param in_h The height of input image.
/// @param stream CU kernel run in the stream.
void YUVYu12ToRGB(uint8_t* in_buf, uint8_t* out_buf,
                       int in_w, int in_h, cudaStream_t stream);
void YUVYu12ToRGB(uint8_t* in_buf, uint8_t* out_buf,
                     int in_w, int in_h, cudaStream_t stream){
    
    
  if (in_w % 4 == 0) {
    
    
      dim3 block(32, 4, 1);
      dim3 grid(((in_w >> 2) + block.x - 1) / block.x, ((in_h >> 1) + block.y - 1) / block.y, 1);
      Yuv2rgb24<true><<<grid, block, 0, stream>>>((uint32_t*)in_buf, (void*)out_buf, in_h >> 1, in_w >> 2);
  } else {
    
    
      dim3 block(256, 1, 1);
      dim3 grid(in_h / 2, 1, 1);
      int sm_size = in_w * 6 + 16 * 3;
      Yuv2rgb24_general<true, false, uint8_t, false><<<grid, block, sm_size, stream>>>(in_buf, out_buf, in_h, in_w);
  }
}

Reference template function 1

template<bool IsYU12, bool float_out = false>
__global__ void Yuv2rgb24(uint32_t *__restrict__ in, void *__restrict__ out, int32_t height, int width) {
    
    
  uint32_t *u8_out = NULL;
  float2 *fp32_out = NULL;
  if (float_out)
    fp32_out = (float2*)out;
  else
    u8_out = (uint32_t*)out;
  uint32_t w_id = blockDim.x * blockIdx.x + threadIdx.x;
  uint32_t h_id = blockDim.y * blockIdx.y + threadIdx.y;
  uint32_t out_idx = h_id * width + w_id;
  uint32_t thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
  __shared__ uint32_t out_sm[768]; // 128 * 4 * 3 / 2


  using dTy = uint16_t; //or int8_t
  if (w_id < width && h_id < height) {
    
    
    dTy u1, v1, u2, v2;
    uint32_t temp_y1, temp_y2;
    uchar4 temp_rgb0[4],temp_rgb1[4];
    uint32_t pos = h_id * 2 * width + w_id;
    temp_y1 = in[pos];
    pos  = (h_id * 2 + 1) * width + w_id;
    temp_y2 = in[pos];
    if (IsYU12) {
    
    
      pos = height * width * 4 + out_idx;
      uint16_t u12 = ((uint16_t*)in)[pos];
      pos += height * width;
      uint16_t v12 = ((uint16_t*)in)[pos];
      u1 = ((uint8_t*)&u12)[0];
      u2 = ((uint8_t*)&u12)[1];
      v1 = ((uint8_t*)&v12)[0];
      v2 = ((uint8_t*)&v12)[1];
    } else {
    
    
      pos = height * width * 2 + out_idx;
      uint32_t uv = in[pos];
      u1 = ((uint8_t*)&uv)[0];
      v1 = ((uint8_t*)&uv)[1];
      u2 = ((uint8_t*)&uv)[2];
      v2 = ((uint8_t*)&uv)[3];
    }
    dTy y1 = ((uint8_t*)&temp_y1)[0];
    dTy y2 = ((uint8_t*)&temp_y1)[1];
    dTy y3 = ((uint8_t*)&temp_y1)[2];
    dTy y4 = ((uint8_t*)&temp_y1)[3];
    dTy y5 = ((uint8_t*)&temp_y2)[0];
    dTy y6 = ((uint8_t*)&temp_y2)[1];
    dTy y7 = ((uint8_t*)&temp_y2)[2];
    dTy y8 = ((uint8_t*)&temp_y2)[3];
    uint8_t *out_row1 = (uint8_t*)out_sm;
    uint8_t *out_row2 = out_row1 + 1536;

    temp_rgb0[0] = cvt2rgb<dTy>(y1,u1,v1);
    temp_rgb0[1] = cvt2rgb<dTy>(y2,u1,v1);
    temp_rgb0[2] = cvt2rgb<dTy>(y3,u2,v2);
    temp_rgb0[3] = cvt2rgb<dTy>(y4,u2,v2);
    temp_rgb1[0] = cvt2rgb<dTy>(y5,u1,v1);
    temp_rgb1[1] = cvt2rgb<dTy>(y6,u1,v1);
    temp_rgb1[2] = cvt2rgb<dTy>(y7,u2,v2); 
    temp_rgb1[3] = cvt2rgb<dTy>(y8,u2,v2); 
    for(int i = 0; i < 4; i++)
    {
    
    
      out_row1[thread_idx * 12 + i*3 + 0] = temp_rgb0[i].x;
      out_row1[thread_idx * 12 + i*3 + 1] = temp_rgb0[i].y;
      out_row1[thread_idx * 12 + i*3 + 2] = temp_rgb0[i].z;
      out_row2[thread_idx * 12 + i*3 + 0] = temp_rgb1[i].x;
      out_row2[thread_idx * 12 + i*3 + 1] = temp_rgb1[i].y;
      out_row2[thread_idx * 12 + i*3 + 2] = temp_rgb1[i].z;
    } 
    __syncthreads();

    thread_idx = threadIdx.x;
    int num_loops = float_out ? 6 : 3;
    int out_offset1 = h_id * 2 * width * num_loops + blockDim.x * blockIdx.x * num_loops;
    int out_offset2 = (h_id * 2 + 1) * width * num_loops + blockDim.x * blockIdx.x * num_loops;
    int sm_offset1 = threadIdx.y * blockDim.x * num_loops;
    int sm_offset2 = sm_offset1 + 128 * num_loops;
    int threads = blockDim.x * (blockIdx.x + 1) <= width ? blockDim.x : (width - blockDim.x * blockIdx.x);
    for (int i=0; i<num_loops; i++) {
    
    
      if (float_out) {
    
    
        uchar2 tmp1, tmp2;
        tmp1 = *((uchar2 *)out_sm + sm_offset1 + thread_idx);
        tmp2 = *((uchar2 *)out_sm + sm_offset2 + thread_idx);
        fp32_out[out_offset1 + thread_idx].x = tmp1.x;
        fp32_out[out_offset1 + thread_idx].y = tmp1.y;

        fp32_out[out_offset2 + thread_idx].x = tmp2.x;
        fp32_out[out_offset2 + thread_idx].y = tmp2.y;
      } else {
    
    
        u8_out[out_offset1 + thread_idx] = out_sm[sm_offset1 + thread_idx];
        u8_out[out_offset2 + thread_idx] = out_sm[sm_offset2 + thread_idx];
      }
      thread_idx += threads;
    }
  }
}

Reference template function 2

template<bool IsYU12, bool float_out, typename OUT, bool plane>
__global__ void Yuv2rgb24_general(uint8_t *__restrict__ in, OUT *__restrict__ out, int32_t h, int w) {
    
    
    extern __shared__ uint8_t sm[];
    uint8_t *y = sm;
    uint8_t *out_tmp = NULL;
    int h_idx = blockIdx.x;
    int offset = global2share_copy(in + h_idx * w * 2, y, w * 2);
    y += offset;
    int16_t y1, y2, u1, v1;
    uint8_t *u, *v, *uv;
    if (IsYU12) {
    
    
        u = y + w * 2 / 8 * 8 + 16;
        v = u + w / 2 / 8 * 8 + 16;
        out_tmp = v + w / 2 / 8 * 8 + 16;
        offset = global2share_copy(in + w * h + h_idx * w / 2, u, w >> 1);
        u += offset;
        offset = global2share_copy(in + int(w * h * 1.25f) + h_idx * w / 2, v, w >> 1);
        v += offset;
    } else {
    
    
        uv = y + w * 2 / 8 * 8 + 16;
        out_tmp = uv + w / 8 * 8 + 16;
        offset = global2share_copy(in + w * h + h_idx * w, uv, w);
        uv += offset;
    }
    __syncthreads();
    OUT *out_ptr = NULL;
    if (plane) {
    
    
        out_ptr = out + (h_idx * 2 + 0) * w;
    } else {
    
    
        out_ptr = out + (h_idx * 2 + 0) * w * 3;
    }
    uint8_t front = 0;
    if (!float_out && !plane) front = (8 - ((uint64_t)out_ptr & 7)) & 7;
    for (int i = threadIdx.x; i < w / 2; i += blockDim.x) {
    
    
        if (IsYU12) {
    
    
            u1 = u[i];
            v1 = v[i];
        } else {
    
    
            u1 = uv[i * 2 + 0];
            v1 = uv[i * 2 + 1];
        }
        y1 = y[i * 2 + 0];
        y2 = y[i * 2 + 1];
        uchar4 a = cvt2rgb<int16_t>(y1,u1,v1);
        uchar4 b = cvt2rgb<int16_t>(y2,u1,v1);
        offset = i * 6;
        if (!float_out && !plane) offset += 8 - front;
        out_tmp[offset + 0] = a.x;
        out_tmp[offset + 1] = a.y;
        out_tmp[offset + 2] = a.z;
        out_tmp[offset + 3] = b.x;
        out_tmp[offset + 4] = b.y;
        out_tmp[offset + 5] = b.z;
    }
    __syncthreads();
    if (float_out) {
    
    
        for (int i = threadIdx.x; i < w * 3; i += blockDim.x) {
    
    
            out_ptr[i] = out_tmp[i];
        }
    } else {
    
    
        if (plane) {
    
    
            for (int i = threadIdx.x; i < w; i += blockDim.x) {
    
    
                out_ptr[i] = out_tmp[i * 3 + 0];
                out_ptr[i + h * w] = out_tmp[i * 3 + 1];
                out_ptr[i + h * w * 2] = out_tmp[i * 3 + 2];
            }
        } else {
    
    
            share2global_copy(out_tmp, (uint8_t*)out_ptr, w * 3, front);
        }
    }
    __syncthreads();
    if (plane) {
    
    
        out_ptr = out + (h_idx * 2 + 1) * w;
    } else {
    
    
        out_ptr = out + (h_idx * 2 + 1) * w * 3;
    }
    if (!float_out && !plane) front = (8 - ((uint64_t)out_ptr & 7)) & 7;
    for (int i = threadIdx.x; i < w / 2; i += blockDim.x) {
    
    
        if (IsYU12) {
    
    
            u1 = u[i];
            v1 = v[i];
        } else {
    
    
            u1 = uv[i * 2 + 0];
            v1 = uv[i * 2 + 1];
        }
        y1 = y[w + i * 2 + 0];
        y2 = y[w + i * 2 + 1];
        uchar4 a = cvt2rgb<int16_t>(y1,u1,v1);
        uchar4 b = cvt2rgb<int16_t>(y2,u1,v1);
        offset = i * 6;
        if (!float_out && !plane) offset += 8 - front;
        out_tmp[offset + 0] = a.x;
        out_tmp[offset + 1] = a.y;
        out_tmp[offset + 2] = a.z;
        out_tmp[offset + 3] = b.x;
        out_tmp[offset + 4] = b.y;
        out_tmp[offset + 5] = b.z;
    }
    __syncthreads();
    if (float_out) {
    
    
        for (int i = threadIdx.x; i < w * 3; i += blockDim.x) {
    
    
            out_ptr[i] = out_tmp[i];
        }
    } else {
    
    
        if (plane) {
    
    
            for (int i = threadIdx.x; i < w; i += blockDim.x) {
    
    
                out_ptr[i] = out_tmp[i * 3 + 0];
                out_ptr[i + h * w] = out_tmp[i * 3 + 1];
                out_ptr[i + h * w * 2] = out_tmp[i * 3 + 2];
            }
        } else {
    
    
            share2global_copy(out_tmp, (uint8_t*)out_ptr, w * 3, front);
        }
    }
}

Guess you like

Origin blog.csdn.net/hh1357102/article/details/130617025