FFmpeg source code analysis: sws_scale image scaling and image conversion

FFmpeg provides image scaling and image conversion functions in the libswscale module, such as scaling 1080P images to 720P, or converting YUV422P to YUV420P. The image zoom function has a SwsContext structure as a context, which was introduced in the previous article: SwsContext image conversion context .

1. Pixel format

Let's first look at the common pixel formats, located in libavutil/pixfmt.h, stored in the AVPixelFormat enumeration type:

enum AVPixelFormat {
    AV_PIX_FMT_NONE = -1,
    AV_PIX_FMT_YUV420P,   ///< planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
    AV_PIX_FMT_YUYV422,   ///< packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
    AV_PIX_FMT_RGB24,     ///< packed RGB 8:8:8, 24bpp, RGBRGB...
    AV_PIX_FMT_BGR24,     ///< packed RGB 8:8:8, 24bpp, BGRBGR...
    AV_PIX_FMT_YUV422P,   ///< planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
    AV_PIX_FMT_YUV444P,   ///< planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples)
    AV_PIX_FMT_YUV410P,   ///< planar YUV 4:1:0,  9bpp, (1 Cr & Cb sample per 4x4 Y samples)
    AV_PIX_FMT_YUV411P,   ///< planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples)
    AV_PIX_FMT_GRAY8,     ///<        Y        ,  8bpp
    AV_PIX_FMT_MONOWHITE, ///<        Y        ,  1bpp, 0 is white, 1 is black
    AV_PIX_FMT_MONOBLACK, ///<        Y        ,  1bpp, 0 is black, 1 is white
    AV_PIX_FMT_PAL8,      ///< 8 bits with AV_PIX_FMT_RGB32 palette
    AV_PIX_FMT_YUVJ420P,  ///< planar YUV 4:2:0, 12bpp, full scale (JPEG)
    AV_PIX_FMT_YUVJ422P,  ///< planar YUV 4:2:2, 16bpp, full scale (JPEG)
    AV_PIX_FMT_YUVJ444P,  ///< planar YUV 4:4:4, 24bpp, full scale (JPEG)
    AV_PIX_FMT_NV12,      ///< planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV
    AV_PIX_FMT_NV21,      ///< as above, but U and V bytes are swapped

    AV_PIX_FMT_ARGB,      ///< packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
    AV_PIX_FMT_RGBA,      ///< packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
    AV_PIX_FMT_ABGR,      ///< packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
    AV_PIX_FMT_BGRA,      ///< packed BGRA 8:8:8:8, 32bpp, BGRABGRA...

    AV_PIX_FMT_RGB565BE,  ///< packed RGB 5:6:5, 16bpp, big-endian
    AV_PIX_FMT_RGB565LE,  ///< packed RGB 5:6:5, 16bpp, little-endian
    AV_PIX_FMT_RGB555BE,  ///< packed RGB 5:5:5, 16bpp, big-endian
    AV_PIX_FMT_RGB555LE,  ///< packed RGB 5:5:5, 16bpp, little-endian
   
    AV_PIX_FMT_YUV420P10BE,///< planar YUV 4:2:0, 15bpp, big-endian
    AV_PIX_FMT_YUV420P10LE,///< planar YUV 4:2:0, 15bpp, little-endian
    AV_PIX_FMT_YUV422P10BE,///< planar YUV 4:2:2, 20bpp, big-endian
    AV_PIX_FMT_YUV422P10LE,///< planar YUV 4:2:2, 20bpp, little-endian
    AV_PIX_FMT_YUV444P10BE,///< planar YUV 4:4:4, 30bpp, big-endian
    AV_PIX_FMT_YUV444P10LE,///< planar YUV 4:4:4, 30bpp, little-endian

    AV_PIX_FMT_YUV420P12BE, ///< planar YUV 4:2:0,18bpp, big-endian
    AV_PIX_FMT_YUV420P12LE, ///< planar YUV 4:2:0,18bpp, little-endian
    AV_PIX_FMT_YUV422P12BE, ///< planar YUV 4:2:2,24bpp, big-endian
    AV_PIX_FMT_YUV422P12LE, ///< planar YUV 4:2:2,24bpp, little-endian
    AV_PIX_FMT_YUV444P12BE, ///< planar YUV 4:4:4,36bpp, big-endian
    AV_PIX_FMT_YUV444P12LE, ///< planar YUV 4:4:4,36bpp, little-endian

    AV_PIX_FMT_VIDEOTOOLBOX,///< hardware decoding through Videotoolbox
    AV_PIX_FMT_MEDIACODEC,  ///< hardware decoding through MediaCodec

    AV_PIX_FMT_NB         ///< number of pixel formats
};

As shown in the above code, pixel formats are mainly divided into two series: YUV and RGB. Among them, the YUV series includes YUV420P, YUV422P, and YUV444P, and the RGB series includes RGBA, RGB, BGR, and GRAY. Subdivided further, YUV420P sub-series includes YUV420P10LE, YUV420P10BE, YUV420P12LE, YUV420P12BE, etc. RGB sub-series includes RGB24, RGB565LE, RGB565BE, RGB555LE, RGB555BE, etc.

Take YUV420P10LE as an example to analyze the parameters of each part. 420P represents 4:2:0 planar, and one UV corresponds to 4 Ys. 10 stands for 10bpp, where bpp is how many bits each pixel occupies in bits per pixel. LE stands for Little Endian, little endian storage, as opposed to BE big endian storage.

In addition, NV12 and NV21 are generally Android platform storage formats. NV21 is similar to YUV420SP, both are 4:2:0 Semi Planar, the difference is that YUV420SP is 1 UV corresponding to 4 Y, NV21 is 1 VU corresponding to 4 Y.

Second, sws_scale image conversion

Call the sws_scale() function for image conversion:

int attribute_align_arg sws_scale(struct SwsContext *c,
                                  const uint8_t * const srcSlice[],
                                  const int srcStride[], int srcSliceY,
                                  int srcSliceH, uint8_t *const dst[],
                                  const int dstStride[])
{
    /******************* 参数初始化 ***********************/
    // 级联context的处理
    if (c->gamma_flag && c->cascaded_context[0]) {
		// 调用内部图像转换函数
        ret = sws_scale(c->cascaded_context[0],
                    srcSlice, srcStride, srcSliceY, srcSliceH,
                    c->cascaded_tmp, c->cascaded_tmpStride);

        if (ret < 0)
            return ret;

        if (c->cascaded_context[2]) {
            ret = sws_scale(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp, 
			c->cascaded_tmpStride, srcSliceY, srcSliceH, c->cascaded1_tmp, c->cascaded1_tmpStride);
        } else {
            ret = sws_scale(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp, 
			c->cascaded_tmpStride, srcSliceY, srcSliceH, dst, dstStride);
        }
        if (ret < 0)
            return ret;

        if (c->cascaded_context[2]) {
            ret = sws_scale(c->cascaded_context[2], (const uint8_t * const *)c->cascaded1_tmp, 
			c->cascaded1_tmpStride, c->cascaded_context[1]->dstY - ret, 
			c->cascaded_context[1]->dstY, dst, dstStride);
        }
        return ret;
    }

    if (c->cascaded_context[0] && srcSliceY == 0 && srcSliceH == c->cascaded_context[0]->srcH) {
        ret = sws_scale(c->cascaded_context[0], srcSlice, srcStride, srcSliceY, srcSliceH, c->cascaded_tmp, c->cascaded_tmpStride);
        if (ret < 0)
            return ret;
        ret = sws_scale(c->cascaded_context[1],
                        (const uint8_t * const * )c->cascaded_tmp, c->cascaded_tmpStride, 0, 
						c->cascaded_context[0]->dstH, dst, dstStride);
        return ret;
    }

    memcpy(src2, srcSlice, sizeof(src2));
    memcpy(dst2, dst, sizeof(dst2));

    if (srcSliceH == 0)
        return 0;
    // 检查源图像与目标图像指针
    if (!check_image_pointers(srcSlice, c->srcFormat, srcStride)) {
        av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
        return 0;
    }
    if (!check_image_pointers((const uint8_t* const*)dst, c->dstFormat, dstStride)) {
        av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
        return 0;
    }

    ......

    // 调用内部图像转换函数
    ret = c->swscale(c, src2, srcStride2, srcSliceY_internal, srcSliceH, dst2, dstStride2);

    ......
	
    return ret;
}

Call the internal sws_scale() to do the actual image transformation:

static int swscale(SwsContext *c, const uint8_t *src[],
                   int srcStride[], int srcSliceY,
                   int srcSliceH, uint8_t *dst[], int dstStride[])
{
    /******************* 参数初始化 ***********************/

    if (dstStride[0]&15 || dstStride[1]&15 ||
        dstStride[2]&15 || dstStride[3]&15) {
        static int warnedAlready = 0;
		// dstStride没有内存对齐，不能进行对齐访问
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
            warnedAlready = 1;
        }
    }

    if (   (uintptr_t)dst[0]&15 || (uintptr_t)dst[1]&15 || (uintptr_t)dst[2]&15
        || (uintptr_t)src[0]&15 || (uintptr_t)src[1]&15 || (uintptr_t)src[2]&15
        || dstStride[0]&15 || dstStride[1]&15 || dstStride[2]&15 || dstStride[3]&15
        || srcStride[0]&15 || srcStride[1]&15 || srcStride[2]&15 || srcStride[3]&15
    ) {
        static int warnedAlready=0;
        int cpu_flags = av_get_cpu_flags();
		//数据没有对齐，可能导致cpu访问速率降低
        if (HAVE_MMXEXT && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
            warnedAlready=1;
        }
    }

    // 分别初始化垂直缩放函数、源slice函数、目标slice函数
    ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
                   yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, c->use_mmx_vfilter);

    ff_init_slice_from_src(src_slice, (uint8_t**)src, srcStride, c->srcW,
            srcSliceY, srcSliceH, chrSrcSliceY, chrSrcSliceH, 1);

    ff_init_slice_from_src(vout_slice, (uint8_t**)dst, dstStride, c->dstW,
            dstY, dstH, dstY >> c->chrDstVSubSample,
            AV_CEIL_RSHIFT(dstH, c->chrDstVSubSample), 0);

    for (; dstY < dstH; dstY++) {
       
	    // 旋转slice
        ff_rotate_slice(hout_slice, lastPosY, lastCPosY);

        if (posY < lastLumSrcY + 1) {
            for (i = lumStart; i < lumEnd; ++i)
                desc[i].process(c, &desc[i], firstPosY, lastPosY - firstPosY + 1);
        }

        lastInLumBuf = lastLumSrcY;

        if (cPosY < lastChrSrcY + 1) {
            for (i = chrStart; i < chrEnd; ++i)
                desc[i].process(c, &desc[i], firstCPosY, lastCPosY - firstCPosY + 1);
        }

        lastInChrBuf = lastChrSrcY;

        if (!enough_lines)
            break;
		
        if (should_dither) {
            c->chrDither8 = ff_dither_8x8_128[chrDstY & 7];
            c->lumDither8 = ff_dither_8x8_128[dstY    & 7];
        }
        if (dstY >= dstH - 2) {
			// 初始化输出函数
            ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
                                     &yuv2packed1, &yuv2packed2, &yuv2packedX, &yuv2anyX);
            use_mmx_vfilter= 0;
            ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
                           yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, use_mmx_vfilter);
        }

        {
            for (i = vStart; i < vEnd; ++i)
                desc[i].process(c, &desc[i], dstY, 1);
        }
    }
    if (isPlanar(dstFormat) && isALPHA(dstFormat) && !needAlpha) {
        int length = dstW;
        int height = dstY - lastDstY;

        if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
            fillPlane16(dst[3], dstStride[3], length, height, lastDstY,
                    1, desc->comp[3].depth,
                    isBE(dstFormat));
        } else if (is32BPS(dstFormat)) {
            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
            fillPlane32(dst[3], dstStride[3], length, height, lastDstY,
                    1, desc->comp[3].depth,
                    isBE(dstFormat), desc->flags & AV_PIX_FMT_FLAG_FLOAT);
        } else
            fillPlane(dst[3], dstStride[3], length, height, lastDstY, 255);
    }

    c->dstY         = dstY;
    c->lastInLumBuf = lastInLumBuf;
    c->lastInChrBuf = lastInChrBuf;

    return dstY - lastDstY;
}

3. Image tools

The image-related judgment tool class is located in libswscale/swscale_internal.h, including judging whether it is 16-bit depth, whether it is YUV, whether it is plane YUV, whether it is RGB, whether it is any RGB.

1. Determine 16-bit depth

To determine whether it is 16-bit depth, by comparing the depth parameter:

static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
{
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
    av_assert0(desc);
    return desc->comp[0].depth == 16;
}

2. Judge YUV

To determine whether it is in YUV format, read the flag as non-RGB and the nb_components parameter is greater than or equal to 2:

static av_always_inline int isYUV(enum AVPixelFormat pix_fmt)
{
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
    av_assert0(desc);
    return !(desc->flags & AV_PIX_FMT_FLAG_RGB) && desc->nb_components >= 2;
}

3. Determine plane YUV

To determine whether it is in plane YUV format, read the flag as PLANAR and in YUV format:

static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt)
{
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
    av_assert0(desc);
    return ((desc->flags & AV_PIX_FMT_FLAG_PLANAR) && isYUV(pix_fmt));
}

4. Judge RGB

To determine whether it is in RGB format, read the flag as RGB:

static av_always_inline int isRGB(enum AVPixelFormat pix_fmt)
{
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
    av_assert0(desc);
    return (desc->flags & AV_PIX_FMT_FLAG_RGB);
}

5. Determine any RGB

To determine whether it is in any RGB format, read flag as RGB, or pix_fmt as MONOBLACK, or pix_fmt as MONOWHITE:

static av_always_inline int isAnyRGB(enum AVPixelFormat pix_fmt)
{
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
    av_assert0(desc);
    return (desc->flags & AV_PIX_FMT_FLAG_RGB) ||
            pix_fmt == AV_PIX_FMT_MONOBLACK || pix_fmt == AV_PIX_FMT_MONOWHITE;
}

Four, image conversion example

Convert according to the specified output pixel format. First determine whether the SwsContext is empty, and if it is empty, get it from the cache; then call sws_scale() to convert the image:

int convertPixelFormat(AVCodecContext *context, AVFrame *frame) {
	int outputLineSize[4];
	int outputFormat = frame->format;
    av_image_fill_linesizes(outputLineSize, 
		(enum AVPixelFormat)(outputFormat), frame->width);
	if (!context->opaque) {
            // 获取缓存context
			struct SwsContext *cacheContext = sws_getCachedContext(NULL, 
			    frame->width, frame->height, (enum AVPixelFormat) frame->format,
                frame->width, frame->height, (enum AVPixelFormat) outputFormat,
                SWS_BICUBIC, NULL, NULL, NULL);
            context->opaque = cacheContext;
        }
		struct SwsContext *swsContext = context->opaque;
		uint8_t *dst_data[4];
		av_image_fill_pointers(dst_data,
							 (enum AVPixelFormat) outputFormat,
							 frame->height,
							 (uint8_t *) data,
							 outputLineSize);
		// 像素格式转换
		sws_scale(swsContext, 
				  (const uint8_t **) frame->data, 
				  frame->linesize, 
				  0,
				  frame->height, 
				  dst_data, 
				  outputLineSize);
}