预处理优化——libjpeg-turbo imcode、gpu resize、gpu subtract

cmakelist编译opencv cuda、libjpeg-turbo库

https://blog.csdn.net/cgt19910923/article/details/86541471

编译完成直接调用imcode，解码速度提升。gpu涉及到与cpu的交互，额外需要upload、download操作，应用上需要增加上传和下载时间。仅比较resize、subtract操作处理性能有所提升。

#include <stdio.h>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>

#include <chrono>


using namespace cv;
using namespace std;
typedef std::chrono::high_resolution_clock Time;
typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
typedef std::chrono::duration<float> fsec;

int main(int argc, char **argv)
{
    //cpu gpu subtract
    #if 0
    Mat img1,img2;
    Mat img1_resize = Mat(Size(416,416), CV_8UC3);
    Mat img2_resize = Mat(Size(416,416), CV_8UC3);
	img1 = cv::imread("motor.jpg", 1);
    img2 = cv::imread("person.jpg", 1);

    resize(img1, img1_resize, Size(416,416), 0, 0, CV_INTER_LINEAR);
    resize(img2, img2_resize, Size(416,416), 0, 0, CV_INTER_LINEAR);

    imshow("resize1",img1_resize);
    waitKey(0);
    imshow("resize2",img2_resize);
    waitKey(0);

    Mat dst = Mat(Size(416,416), CV_8UC3);
    double cpusubtracttotal = 0.0;
    auto cpusubtract_startTime = Time::now();
    for(int i=0;i<1000;i++)
    {
    subtract(img1_resize,img2_resize,dst);
    }
    auto cpusubtract_endTime = Time::now();
    fsec fscpusubtract = cpusubtract_endTime - cpusubtract_startTime;
    ms d3 = std::chrono::duration_cast<ms>(fscpusubtract);
    cpusubtracttotal += d3.count();
    printf("Time cpusubtracttotal: %f\n", cpusubtracttotal);

    imshow("cpusubtract", dst);
    waitKey(0);

    cuda::GpuMat cuda_src1, cuda_src2;
    cuda::GpuMat cuda_dst;
    cuda_src1.upload(img1_resize);
    cuda_src2.upload(img1_resize);
    double gpusubtracttotal = 0.0;
    auto gpusubtract_startTime = Time::now();
    for(int i=0;i<1000;i++)
    {
    cuda::subtract(cuda_src1,cuda_src2,cuda_dst);
    }
    auto gpusubtract_endTime = Time::now();
    fsec fsgpusubtract = gpusubtract_endTime - gpusubtract_startTime;
    ms d4 = std::chrono::duration_cast<ms>(fsgpusubtract);
    gpusubtracttotal += d4.count();
    printf("Time gpusubtract: %f\n", gpusubtracttotal);
    Mat result;
    cuda_dst.download(result);
    imshow("gpusubtract", result);
    waitKey(0);
    return 0;
    #endif

    //cpu gpu encode decode
    #if 0
    const char fname[] = "motor.jpg";
    Mat image=imread(fname,1);
    if (image.empty())
	{
		printf("Can't load image %s\n", fname);
	}
    vector<unsigned char> inImage;

    double encodetotal = 0.0;
    auto encode_startTime = Time::now();
    for(int i=0;i<100;i++)
    {
        imencode(".jpg",image,inImage);
    }
    auto encode_endTime = Time::now();
    fsec fsencode = encode_endTime - encode_startTime;
    ms d2 = std::chrono::duration_cast<ms>(fsencode);
    encodetotal += d2.count();
    printf("Time encode: %f\n", encodetotal);

    size_t datalen=inImage.size();
    unsigned char *msgImage=new unsigned char[datalen];
    for(int i=0;i<datalen;i++)
    {
        msgImage[i]=inImage[i];
        //cout<<msgImage[i]<<endl;
    }

    vector<unsigned char> buff;
    for(int i=0;i<datalen;i++)
    {
        buff.push_back(msgImage[i]);
    }

    double decodetotal = 0.0;
    auto decode_startTime = Time::now();
    Mat show;
    for(int j=0;j<100;j++)
    {
        show=imdecode(buff,CV_LOAD_IMAGE_COLOR);
    }
    auto decode_endTime = Time::now();
    fsec fsdecode = decode_endTime - decode_startTime;
    ms d3 = std::chrono::duration_cast<ms>(fsdecode);
    decodetotal += d3.count();
    printf("Time imcode: %f\n", decodetotal);
    imshow("picture",show);

    cv::waitKey(0);
    #endif

    //cpu gpu resize
    #if 1
    Mat image;
    Mat cpu_resize = Mat(Size(416,416), CV_32FC3);
    const char fname[] = "motor.jpg";
	image = cv::imread(fname, 1);
    image.convertTo(image, CV_32FC3, 1/255.0);

	if (image.empty())
	{
		printf("Can't load image %s\n", fname);
	}
    Mat gpu_image;
    image.copyTo(gpu_image);
    double cputotal = 0.0;
    auto cpu_startTime = Time::now();
    for(int i=0; i<100; i++)
    {
        resize(image, cpu_resize, Size(416,416), 0, 0, CV_INTER_LINEAR);
    }

    auto cpu_endTime = Time::now();
    fsec fscpu = cpu_endTime - cpu_startTime;
    ms d = std::chrono::duration_cast<ms>(fscpu);
    cputotal += d.count();
    printf("Time CPU: %f\n", cputotal);
    imshow("Resize_cpu", cpu_resize);
    waitKey(0);

    cuda::Stream stream;
    cuda::GpuMat d_src =cuda::GpuMat(1920,1080,CV_32FC3);
    cuda::GpuMat gpu_resize=cuda::GpuMat(Size(416,416),CV_32FC3);
    double gputotal = 0.0;
    auto gpu_startTime = Time::now();
        for(int j=0; j<100; j++)
    {
        d_src.upload(gpu_image,stream);
        cuda::resize(d_src, gpu_resize, Size(416,416), 0, 0, CV_INTER_LINEAR,stream);
    }
    auto gpu_endTime = Time::now();
    fsec fsgpu = gpu_endTime - gpu_startTime;
    ms d1 = std::chrono::duration_cast<ms>(fsgpu);
    gputotal += d1.count();
    printf("Time GPU: %f\n", gputotal);
    Mat result;
    gpu_resize.download(result,stream);
    imshow("Resize_gpu", result);
    waitKey(0);
    #endif
    return 0;

}

P100服务器测试得：uchar型 cuda resize快1.9倍，float32型 cuda resize快18.9倍，cuda subtract快3.3倍；对比libjpeg、libjpeg-turbo 编解码，其中libjpeg-turbo解码快6.5倍，但是编码慢2.1倍。

操作	库	输入尺寸	输出尺寸	处理器	耗时(ms)
resize uchar	/	1080P	416*416	cpu	2.046
resize uchar	cuda	1080P	416*416	gpu	1.101
resize float32	/	1080P	416*416	cpu	0.323
resize float32	cuda	1080P	416*416	gpu	6.094
encode	libjpeg	1080P	1080P	cpu	11.941
encode	libjpeg-turbo	1080P	1080P	cpu	25.525
imcode	libjpeg	1080P	1080P	cpu	79.643
imcode	libjpeg-turbo	1080P	1080P	cpu	12.253
subtract	/	416*416	416*416	cpu	0.096
subtract	cuda	416*416	416*416	gpu	0.029

预处理优化——libjpeg-turbo imcode、gpu resize、gpu subtract

猜你喜欢