预处理优化——libjpeg-turbo imcode、gpu resize、gpu subtract

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/cgt19910923/article/details/86655347

cmakelist编译opencv cuda、libjpeg-turbo库

https://blog.csdn.net/cgt19910923/article/details/86541471

编译完成直接调用imcode,解码速度提升。gpu涉及到与cpu的交互,额外需要upload、download操作,应用上需要增加上传和下载时间。仅比较resize、subtract操作处理性能有所提升。

#include <stdio.h>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>

#include <chrono>


using namespace cv;
using namespace std;
typedef std::chrono::high_resolution_clock Time;
typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
typedef std::chrono::duration<float> fsec;

int main(int argc, char **argv)
{
    //cpu gpu subtract
    #if 0
    Mat img1,img2;
    Mat img1_resize = Mat(Size(416,416), CV_8UC3);
    Mat img2_resize = Mat(Size(416,416), CV_8UC3);
	img1 = cv::imread("motor.jpg", 1);
    img2 = cv::imread("person.jpg", 1);

    resize(img1, img1_resize, Size(416,416), 0, 0, CV_INTER_LINEAR);
    resize(img2, img2_resize, Size(416,416), 0, 0, CV_INTER_LINEAR);

    imshow("resize1",img1_resize);
    waitKey(0);
    imshow("resize2",img2_resize);
    waitKey(0);

    Mat dst = Mat(Size(416,416), CV_8UC3);
    double cpusubtracttotal = 0.0;
    auto cpusubtract_startTime = Time::now();
    for(int i=0;i<1000;i++)
    {
    subtract(img1_resize,img2_resize,dst);
    }
    auto cpusubtract_endTime = Time::now();
    fsec fscpusubtract = cpusubtract_endTime - cpusubtract_startTime;
    ms d3 = std::chrono::duration_cast<ms>(fscpusubtract);
    cpusubtracttotal += d3.count();
    printf("Time cpusubtracttotal: %f\n", cpusubtracttotal);

    imshow("cpusubtract", dst);
    waitKey(0);

    cuda::GpuMat cuda_src1, cuda_src2;
    cuda::GpuMat cuda_dst;
    cuda_src1.upload(img1_resize);
    cuda_src2.upload(img1_resize);
    double gpusubtracttotal = 0.0;
    auto gpusubtract_startTime = Time::now();
    for(int i=0;i<1000;i++)
    {
    cuda::subtract(cuda_src1,cuda_src2,cuda_dst);
    }
    auto gpusubtract_endTime = Time::now();
    fsec fsgpusubtract = gpusubtract_endTime - gpusubtract_startTime;
    ms d4 = std::chrono::duration_cast<ms>(fsgpusubtract);
    gpusubtracttotal += d4.count();
    printf("Time gpusubtract: %f\n", gpusubtracttotal);
    Mat result;
    cuda_dst.download(result);
    imshow("gpusubtract", result);
    waitKey(0);
    return 0;
    #endif

    //cpu gpu encode decode
    #if 0
    const char fname[] = "motor.jpg";
    Mat image=imread(fname,1);
    if (image.empty())
	{
		printf("Can't load image %s\n", fname);
	}
    vector<unsigned char> inImage;

    double encodetotal = 0.0;
    auto encode_startTime = Time::now();
    for(int i=0;i<100;i++)
    {
        imencode(".jpg",image,inImage);
    }
    auto encode_endTime = Time::now();
    fsec fsencode = encode_endTime - encode_startTime;
    ms d2 = std::chrono::duration_cast<ms>(fsencode);
    encodetotal += d2.count();
    printf("Time encode: %f\n", encodetotal);

    size_t datalen=inImage.size();
    unsigned char *msgImage=new unsigned char[datalen];
    for(int i=0;i<datalen;i++)
    {
        msgImage[i]=inImage[i];
        //cout<<msgImage[i]<<endl;
    }

    vector<unsigned char> buff;
    for(int i=0;i<datalen;i++)
    {
        buff.push_back(msgImage[i]);
    }

    double decodetotal = 0.0;
    auto decode_startTime = Time::now();
    Mat show;
    for(int j=0;j<100;j++)
    {
        show=imdecode(buff,CV_LOAD_IMAGE_COLOR);
    }
    auto decode_endTime = Time::now();
    fsec fsdecode = decode_endTime - decode_startTime;
    ms d3 = std::chrono::duration_cast<ms>(fsdecode);
    decodetotal += d3.count();
    printf("Time imcode: %f\n", decodetotal);
    imshow("picture",show);

    cv::waitKey(0);
    #endif

    //cpu gpu resize
    #if 1
    Mat image;
    Mat cpu_resize = Mat(Size(416,416), CV_32FC3);
    const char fname[] = "motor.jpg";
	image = cv::imread(fname, 1);
    image.convertTo(image, CV_32FC3, 1/255.0);

	if (image.empty())
	{
		printf("Can't load image %s\n", fname);
	}
    Mat gpu_image;
    image.copyTo(gpu_image);
    double cputotal = 0.0;
    auto cpu_startTime = Time::now();
    for(int i=0; i<100; i++)
    {
        resize(image, cpu_resize, Size(416,416), 0, 0, CV_INTER_LINEAR);
    }

    auto cpu_endTime = Time::now();
    fsec fscpu = cpu_endTime - cpu_startTime;
    ms d = std::chrono::duration_cast<ms>(fscpu);
    cputotal += d.count();
    printf("Time CPU: %f\n", cputotal);
    imshow("Resize_cpu", cpu_resize);
    waitKey(0);

    cuda::Stream stream;
    cuda::GpuMat d_src =cuda::GpuMat(1920,1080,CV_32FC3);
    cuda::GpuMat gpu_resize=cuda::GpuMat(Size(416,416),CV_32FC3);
    double gputotal = 0.0;
    auto gpu_startTime = Time::now();
        for(int j=0; j<100; j++)
    {
        d_src.upload(gpu_image,stream);
        cuda::resize(d_src, gpu_resize, Size(416,416), 0, 0, CV_INTER_LINEAR,stream);
    }
    auto gpu_endTime = Time::now();
    fsec fsgpu = gpu_endTime - gpu_startTime;
    ms d1 = std::chrono::duration_cast<ms>(fsgpu);
    gputotal += d1.count();
    printf("Time GPU: %f\n", gputotal);
    Mat result;
    gpu_resize.download(result,stream);
    imshow("Resize_gpu", result);
    waitKey(0);
    #endif
    return 0;

}

P100服务器测试得:uchar型 cuda resize快1.9倍,float32型 cuda resize快18.9倍,cuda subtract快3.3倍;对比libjpeg、libjpeg-turbo 编解码,其中libjpeg-turbo解码快6.5倍,但是编码慢2.1倍。

操作

输入尺寸

输出尺寸

处理器

耗时(ms)

resize uchar

/

1080P

416*416

cpu

2.046

resize uchar

cuda

1080P

416*416

gpu

1.101

resize float32

/

1080P

416*416

cpu

0.323

resize float32

cuda

1080P

416*416

gpu

6.094

encode

libjpeg

1080P

1080P

cpu

11.941

encode

libjpeg-turbo

1080P

1080P

cpu

25.525

imcode

libjpeg

1080P

1080P

cpu

79.643

imcode

libjpeg-turbo

1080P

1080P

cpu

12.253

subtract

/

416*416

416*416

cpu

0.096

subtract

cuda

416*416

416*416

gpu

0.029

猜你喜欢

转载自blog.csdn.net/cgt19910923/article/details/86655347
GPU