优化加速的导向滤波(Guided Filter)【CUDA】【OpenCV】

原理可以参看:白马负金羁的《导向滤波(Guided Filter)的解析与实现》 ,写得很清晰透彻

我实现的效果如下

原图(894*1080,由于笔记本屏幕不够大,截图有截断)

单通道的导向滤波

1.CPU 版的 Guided Filter  和 Fast Guided Filter (缩放倍数为 2,何凯明大神加了一个 resize 就成了新算法,大神就是大神!) 的耗时(循环 100 次取平均):

 2.基于 OpenCV CUDA 编译库实现的 Guided Filter (实现了多流覆盖)的耗时

 可见 OpenCV 实现的 CUDA 函数其实性能也是有很大优化空间的。由于 GPU 开辟设备内存相当耗时,一般 GPU 内存是重用的(比如视频流),上图的时间是 Guided Filter 循环 100 次计算的平均值。 

3. CUDA 加速的 Guided Filter 耗时:

 同上,所以内存开辟的时间只记录一次,Guided Filter 是循环 100 次计算的平均值。可见设备内存开辟的时间还是很多的,所以如果只处理少量图片,GPU 并不占优势,适合采用 CPU 版的 Fast Guided Filter 。

如果引导图像和原图是同一张图的话,还可以尽一步减少计算时间,避免重复计算。

 部分代码如下:

int main()
{
    if (cuda::getCudaEnabledDeviceCount() == 0)
    {
        cerr << "此OpenCV编译的时候没有启用CUDA模块" << endl;
        return -1;
    }

    cv::Mat srcI = cv::imread("./lady.jpg", 0);
    cv::Mat srcP = cv::imread("./lady.jpg", 0);

    const int cols = 1080.f / srcI.rows * srcI.cols;
    const int rows = 1080;

    cv::Mat origI, origP;
    cv::resize(srcI, origI, cv::Size(cols, rows));
    cv::resize(srcP, origP, cv::Size(cols, rows));

    Timer t;

    cudaStream_t stream[5];
    for (int i = 0; i < 5; ++i)
        cudaStreamCreate(&stream[i]);

    // I
    cuda::GpuMat devOrigI(rows, cols, CV_8UC1);
    cuda::GpuMat devIntegralI(rows, cols, CV_32FC1);
    cuda::GpuMat devCountI(rows, cols, CV_32SC1, cv::Scalar(1));
    cuda::GpuMat devMeanI(rows, cols, CV_32FC1);

    // P
    cuda::GpuMat devOrigP(rows, cols, CV_8UC1);
    cuda::GpuMat devIntegralP(rows, cols, CV_32FC1);
    cuda::GpuMat devMeanP(rows, cols, CV_32FC1);

    // I*I
    cuda::GpuMat devIntegralII(rows, cols, CV_32FC1);
    cuda::GpuMat devMeanII(rows, cols, CV_32FC1);

    // P*P
    cuda::GpuMat devIntegralPP(rows, cols, CV_32FC1);
    cuda::GpuMat devMeanPP(rows, cols, CV_32FC1);

    // I*P
    cuda::GpuMat devIntegralIP(rows, cols, CV_32FC1);
    cuda::GpuMat devMeanIP(rows, cols, CV_32FC1);

    cuda::GpuMat devA(rows, cols, CV_32FC1);
    cuda::GpuMat devMeanA(rows, cols, CV_32FC1);

    cuda::GpuMat devB(rows, cols, CV_32FC1);
    cuda::GpuMat devMeanB(rows, cols, CV_32FC1);

    std::cout << "Mem init(ms): " << t.elapsed() << std::endl;

    const int N = 100; // 100
    int count = N;
    const int radius = 2 * 10 + 1; // 窗口半径
    const float eps = 1e-2;

    t.restart();

    do
    {
        devOrigI.upload(origI); // 可以考虑异步
        devOrigP.upload(origP);
        devOrigI.convertTo(devIntegralI, CV_32FC1, 1.f / 255);
        devOrigP.convertTo(devIntegralP, CV_32FC1, 1.f / 255);
        devCountI.setTo(cv::Scalar(1)); // 要是没有循环可以删去

        // ----------------------------------------------------------------
        // I*I P*P I*P
        mul(devIntegralI, devIntegralI, devIntegralII, stream[0]);
        mul(devIntegralP, devIntegralP, devIntegralPP, stream[1]);
        mul(devIntegralI, devIntegralP, devIntegralIP, stream[2]);

        //cudaStreamSynchronize()//同步单个流:等待该流上的命令都完成
        //cudaDeviceSynchronize()//同步所有流同步:等待整个设备上流都完成

        checkCudaErrors(cudaStreamSynchronize(stream[0]));
        checkCudaErrors(cudaStreamSynchronize(stream[1]));
        checkCudaErrors(cudaStreamSynchronize(stream[2]));

        // ----------------------------------------------------------------
        // I P I*I I*P P*P 的积分图
        rowAdd(devIntegralI, devCountI, stream[0]);
        rowAdd(devIntegralP, stream[1]);
        rowAdd(devIntegralII, stream[2]);
        rowAdd(devIntegralIP, stream[3]);
        rowAdd(devIntegralPP, stream[4]);

        checkCudaErrors(cudaStreamSynchronize(stream[0]));
        checkCudaErrors(cudaStreamSynchronize(stream[1]));
        checkCudaErrors(cudaStreamSynchronize(stream[2]));
        checkCudaErrors(cudaStreamSynchronize(stream[3]));
        checkCudaErrors(cudaStreamSynchronize(stream[4]));

        colAdd(devIntegralI, devCountI, stream[0]);
        colAdd(devIntegralP, stream[1]);
        colAdd(devIntegralII, stream[2]);
        colAdd(devIntegralIP, stream[3]);
        colAdd(devIntegralPP, stream[4]);

        checkCudaErrors(cudaStreamSynchronize(stream[0]));
        checkCudaErrors(cudaStreamSynchronize(stream[1]));
        checkCudaErrors(cudaStreamSynchronize(stream[2]));
        checkCudaErrors(cudaStreamSynchronize(stream[3]));
        checkCudaErrors(cudaStreamSynchronize(stream[4]));

        // ----------------------------------------------------------------
        // I P II IP PP 的均值
        boxFilter<float>(devMeanI, devIntegralI, devCountI, radius, stream[0]);
        boxFilter<float>(devMeanP, devIntegralP, devCountI, radius, stream[1]);
        boxFilter<float>(devMeanII, devIntegralII, devCountI, radius, stream[2]);
        boxFilter<float>(devMeanIP, devIntegralIP, devCountI, radius, stream[3]);
        boxFilter<float>(devMeanPP, devIntegralPP, devCountI, radius, stream[4]);

        checkCudaErrors(cudaStreamSynchronize(stream[0]));
        checkCudaErrors(cudaStreamSynchronize(stream[1]));
        checkCudaErrors(cudaStreamSynchronize(stream[2]));
        checkCudaErrors(cudaStreamSynchronize(stream[3]));
        checkCudaErrors(cudaStreamSynchronize(stream[4]));

        // ----------------------------------------------------------------
        // 计算 a, b
        calcAB(devMeanI, devMeanP, devMeanII, devMeanIP, eps, devA, devB);
        checkCudaErrors(cudaDeviceSynchronize());
        // ----------------------------------------------------------------
        // 计算 a, b 均值
        rowAdd(devA, stream[0]);
        rowAdd(devB, stream[1]);
        checkCudaErrors(cudaStreamSynchronize(stream[0]));
        checkCudaErrors(cudaStreamSynchronize(stream[1]));

        colAdd(devA, stream[0]);
        colAdd(devB, stream[1]);
        checkCudaErrors(cudaStreamSynchronize(stream[0]));
        checkCudaErrors(cudaStreamSynchronize(stream[1]));

        boxFilter<float>(devMeanA, devA, devCountI, radius, stream[0]);
        boxFilter<float>(devMeanB, devB, devCountI, radius, stream[1]);
        checkCudaErrors(cudaStreamSynchronize(stream[0]));
        checkCudaErrors(cudaStreamSynchronize(stream[1]));

        // ----------------------------------------------------------------
        // a * I + b
        calcAMultiplyIPlusB(devMeanA, devMeanB, devOrigI, devOrigP); // 把 devOrigP 重复利用当做输出
        checkCudaErrors(cudaDeviceSynchronize());
        // ----------------------------------------------------------------
    }
    while (count--);
    // ----------------------------------------------------------------

    std::cout << "Guided Filter(ms): " << t.elapsed() / (N + 1) << std::endl;

    // 从显存把数据下载到内存
    Mat retI, retP;
    devOrigP.download(retI);

    // 显示
    imshow("ResultI", retI);
    imshow("OrigI", origI);
    waitKey(0);

    return 0;
}

ps: 如需源码请私信

发布了233 篇原创文章 · 获赞 221 · 访问量 106万+

猜你喜欢

转载自blog.csdn.net/panda1234lee/article/details/88568440
今日推荐