创建工程

使用cmake创建工程，CMakeLists.txt如下：

cmake_minimum_required(VERSION 2.8)
project(image_process)
find_package(OpenCV REQUIRED)   #会去找FindXXX.cmake或XXXConfig.cmake，从而返回一些变量
find_package(CUDA REQUIRED)     #REQUIRED代表如果找不到就会报错
cuda_add_executable(image_process main.cu)
target_link_libraries(image_process ${OpenCV_LIBS})

疑点尚未解决：cuda_add_executable是如何指定调用NVCC进行编译的，如何用其他方式制定nvcc编译

编写代码

代码思路很简单，就是用cuda、cpu、cv::cvtColor都运行一遍彩色图转灰度图的算法，对比一下运行时间

cuda 程序

每一个thread处理一个像素，线程网格与线程块设置如下：

dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
        (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);

kernel函数编写如下：

__global__ void rgb2grayincuda(uchar3 * const d_in, unsigned char * const d_out, 
                                uint imgheight, uint imgwidth)
{
    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (idx < imgwidth && idy < imgheight)
    {
        uchar3 rgb = d_in[idy * imgwidth + idx];
        d_out[idy * imgwidth + idx] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z;
    }
}

kernel函数比较tricky的一点是，对于不能被线程块整除的情况，有一些线程是全程不工作的

测速时注意，要使用cudaDeviceSynchronize()函数来同步cpu和gpu，否则测出来的速度是cpu启动内核函数的速度

cpu 遍历函数

函数接口同kernel函数，使用指针遍历元素：

void rgb2grayincpu(unsigned char * const d_in, unsigned char * const d_out,
                                uint imgheight, uint imgwidth)
{
    for(int i = 0; i < imgheight; i++)
    {
        for(int j = 0; j < imgwidth; j++)
        {
            d_out[i * imgwidth + j] = 0.299f * d_in[(i * imgwidth + j)*3]
                                     + 0.587f * d_in[(i * imgwidth + j)*3 + 1]
                                     + 0.114f * d_in[(i * imgwidth + j)*3 + 2];
        }
    }
}

*3那里坑了我不少时间，果然还是太年轻

测速结果

cuda	cpu	cv::cvtColor
0.00077100	0.00244700	0.09298100

发现cuda为cpu的1/3，并没有想象中提速快，有可能是因为线程块设置的不合理导致的；
反倒是opencv的cvtColor函数，比cuda和cpu慢了一个数量级。

猜想：如果一个线程处理多个像素，兴许会快
疑点：为什么opencv的cvtColor会这么慢

源代码

#include <iostream>
#include <time.h>
#include "opencv2/highgui.hpp"  //实际上在/usr/include下
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;

#define PAUSE printf("Press Enter key to continue..."); fgetc(stdin);

__global__ void rgb2grayincuda(uchar3 * const d_in, unsigned char * const d_out, 
                                uint imgheight, uint imgwidth)
{
    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (idx < imgwidth && idy < imgheight)
    {
        uchar3 rgb = d_in[idy * imgwidth + idx];
        d_out[idy * imgwidth + idx] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z;
    }
}

void rgb2grayincpu(unsigned char * const d_in, unsigned char * const d_out,
                                uint imgheight, uint imgwidth)
{
    for(int i = 0; i < imgheight; i++)
    {
        for(int j = 0; j < imgwidth; j++)
        {
            d_out[i * imgwidth + j] = 0.299f * d_in[(i * imgwidth + j)*3]
                                     + 0.587f * d_in[(i * imgwidth + j)*3 + 1]
                                     + 0.114f * d_in[(i * imgwidth + j)*3 + 2];
        }
    }
}

int main(void)
{
    Mat srcImage = imread("./test.jpg");
    imshow("srcImage", srcImage);
    waitKey(0);

    const uint imgheight = srcImage.rows;
    const uint imgwidth = srcImage.cols;

    Mat grayImage(imgheight, imgwidth, CV_8UC1, Scalar(0));

    uchar3 *d_in;
    unsigned char *d_out;

    cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
    cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char));

    cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
    
    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
        (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);

    clock_t start, end;
    start = clock();

    rgb2grayincuda<< <blocksPerGrid, threadsPerBlock>> >(d_in, d_out, imgheight, imgwidth);

    cudaDeviceSynchronize();
    end = clock();

    printf("cuda exec time is %.8f\n", (double)(end-start)/CLOCKS_PER_SEC);

    cudaMemcpy(grayImage.data, d_out, imgheight*imgwidth*sizeof(unsigned char), cudaMemcpyDeviceToHost);

    cudaFree(d_in);
    cudaFree(d_out);

    start = clock();

    rgb2grayincpu(srcImage.data, grayImage.data, imgheight, imgwidth);

    end = clock();

    printf("cpu exec time is %.8f\n", (double)(end-start)/CLOCKS_PER_SEC);

    start = clock();
    cvtColor(srcImage, grayImage, CV_BGR2GRAY);

    end = clock();

    printf("opencv-cpu exec time is %.8f\n", (double)(end-start)/CLOCKS_PER_SEC);

    imshow("grayImage", grayImage);
    waitKey(0);

    return 0;

}

后记

opencv运行时间问题

经大神提示，cv::cvtColor()之所以慢的原因，可能是动态加载库的问题，即大部分时间花在了将函数加载进内存上面。用下面一段代码来验证猜想：

cvtColor(srcImage, grayImage, CV_BGR2GRAY);		//让cvtColor()先动态加载进来

start = clock();
cvtColor(srcImage, grayImage, CV_BGR2GRAY);

end = clock();

结果发现，cv::cvtColor()执行时间降到了0.00132400，比自己写的cpu程序要快，但是要略慢与gpu程序；看来确实是opencv动态加载函数造成的。

静态链接库与动态链接库的区别：静态链接库与动态链接库－－－－C/C++
静态链接库、动态链接库、动态加载库的区别：静态链接库、动态链接库和动态加载库

总结一下，库文件分为静态库文件(.lib/.a)和动态库文件(.dll/.so)两种，静态库文件实在编译可执行文件时，就和.o文件连接在一起了，可单独拷出来直接执行，也因此比较大；动态库文件则是在程序运行时才去加载，因此程序运行时依赖于库的存在。动态库文件有两种加载方式：一种是在程序开始时就全部加载，称为静态加载；另一种是需要用到哪些函数时再去加载，称为动态加载。

后续需要实验来试一下如何在程序里，或者编译时，指定讲动态链接库静态加载

另一方面，我们发现我们cpu写的代码，不如opencv-cpu快，为此我看了一下opencv的源码，发现cv::cvtColor()定义在color.cpp下，根据编号调用了cvtColorBGR2Gray()函数，而该函数又调用了hal::cvtBGRtoGray()，一层层翻下去居然发现最后是“NOT_IMPLEMENTED”？后来查了下述博客，才发现这是OpenCV调用HAL库的一种机制——OpenCV默认提供了一批接口，没有写(所以是NOT_IMPLEMENTED)，需要各个硬件厂商写好后替换上去。当OpenCV调用一个函数时，如果有查到替换的函数则直接调用，否则就返回一个CV_HAL_ERROR_NOT_IMPLEMENTED。

OpenCV的HAL实施：OpenCv源码解析：对HAL硬件加速层的支持

所以结论就是，OpenCV的cvtColor是经过硬件加速的！怪不得比我们的cpu代码快

cuda练习（一）：使用cuda将rbg图像转为灰度图像