我的CUDA学习之旅4——Sobel算子图像边缘检测CUDA实现

引言

关于图像边缘检测，记得刚开始接触图像处理时，第一个自己实现的程序是通过笔记本摄像头采集图像，利用OpenCV自带的算法库进行Canny算子边缘检测，那时候当看到程序运行后，视频窗口实时显示经Canny算子边缘分割后的图像，觉得十分有科技感，后来慢慢开始自己写边缘检测的源代码，本博客以Sobel算子为例，将边缘检测通过CUDA实现。

任务要求

输入一张图片，将其转为灰度图后，通过CUDA在GPU中对图片实现Sobel算子边缘检测，最后将结果输出至CPU并进行显示，要求输出图与用CPU内实现后的结果一致。

实现思路

关于Sobel算子的边缘检测原理，可看此博客Sobel边缘检测算法
由于检测的原理是通过对Gx和Gy两个方向的卷积，故在CUDA实现时我们需要正确索引到以目标像素点为中心的3*3的小方格中各个元素的位置，由于图像从CPU端传给GPU是一段一维连续的内存，增大了我们索引的难度，故在block和grid的设计上，我把整张图像完整的映射到了grid中，每个thread即对应一个像素，通过二维索引的方法将一维的内存准确映射。

实现环境

VS2013 + CUDA7.5 + Opencv2.4.13

实现代码

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <opencv2\opencv.hpp>
#include <iostream>
using namespace std;
using namespace cv;

//Sobel算子边缘检测核函数
__global__ void sobelInCuda(unsigned char *dataIn, unsigned char *dataOut, int imgHeight, int imgWidth)
{
    int xIndex = threadIdx.x + blockIdx.x * blockDim.x;
    int yIndex = threadIdx.y + blockIdx.y * blockDim.y;
    int index = yIndex * imgWidth + xIndex;
    int Gx = 0;
    int Gy = 0;

    if (xIndex > 0 && xIndex < imgWidth - 1 && yIndex > 0 && yIndex < imgHeight - 1)
    {
        Gx = dataIn[(yIndex - 1) * imgWidth + xIndex + 1] + 2 * dataIn[yIndex * imgWidth + xIndex + 1] + dataIn[(yIndex + 1) * imgWidth + xIndex + 1]
            - (dataIn[(yIndex - 1) * imgWidth + xIndex - 1] + 2 * dataIn[yIndex * imgWidth + xIndex - 1] + dataIn[(yIndex + 1) * imgWidth + xIndex - 1]);
        Gy = dataIn[(yIndex - 1) * imgWidth + xIndex - 1] + 2 * dataIn[(yIndex - 1) * imgWidth + xIndex] + dataIn[(yIndex - 1) * imgWidth + xIndex + 1]
            - (dataIn[(yIndex + 1) * imgWidth + xIndex - 1] + 2 * dataIn[(yIndex + 1) * imgWidth + xIndex] + dataIn[(yIndex + 1) * imgWidth + xIndex + 1]);
        dataOut[index] = (abs(Gx) + abs(Gy)) / 2;
    }
}

//Sobel算子边缘检测CPU函数
void sobel(Mat srcImg, Mat dstImg, int imgHeight, int imgWidth)
{
    int Gx = 0;
    int Gy = 0;
    for (int i = 1; i < imgHeight - 1; i++)
    {
        uchar *dataUp = srcImg.ptr<uchar>(i - 1);
        uchar *data = srcImg.ptr<uchar>(i);
        uchar *dataDown = srcImg.ptr<uchar>(i + 1);
        uchar *out = dstImg.ptr<uchar>(i);
        for (int j = 1; j < imgWidth - 1; j++)
        {
            Gx = (dataUp[j + 1] + 2 * data[j + 1] + dataDown[j + 1]) - (dataUp[j - 1] + 2 * data[j - 1] + dataDown[j - 1]);
            Gy = (dataUp[j - 1] + 2 * dataUp[j] + dataUp[j + 1]) - (dataDown[j - 1] + 2 * dataDown[j] + dataDown[j + 1]);
            out[j] = (abs(Gx) + abs(Gy)) / 2;
        }
    }
}

int main()
{
    Mat grayImg = imread("1.jpg", 0);

    int imgHeight = grayImg.rows;
    int imgWidth = grayImg.cols;

    Mat gaussImg;
    //高斯滤波
    GaussianBlur(grayImg, gaussImg, Size(3, 3), 0, 0, BORDER_DEFAULT);

    //Sobel算子CPU实现
    Mat dst(imgHeight, imgWidth, CV_8UC1, Scalar(0));
    sobel(gaussImg, dst, imgHeight, imgWidth);

    //CUDA实现后的传回的图像
    Mat dstImg(imgHeight, imgWidth, CV_8UC1, Scalar(0));

    //创建GPU内存
    unsigned char *d_in;
    unsigned char *d_out;

    cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(unsigned char));
    cudaMalloc((void**)&d_out, imgHeight * imgWidth * sizeof(unsigned char));

    //将高斯滤波后的图像从CPU传入GPU
    cudaMemcpy(d_in, gaussImg.data, imgHeight * imgWidth * sizeof(unsigned char), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);

    //调用核函数
    sobelInCuda << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out, imgHeight, imgWidth);

    //将图像传回GPU
    cudaMemcpy(dstImg.data, d_out, imgHeight * imgWidth * sizeof(unsigned char), cudaMemcpyDeviceToHost);

    //释放GPU内存
    cudaFree(d_in);
    cudaFree(d_out);

    return 0;
}

实现结果

原图

CPU实现后图像

CUDA实现后图像

通过比对发现CUDA输出结果与CPU实现输出结果一致~