版权声明:涉猎过的知识都像是不断汇入大海的涓涓细流,你怎么知道是哪条汇入的溪流让海洋成为海洋呢【转载请注明出处】 https://blog.csdn.net/panda1234lee/article/details/87727523
由于cufftPlan*d 的使用比较简单,这里主要介绍 cufftPlanMany 函数的使用
void main() {
cv::Mat src = cv::imread("../images./beard.jpg", 1);
src.convertTo(src, CV_32FC3, 1.f / 255);
std::vector<cv::Mat> split;
cv::split(src, split);
cufftHandle forward_plan, inverse_plan;
int batch = src.channels();
int rank = 2;
int nRows = src.rows;
int nCols = src.cols;
int n[2] = { nRows, nCols };
int idist = nRows*nCols;
int odist = nRows*(nCols / 2 + 1);
int inembed[] = { nRows, nCols };
int onembed[] = { nRows, (nCols / 2 + 1) };
int istride = 1;
int ostride = 1;
// 初始化主机端输入
float *h_in = (float*)malloc(sizeof(float)*nRows*nCols*batch);
memcpy(h_in, (float*)split[0].data, sizeof(float)*nRows*nCols);
memcpy(h_in + nRows*nCols, (float*)split[1].data, sizeof(float)*nRows*nCols);
memcpy(h_in + 2 * nRows*nCols, (float*)split[2].data, sizeof(float)*nRows*nCols);
// 初始化主机端保存 DFT 结果的内存
float2* h_freq = (float2*)malloc(sizeof(float2)*nRows*(nCols / 2 + 1)*batch);
// 初始化设备内存
float* d_in;
gpuErrchk(cudaMalloc(&d_in, sizeof(float)*nRows*nCols*batch));
float2* d_freq;
gpuErrchk(cudaMalloc(&d_freq, sizeof(float2)*nRows*(nCols / 2 + 1)*batch));
// 输入 host -> device
gpuErrchk(cudaMemcpy(d_in, h_in, sizeof(float)*nRows*nCols*batch, cudaMemcpyHostToDevice));
// 定义 DFT
cufftSafeCall(cufftPlanMany(&forward_plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch));
// 执行 DFT
cufftSafeCall(cufftExecR2C(forward_plan, d_in, d_freq));
// DFT 的结果 device -> host
gpuErrchk(cudaMemcpy(h_freq, d_freq, sizeof(float2)*nRows*(nCols / 2 + 1)*batch, cudaMemcpyDeviceToHost));
// --------------- 测试结果 ---------------
std::vector<cv::Mat> matWatches;
for (int b = 0; b < batch; b++)
{
cv::Mat tmp(nRows, nCols / 2 + 1, CV_32FC2);
for (int i = 0; i < nRows; i++)
{
for (int j = 0; j < (nCols / 2 + 1); j++)
{
int ind = j + i*(nCols / 2 + 1) + b * (nRows*(nCols / 2 + 1));
tmp.at<cv::Vec2f>(i, j)[0] = h_freq[ind].x;
tmp.at<cv::Vec2f>(i, j)[1] = h_freq[ind].y;
}
}
matWatches.push_back(tmp);
}
// 定义 IDFT
cufftSafeCall(cufftPlanMany(&inverse_plan, rank, n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch));
// 执行 IDFT
cufftSafeCall(cufftExecC2R(inverse_plan, d_freq, d_in));
// IDFT 的结果 device -> host
gpuErrchk(cudaMemcpy(h_in, d_in, sizeof(float)*nRows*nCols*batch, cudaMemcpyDeviceToHost));
// --------------- 测试结果 ---------------
matWatches.clear();
for (int b = 0; b < batch; b++)
{
cv::Mat tmp(nRows, nCols, CV_32FC1);
for (int i = 0; i < nRows; i++)
{
for (int j = 0; j < nCols; j++)
{
int ind = j + i * nCols + b * nRows * nCols;
tmp.at<float>(i, j) = h_in[ind] / (nCols * nRows);
}
}
matWatches.push_back(tmp);
}
cv::Mat dst;
cv::merge(matWatches, dst);
cv::imshow("src", src);
cv::imshow("dst", dst);
cv::waitKey(0);
}
效果图
在我的 gtx970m 上, 640*640 的图像从头到尾(包括从磁盘读取图片,然后 host 和 device 之间的互相拷贝)的执行时间是将近 1s