cuFFT 的使用示例

由于cufftPlan*d 的使用比较简单，这里主要介绍 cufftPlanMany 函数的使用

void main() {

	cv::Mat src = cv::imread("../images./beard.jpg", 1);
	src.convertTo(src, CV_32FC3, 1.f / 255);
	std::vector<cv::Mat> split;
	cv::split(src, split);

	cufftHandle forward_plan, inverse_plan;

	int batch = src.channels();
	int rank = 2;

	int nRows = src.rows;
	int nCols = src.cols;
	int n[2] = { nRows, nCols };

	int idist = nRows*nCols;
	int odist = nRows*(nCols / 2 + 1);

	int inembed[] = { nRows, nCols };
	int onembed[] = { nRows, (nCols / 2 + 1) };

	int istride = 1;
	int ostride = 1;

	// 初始化主机端输入
	float *h_in = (float*)malloc(sizeof(float)*nRows*nCols*batch);

	memcpy(h_in, (float*)split[0].data, sizeof(float)*nRows*nCols);
	memcpy(h_in + nRows*nCols, (float*)split[1].data, sizeof(float)*nRows*nCols);
	memcpy(h_in + 2 * nRows*nCols, (float*)split[2].data, sizeof(float)*nRows*nCols);
		
	// 初始化主机端保存 DFT 结果的内存
	float2* h_freq = (float2*)malloc(sizeof(float2)*nRows*(nCols / 2 + 1)*batch);

	// 初始化设备内存
	float* d_in;
	gpuErrchk(cudaMalloc(&d_in, sizeof(float)*nRows*nCols*batch));
	float2* d_freq;
	gpuErrchk(cudaMalloc(&d_freq, sizeof(float2)*nRows*(nCols / 2 + 1)*batch));

	// 输入 host -> device
	gpuErrchk(cudaMemcpy(d_in, h_in, sizeof(float)*nRows*nCols*batch, cudaMemcpyHostToDevice));

	// 定义 DFT
	cufftSafeCall(cufftPlanMany(&forward_plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch));
	// 执行 DFT
	cufftSafeCall(cufftExecR2C(forward_plan, d_in, d_freq));

	// DFT 的结果 device -> host
	gpuErrchk(cudaMemcpy(h_freq, d_freq, sizeof(float2)*nRows*(nCols / 2 + 1)*batch, cudaMemcpyDeviceToHost));

	// --------------- 测试结果 --------------- 
	std::vector<cv::Mat> matWatches;
	for (int b = 0; b < batch; b++)
	{
		cv::Mat tmp(nRows, nCols / 2 + 1, CV_32FC2);
		for (int i = 0; i < nRows; i++)
		{
			for (int j = 0; j < (nCols / 2 + 1); j++)
			{
				int ind = j + i*(nCols / 2 + 1) + b * (nRows*(nCols / 2 + 1));
				tmp.at<cv::Vec2f>(i, j)[0] = h_freq[ind].x;
				tmp.at<cv::Vec2f>(i, j)[1] = h_freq[ind].y;
			}
		}
		matWatches.push_back(tmp);
	}

	// 定义 IDFT
	cufftSafeCall(cufftPlanMany(&inverse_plan, rank, n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch));
	// 执行 IDFT
	cufftSafeCall(cufftExecC2R(inverse_plan, d_freq, d_in));

	// IDFT 的结果 device -> host 
	gpuErrchk(cudaMemcpy(h_in, d_in, sizeof(float)*nRows*nCols*batch, cudaMemcpyDeviceToHost));

	// --------------- 测试结果 --------------- 
	matWatches.clear();
	for (int b = 0; b < batch; b++)
	{
		cv::Mat tmp(nRows, nCols, CV_32FC1);
		for (int i = 0; i < nRows; i++)
		{
			for (int j = 0; j < nCols; j++)
			{
				int ind = j + i * nCols + b * nRows * nCols;
				tmp.at<float>(i, j) = h_in[ind] / (nCols * nRows);
			}
		}
		matWatches.push_back(tmp);
	}

	cv::Mat dst;
	cv::merge(matWatches, dst);
	cv::imshow("src", src);
	cv::imshow("dst", dst);
	cv::waitKey(0);

}

效果图

在我的 gtx970m 上， 640*640 的图像从头到尾（包括从磁盘读取图片，然后 host 和 device 之间的互相拷贝）的执行时间是将近 1s

猜你喜欢