CUDFF 图像傅里叶变换

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u011808673/article/details/81778118

cudff 快速傅里叶变换

利用cuda进行fft变换时,会有一些参数设置的规则,一下举例进行说明:

float *h_Data; //"h_": host,表示CPU内存
float *d_Data; //"d_":device,表示GPU内存
fComplex *d_DataSpectrum, //fComplex:为float复数形式,x为实数,y为复数
cufftHandle  fftPlanFwd,  fftPlanInv;  //Fwd表示正变换,Inv变换反变换
const int  dataH = 1024; //二维图像的高度
const int  dataW = 512;  //二维图像的宽度
int fftH = dataH; //若dataH 不为2的幂次数,需要进行图像扩展
int fftW = dataW; //若dataW 不为2的幂次数,需要进行图像扩展
h_Data = (float *)malloc(dataH   * dataW * sizeof(float)); //CPU端内存分配方式
checkCudaErrors(cudaMalloc((void **)&d_Data,   dataH   * dataW   * sizeof(float))); //GPU端内存开盘方式
checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum,   fftH * (fftW / 2 + 1) * sizeof(fComplex))); //cufft中R2C变换要求输入为H*W,输出为H*(W/2+1),W必须保证为2的倍数

for (int i = 0; i < dataH * dataW; i++)
{
h_Data[i] = getRand();
} //对cpu中的内存块h_Data赋值,这里仅为说明,因此用getRand()函数进行随机数产生
checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C)); //设置fft变换的句柄,这里需要注意,传入的参数先为fftH,fftW,即先传高度值,再传宽度值,CUFFT_R2C表示从real实数变换到complex复数
checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R)); //设置fft反变换的句柄,雷同正变换,传入的参数先为fftH,fftW,即先传高度值,再传宽度值,CUFFT_C2R表示从complex复数变换到real实数

checkCudaErrors(cudaMemcpy(d_Data,   h_Data,   dataH   * dataW *   sizeof(float), cudaMemcpyHostToDevice));//将CPU内存的数据块copy到GPU的显存中,cudaMemcpyHostToDevice表示从host端copy到device端,cudaMemcpyDeviceToHost表示从device端copy到host端

checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_Data, (cufftComplex *)d_DataSpectrum));//按照FFT正变换的句柄,进行图像数据的FFT变换,注意这里的句柄使用的是fftPlanFwd

//此处为其它在频域中操作的函数

checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_Data));//将在频域中处理的结果,再反变换回空间域中,注意这里的句柄使用的是fftPlanInv

//Release
checkCudaErrors(cufftDestroy(fftPlanInv)); //销毁句柄,对应cufftPlan2d
checkCudaErrors(cufftDestroy(fftPlanFwd));//销毁句柄,对应cufftPlan2d
checkCudaErrors(cudaFree(d_DataSpectrum));//GPU中内存释放,对应cudaMalloc
checkCudaErrors(cudaFree(d_Data));//GPU中内存释放,对应cudaMalloc
free(h_Data);//CPU中内存释放,对应malloc

CPU 实现与GPU实现对比

使用小图时CPU更快,使用大图时GPU更快

#include <cuda_runtime.h>
#include <cufft.h>

#include <iostream>
#include <time.h>

#include <opencv2/opencv.hpp>


using namespace std;
using namespace cv;


cv::Mat real(cv::Mat img)
{
	std::vector<cv::Mat> planes;
	cv::split(img, planes);
	return planes[0];
}

int main(int argc, char ** argv)
{
	
	Mat img = imread("01.jpg", CV_LOAD_IMAGE_GRAYSCALE);

	//int NX = 2560;
	//int NY = 2560;
	int NN = 1000;

	//if (argc == 4)
	//{
	//	NX = atoi(argv[1]);
	//	NY = atoi(argv[2]);
	//	NN = atoi(argv[3]);
	//}

	//std::cout << "NX=" << NX << " ; NY=" << NY << " ; NN=" << NN << std::endl;

	resize(img, img, Size(24, 24));
	cout << "img: " << img.channels() << " " << img.rows << "  " << img.cols << endl;
	imshow("img", img);
	waitKey(500);

	int rows = img.rows;
	int cols = img.cols;
	
	//normalize(img, img, 0, 1, CV_MINMAX);
	img.convertTo(img, CV_32FC1, 1.0f / 255);
	//cout << img(Rect(0, 0, 10, 96)) << endl;
	cufftHandle planFwd;
	cufftHandle planInv;
	float *data;
	float *data2;
	cufftComplex*res;
	cudaMalloc((float**)&data, sizeof(float)*rows*cols);
	cudaMalloc((float**)&data2, sizeof(float)*rows*cols);

	cudaMalloc((void**)&res, sizeof(cufftComplex)*rows*(cols/2+1));

	/* Try to do the same thing than cv::randu() */
	float* host_data;
	host_data = (float *)malloc(sizeof(float)*rows*cols);
	
	//host_data = (float *)img.data;
	//srand(time(NULL));
	for (int i = 0; i < rows; i++)
	{
		float *ptrs = img.ptr<float>(i);
		for (size_t j = 0; j < cols; j++)
		{
			host_data[i*cols + j] = ptrs[j];
		}
		//cout << img(Rect(0, 0, 1, 2)) << endl;
		//cout << sizeof(img.data[0])<< " " << (float)img.data[1]<< endl;
		//host_data[i] = make_cuComplex(rand() % 256, rand() % 256);
		////host_data[i].x = rand() % 256;
		////host_data[i].y = rand() % 256;
	}
	//cout << host_data[0]<< endl;
	
	

	/* Warm up ? */
	/* Create a 3D FFT plan. */
	cufftPlan2d(&planFwd, rows, cols, CUFFT_R2C);
	cufftPlan2d(&planInv, rows, cols, CUFFT_C2R);
	//cufftPlan2d(&planInv, rows, cols, CUFFT_C2C);

	cufftComplex* dftRes = (cufftComplex *)malloc(sizeof(cufftComplex)*rows*(cols / 2 + 1));

	
	double t = cv::getTickCount();
	for (size_t i = 0; i < NN; i++)
	{
		cudaMemcpy(data, host_data, sizeof(float)*rows*cols, cudaMemcpyHostToDevice);
		/* Transform the first signal in place. */
		cufftExecR2C(planFwd, (cufftReal*)data, (cufftComplex *)res);

		cudaMemcpy(dftRes, res, sizeof(cufftComplex)*rows*(cols / 2 + 1), cudaMemcpyDeviceToHost);
		//for (size_t i = 0; i < 5; i++)
		//{
		//	cout << "gpu: " << i << "  " << dftRes[i].x << " " << dftRes[i].y << endl;

		//}
		//逆变换
		cufftExecC2R(planInv, (cufftComplex *)res, (cufftReal *)data2);
		float * new_data = (float *)malloc(sizeof(float)*rows*cols);
		cudaMemcpy(new_data, data2, sizeof(float)*rows*cols, cudaMemcpyDeviceToHost);
		//cout << new_data[0] / (rows*cols) << " " << new_data[1] / (rows*cols) << endl;
		//for (size_t i = 0; i < rows*cols; i++)
		//{
		//	new_data[i] = new_data[i] / (rows*cols);
		//}
		//Mat gpu_res = Mat(rows, cols, CV_32FC1, new_data);
		//imshow("gpu:", gpu_res);
		//waitKey(2);
	}
	t = 1000 * ((double)cv::getTickCount() - t) / cv::getTickFrequency() / NN;
	std::cout << "Cuda time=" << t << " ms" << std::endl;


	//use  CPU
	cout << "cpu img:" << img(Rect(0, 0, 1, 2)) << endl;
	Mat img2, newimg2, dst;

	double t1 = cv::getTickCount();
	bool backwards = false;
	for (size_t i = 0; i < NN; i++)
	{
		if (img.channels() == 1)
		{
			cv::Mat planes[] = { cv::Mat_<float>(img), cv::Mat_<float>::zeros(img.size()) };
			//cv::Mat planes[] = {cv::Mat_<double> (img), cv::Mat_<double>::zeros(img.size())};
			cv::merge(planes, 2, img2);
		}
		cv::dft(img2, dst, backwards ? (cv::DFT_INVERSE | cv::DFT_SCALE) : 0); // 0.01ms
		//cout << "cpu dst :" << dst(Rect(0, 0, 5, 1)) << endl;


		backwards = true;
		cv::dft(dst, newimg2, backwards ? (cv::DFT_INVERSE | cv::DFT_SCALE) : 0); // 0.01ms

		//Mat cpu_res = real(newimg2);
		//cout << "cpu newImg2:" << cpu_res(Rect(0, 0, 1, 2)) << endl;
		//imshow("cpu:", cpu_res);
		//waitKey(0);
	}
	t1 = 1000 * ((double)cv::getTickCount() - t1) / cv::getTickFrequency() / NN;
	std::cout << "cpu time=" << t1 << " ms" << std::endl;
	

	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	//double t = cv::getTickCount();
	//for (int i = 0; i < NN; i++)
	//{
	//	/* Create a 2D FFT plan. */
	//	cufftPlan2d(&plan, cols, rows, CUFFT_C2C);

	//	/* Transform the first signal in place. */
	//	cufftExecC2C(plan, data, res, CUFFT_FORWARD);
	//}

	//t = 1000 * ((double)cv::getTickCount() - t) / cv::getTickFrequency() / NN;
	//std::cout << "Cuda time=" << t << " ms" << std::endl;


	/* Destroy the cuFFT plan. */
	cufftDestroy(planFwd);
	cufftDestroy(planInv);
	cudaFree(res);
	cudaFree(data);
	free(host_data);
	getchar();
	std::system("PAUSE");
	return 0;
}

备注

CUFFT库实现了快速傅里叶变换,由于新手加上英文水平有限,被这玩意小小折磨了一下。我把其中需要注意到的几点问题,列在这里,以供后来的读者能够少犯错误。

(1) 在图像正变换时,使用函数R2C,或者D2Z。从图像数据变换到复数数据,图像为N*N时,其实复数数据只有N*(N/2+1), 因为另外的N*(N/2-1)可以根据中心对称和共轭关系算出来,CUFFT为了节省存储空间,所以略去了那些数据。

(2) 在反变换时使用C2R变换,也可以是Z2D变换,在创建cufftPlan句柄时要注意:

cufftPlan2d(&plan_IFFT,nHeight,nWidth,CUFFT_C2R);

上面的语句中参数并不是(nWidth/2+1)而是nWidth。别以为得到的复数数据为N*(N/2+1)就用参数(nWidth/2+1)。

(3)反变换出来的数据是实数,但是被放大了nWidth*nHeight倍,所以你要得到图像数据的话,需要每个像素值除以 nWidth*nHeight。

猜你喜欢

转载自blog.csdn.net/u011808673/article/details/81778118