https://www.cnblogs.com/ahfuzhang/p/11083423.html
opencv-3.4.3\modules\core\include\opencv2\core.hpp:2157
CV_EXPORTS_W void dft(InputArray src, OutputArray dst, int flags = 0, int nonzeroRows = 0);
2. Function implementation
opencv-3.4.3\modules\core\src\dxt.cpp:3315
void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
{
CV_INSTRUMENT_REGION()
#ifdef HAVE_CLAMDFFT
CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
_dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
ocl_dft_amdfft(_src0, _dst, flags))
#endif
#ifdef HAVE_OPENCL
CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
ocl_dft(_src0, _dst, flags, nonzero_rows))
#endif
Mat src0 = _src0.getMat(), src = src0;
bool inv = (flags & DFT_INVERSE) != 0;
int type = src.type();
int depth = src.depth();
CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );
// Fail if DFT_COMPLEX_INPUT is specified, but src is not 2 channels.
CV_Assert( !((flags & DFT_COMPLEX_INPUT) && src.channels() != 2) );
if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )
_dst.create( src.size(), CV_MAKETYPE(depth, 2) );
else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )
_dst.create( src.size(), depth );
else
_dst.create( src.size(), type );
Mat dst = _dst.getMat();
int f = 0;
if (src.isContinuous() && dst.isContinuous())
f |= CV_HAL_DFT_IS_CONTINUOUS;
if (inv)
f |= CV_HAL_DFT_INVERSE;
if (flags & DFT_ROWS)
f |= CV_HAL_DFT_ROWS;
if (flags & DFT_SCALE)
f |= CV_HAL_DFT_SCALE;
if (src.data == dst.data)
f |= CV_HAL_DFT_IS_INPLACE;
Ptr<hal::DFT2D> c = hal::DFT2D::create(src.cols, src.rows, depth, src.channels(), dst.channels(), f, nonzero_rows);
c->apply(src.data, src.step, dst.data, dst.step);
}
3. Calling opencl
#ifdef HAVE_OPENCL
CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
ocl_dft(_src0, _dst, flags, nonzero_rows))
#endif
The function implementation of ocl:
opencv-3.4.3\modules\core\src\dxt.cpp:2161
static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)
{
int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
Size ssize = _src.size();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if (!(cn == 1 || cn == 2)
|| !(depth == CV_32F || (depth == CV_64F && doubleSupport))
|| ((flags & DFT_REAL_OUTPUT) && (flags & DFT_COMPLEX_OUTPUT)))
return false;
// if is not a multiplication of prime numbers { 2, 3, 5 }
if (ssize.area() != getOptimalDFTSize(ssize.area()))
return false;
UMat src = _src.getUMat();
bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0;
if( nonzero_rows <= 0 || nonzero_rows > _src.rows() )
nonzero_rows = _src.rows();
bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1;
FftType fftType = determineFFTType(cn == 1, cn == 2,
(flags & DFT_REAL_OUTPUT) != 0, (flags & DFT_COMPLEX_OUTPUT) != 0, inv);
UMat output;
if (fftType == C2C || fftType == R2C)
{
// complex output
_dst.create(src.size(), CV_MAKETYPE(depth, 2));
output = _dst.getUMat();
}
else
{
// real output
if (is1d)
{
_dst.create(src.size(), CV_MAKETYPE(depth, 1));
output = _dst.getUMat();
}
else
{
_dst.create(src.size(), CV_MAKETYPE(depth, 1));
output.create(src.size(), CV_MAKETYPE(depth, 2));
}
}
bool result = false;
if (!inv)
{
int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols;
result = ocl_dft_rows(src, output, nonzero_rows, flags, fftType);
if (!is1d)
result = result && ocl_dft_cols(output, _dst, nonzero_cols, flags, fftType);
}
else
{
if (fftType == C2C)
{
// complex output
result = ocl_dft_rows(src, output, nonzero_rows, flags, fftType);
if (!is1d)
result = result && ocl_dft_cols(output, output, output.cols, flags, fftType);
}
else
{
if (is1d)
{
result = ocl_dft_rows(src, output, nonzero_rows, flags, fftType);
}
else
{
int nonzero_cols = src.cols/2 + 1;
result = ocl_dft_cols(src, output, nonzero_cols, flags, fftType);
result = result && ocl_dft_rows(output, _dst, nonzero_rows, flags, fftType);
}
}
}
return result;
}
4. The call function of row/col in ocl_dft()
Function prototype:
static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)
static bool ocl_dft_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType)
Look at the source code of one of them:
static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)
{
int type = _src.type(), depth = CV_MAT_DEPTH(type);
Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), depth);
return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true);
}
5. Object pool calculated by fft
Before each fft calculation of a certain size, a series of initialization data needs to be established; if these initialization data are established for the same size each time, it is obviously wasteful.
So an object pool is established, and an object is cached every time a new size calculated by fft appears. Space for time (but long-term running scenarios should pay attention to memory consumption).
Ptr<OCL_FftPlan> OCL_FftPlanCache::getFftPlan(int dft_size, int depth)
{
int key = (dft_size << 16) | (depth & 0xFFFF);
std::map<int, Ptr<OCL_FftPlan> >::iterator f = planStorage.find(key);
if (f != planStorage.end())
{
return f->second;
}
else
{
Ptr<OCL_FftPlan> newPlan = Ptr<OCL_FftPlan>(new OCL_FftPlan(dft_size, depth));
planStorage[key] = newPlan;
return newPlan;
}
}
6. fft object
opencv-3.4.3\modules\core\src\dxt.cpp:1881
struct OCL_FftPlan is
initialized in the constructor: OCL_FftPlan(int _size, int _depth)
calculation using this method: bool enqueueTransform(InputArray _src, OutputArray _dst, int num_dfts, int flags, int fftType, bool rows = true)
The main code of the const method is to construct the compilation parameters of the kernel function.
6.1 Compilation, binding parameters and execution of opencl kernel functions
The core code of the enqueueTransform() method is as follows:
ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options);
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::ReadOnlyNoSize(twiddles), thread_count, num_dfts);
return k.run(2, globalsize, localsize, false);
The ocl::Kernel object is used to compile opencl kernel functions.
ocl::KernelArg is used to bind the execution parameters of the kernel function.
k.run() executes the kernel function.
6.2 Definition of kernel function
ocl::core::fft_oclsrc This constant object defines the source code of the kernel function. I searched all .h, .hpp, and .cpp but no definition was found.
This part of the source code is generated during the compilation process.
Defined in:
opencv-3.4.3/build/modules/core/opencl_kernels_core.hpp:21
extern struct cv::ocl::internal::ProgramEntry fft_oclsrc;
Realized in:
opencv-3.4.3/build/modules/core/opencl_kernels_core.cpp:770
struct cv::ocl::internal::ProgramEntry fft_oclsrc={moduleName, "fft",
"#define SQRT_2 0.707106781188f\n"
It seems that only a script is used to convert the opencl kernel function code into a C++ string.
6.3 Definition file of kernel function
Finally found the opencl fft kernel function file:
opencv-3.4.3\modules\core\src\opencl\fft.cl
There is an obvious problem here. The kernel function must be compiled every time it is called. I don't see where the compiled results are cached.
7.cv::dft() possible optimization points
- Compilation is required every time the kernel function is called, and the ocl::Kernel object should be cached
- Modify the style of the C function to an object-oriented style, and add the UMat data upload/core function operation/UMat data download to the asynchronous queue. This makes it possible to avoid the CPU waiting for the result of the GPU when calculating multiple dft() continuously.