Example of using opencl fft.

https://www.cnblogs.com/ahfuzhang/p/11083423.html

opencv-3.4.3\modules\core\include\opencv2\core.hpp:2157

CV_EXPORTS_W void dft(InputArray src, OutputArray dst, int flags = 0, int nonzeroRows = 0);

2. Function implementation

opencv-3.4.3\modules\core\src\dxt.cpp:3315

void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
{
    CV_INSTRUMENT_REGION()

#ifdef HAVE_CLAMDFFT
    CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
            _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
               ocl_dft_amdfft(_src0, _dst, flags))
#endif

#ifdef HAVE_OPENCL
    CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
               ocl_dft(_src0, _dst, flags, nonzero_rows))
#endif

    Mat src0 = _src0.getMat(), src = src0;
    bool inv = (flags & DFT_INVERSE) != 0;
    int type = src.type();
    int depth = src.depth();

    CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );

    // Fail if DFT_COMPLEX_INPUT is specified, but src is not 2 channels.
    CV_Assert( !((flags & DFT_COMPLEX_INPUT) && src.channels() != 2) );

    if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )
        _dst.create( src.size(), CV_MAKETYPE(depth, 2) );
    else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )
        _dst.create( src.size(), depth );
    else
        _dst.create( src.size(), type );

    Mat dst = _dst.getMat();

    int f = 0;
    if (src.isContinuous() && dst.isContinuous())
        f |= CV_HAL_DFT_IS_CONTINUOUS;
    if (inv)
        f |= CV_HAL_DFT_INVERSE;
    if (flags & DFT_ROWS)
        f |= CV_HAL_DFT_ROWS;
    if (flags & DFT_SCALE)
        f |= CV_HAL_DFT_SCALE;
    if (src.data == dst.data)
        f |= CV_HAL_DFT_IS_INPLACE;
    Ptr<hal::DFT2D> c = hal::DFT2D::create(src.cols, src.rows, depth, src.channels(), dst.channels(), f, nonzero_rows);
    c->apply(src.data, src.step, dst.data, dst.step);
}

3. Calling opencl

#ifdef HAVE_OPENCL
    CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
               ocl_dft(_src0, _dst, flags, nonzero_rows))
#endif

The function implementation of ocl:
opencv-3.4.3\modules\core\src\dxt.cpp:2161

static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)
{
    int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
    Size ssize = _src.size();
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

    if (!(cn == 1 || cn == 2)
        || !(depth == CV_32F || (depth == CV_64F && doubleSupport))
        || ((flags & DFT_REAL_OUTPUT) && (flags & DFT_COMPLEX_OUTPUT)))
        return false;

    // if is not a multiplication of prime numbers { 2, 3, 5 }
    if (ssize.area() != getOptimalDFTSize(ssize.area()))
        return false;

    UMat src = _src.getUMat();
    bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0;

    if( nonzero_rows <= 0 || nonzero_rows > _src.rows() )
        nonzero_rows = _src.rows();
    bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1;

    FftType fftType = determineFFTType(cn == 1, cn == 2,
        (flags & DFT_REAL_OUTPUT) != 0, (flags & DFT_COMPLEX_OUTPUT) != 0, inv);

    UMat output;
    if (fftType == C2C || fftType == R2C)
    {
        // complex output
        _dst.create(src.size(), CV_MAKETYPE(depth, 2));
        output = _dst.getUMat();
    }
    else
    {
        // real output
        if (is1d)
        {
            _dst.create(src.size(), CV_MAKETYPE(depth, 1));
            output = _dst.getUMat();
        }
        else
        {
            _dst.create(src.size(), CV_MAKETYPE(depth, 1));
            output.create(src.size(), CV_MAKETYPE(depth, 2));
        }
    }

    bool result = false;
    if (!inv)
    {
        int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols;
        result = ocl_dft_rows(src, output, nonzero_rows, flags, fftType);
        if (!is1d)
            result = result && ocl_dft_cols(output, _dst, nonzero_cols, flags, fftType);
    }
    else
    {
        if (fftType == C2C)
        {
            // complex output
            result = ocl_dft_rows(src, output, nonzero_rows, flags, fftType);
            if (!is1d)
                result = result && ocl_dft_cols(output, output, output.cols, flags, fftType);
        }
        else
        {
            if (is1d)
            {
                result = ocl_dft_rows(src, output, nonzero_rows, flags, fftType);
            }
            else
            {
                int nonzero_cols = src.cols/2 + 1;
                result = ocl_dft_cols(src, output, nonzero_cols, flags, fftType);
                result = result && ocl_dft_rows(output, _dst, nonzero_rows, flags, fftType);
            }
        }
    }
    return result;
}

4. The call function of row/col in ocl_dft()

Function prototype:

static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)
static bool ocl_dft_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType)

Look at the source code of one of them:

static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)
{
    int type = _src.type(), depth = CV_MAT_DEPTH(type);
    Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), depth);
    return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true);
}

5. Object pool calculated by fft

Before each fft calculation of a certain size, a series of initialization data needs to be established; if these initialization data are established for the same size each time, it is obviously wasteful.
So an object pool is established, and an object is cached every time a new size calculated by fft appears. Space for time (but long-term running scenarios should pay attention to memory consumption).

    Ptr<OCL_FftPlan> OCL_FftPlanCache::getFftPlan(int dft_size, int depth)
    {
        int key = (dft_size << 16) | (depth & 0xFFFF);
        std::map<int, Ptr<OCL_FftPlan> >::iterator f = planStorage.find(key);
        if (f != planStorage.end())
        {
            return f->second;
        }
        else
        {
            Ptr<OCL_FftPlan> newPlan = Ptr<OCL_FftPlan>(new OCL_FftPlan(dft_size, depth));
            planStorage[key] = newPlan;
            return newPlan;
        }
    }

6. fft object

opencv-3.4.3\modules\core\src\dxt.cpp:1881
struct OCL_FftPlan is
initialized in the constructor: OCL_FftPlan(int _size, int _depth)
calculation using this method: bool enqueueTransform(InputArray _src, OutputArray _dst, int num_dfts, int flags, int fftType, bool rows = true)
The main code of the const method is to construct the compilation parameters of the kernel function.

6.1 Compilation, binding parameters and execution of opencl kernel functions

The core code of the enqueueTransform() method is as follows:

        ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options);
        if (k.empty())
            return false;

        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::ReadOnlyNoSize(twiddles), thread_count, num_dfts);
        return k.run(2, globalsize, localsize, false);

The ocl::Kernel object is used to compile opencl kernel functions.
ocl::KernelArg is used to bind the execution parameters of the kernel function.
k.run() executes the kernel function.

6.2 Definition of kernel function

ocl::core::fft_oclsrc This constant object defines the source code of the kernel function. I searched all .h, .hpp, and .cpp but no definition was found.
This part of the source code is generated during the compilation process.
Defined in:
opencv-3.4.3/build/modules/core/opencl_kernels_core.hpp:21

extern struct cv::ocl::internal::ProgramEntry fft_oclsrc;

Realized in:
opencv-3.4.3/build/modules/core/opencl_kernels_core.cpp:770

struct cv::ocl::internal::ProgramEntry fft_oclsrc={moduleName, "fft",
"#define SQRT_2 0.707106781188f\n"

It seems that only a script is used to convert the opencl kernel function code into a C++ string.

6.3 Definition file of kernel function

Finally found the opencl fft kernel function file:
opencv-3.4.3\modules\core\src\opencl\fft.cl

There is an obvious problem here. The kernel function must be compiled every time it is called. I don't see where the compiled results are cached.

7.cv::dft() possible optimization points

Compilation is required every time the kernel function is called, and the ocl::Kernel object should be cached
Modify the style of the C function to an object-oriented style, and add the UMat data upload/core function operation/UMat data download to the asynchronous queue. This makes it possible to avoid the CPU waiting for the result of the GPU when calculating multiple dft() continuously.