一个基于OpenCL的三维有限差分kernel核函数

https://rtoax.blog.csdn.net/article/details/104266071
/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

__kernel void FiniteDifferences(__global float * const output,
                                __global const float * const input,
                                __constant float * const coeff,
                                const int dimx,
                                const int dimy,
                                const int dimz,
                                const int padding)
{
    bool valid = true;
    const int gtidx = get_global_id(0);
    const int gtidy = get_global_id(1);
    const int ltidx = get_local_id(0);
    const int ltidy = get_local_id(1);
    const int workx = get_local_size(0);
    const int worky = get_local_size(1);
    __local float tile[MAXWORKY + 2 * RADIUS][MAXWORKX + 2 * RADIUS];
    
    const int stride_y = dimx + 2 * RADIUS;
    const int stride_z = stride_y * (dimy + 2 * RADIUS);

    int inputIndex  = 0;
    int outputIndex = 0;

    // Advance inputIndex to start of inner volume
    inputIndex += RADIUS * stride_y + RADIUS + padding;
    
    // Advance inputIndex to target element
    inputIndex += gtidy * stride_y + gtidx;

    float infront[RADIUS];
    float behind[RADIUS];
    float current;

	const int tx = ltidx + RADIUS;
	const int ty = ltidy + RADIUS;

    if (gtidx >= dimx)
        valid = false;
    if (gtidy >= dimy)
        valid = false;

    // For simplicity we assume that the global size is equal to the actual
    // problem size; since the global size must be a multiple of the local size
    // this means the problem size must be a multiple of the local size (or
    // padded to meet this constraint).
    // Preload the "infront" and "behind" data
    for (int i = RADIUS - 2 ; i >= 0 ; i--)
    {
        behind[i] = input[inputIndex];
        inputIndex += stride_z;
    }

    current = input[inputIndex];
    outputIndex = inputIndex;
    inputIndex += stride_z;

    for (int i = 0 ; i < RADIUS ; i++)
    {
        infront[i] = input[inputIndex];
        inputIndex += stride_z;
    }

    // Step through the xy-planes
    for (int iz = 0 ; iz < dimz ; iz++)
    {
        // Advance the slice (move the thread-front)
        for (int i = RADIUS - 1 ; i > 0 ; i--)
            behind[i] = behind[i - 1];
        behind[0] = current;
        current = infront[0];
        for (int i = 0 ; i < RADIUS - 1 ; i++)
            infront[i] = infront[i + 1];
        infront[RADIUS - 1] = input[inputIndex];

        inputIndex  += stride_z;
        outputIndex += stride_z;
        barrier(CLK_LOCAL_MEM_FENCE);

        // Note that for the work items on the boundary of the problem, the
        // supplied index when reading the halo (below) may wrap to the
        // previous/next row or even the previous/next xy-plane. This is
        // acceptable since a) we disable the output write for these work
        // items and b) there is at least one xy-plane before/after the
        // current plane, so the access will be within bounds.

        // Update the data slice in the local tile
        // Halo above & below
        if (ltidy < RADIUS)
        {
            tile[ltidy][tx]                  = input[outputIndex - RADIUS * stride_y];
            tile[ltidy + worky + RADIUS][tx] = input[outputIndex + worky * stride_y];
        }
        // Halo left & right
        if (ltidx < RADIUS)
        {
            tile[ty][ltidx]                  = input[outputIndex - RADIUS];
            tile[ty][ltidx + workx + RADIUS] = input[outputIndex + workx];
        }
        tile[ty][tx] = current;
        barrier(CLK_LOCAL_MEM_FENCE);

        // Compute the output value
        float value = coeff[0] * current;
#pragma unroll RADIUS
        for (int i = 1 ; i <= RADIUS ; i++)
        {
            value += coeff[i] * (infront[i-1] + behind[i-1] + tile[ty - i][tx] + tile[ty + i][tx] + tile[ty][tx - i] + tile[ty][tx + i]);
        }

        // Store the output value
        if (valid)
            output[outputIndex] = value;
    }
}
Koma_Wong
发布了591 篇原创文章 · 获赞 285 · 访问量 49万+
他的留言板关注
一个基于OpenCL的三维有限差分kernel核函数

猜你喜欢