Lesson 3 - Fundamental GPU Algorithms(Reduce,Scan,Histogram)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_24990189/article/details/89847271

目录

1.Welcome to Unit#3

2.Fundamental GPU Algorithms

3.Digging Holes Again

4.Steps and Work

5.练习:A Quiz on Step adn work

6.Reduce Part 1

7.Reduce Part 2

8.练习:Binary and Associative Operators

9.练习:Serial lmplementation of Reduce

10.练习:Parallel Reduce

11.练习:Step Complexity of Parallel Reduce

12.练习:Reduction Using Global and Shared Memory

13.Scan

14.Balancing Checkbook With Scan

15.练习:Inputs to Scan

16.练习:What Scan Actually Does

17.Why Do We Care About Parallel Scan

18.Inclusive Vs. Exclusive Scan

19.练习:Serial Implementation of Scan

20.练习:Inclusive Scan Revisited

21.Hills Steele vs Blelloch Scan

22.练习:Hills Steele Scan

23.练习:Blelloch Scan

24.Analyzing The Blelloch Downsweep

25.练习:Which Scan Should You Use

26.练习:Histogram

27.练习:Serial Implementation of Histogram

28.Parallel Implementation of Histogram

29.练习:Implementing Histogram Using Atomics

30.Implementing Histogram Using Local Memory

31.Calculating Global Histogram Using Reduction

32.Sort Then Reduce By key

33.练习:What Operation should we use

34.练习:Final Thoughts on Histogram

35.Description of Problem Set #3

36.Congratulations


1.Welcome to Unit#3

2.Fundamental GPU Algorithms

回顾Lession2的 communication pattern

3.Digging Holes Again

4.Steps and Work

5.练习:A Quiz on Step and work

6.Reduce Part 1

7.Reduce Part 2

8.练习:Binary and Associative Operators

9.练习:Serial lmplementation of Reduce

10.练习:Parallel Reduce

11.练习:Step Complexity of Parallel Reduce

12.练习:Reduction Using Global and Shared Memory

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid  = threadIdx.x;

    // do reduction in global mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            d_in[myId] += d_in[myId + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = d_in[myId];
    }
}

__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
    // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
    extern __shared__ float sdata[];

    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid  = threadIdx.x;

    // load shared mem from global mem
    sdata[tid] = d_in[myId];
    __syncthreads();            // make sure entire block is loaded!

    // do reduction in shared mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = sdata[0];
    }
}

void reduce(float * d_out, float * d_intermediate, float * d_in, 
            int size, bool usesSharedMemory)
{
    // assumes that size is not greater than maxThreadsPerBlock^2
    // and that size is a multiple of maxThreadsPerBlock
    const int maxThreadsPerBlock = 1024;
    int threads = maxThreadsPerBlock;
    int blocks = size / maxThreadsPerBlock;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
            (d_intermediate, d_in);
    }
    else
    {
        global_reduce_kernel<<<blocks, threads>>>
            (d_intermediate, d_in);
    }
    // now we're down to one block left, so reduce it
    threads = blocks; // launch one thread for each block in prev step
    blocks = 1;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
            (d_out, d_intermediate);
    }
    else
    {
        global_reduce_kernel<<<blocks, threads>>>
            (d_out, d_intermediate);
    }
}

int main(int argc, char **argv)
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0) {
        fprintf(stderr, "error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }
    int dev = 0;
    cudaSetDevice(dev);

    cudaDeviceProp devProps;
    if (cudaGetDeviceProperties(&devProps, dev) == 0)
    {
        printf("Using device %d:\n", dev);
        printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
               devProps.name, (int)devProps.totalGlobalMem, 
               (int)devProps.major, (int)devProps.minor, 
               (int)devProps.clockRate);
    }

    const int ARRAY_SIZE = 1 << 20;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // generate the input array on the host
    float h_in[ARRAY_SIZE];
    float sum = 0.0f;
    for(int i = 0; i < ARRAY_SIZE; i++) {
        // generate random float in [-1.0f, 1.0f]
        h_in[i] = -1.0f + (float)random()/((float)RAND_MAX/2.0f);
        sum += h_in[i];
    }

    // declare GPU memory pointers
    float * d_in, * d_intermediate, * d_out;

    // allocate GPU memory
    cudaMalloc((void **) &d_in, ARRAY_BYTES);
    cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated
    cudaMalloc((void **) &d_out, sizeof(float));

    // transfer the input array to the GPU
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 

    int whichKernel = 0;
    if (argc == 2) {
        whichKernel = atoi(argv[1]);
    }
        
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    // launch the kernel
    switch(whichKernel) {
    case 0:
        printf("Running global reduce\n");
        cudaEventRecord(start, 0);
        for (int i = 0; i < 100; i++)
        {
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
        }
        cudaEventRecord(stop, 0);
        break;
    case 1:
        printf("Running reduce with shared mem\n");
        cudaEventRecord(start, 0);
        for (int i = 0; i < 100; i++)
        {
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
        }
        cudaEventRecord(stop, 0);
        break;
    default:
        fprintf(stderr, "error: ran no kernel\n");
        exit(EXIT_FAILURE);
    }
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);    
    elapsedTime /= 100.0f;      // 100 trials

    // copy back the sum from GPU
    float h_out;
    cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    printf("average time elapsed: %f\n", elapsedTime);

    // free GPU memory allocation
    cudaFree(d_in);
    cudaFree(d_intermediate);
    cudaFree(d_out);
        
    return 0;
}
wlsh@wlsh-ThinkStation:~/Desktop/cs344-master/Lesson Code Snippets/Lesson 3 Code Snippets$ ./a.out 0
Using device 0:
GeForce GTX 980 Ti; global mem: 2083127296B; compute v5.2; clock: 1228000 kHz
Running global reduce
average time elapsed: 0.138348
wlsh@wlsh-ThinkStation:~/Desktop/cs344-master/Lesson Code Snippets/Lesson 3 Code Snippets$ ./a.out 1
Using device 0:
GeForce GTX 980 Ti; global mem: 2083127296B; compute v5.2; clock: 1228000 kHz
Running reduce with shared mem
average time elapsed: 0.117061

13.Scan

14.Balancing Checkbook With Scan

15.练习:Inputs to Scan

16.练习:What Scan Actually Does

17.Why Do We Care About Parallel Scan

18.Inclusive Vs. Exclusive Scan

19.练习:Serial Implementation of Scan

#include <stdio.h>

int main(int argc,char **argv)
{
    const int ARRAY_SIZE = 10;
    
    int acc = 0;
    int out[ARRAY_SIZE];
    int elements[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    
    for(int i = 0; i < ARRAY_SIZE; i++){
    	acc = acc + elements[i];
    	out[i] = acc;
    }

    for(int i = 0 ; i < ARRAY_SIZE; i++){
    	printf("%i ", out[i]);
    }

    return 0;
}

20.练习:Inclusive Scan Revisited

第n次输出需要加n-1次 0 +1 +2 +...+ n-1

串行的step需要走O(n)步

21.Hills Steele vs Blelloch Scan

22.练习:Hills Steele Scan

23.练习:Blelloch Scan

24.Analyzing The Blelloch Downsweep

25.练习:Which Scan Should You Use

26.练习:Histogram

27.练习:Serial Implementation of Histogram

28.Parallel Implementation of Histogram

29.练习:Implementing Histogram Using Atomics

30.Implementing Histogram Using Local Memory

31.Calculating Global Histogram Using Reduction

32.Sort Then Reduce By key

33.练习:What Operation should we use

34.练习:Final Thoughts on Histogram

35.Description of Problem Set #3

36.Congratulations

猜你喜欢

转载自blog.csdn.net/qq_24990189/article/details/89847271
今日推荐