Lesson 3 - Fundamental GPU Algorithms(Reduce,Scan,Histogram)

1.Welcome to Unit#3

2.Fundamental GPU Algorithms

3.Digging Holes Again

4.Steps and Work

5.练习：A Quiz on Step adn work

6.Reduce Part 1

7.Reduce Part 2

8.练习：Binary and Associative Operators

9.练习：Serial lmplementation of Reduce

10.练习：Parallel Reduce

11.练习：Step Complexity of Parallel Reduce

12.练习：Reduction Using Global and Shared Memory

13.Scan

14.Balancing Checkbook With Scan

15.练习：Inputs to Scan

16.练习：What Scan Actually Does

17.Why Do We Care About Parallel Scan

18.Inclusive Vs. Exclusive Scan

19.练习：Serial Implementation of Scan

20.练习：Inclusive Scan Revisited

21.Hills Steele vs Blelloch Scan

22.练习：Hills Steele Scan

23.练习：Blelloch Scan

24.Analyzing The Blelloch Downsweep

25.练习：Which Scan Should You Use

26.练习：Histogram

27.练习：Serial Implementation of Histogram

28.Parallel Implementation of Histogram

29.练习：Implementing Histogram Using Atomics

30.Implementing Histogram Using Local Memory

31.Calculating Global Histogram Using Reduction

32.Sort Then Reduce By key

33.练习：What Operation should we use

34.练习：Final Thoughts on Histogram

35.Description of Problem Set #3

36.Congratulations

1.Welcome to Unit#3

2.Fundamental GPU Algorithms

回顾Lession2的 communication pattern

3.Digging Holes Again

4.Steps and Work

5.练习：A Quiz on Step and work

6.Reduce Part 1

7.Reduce Part 2

8.练习：Binary and Associative Operators

9.练习：Serial lmplementation of Reduce

10.练习：Parallel Reduce

11.练习：Step Complexity of Parallel Reduce

12.练习：Reduction Using Global and Shared Memory

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid  = threadIdx.x;

    // do reduction in global mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            d_in[myId] += d_in[myId + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = d_in[myId];
    }
}

__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
    // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
    extern __shared__ float sdata[];

    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid  = threadIdx.x;

    // load shared mem from global mem
    sdata[tid] = d_in[myId];
    __syncthreads();            // make sure entire block is loaded!

    // do reduction in shared mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = sdata[0];
    }
}

void reduce(float * d_out, float * d_intermediate, float * d_in, 
            int size, bool usesSharedMemory)
{
    // assumes that size is not greater than maxThreadsPerBlock^2
    // and that size is a multiple of maxThreadsPerBlock
    const int maxThreadsPerBlock = 1024;
    int threads = maxThreadsPerBlock;
    int blocks = size / maxThreadsPerBlock;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
            (d_intermediate, d_in);
    }
    else
    {
        global_reduce_kernel<<<blocks, threads>>>
            (d_intermediate, d_in);
    }
    // now we're down to one block left, so reduce it
    threads = blocks; // launch one thread for each block in prev step
    blocks = 1;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
            (d_out, d_intermediate);
    }
    else
    {
        global_reduce_kernel<<<blocks, threads>>>
            (d_out, d_intermediate);
    }
}

int main(int argc, char **argv)
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0) {
        fprintf(stderr, "error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }
    int dev = 0;
    cudaSetDevice(dev);

    cudaDeviceProp devProps;
    if (cudaGetDeviceProperties(&devProps, dev) == 0)
    {
        printf("Using device %d:\n", dev);
        printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
               devProps.name, (int)devProps.totalGlobalMem, 
               (int)devProps.major, (int)devProps.minor, 
               (int)devProps.clockRate);
    }

    const int ARRAY_SIZE = 1 << 20;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // generate the input array on the host
    float h_in[ARRAY_SIZE];
    float sum = 0.0f;
    for(int i = 0; i < ARRAY_SIZE; i++) {
        // generate random float in [-1.0f, 1.0f]
        h_in[i] = -1.0f + (float)random()/((float)RAND_MAX/2.0f);
        sum += h_in[i];
    }

    // declare GPU memory pointers
    float * d_in, * d_intermediate, * d_out;

    // allocate GPU memory
    cudaMalloc((void **) &d_in, ARRAY_BYTES);
    cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated
    cudaMalloc((void **) &d_out, sizeof(float));

    // transfer the input array to the GPU
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 

    int whichKernel = 0;
    if (argc == 2) {
        whichKernel = atoi(argv[1]);
    }
        
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    // launch the kernel
    switch(whichKernel) {
    case 0:
        printf("Running global reduce\n");
        cudaEventRecord(start, 0);
        for (int i = 0; i < 100; i++)
        {
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
        }
        cudaEventRecord(stop, 0);
        break;
    case 1:
        printf("Running reduce with shared mem\n");
        cudaEventRecord(start, 0);
        for (int i = 0; i < 100; i++)
        {
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
        }
        cudaEventRecord(stop, 0);
        break;
    default:
        fprintf(stderr, "error: ran no kernel\n");
        exit(EXIT_FAILURE);
    }
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);    
    elapsedTime /= 100.0f;      // 100 trials

    // copy back the sum from GPU
    float h_out;
    cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    printf("average time elapsed: %f\n", elapsedTime);

    // free GPU memory allocation
    cudaFree(d_in);
    cudaFree(d_intermediate);
    cudaFree(d_out);
        
    return 0;
}

wlsh@wlsh-ThinkStation:~/Desktop/cs344-master/Lesson Code Snippets/Lesson 3 Code Snippets$ ./a.out 0
Using device 0:
GeForce GTX 980 Ti; global mem: 2083127296B; compute v5.2; clock: 1228000 kHz
Running global reduce
average time elapsed: 0.138348
wlsh@wlsh-ThinkStation:~/Desktop/cs344-master/Lesson Code Snippets/Lesson 3 Code Snippets$ ./a.out 1
Using device 0:
GeForce GTX 980 Ti; global mem: 2083127296B; compute v5.2; clock: 1228000 kHz
Running reduce with shared mem
average time elapsed: 0.117061

13.Scan

14.Balancing Checkbook With Scan

15.练习：Inputs to Scan

16.练习：What Scan Actually Does

17.Why Do We Care About Parallel Scan

18.Inclusive Vs. Exclusive Scan

19.练习：Serial Implementation of Scan

#include <stdio.h>

int main(int argc,char **argv)
{
    const int ARRAY_SIZE = 10;
    
    int acc = 0;
    int out[ARRAY_SIZE];
    int elements[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    
    for(int i = 0; i < ARRAY_SIZE; i++){
    	acc = acc + elements[i];
    	out[i] = acc;
    }

    for(int i = 0 ; i < ARRAY_SIZE; i++){
    	printf("%i ", out[i]);
    }

    return 0;
}

Lesson 3 - Fundamental GPU Algorithms(Reduce,Scan,Histogram)

1.Welcome to Unit#3

2.Fundamental GPU Algorithms

3.Digging Holes Again

4.Steps and Work

5.练习：A Quiz on Step and work

6.Reduce Part 1

7.Reduce Part 2

8.练习：Binary and Associative Operators

9.练习：Serial lmplementation of Reduce

10.练习：Parallel Reduce

11.练习：Step Complexity of Parallel Reduce

12.练习：Reduction Using Global and Shared Memory

13.Scan

14.Balancing Checkbook With Scan

15.练习：Inputs to Scan

16.练习：What Scan Actually Does

17.Why Do We Care About Parallel Scan

18.Inclusive Vs. Exclusive Scan

19.练习：Serial Implementation of Scan

20.练习：Inclusive Scan Revisited

21.Hills Steele vs Blelloch Scan

22.练习：Hills Steele Scan

23.练习：Blelloch Scan

24.Analyzing The Blelloch Downsweep

25.练习：Which Scan Should You Use

26.练习：Histogram

27.练习：Serial Implementation of Histogram

28.Parallel Implementation of Histogram

29.练习：Implementing Histogram Using Atomics

30.Implementing Histogram Using Local Memory

31.Calculating Global Histogram Using Reduction

32.Sort Then Reduce By key

33.练习：What Operation should we use

34.练习：Final Thoughts on Histogram

35.Description of Problem Set #3

36.Congratulations

猜你喜欢