NVIDIA CUDA Highly Parallel Processor Programming (4): Performance Optimization Exercises

1. The two reduction kernel functions in performance optimization are wasteful when using threads, and half of the threads in each block are not executed at all. Modify the kernel to eliminate this waste. Gives the value of the configuration parameter associated with the kernel when it starts. Is there any additional cost in computing operations? What type of resource limitation does this modification remove?

Answer: The modified first one:

unsigned int t = threadIdx.x * 2;
    for(int stride = 1;stride <= blockDim.x;stride <<= 1){
    
    
        if(t % (2*stride) == 0){
    
    
            partialSum[t] +=partialSum[t + stride];
        }
        __syncthreads();
    }

Modified second:

__shared__ double partialSum[];
unsigned int t = threadIdx.x;
for (unsigned int stride = blockDim.x; stride > 0; stride >>=1 ) 
{
    
    
 	__syncthreads();
 	if (t < stride)
 		partialSum[t] += partialSum[t+stride];
}

Start configuration:

reduction_sum<<<ceil((double)n/THREAD_LENGTH/2), THREAD_LENGTH>>>(d_A, n);

The first one assigns twice the thread ID to t, and the loop end condition changes from blockDim.x/2 to blockDim.x, so there are two additional operations.

The second one multiplies the initial condition in the loop: stride by 2, so there is an additional operation of a loop.

2. Comparing the modified kernel function of the previous question, which modification scheme introduces fewer operations?

The second type has fewer thread ID multiplications than the first type.

3. Write a complete kernel function on the basis of Exercise 1: (1) Add a few statements to load part of the data of the input array from the global memory to the shared memory; (2) Use the variable blockIdx.x to let multiple Blocks act on different parts of the array; (3) Write the reduced value of each part into a location according to blockIdx.x.

Please add a picture description
kernel:

__global__ void reduction_sum(double *X, size_t input_size){
    
    
    __shared__ double partialSum[2 * THREAD_LENGTH];
    int i = 2 * blockIdx.x * blockDim.x + threadIdx.x;
	if(i < input_size) partialSum[threadIdx.x] = X[i];
    else partialSum[threadIdx.x] = 0.0;
    if(i + blockDim.x < input_size) partialSum[threadIdx.x + blockDim.x] = X[i + blockDim.x];
    else partialSum[threadIdx.x + blockDim.x] = 0.0;
    __syncthreads();
    unsigned int t = threadIdx.x;
    for(int stride = blockDim.x; stride >= 1; stride /= 2){
    
    
        if(t < stride)
            partialSum[t] += partialSum[t + stride];
            __syncthreads();
    } 
    // unsigned int t = threadIdx.x * 2;
    // for(int stride = 1;stride <= blockDim.x;stride <<= 1){
    
    
    //     if(t % (2*stride) == 0){
    
    
    //         partialSum[t] +=partialSum[t + stride];
    //     }
    //     __syncthreads();
    // }
    if(t == 0){
    
    
        X[blockIdx.x] = partialSum[t];
    }
}

4. Design a reduction program based on the guest in 3. The mainframe code includes: (1) passing the large input array into the global memory; (2) calling the kernel function written in exercise 3 and adjusting the value of the configuration parameters by repeated loops, so as to generate the reduction result of the entire input array.

#include<cuda.h>
#include<stdlib.h>
#include<stdio.h>
#define THREAD_LENGTH 1024
//the same as before 
__global__ void reduction_sum(double *X, size_t input_size){
    
    
    __shared__ double partialSum[2 * THREAD_LENGTH];
    int i = 2 * blockIdx.x * blockDim.x + threadIdx.x;
	if(i < input_size) partialSum[threadIdx.x] = X[i];
    else partialSum[threadIdx.x] = 0.0;
    if(i + blockDim.x < input_size) partialSum[threadIdx.x + blockDim.x] = X[i + blockDim.x];
    else partialSum[threadIdx.x + blockDim.x] = 0.0;
    __syncthreads();
    //without console stream
    unsigned int t = threadIdx.x;
    for(int stride = blockDim.x; stride >= 1; stride /= 2){
    
    
        if(t < stride)
            partialSum[t] += partialSum[t + stride];
            __syncthreads();
    } 
    //with console stream
    // unsigned int t = threadIdx.x * 2;
    // for(int stride = 1;stride <= blockDim.x;stride <<= 1){
    
    
    //     if(t % (2*stride) == 0){
    
    
    //         partialSum[t] +=partialSum[t + stride];
    //     }
    //     __syncthreads();
    // }
    if(t == 0){
    
    
        X[blockIdx.x] = partialSum[t];
    }
}
//host code
double reduceArray(double* array, unsigned int length){
    
    
    double *d_A;
    int size = length*sizeof(double);
    cudaMalloc(&d_A, size);
    cudaMemcpy(d_A, array, size, cudaMemcpyHostToDevice);
    int num_blocks = (length - 1)/THREAD_LENGTH/2 + 1;
    while(num_blocks >= 1){
    
    
        reduction_sum<<<num_blocks, THREAD_LENGTH>>>(d_A, length);
        if(num_blocks == 1)
            break;
        length = num_blocks; 
        num_blocks = (num_blocks - 1)/THREAD_LENGTH/2 + 1;
    }
    double result(0);
    cudaMemcpy(&result, d_A, sizeof(double), cudaMemcpyDeviceToHost);
    cudaFree(d_A);
    return result;
}
//test
int main(int argc, char **argv){
    
    
    int n = atoi(argv[1]);
    double *A = (double *)malloc(n * sizeof(double));
    for(int i = 0; i < n;++i){
    
    
        A[i] = 1.0;
    }
    double result = reduceArray(A, n);
    printf("%lf\n", result);
    free(A);
    return 0;
}

operation result:
insert image description here

5. For the matrix multiplication kernel function in performance optimization , use a 16 × 16 16\times1616×Using a small matrix of 16 as an example, plot the access patterns in rows 9 and 10 for all threads in the same warp. Calculate the values ​​of tx and ty for each thread in the same warp, and use the values ​​of tx and ty when calculating the index values ​​of d_M and d_N in lines 9 and 10. Explain that the thread does access consecutive d_M and d_N locations in global memory during each iteration.

Suppose the thread block size is 16 × 16 16\times1616×16 , the first warp contains two rows of thread with column index 0 1 .
Index of d_M:
( block I dx . y × TILE _ WIDTH + thread I dx . y ) × W idth + m × TILE _ WIDTH + thread I dx . x (blockIdx.y \times TILE\_WIDTH + threadIdx.y) \times Width + m \times TILE\_WIDTH + threadIdx.x(blockIdx.y×TILE_WIDTH+threadIdx.y)×Width+m×TILE_WIDTH+threadIdx.x
= ( ( b l o c k I d x . y × W i d t h + m ) × T I L E _ W I D T H ) + t h r e a d I d x . y × W i d t h + t h r e a d I d x . x ((blockIdx.y \times Width + m) \times TILE\_WIDTH) + threadIdx.y \times Width + threadIdx.x ((blockIdx.y×Width+m)×TILE_WIDTH)+threadIdx.y×Width+threadIdx.x

d_N的索引:
( m × T I L E _ W I D T H + t h r e a d I d x . y ) × W i d t h + b l o c k I d x . x × T I L E _ W I D T H + t h r e a d I d x . x (m \times TILE\_WIDTH + threadIdx.y)\times Width + blockIdx.x \times TILE\_WIDTH + threadIdx.x (m×TILE_WIDTH+threadIdx.y)×Width+blockIdx.x×TILE_WIDTH+threadIdx.x
= ( ( m × W i d t h + b l o c k I d x . x ) × T I L E _ W I D T H ) + t h r e a d I d x . y × + t h r e a d I d x . x ((m \times Width +blockIdx.x) \times TILE\_WIDTH) + threadIdx.y \times + threadIdx.x ((m×Width+blockIdx.x)×TILE_WIDTH)+threadIdx.y×+threadIdx.x

The values ​​inside the brackets are the same for threads inside a single block each time through the loop, so only the values ​​outside the brackets are proved: thread I dx . y × 16 + thread I dx . x threadIdx.y \times 16+ threadIdx.xthreadIdx.y×16+t h re a d I d x . x can generate adjacent addresses.

tx ty t h r e a d I d x . y × W i d t h + t h r e a d I d x . x threadIdx.y \times Width + threadIdx.x threadIdx.y×Width+threadIdx.x
0 0 0
1 0 1
2 0 2
3 0 3
4 0 4
5 0 5
6 0 6
7 0 7
8 0 8
9 0 9
10 0 10
11 0 11
12 0 12
13 0 13
14 0 14
15 0 15
0 1 16
1 1 17
2 1 18
3 1 19
4 1 20
5 1 21
6 1 22
7 1 23
8 1 24
9 1 25
10 1 26
11 1 27
12 1 28
13 1 29
14 1 30
15 1 31

Proven.

But when the thread block used is small, only the same row accesses are continuous, and the accesses of different rows cannot be merged.

10. For the design in the figure below, write the matrix multiplication kernel function.

insert image description here

#define BLOCK_SIZE 16
__global__ void
matrixMul(float *Pd, float *Md, float *Nd, int widthM, int widthN)
{
    
    
    unsigned int bx = blockIdx.x;
    unsigned int by = blockIdx.y;
    unsigned int tx = threadIdx.x;
    unsigned int ty = threadIdx.y;
    __shared__ float Ms[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float Ns[BLOCK_SIZE][BLOCK_SIZE * 2];
    // M 的第一个子矩阵左上元素索引
    int mBegin = widthM * BLOCK_SIZE * by;
    // M 的最后一个子矩阵右上元素索引,用来控制循环何时结束
    int mEnd = mBegin + widthM - 1;
    // M 子矩阵每次循环需要加上的距离
    int mStep = BLOCK_SIZE;
    // N 的第一个子矩阵左上元素索引
    int nBegin = BLOCK_SIZE * bx;
    // N 子矩阵每次循环需要加上的距离
    int nStep = BLOCK_SIZE * widthN;
	//Psub 用来存储结果矩阵每块元素的部分和
    float Psub1 = 0.0f;
    float Psub2 = 0.0f;
    // 遍历所有子矩阵
    for (int m = mBegin, n = nBegin; m <= mEnd; m += mStep, n += nStep)
    {
    
    
        // 将数组加载到共享存储器,每个线程加载 3 个元素
        Ms[ty][tx] = Md[m + widthM * ty + tx];
        Ns[ty][tx] = Nd[n + widthN * ty + tx];
        Ns[ty][tx + blockDim.x] = Nd[n + widthN * ty + tx + blockDim.x];

        __syncthreads();
        // 一次计算两个矩阵
        for (int k = 0; k < BLOCK_SIZE; ++k)
        {
    
    
            Psub1 += Ms[ty][k] * Ns[k][tx];
            Psub2 += Ms[ty][k] * Ns[k][tx + blockDim.x];
        }

        __syncthreads();
    }
    // 写回计算结果,每个线程写一个
    int p = widthN * BLOCK_SIZE * by + BLOCK_SIZE * bx;
    p += widthN * ty + tx;
    Pd[p] = Psub1;
    Pd[p + blockDim.x] = Psub2;
}

12. A young engineer uses the following kernel for reduction in order to improve performance. (A) Do you think performance will improve? (B) Should the engineer be rewarded or punished? why?

__shared__ float partialSum[];
unsigned int tid = threadIdx.x;
for (unsigned int stride = n >> 1; stride >= 32; stride >>= 1)
{
    
    
    __syncthreads();
    if (tid < stride)
        shared[tid] += shared[tid + stride];
}
__syncthreads();
if (tid < 32)
{
    
     // unroll last 5 predicated steps
    shared[tid] += shared[tid + 16];
    shared[tid] += shared[tid + 8];
    shared[tid] += shared[tid + 4];
    shared[tid] += shared[tid + 2];
    shared[tid] += shared[tid + 1];
}

will be promoted, will be rewardedinsert image description here

When only 32 threads are executing calculations, only one warp is used in each cycle, and other warps are not used, which will cause resource waste. Expanding the warp at the last calculation separately reduces the number of instructions to be executed, thereby improving performance.

Guess you like

Origin blog.csdn.net/weixin_45773137/article/details/125790114