【Participate in CUDA online training camp】--CUDA programming model thread organization

When the GPU manages threads, it is scheduled to execute on the SM in units of blocks. In each block, warp is used as the unit of execution. Each warp includes 32 threads.

1. Thread level

Thread
thread is the most basic unit, 32 threads form a warp, and a warp corresponds to an instruction stream.
Thread Block:
threads inside a group of threads block can share the storage unit. SM is the hardware level. One hardware SM can execute multiple blooks, and one block can only execute in one SM. Thread Grid:
a collection of thread blocks
The thread grid is composed of multiple thread blocks, and each thread block contains several threads
insert image description here

2. Multi-threaded kernel function. Thread index

Use threadIdx to get the serial number of the current thread in the thread block, and use blockIdx to get the serial number of the thread block where the thread is located in the grid. threadIdx.x is the serial number in the x direction of the thread executing the current kernel function blockIdx.x is the serial number in the x direction of the block where the thread executing the current kernel function is located. There are 8 in each group, threadIdx.x represents the index within the group, blockIdx.x represents the group index, and blockDim.x represents the number of threads in
each
group

insert image description here
.

experiment

Vector addition:

CPU execution

#include <math.h>
#include <stdlib.h>
#include <stdio.h>

void add(const double *x, const double *y, double *z, const int N)
{
    
    
    for (int n = 0; n < N; ++n)
    {
    
    
        z[n] = x[n] + y[n];
    }
}

void check(const double *z, const int N)
{
    
    
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
    
    
        if (fabs(z[n] - 3) > (1.0e-10))
        {
    
    
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Errors" : "Pass");
}


int main(void)
{
    
    
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double *x = (double*) malloc(M);
    double *y = (double*) malloc(M);
    double *z = (double*) malloc(M);

    for (int n = 0; n < N; ++n)
    {
    
    
        x[n] = 1;
        y[n] = 2;
    }

    add(x, y, z, N);
    check(z, N);

    free(x);
    free(y);
    free(z);
    return 0;
}

To change to GPU execution,
first transfer the data to the GPU, and when the GPU completes the calculation, transfer the data from the GPU to the CPU memory. At this time, we need to consider how to apply for the GPU storage unit, as well as the data transmission between the memory and the video memory.

#include <math.h>
#include <stdio.h>

void __global__ add(const double *x, const double *y, double *z, int count)
{
    
    
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if( n < count)
    {
    
    
    z[n] = x[n] + y[n];
    }

}
void check(const double *z, const int N)
{
    
    
    bool error = false;
    for (int n = 0; n < N; ++n)
    {
    
    
        if (fabs(z[n] - 3) > (1.0e-10))
        {
    
    
            error = true;
        }
    }
    printf("%s\n", error ? "Errors" : "Pass");
}


int main(void)
{
    
    
    const int N = 1000;
    const int M = sizeof(double) * N;
    double *h_x = (double*) malloc(M);
    double *h_y = (double*) malloc(M);
    double *h_z = (double*) malloc(M);

    for (int n = 0; n < N; ++n)
    {
    
    
        h_x[n] = 1;
        h_y[n] = 2;
    }

    double *d_x, *d_y, *d_z;
    cudaMalloc((void **)&d_x, M);
    cudaMalloc((void **)&d_y, M);
    cudaMalloc((void **)&d_z, M);
    cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice);

    const int block_size = 128;
    const int grid_size = (N + block_size - 1) / block_size;
    add<<<grid_size, block_size>>>(d_x, d_y, d_z, N);

    cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);
    return 0;
}

Compile and view the result:
insert image description here

Supongo que te gusta

Origin blog.csdn.net/weixin_47665864/article/details/128920245
Recomendado
Clasificación