cuda programming learning - running error detection (4)

foreword

References:

Gao Sheng's blog
"CUDA C programming authoritative guide"
and CUDA official document
CUDA programming: basics and practice Fan Zheyong

Articles and explanatory videos are simultaneously updated to the public "AI Knowledge Story", station B: go out to eat three bowls of rice

1: Write the header file erro.cuh

Write a header file (error.cuh), which contains a macro function (macro function) to detect CUDA runtime errors, the content is as follows: (1)
#pragma once is a preprocessing directive, its function is to ensure that the current file is compiled in one Units are not included repeatedly.
(2) The name of the macro function is CHECK, and the parameter call is a CUDA runtime API function.
(3) When defining a macro, if a line cannot fit, write \ at the end of the line to indicate a continuation line.
(4) Line 7 defines a variable error_code of cudaError_t type, and initializes it to the return value of the function call.
(5) Line 8 judges whether the value of the variable is cudaSuccess. If not, report the relevant file, line number, error code and error text description on lines 9-16 and exit the program. The cudaGetErrorString() in line 14 is obviously also a CUDA runtime API function, and its function is to convert the error code into a text description of the error.

#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{
      
                                                           \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {
      
                                                       \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)

2: Write a test program

#include<stdint.h>
#include<cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>

#define CHECK(call)                                   \
do                                                    \
{
      
                                                           \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {
      
                                                       \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)


const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double* x, const double* y, double* z, const int N);
void check(const double* z, const int N);

int main(void)
{
    
    
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double* h_x = (double*)malloc(M);
    double* h_y = (double*)malloc(M);
    double* h_z = (double*)malloc(M);

    for (int n = 0; n < N; ++n)
    {
    
    
        h_x[n] = a;
        h_y[n] = b;
    }

    double* d_x, * d_y, * d_z;
    CHECK(cudaMalloc((void**)&d_x, M));
    CHECK(cudaMalloc((void**)&d_y, M));
    CHECK(cudaMalloc((void**)&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyDeviceToHost));//Set Error
    CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyDeviceToHost));

    const int block_size = 128;
    const int grid_size = (N + block_size - 1) / block_size;
    add << <grid_size, block_size >> > (d_x, d_y, d_z, N);

    CHECK(cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost));
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_y));
    CHECK(cudaFree(d_z));
    return 0;
}

void __global__ add(const double* x, const double* y, double* z, const int N)
{
    
    
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
    
    
        z[n] = x[n] + y[n];
    }
}

void check(const double* z, const int N)
{
    
    
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
    
    
        if (fabs(z[n] - c) > EPSILON)
        {
    
    
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}

insert image description here

It can be seen that the macro function correctly captures the error at runtime, telling us that an illegal parameter appears in the 50th line of code in the file checkerror.cu. Illegal parameters refer to a problem with the parameters of the cudaMemcpy function, because we deliberately wrote cudaMemcpyHostToDevice as cudaMemcpyDeviceToHost. It can be seen that after using the error-checking macro function, we can get more useful error information, not just a wrong running result. From here on, we'll stick with this macro function wrapping most of the CUDA runtime API functions. One exception is the cudaEventQuery function, because it is likely to return cudaErrorNotReady, but it does not mean that the program has gone wrong.

3: Detection kernel function

The above method cannot catch errors related to calling the kernel function , because the kernel function does not return any value (recall that the kernel function must be decorated with void). There is a way to catch errors that may occur when calling the kernel function, that is, add the following two statements after calling the kernel function:
CHECK(cudaGetLastError());
CHECK(cudaDeviceSynchronize());

The role of the first statement is to catch the last error before the second statement, and the role of the second statement is to synchronize the host and device. The reason why the host and device need to be synchronized is because the call of the kernel function is asynchronous, that is, the host will execute the following statement immediately after issuing the command to call the kernel function, and will not wait for the execution of the kernel function to complete.

Write a program to test the kernel function error

The maximum thread block size is 1024 (this is true for all architectures from Kepler to Turing). If we accidentally write the thread block size in the kernel function execution configuration as 1280, the kernel function will not be successfully called. The code on line 57 successfully catches the error, telling us that the execution configuration parameters of the kernel function in the program are wrong:

insert image description here

#include <math.h>
#include <stdio.h>
#include<stdint.h>
#include<cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>


#define CHECK(call)                                   \
do                                                    \
{
      
                                                           \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {
      
                                                       \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__  add(const double* x, const double* y, double* z, const int N);
void check(const double* z, const int N);

int main(void)
{
    
    
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double* h_x = (double*)malloc(M);
    double* h_y = (double*)malloc(M);
    double* h_z = (double*)malloc(M);

    for (int n = 0; n < N; ++n)
    {
    
    
        h_x[n] = a;
        h_y[n] = b;
    }

    double* d_x, * d_y, * d_z;
    CHECK(cudaMalloc((void**)&d_x, M));
    CHECK(cudaMalloc((void**)&d_y, M));
    CHECK(cudaMalloc((void**)&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice));

    const int block_size = 1280;
    const int grid_size = (N + block_size - 1) / block_size;
    add << <grid_size, block_size >> > (d_x, d_y, d_z, N);
    CHECK(cudaGetLastError());
    CHECK(cudaDeviceSynchronize());

    CHECK(cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost));
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_y));
    CHECK(cudaFree(d_z));
    return 0;
}

void __global__ add(const double* x, const double* y, double* z, const int N)
{
    
    
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
    
    
        z[n] = x[n] + y[n];
    }
}

void check(const double* z, const int N)
{
    
    
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
    
    
        if (fabs(z[n] - c) > EPSILON)
        {
    
    
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}

Guess you like

Origin blog.csdn.net/qq_40514113/article/details/130900969