CUDA programming model series three (matrix multiplication)

CUDA programming model series three (matrix multiplication)

This series of tutorials will introduce the details of the specific CUDA programming code

CUDA programming model series three (matrix multiplication)

#include <stdio.h>
#include <math.h>

#define BLOCK_SIZE 32
// error type & event
// a[][] * b[][] = c[][]
// 

//                         b00 b01 b02 b03
//                         b10 b11 b12 b13
//                         b20 b21 b22 b23
//                         b30 b31 b32 b33
//
// a00 a01 a02 a03         c00 c01 c02 c03
// a10 a11 a12 a13         c10 c11 c12 c13    
// a20 a21 a22 a23         c20 c21 c22 c23
// a30 a31 a32 a33         c30 c31 c32 c33
//
// c21 = a20 * b01 + a21 * b11 + a22 * b21 + a23 * b31
// a00 a01 a02 a03 a10 a11 a12 a13 a20 a21 a22 a23 a30 a31 a32 a33
// 0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
// b00 b01 b02 b03 b10 b11 b12 b13 b20 b21 b22 b23 b30 b31 b32 b33
//
// index = y * size + x
// step 0 -> 3
// a_index = y * size + step;
// b_index = step * size + x;

void cpu_matrix_mult(int *a, int *b, int *c, const int size)
{
    
    
    for(int y=0; y<size; ++y)
    {
    
    
        for(int x=0; x<size; ++x)
        {
    
    
            int tmp = 0;
            for(int step = 0; step < size; ++step)
            {
    
    
                tmp += a[y*size + step] * b[step * size + x];
            }
            c[y * size + x] = tmp;
        }
    }
}

__global__ void gpu_matrix_mult(int *a, int *b, int *c, const int size)
{
    
    
    int y = blockDim.y * blockIdx.y + threadIdx.y;
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int tmp = 0;
    if( x < size && y < size)
    {
    
    
        for( int step = 0; step < size; ++step)
        {
    
    
            tmp += a[y * size + step] * b[step * size + x];
        }
        c[y * size + x] = tmp;
    }
}



int main()
{
    
    
    int matrix_size = 1000;
    int memsize = sizeof(int) * matrix_size * matrix_size;


    int *h_a, *h_b, *h_c, *h_cc;

    cudaMallocHost( (void**)&h_a, memsize);
    cudaMallocHost( (void**)&h_b, memsize);
    cudaMallocHost( (void**)&h_c, memsize);
    cudaMallocHost( (void**)&h_cc, memsize);

    for(int y=0; y<matrix_size; ++y)
    {
    
    
        for(int x=0; x<matrix_size; ++x)
        {
    
    
            h_a[y * matrix_size + x] = rand() % 1024;
        }
    }

    for(int y=0; y<matrix_size; ++y)
    {
    
    
        for(int x=0; x<matrix_size; ++x)
        {
    
    
            h_b[y * matrix_size + x] = rand() % 1024;
        }
    }

    int *d_a, *d_b, *d_c;
    cudaMalloc((void**) &d_a , memsize);
    cudaMalloc((void**) &d_b , memsize);
    cudaMalloc((void**) &d_c , memsize);


    cudaMemcpy( d_a, h_a, memsize, cudaMemcpyHostToDevice);
    cudaMemcpy( d_b, h_b, memsize, cudaMemcpyHostToDevice);


    unsigned int grid_rows = (matrix_size +BLOCK_SIZE -1)/BLOCK_SIZE;
    unsigned int grid_cols = (matrix_size +BLOCK_SIZE -1)/BLOCK_SIZE;

    dim3 dimGrid(grid_cols, grid_rows);
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);//1.gpu warp 32 2. <= 1024

    gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, matrix_size);

    cudaMemcpy( h_c, d_c, memsize, cudaMemcpyDeviceToHost);


    cpu_matrix_mult(h_a, h_b, h_cc, matrix_size);






    bool errors = false;
    for(int y=0; y<matrix_size; ++y)
    {
    
    
        for(int x=0; x<matrix_size; ++x)
        {
    
    
            if(fabs(h_cc[y*matrix_size + x] - h_c[y*matrix_size + x]) > (1.0e-10))
            {
    
    
                //printf("%d, %d\n", y, x);
                errors = true;
            }
        }
    }
    printf("Result: %s\n", errors?"Errors":"Passed");

    cudaFreeHost(h_a );
    cudaFreeHost(h_b );
    cudaFreeHost(h_c );
    cudaFreeHost(h_cc );
    cudaFree(d_a );
    cudaFree(d_b );
    cudaFree(d_c );
    return 0;

}


Guess you like

Origin blog.csdn.net/kunhe0512/article/details/131369624