CUDA中的cudaMemcpy2D和cudaMallocPitch使用详解

#include <stdio.h>  
#include <stdlib.h>  
#include <cuda_runtime.h> 
#define N 3 //类似数组的行
#define M 5 //类似数组的列
#define GridSize 16
#define BlockSize 16
#include<iostream>
using namespace std;

__global__ void kernel(float * d_matrix, size_t pitch) {
    int count = 1;
    for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < N; j += blockDim.y * gridDim.y)
    {
        float* row_d_matrix = (float*)((char*)d_matrix + j*pitch);
        for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x)
        {
            row_d_matrix[i] = count;
            count++;
        }
    }
}

int main()
{
    float *d_matrix;
    float *dc_matrix = new float[M*N];
    //dc_matrix = (float*)malloc(sizeof(float)*M*N);
    size_t pitch;
    cudaMallocPitch(&d_matrix, &pitch, M*sizeof(float), N);
    for (int i = 0; i < M*N; i++)
        dc_matrix[i] = i;
    for (int i = 0; i < M*N; i++)
        printf("%.2f ", dc_matrix[i]);
    printf("\n");
    cudaMemcpy2D(d_matrix, pitch, dc_matrix, M* sizeof(float), M * sizeof(float), N, cudaMemcpyHostToDevice);

    kernel << <GridSize, BlockSize >> >(d_matrix, pitch);
    cudaMemcpy2D(dc_matrix, M * sizeof(float), d_matrix, pitch, M * sizeof(float), N, cudaMemcpyDeviceToHost);
    for (int i = 0; i < M*N; i++)
        printf("%.2f ", dc_matrix[i]);

    cudaFree(d_matrix);
    free(dc_matrix);
    return 0;
}

猜你喜欢

转载自blog.csdn.net/yujuan110/article/details/79072201