cuda 在GPU和CPU之间复制数组

#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",             \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(1);                             \
  }                                                 \
} while(0)
int nDev=2;
  float** sendbuff = (float**)malloc(nDev * sizeof(float*));
  float** recvbuff = (float**)malloc(nDev * sizeof(float*));
  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
 int size=4;
 for (int i = 0; i < nDev; ++i) {
    CUDACHECK(cudaSetDevice(localRank*nDev + i));
    CUDACHECK(cudaMalloc(sendbuff + i, size * sizeof(float)));
    CUDACHECK(cudaMalloc(recvbuff + i, size * sizeof(float)));
    CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float)));
    CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float)));


    float *h_arr;
    h_arr = (float *)malloc(size*sizeof(float));
    for (int i=0; i<size; ++i)
        h_arr[i] = i; // Or other values
    CUDACHECK(cudaMemcpy(sendbuff[i], h_arr, size*sizeof(float), cudaMemcpyHostToDevice)); //将数据从CPU传递到GPU

    CUDACHECK(cudaStreamCreate(s+i));

  }
  for (int i = 0; i < nDev; ++i) {
       CUDACHECK(cudaSetDevice(localRank*nDev + i));
       float* recvCPU=(float*)malloc(size*sizeof(float));  //将数据从cuda 拷贝到cpu
       CUDACHECK(cudaMemcpy(recvCPU, sendbuff[i], sizeof(float) * size, cudaMemcpyDeviceToHost));
      printf("Begin Reduce Dev is %d of process myRank is %d, RecvBUf is %f,%f,%f,%f\n",i,myRank
      ,recvCPU[0],recvCPU[1],recvCPU[2],recvCPU[3]);

  }

猜你喜欢

转载自blog.csdn.net/TH_NUM/article/details/81098296