CUDA--Unified Memory

在我们编写CUDA程序时我们经常要对CPU和GPU上分配内存和管理，这样就增加了编写程序发复杂度。在cuda6后引入了统一寻址（Unified Memory）技术，该技术使得CPU和GPU使用同一个指针对同一块内存进行处理，省去了原始的在CPU和CPU分配内存然后来回拷贝的过程，简化了程序的编写。

统一寻址创建了一块托管内存（managed memory），这是在CPU和GPU共享的内存，它在CPU和GPU之间架起了桥梁。这块托管内存CPU和GPU都能访问通过单一指针访问得到，最关键的一点是，系统会自动地在host端和device端转移在Unified Memory内存分配的数据，这样就好像看起来CPU内存好像运行在GPU，GPU内存好像运行在CPU。

下面写了一个简单的向量相加程序来比较使用统一寻址和不使用统一寻址程序的复杂度和计算时间。

1.传统的向量相加程序：

#include <cuda_runtime.h>
#include <stdio.h>
#define NUM 1024*2
__global__ void add(int *a,int *b,int *c){
 int tid=threadIdx.x+blockIdx.x*blockDim.x;
 if(tid<NUM){
   c[tid]=a[tid]+b[tid];
 }
}
int main(){
 int i;
 int *a,*b,*c;
 int *a_device,*b_device,*c_device;
 cudaEvent_t start,end;
 float elapsedTime;
 cudaEventCreate(&start);
 cudaEventCreate(&end);
 cudaEventRecord(start,0);

 a=(int *)malloc(NUM*sizeof(int));
 b=(int *)malloc(NUM*sizeof(int));
 c=(int *)malloc(NUM*sizeof(int));
 cudaMalloc((void **)&a_device,NUM*sizeof(int));
cudaMalloc((void **)&b_device,NUM*sizeof(int));
 cudaMalloc((void **)&c_device,NUM*sizeof(int));
 for(i=0;i<NUM;i++){
    a[i]=i;
    b[i]=i;
 }
 cudaMemcpy(a_device,a,sizeof(int)*NUM,cudaMemcpyHostToDevice);
 cudaMemcpy(b_device,b,sizeof(int)*NUM,cudaMemcpyHostToDevice);
 add<<<NUM/1024+1,1024>>>(a_device,b_device,c_device);
 cudaDeviceSynchronize();
 cudaMemcpy(c,c_device,sizeof(int)*NUM,cudaMemcpyDeviceToHost);

 cudaEventRecord(end,0);
 cudaEventSynchronize(end);

 cudaEventElapsedTime(&elapsedTime,start,end);
 printf("tie===%3.1f ms\n",elapsedTime);

 cudaFree(a_device);
 cudaFree(b_device);
 cudaFree(c_device);
 free(a);
 free(b);
 free(c);
return 0;
}

运行结果：tie===0.3 ms

2.使用统一寻址方式：

#include <cuda_runtime.h>
#include <stdio.h>
#define NUM 1024*2
__global__ void add(int *a,int *b,int *c){
 int tid=threadIdx.x+blockIdx.x*blockDim.x;
 if(tid<NUM){
   c[tid]=a[tid]+b[tid];
 }
}
int main(){
 int i,j;
 int *a,*b,*c;
 cudaEvent_t start,end;
 float elapsedTime;
 cudaEventCreate(&start);
 cudaEventCreate(&end);
 cudaEventRecord(start,0);

 cudaMallocManaged(&a,NUM*sizeof(int));
 cudaMallocManaged(&b,NUM*sizeof(int));
 cudaMallocManaged(&c,NUM*sizeof(int));
 for(i=0;i<NUM;i++){
    a[i]=i;
    b[i]=i;
 }
 add<<<NUM/1024+1,1024>>>(a,b,c);
 cudaDeviceSynchronize();

 cudaEventRecord(end,0);
 cudaEventSynchronize(end);

 cudaEventElapsedTime(&elapsedTime,start,end);
 printf("tie===%3.1f ms\n",elapsedTime);

 cudaFree(a);
 cudaFree(b);
 cudaFree(c);
return 0;
}

运行结果：tie===10.2 ms

从以上代码的比较可以看出，使用统一寻址方式可以减少代码编写的复杂度，但不会减少代码运行的时间复杂度，反而还增加了时间复杂度。因此在以后编写代码时根据自己的需求使用统一寻址编码方式。为什么会增加这么大的时间复杂度？，目前还不太清楚，希望有理解的朋友能赐教一下。

参考：http://www.ouccloud.com/356.html

猜你喜欢