版权声明:本文为博主原创文章,转载请加入原文链接,谢谢。。 https://blog.csdn.net/shawncheer/article/details/77931172
- 一、简述:
-
1、本文参考 nvidia official tutorial
- 2、本文重点在于测量带宽的问题。
- 3、使用 cudaEventCreate() 可以不用与cpu进行同时。减少拥塞。
二、代码:
#include <iostream>
#include <math.h>
#include <stdio.h>
__global__
void saxpy(int n,float a,float *x,float *y)
{
int i = blockIdx.x*blockDim.x +threadIdx.x;
if (i < n) y[i] = a*x[i] + y[i];
}
int main(void){
int N = 1 << 20; //1M element.
//float *x=new float[N];
//float *y=new float[N];
//Allocate Unified Memory -- accessible from CPU or GPU
float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
//initialize x and y arrays on the host.
for (int i=0;i<N;i++){
x[i]=1.0f;
y[i]=2.0f;
}
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(start);
saxpy<<< (N+255)/256, 256>>>(N, 2.0, d_x, d_y);
cudaEventRecord(stop);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
// Check for errors (all values should be 3.0f)
float maxError=0.0f;
for (int i=0;i<N;i++)
maxError=max(maxError,fabs(y[i]-4.0f));
printf("Max error: %f . \n", maxError);
printf("Effective Bandwidth (GB/s): %f .\n", N*4*3/milliseconds/1e6);
cudaFree(x);
cudaFree(y);
cudaFree(d_x);
cudaFree(d_y);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}