cuda单block多thread的实现
#include <iostream>
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
__global__ void gpu_add(float*d_a,float*d_b,float*d_c,int n)//a,b,c数组,n是元素个数
{
int idx = threadIdx.x;
int IDX = idx;
d_c[idx] =d_a[idx] +d_b[idx];
IDX+=idx;
}
int main()
{
float *h_a,*h_b,*h_c,*d_a,*d_b,*d_c;
int n = 1024;
size_t nBytes = n * sizeof(float);
time_t t;
h_a = (float*)malloc(nBytes);
h_b = (float*)malloc(nBytes);
h_c = (float*)malloc(nBytes);
srand((unsigned int)time(&t));
for(int i = 0;i < n;++i)
{
h_a[i] = (float)(rand()&0xff)/10.0f;
h_b[i] = (float)(rand()&0xff)/10.0f;
std::cout << "h_a[" << i << "]=" << h_a[i] << "\t";
std::cout << "h_b[" << i << "]=" << h_b[i] << "\n";
}
cudaMalloc((void**)&d_a,nBytes);
cudaMalloc((void**)&d_b,nBytes);
cudaMalloc((void**)&d_c,nBytes);
cudaMemcpy(d_a,h_a,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,nBytes,cudaMemcpyHostToDevice);
gpu_add<<<1,n>>>(d_a,d_b,d_c,n);
cudaMemcpy(h_c,d_c,nBytes,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
for(int i = 0;i < n;++i)
{
std::cout << "c[" << i<<"]=" <<h_c[i]<<"\n";
}
free(h_a);
free(h_b);
free(h_c);
}