资料杂乱无章,各种思维的编程(历经心酸有以下体会)
废话不多说,先贴上最简洁的代码
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<stdlib.h>
#define threadsperblock 128
#define Matrix_size 256
__global__ static void Mulitkernel(const float* a, const float* b, float* c,int n)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
int idx = bid*threadsperblock + tid;
int row = idx / n;
int col = idx%n;
float Cvalue = 0;
for (int i = 0; i < n; i++)
{
Cvalue += a[row*n + i] * b[i*n + col];
}
c[row*n + col] = Cvalue;
}
void matgen(float* a, int n)
{
int i, j;
for (i = 0; i < n; i++)
for (j = 0; j <n; j++)
a[i*n + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX*RAND_MAX);
}
int main()
{
float *a, *b, *c, *d;
int n = Matrix_size;
a = (float*)malloc(sizeof(float)*n*n);
b = (float*)malloc(sizeof(float)*n*n);
c = (float*)malloc(sizeof(float)*n*n);
d = (float*)malloc(sizeof(float)*n*n);
srand(0);
matgen(a, n);
matgen(b, n);
float *d_a, *d_b, *d_c;
cudaMalloc((void**)&d_a, sizeof(float)*n*n);
cudaMalloc((void**)&d_b, sizeof(float)*n*n);
cudaMalloc((void**)&d_c, sizeof(float)*n*n);
cudaMemcpy(d_a, a, sizeof(float)*n*n, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float)*n*n, cudaMemcpyHostToDevice);
Mulitkernel << <1, threadsperblock >> > (d_a, d_b, d_c, n);
cudaMemcpy(c, d_c, sizeof(float)*n*n, cudaMemcpyDeviceToHost);
return 0;
}
来解释一下,它的奇葩之处:它使用一维线程来标记矩阵的二维编号以及一维数组存储矩阵的数值;