1 求和
#include <cstdio>
#include <iostream>
#include <chrono>
#define BLOCK_DIM 1024
#define REDUCE_SCALE 4
__global__ void sum_gpu(int* const arr, int* sum, int n) {
__shared__ volatile int local_sum[BLOCK_DIM];
int temp_sum = 0;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) {
temp_sum += arr[i];
}
local_sum[threadIdx.x] = temp_sum;
__syncthreads();
for (int j = blockDim.x >> 1; j > 32; j >>= 1) {
if (threadIdx.x < j) local_sum[threadIdx.x] += local_sum[threadIdx.x + j];
__syncthreads();
}
if (threadIdx.x < 32) {
local_sum[threadIdx.x] += local_sum[threadIdx.x + 32];
local_sum[threadIdx.x] += local_sum[threadIdx.x + 16];
local_sum[threadIdx.x] += local_sum[threadIdx.x + 8];
local_sum[threadIdx.x] += local_sum[threadIdx.x + 4];
local_sum[threadIdx.x] += local_sum[threadIdx.x + 2];
local_sum[threadIdx.x] += local_sum[threadIdx.x + 1];
}
sum[blockIdx.x] = local_sum[0];
}
int main() {
int n = 1<<24;
int* arr_host = (int*)malloc(sizeof(int) * n);
for (int i = 0; i < n; i++) {
arr_host[i] = std::rand() % 4;
}
int* arr_dev;
cudaMalloc(&arr_dev, sizeof(int)*n);
cudaMemcpy(arr_dev, arr_host, sizeof(int)*n, cudaMemcpyHostToDevice);
int gridDim = n/BLOCK_DIM/REDUCE_SCALE;
int* sum_dev;
cudaMalloc(&sum_dev, sizeof(int) * gridDim);
auto start = std::chrono::steady_clock::now();
sum_gpu<<<gridDim, BLOCK_DIM>>>(arr_dev, sum_dev, n);
auto end = std::chrono::steady_clock::now();
auto dt = end - start;
std::cout << "time: " << (double)dt.count()/(1000000) << "ms" << std::endl;
int* sum_host = (int*)malloc(sizeof(int) * gridDim);
cudaMemcpy(sum_host, sum_dev, sizeof(int) * gridDim, cudaMemcpyDeviceToHost);
int final_sum = 0;
for (int i = 0; i < gridDim; i++) {
final_sum += sum_host[i];
}
printf("sum: %d\n", final_sum);
}
2 求最大值
#include <cstdio>
#include <iostream>
#include <chrono>
#define BLOCK_DIM 1024
#define REDUCE_SCALE 4
__global__ void sum_gpu(int* const arr, int* sum, int n) {
__shared__ volatile int local_sum[BLOCK_DIM];
int temp_sum = arr[0];
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) {
if (temp_sum < arr[i]) temp_sum = arr[i];
}
local_sum[threadIdx.x] = temp_sum;
__syncthreads();
for (int j = blockDim.x >> 1; j > 32; j >>= 1) {
if (threadIdx.x < j) {
if (local_sum[threadIdx.x] < local_sum[threadIdx.x + j]) local_sum[threadIdx.x] = local_sum[threadIdx.x + j];
}
__syncthreads();
}
if (threadIdx.x < 32) {
if (local_sum[threadIdx.x] < local_sum[threadIdx.x + 32]) local_sum[threadIdx.x] = local_sum[threadIdx.x + 32];
if (local_sum[threadIdx.x] < local_sum[threadIdx.x + 16]) local_sum[threadIdx.x] = local_sum[threadIdx.x + 16];
if (local_sum[threadIdx.x] < local_sum[threadIdx.x + 8]) local_sum[threadIdx.x] = local_sum[threadIdx.x + 8];
if (local_sum[threadIdx.x] < local_sum[threadIdx.x + 4]) local_sum[threadIdx.x] = local_sum[threadIdx.x + 4];
if (local_sum[threadIdx.x] < local_sum[threadIdx.x + 2]) local_sum[threadIdx.x] = local_sum[threadIdx.x + 2];
if (local_sum[threadIdx.x] < local_sum[threadIdx.x + 1]) local_sum[threadIdx.x] = local_sum[threadIdx.x + 1];
}
sum[blockIdx.x] = local_sum[0];
}
int main() {
// 创建数组
int n = 1<<24;
int* arr_host = (int*)malloc(sizeof(int) * n);
for (int i = 0; i < n; i++) {
arr_host[i] = i;
}
int* arr_dev;
int* sum_dev;
int gridDim = n/BLOCK_DIM/REDUCE_SCALE;
cudaMalloc(&arr_dev, sizeof(int)*n);
cudaMalloc(&sum_dev, sizeof(int) * gridDim);
cudaMemcpy(arr_dev, arr_host, sizeof(int)*n, cudaMemcpyHostToDevice);
auto start = std::chrono::steady_clock::now();
sum_gpu<<<gridDim, BLOCK_DIM>>>(arr_dev, sum_dev, n);
auto end = std::chrono::steady_clock::now();
auto dt = end - start;
std::cout << "time: " << (double)dt.count()/(1000000) << "ms" << std::endl;
int* sum_host = (int*)malloc(sizeof(int) * gridDim);
cudaMemcpy(sum_host, sum_dev, sizeof(int) * gridDim, cudaMemcpyDeviceToHost);
int final_sum = sum_host[0];
for (int i = 1; i < gridDim; i++) {
if (final_sum < sum_host[i]) final_sum = sum_host[i];
}
printf("sum: %d\n", final_sum);
free(arr_host);
cudaFree(arr_dev);
cudaFree(sum_dev);
free(sum_host);
return 0;
}