以下に、GPU 側でのレーダー信号処理のパルス圧縮アルゴリズムの高速化を示します。
レーダー信号処理のための GPU の導入を検討するためのリファレンスを提供します。
脉冲压缩算法 在GPU实现,模拟LFM线性调频信号,完成GPU端 cuda加速
最终与matlab答案进行正确性验证对比
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cufft.h>
#include"cuda_runtime_api.h"
#include"device_launch_parameters.h"
#define M_PI 3.14159265358979323846
/*
#define BATCH 1
#define SIZE 2048
*/
#define BATCH 32
#define SIZE 1080
#include "sys/time.h"
double what_time_is_it_now() {
struct timeval time;
if (gettimeofday(&time, NULL)) {
return 0;
}
return (double)time.tv_sec + (double)time.tv_usec * .000001;
}
__global__ void complexMulKernel(const cuComplex* r_sf, const cuComplex* r_hf, cuComplex* sot1, int num) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < num) {
sot1[i] = cuCmulf(r_sf[i], r_hf[i]);
}
}
int main() {
double start1, end1,start2, end2,start3, end3;
double t1 = 10e-6;
double b = 25e6;
double k = b / t1;
double fs = 200e6;
double ts = 1 / fs;
double n = t1 / ts;
// int n = 2048;
cufftComplex* st;
cufftComplex* ht;
cufftComplex* sf;
cufftComplex* hf;
cufftComplex* sot1;
cufftComplex* sot;
cudaMallocHost((void**)&st, SIZE * BATCH * sizeof(cufftComplex));
cudaMallocHost((void**)&ht, SIZE * BATCH * sizeof(cufftComplex));
cudaMallocHost((void**)&sf, SIZE * BATCH * sizeof(cufftComplex));
cudaMallocHost((void**)&hf, SIZE * BATCH * sizeof(cufftComplex));
cudaMallocHost((void**)&sot1, SIZE * BATCH * sizeof(cufftComplex));
cudaMallocHost((void**)&sot, SIZE * BATCH * sizeof(cufftComplex));
//QueryPerformanceCounter(&start_time);
cufftComplex* d_st;
cufftComplex* d_ht;
cufftComplex* d_sf;
cufftComplex* d_hf;
cufftComplex* d_st2;
cufftComplex* d_ht2;
cufftComplex* d_sf2;
cufftComplex* d_hf2;
cufftComplex* d_sot1;
cufftComplex* d_sot;
cufftComplex* d_sot1_out;
cudaMalloc((void**)&d_st, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_ht, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_sf, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_st2, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_ht2, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_sf2, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_hf, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_sot1, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_sot, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_sot1_out, SIZE * BATCH * sizeof(cufftComplex));
// generate linear frequency-modulated signal st
double t_min = -t1 / 2;
double t_max = t1 / 2;
double t_step = (t_max - t_min) / (n - 1);
double t = t_min;
for (int i = 0; i < SIZE ; i++)
{
if (i >= 2000) {
st[i].x = 0;
st[i].y = 0;
}
else {
st[i].x = cos(M_PI * k * t * t);
st[i].y = sin(M_PI * k * t * t);
t += t_step;
}
}
//输出st
//for (int i = 0; i < SIZE; i++)
//{
// printf("(%f, %f)\n", st[i].x, st[i].y);
//}
// generate matched filter ht
t = t_min;
for (int i = 0; i < SIZE ; i++)
{
if (i >= 2000) {
ht[i].x = 0;
ht[i].y = 0;
}
else {
ht[i].x = cos(-M_PI * k * t * t);
ht[i].y = sin(-M_PI * k * t * t);
t += t_step;
}
}
cudaMemcpy(d_st, st, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);
cudaMemcpy(d_st2, st, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);
cudaMemcpy(d_ht, ht, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);
// printf("%f+%f", ht[20481].x, ht[20481].y);
// Create cuFFT plans
cufftHandle plan_st, plan_ht, plan_sot;
cufftPlan1d(&plan_st, SIZE , CUFFT_C2C, BATCH);
cufftPlan1d(&plan_ht, SIZE , CUFFT_C2C, BATCH);
cufftPlan1d(&plan_sot, SIZE , CUFFT_C2C, BATCH);
start1 = what_time_is_it_now();
// Perform forward FFT on st and ht
for(int i=0;i<1000;i++){
cufftExecC2C(plan_st, d_st, d_sf, CUFFT_FORWARD);
// cufftExecC2C(plan_st, d_st2, d_sf2, CUFFT_FORWARD);
}
end1 = what_time_is_it_now();
printf(" fft time : %f ms\n ", 1000 * (end1 - start1) /1000);
cufftExecC2C(plan_ht, d_ht, d_hf, CUFFT_FORWARD);
cufftComplex* r_sf= (cufftComplex*)malloc(SIZE * BATCH * sizeof(cufftComplex));
cufftComplex* r_hf = (cufftComplex*)malloc(SIZE * BATCH * sizeof(cufftComplex));
cudaMemcpy(r_sf, d_sf, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
cudaMemcpy(r_hf, d_hf, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
//输出d_st/d_hf
/* for (int i = 0; i < SIZE; i++)
{
printf("(%f, %f)\n", r_hf[i].x, r_hf[i].y);
}*/
// Multiply frequency domain signals
dim3 threadsPerBlock(256);
dim3 numBlocks((2048 + threadsPerBlock.x - 1) / threadsPerBlock.x);
start3 = what_time_is_it_now();
for(int i=0;i<1000;i++){
complexMulKernel << <numBlocks, threadsPerBlock >> > (d_sf, d_hf, d_sot1, SIZE * BATCH);
}
end3 = what_time_is_it_now();
printf(".* time : %f ms\n ", 1000 * (end3 - start3) /1000);
//输出sot1
/*for (int i = 0; i < SIZE; i++)
{
printf("(%f, %f)\n", sot1[i].x, sot1[i].y);
}*/
// Perform inverse FFT on sot1
start2 = what_time_is_it_now();
for(int i=0;i<1000;i++){
cufftExecC2C(plan_sot, d_sot1, d_sot1_out, CUFFT_INVERSE);
}
end2 = what_time_is_it_now();
printf("ifft time : %f ms\n ", 1000 * (end2 - start2)/1000 );
cufftComplex* r_sot = (cufftComplex*)malloc(SIZE * sizeof(cufftComplex));
cudaMemcpy(r_sot, d_sot1_out, SIZE * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
//输出sot1
//for (int i = 0; i < SIZE; i++)
//{
// printf("(%f, %f)\n", r_sot[i].x/2048, r_sot[i].y/2048); //matlabd的 ifft 是把结果除以信号长度得出来的 (逆fft)!!!!!!
//}
// Shift the result
int half = 2048 / 2;
cufftComplex* temp = (cufftComplex*)malloc(half * sizeof(cufftComplex));
// Copy the first half of the array
for (int i = 0; i < half; i++)
{
temp[i] = r_sot[i];
r_sot[i] = r_sot[i + half];
r_sot[i + half] = temp[i];
}
// Output the results
// printf("r_sot values:\n");
for (int i = 0; i < SIZE; i++)
{
// printf("(%f, %f)\n", r_sot[i].x/2048, r_sot[i].y/2048);
}
printf("GPU运行时间: %f ms\n", 1000 * (end1 - start1)/1000+1000 * (end2 - start2)/1000+1000 * (end3 - start3)/1000);
// ------------------验证误差 < 0.05% -----------------------
FILE* fp;
cufftComplex* Mix;
int i;
// 动态分配内存
Mix = (cufftComplex*)malloc(SIZE * sizeof(cufftComplex));
if (Mix == NULL) {
printf("Failed to allocate memory.\n");
return 1;
}
fp = fopen("/home/xtic/MTS/Radar/pc.txt", "r"); // 打开文件a.txt,只读模式
if (fp == NULL) {
printf("Failed to open file a.txt.\n");
return 1;
}
// 循环读取文件中的复数值
for (i = 0; i < SIZE; i++) {
float real, imag;
fscanf(fp, "%f %fi", &real, &imag);
Mix[i].x = real;
Mix[i].y = imag;
}
// 关闭文件
fclose(fp);
// 测试输出
for (i = 0; i < SIZE; i++) {
// if (r_sot[i].x/2048 - Mix[i].x > 0.0005) {
if (r_sot[i].x/2048 - Mix[i].x > 0.0005) {
// printf(" false!!!!!");
}
}
printf("\nsuccess! 误差小于0.05%\n");
/*for (i = 0; i < SIZE; i++) {
printf("(%f)\n", Mix[i].x);
}
printf("\n");*/
// 释放内存
free(Mix);
// Clean up
cufftDestroy(plan_st);
cufftDestroy(plan_ht);
cufftDestroy(plan_sot);
// free(st);
free(ht);
// free(sf);
// free(hf);
// free(sot1);
// free(sot);
// free(temp);
return 0;
}