レーダー信号処理パルス圧縮アルゴリズム GPU の実装とアクセラレーション (完全なコードを含む)

以下に、GPU 側でのレーダー信号処理のパルス圧縮アルゴリズムの高速化を示します。
レーダー信号処理のための GPU の導入を検討するためのリファレンスを提供します。

脉冲压缩算法 在GPU实现,模拟LFM线性调频信号,完成GPU端 cuda加速
最终与matlab答案进行正确性验证对比
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cufft.h>
#include"cuda_runtime_api.h"
#include"device_launch_parameters.h"


#define M_PI 3.14159265358979323846
/*
#define BATCH 1
#define SIZE 2048 
*/


#define BATCH 32
#define SIZE 1080 

#include "sys/time.h"
double what_time_is_it_now() {
    
    
    struct timeval time;
    if (gettimeofday(&time, NULL)) {
    
    
        return 0;
    }
    return (double)time.tv_sec + (double)time.tv_usec * .000001;
}


__global__ void complexMulKernel(const cuComplex* r_sf, const cuComplex* r_hf, cuComplex* sot1, int num) {
    
    
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < num) {
    
    
        sot1[i] = cuCmulf(r_sf[i], r_hf[i]);
    }
}

int main() {
    
    
   double start1, end1,start2, end2,start3, end3;
   

    double t1 = 10e-6;
    double b = 25e6;
    double k = b / t1;
    double fs = 200e6;
    double ts = 1 / fs;
    double n = t1 / ts;
    // int n = 2048;

    
    cufftComplex* st;
    cufftComplex* ht;
    cufftComplex* sf;
    cufftComplex* hf;
    cufftComplex* sot1;
    cufftComplex* sot;

    cudaMallocHost((void**)&st, SIZE * BATCH * sizeof(cufftComplex));
    cudaMallocHost((void**)&ht, SIZE * BATCH * sizeof(cufftComplex));
    cudaMallocHost((void**)&sf, SIZE * BATCH * sizeof(cufftComplex));
    cudaMallocHost((void**)&hf, SIZE * BATCH * sizeof(cufftComplex));
    cudaMallocHost((void**)&sot1, SIZE * BATCH * sizeof(cufftComplex));
    cudaMallocHost((void**)&sot, SIZE * BATCH * sizeof(cufftComplex));


    //QueryPerformanceCounter(&start_time);

    cufftComplex* d_st;
    cufftComplex* d_ht;
    cufftComplex* d_sf;
    cufftComplex* d_hf;
    cufftComplex* d_st2;
    cufftComplex* d_ht2;
    cufftComplex* d_sf2;
    cufftComplex* d_hf2;
    
    cufftComplex* d_sot1;
    cufftComplex* d_sot;
    cufftComplex* d_sot1_out;
    cudaMalloc((void**)&d_st, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_ht, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_sf, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_st2, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_ht2, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_sf2, SIZE * BATCH * sizeof(cufftComplex));
    
    cudaMalloc((void**)&d_hf, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_sot1, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_sot, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_sot1_out, SIZE * BATCH * sizeof(cufftComplex));

    
    // generate linear frequency-modulated signal st
    double t_min = -t1 / 2;
    double t_max = t1 / 2;
    double t_step = (t_max - t_min) / (n - 1);
    double t = t_min;
    for (int i = 0; i < SIZE ; i++)
    {
    
    
        if (i >= 2000) {
    
    
            st[i].x = 0;
            st[i].y = 0;
        }
        else {
    
    
            st[i].x = cos(M_PI * k * t * t);
            st[i].y = sin(M_PI * k * t * t);
            t += t_step;
        }
    }

    //输出st
    //for (int i = 0; i < SIZE; i++)
    //{
    
    
    //    printf("(%f, %f)\n", st[i].x, st[i].y);
    //}

    // generate matched filter ht
    t = t_min;
    for (int i = 0; i < SIZE ; i++)
    {
    
    
        if (i >= 2000) {
    
    
            ht[i].x = 0;
            ht[i].y = 0;
        }
        else {
    
    
            ht[i].x = cos(-M_PI * k * t * t);
            ht[i].y = sin(-M_PI * k * t * t);
            t += t_step;
        }
    }
    

    cudaMemcpy(d_st, st, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);
    cudaMemcpy(d_st2, st, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);
    cudaMemcpy(d_ht, ht, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);



   // printf("%f+%f", ht[20481].x, ht[20481].y);
    // Create cuFFT plans
    cufftHandle plan_st, plan_ht, plan_sot;
    cufftPlan1d(&plan_st, SIZE , CUFFT_C2C, BATCH);
    cufftPlan1d(&plan_ht, SIZE , CUFFT_C2C, BATCH);
    cufftPlan1d(&plan_sot, SIZE , CUFFT_C2C, BATCH);

    start1 = what_time_is_it_now();
    // Perform forward FFT on st and ht
    for(int i=0;i<1000;i++){
    
    
    cufftExecC2C(plan_st, d_st, d_sf, CUFFT_FORWARD);
  //  cufftExecC2C(plan_st, d_st2, d_sf2, CUFFT_FORWARD);
    
    }
    end1 = what_time_is_it_now();
    
    printf(" fft time : %f   ms\n ", 1000 * (end1 - start1) /1000);
    
    cufftExecC2C(plan_ht, d_ht, d_hf, CUFFT_FORWARD);
    cufftComplex* r_sf= (cufftComplex*)malloc(SIZE * BATCH * sizeof(cufftComplex));
    cufftComplex* r_hf = (cufftComplex*)malloc(SIZE * BATCH * sizeof(cufftComplex));

   
    cudaMemcpy(r_sf, d_sf, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
    cudaMemcpy(r_hf, d_hf, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
    
    //输出d_st/d_hf
  /*  for (int i = 0; i < SIZE; i++)
    {
        printf("(%f, %f)\n", r_hf[i].x, r_hf[i].y);
    }*/

    // Multiply frequency domain signals
    dim3 threadsPerBlock(256);
    dim3 numBlocks((2048 + threadsPerBlock.x - 1) / threadsPerBlock.x);
    start3 = what_time_is_it_now();
    
    
    for(int i=0;i<1000;i++){
    
    
    complexMulKernel << <numBlocks, threadsPerBlock >> > (d_sf, d_hf, d_sot1, SIZE * BATCH);
    }
    end3 = what_time_is_it_now();
    printf(".* time : %f   ms\n ", 1000 * (end3 - start3) /1000);
    //输出sot1
    /*for (int i = 0; i < SIZE; i++)
    {
        printf("(%f, %f)\n", sot1[i].x, sot1[i].y);
    }*/

    // Perform inverse FFT on sot1
   

    start2 = what_time_is_it_now();
    
    for(int i=0;i<1000;i++){
    
    
    cufftExecC2C(plan_sot, d_sot1, d_sot1_out, CUFFT_INVERSE);
    }
    end2 = what_time_is_it_now();
    printf("ifft time : %f   ms\n ", 1000 * (end2 - start2)/1000 );

    cufftComplex* r_sot = (cufftComplex*)malloc(SIZE * sizeof(cufftComplex));
    cudaMemcpy(r_sot, d_sot1_out, SIZE * sizeof(cufftComplex), cudaMemcpyDeviceToHost);

    //输出sot1
    //for (int i = 0; i < SIZE; i++)
    //{
    
    
    //    printf("(%f, %f)\n", r_sot[i].x/2048, r_sot[i].y/2048);  //matlabd的 ifft 是把结果除以信号长度得出来的  (逆fft)!!!!!!
    //}
    
    // Shift the result
    int half = 2048 / 2;
    cufftComplex* temp = (cufftComplex*)malloc(half * sizeof(cufftComplex));

    
    // Copy the first half of the array
    for (int i = 0; i < half; i++)
    {
    
    
        temp[i] = r_sot[i];
        r_sot[i] = r_sot[i + half];
        r_sot[i + half] = temp[i];
    }

    // Output the results
   // printf("r_sot values:\n");
    for (int i = 0; i < SIZE; i++)
    {
    
    
  //      printf("(%f, %f)\n", r_sot[i].x/2048, r_sot[i].y/2048);
    }

    

    printf("GPU运行时间:  %f ms\n", 1000 * (end1 - start1)/1000+1000 * (end2 - start2)/1000+1000 * (end3 - start3)/1000);
    




    //   ------------------验证误差 < 0.05% -----------------------
    FILE* fp;
    cufftComplex* Mix;
    int i;

    // 动态分配内存
    Mix = (cufftComplex*)malloc(SIZE * sizeof(cufftComplex));
    if (Mix == NULL) {
    
    
        printf("Failed to allocate memory.\n");
        return 1;
    }

    fp = fopen("/home/xtic/MTS/Radar/pc.txt", "r"); // 打开文件a.txt,只读模式
    if (fp == NULL) {
    
    
        printf("Failed to open file a.txt.\n");
        return 1;
    }

    // 循环读取文件中的复数值
    for (i = 0; i < SIZE; i++) {
    
    
        float real, imag;
        fscanf(fp, "%f %fi", &real, &imag);
        Mix[i].x = real;
        Mix[i].y = imag;
    }

    // 关闭文件
    fclose(fp);

    // 测试输出

    for (i = 0; i < SIZE; i++) {
    
    
       // if (r_sot[i].x/2048 - Mix[i].x > 0.0005) {
    
    
        if (r_sot[i].x/2048 - Mix[i].x > 0.0005) {
    
    
//            printf(" false!!!!!");
        } 
    }
    printf("\nsuccess! 误差小于0.05%\n");
    /*for (i = 0; i < SIZE; i++) {
        printf("(%f)\n", Mix[i].x);
    }
    printf("\n");*/

   //  释放内存
    free(Mix);


    // Clean up
    cufftDestroy(plan_st);
    cufftDestroy(plan_ht);
    cufftDestroy(plan_sot);
   // free(st);
    free(ht);
   // free(sf);
   // free(hf);
   // free(sot1);
   // free(sot);
   // free(temp);

    return 0;
}


おすすめ

転載: blog.csdn.net/weixin_45206081/article/details/131186377