Algoritmo de compresión de pulsos de procesamiento de señales de radar Implementación y aceleración de GPU (incluido el código completo)

A continuación se muestra la aceleración del algoritmo de compresión de pulsos para el procesamiento de señales de radar en el lado de la GPU.
Proporcione una referencia para explorar la implementación de GPU para el procesamiento de señales de radar.

脉冲压缩算法 在GPU实现，模拟LFM线性调频信号，完成GPU端 cuda加速
最终与matlab答案进行正确性验证对比

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cufft.h>
#include"cuda_runtime_api.h"
#include"device_launch_parameters.h"


#define M_PI 3.14159265358979323846
/*
#define BATCH 1
#define SIZE 2048 
*/


#define BATCH 32
#define SIZE 1080 

#include "sys/time.h"
double what_time_is_it_now() {
    
    
    struct timeval time;
    if (gettimeofday(&time, NULL)) {
    
    
        return 0;
    }
    return (double)time.tv_sec + (double)time.tv_usec * .000001;
}


__global__ void complexMulKernel(const cuComplex* r_sf, const cuComplex* r_hf, cuComplex* sot1, int num) {
    
    
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < num) {
    
    
        sot1[i] = cuCmulf(r_sf[i], r_hf[i]);
    }
}

int main() {
    
    
   double start1, end1,start2, end2,start3, end3;
   

    double t1 = 10e-6;
    double b = 25e6;
    double k = b / t1;
    double fs = 200e6;
    double ts = 1 / fs;
    double n = t1 / ts;
    // int n = 2048;

    
    cufftComplex* st;
    cufftComplex* ht;
    cufftComplex* sf;
    cufftComplex* hf;
    cufftComplex* sot1;
    cufftComplex* sot;

    cudaMallocHost((void**)&st, SIZE * BATCH * sizeof(cufftComplex));
    cudaMallocHost((void**)&ht, SIZE * BATCH * sizeof(cufftComplex));
    cudaMallocHost((void**)&sf, SIZE * BATCH * sizeof(cufftComplex));
    cudaMallocHost((void**)&hf, SIZE * BATCH * sizeof(cufftComplex));
    cudaMallocHost((void**)&sot1, SIZE * BATCH * sizeof(cufftComplex));
    cudaMallocHost((void**)&sot, SIZE * BATCH * sizeof(cufftComplex));


    //QueryPerformanceCounter(&start_time);

    cufftComplex* d_st;
    cufftComplex* d_ht;
    cufftComplex* d_sf;
    cufftComplex* d_hf;
    cufftComplex* d_st2;
    cufftComplex* d_ht2;
    cufftComplex* d_sf2;
    cufftComplex* d_hf2;
    
    cufftComplex* d_sot1;
    cufftComplex* d_sot;
    cufftComplex* d_sot1_out;
    cudaMalloc((void**)&d_st, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_ht, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_sf, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_st2, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_ht2, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_sf2, SIZE * BATCH * sizeof(cufftComplex));
    
    cudaMalloc((void**)&d_hf, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_sot1, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_sot, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_sot1_out, SIZE * BATCH * sizeof(cufftComplex));

    
    // generate linear frequency-modulated signal st
    double t_min = -t1 / 2;
    double t_max = t1 / 2;
    double t_step = (t_max - t_min) / (n - 1);
    double t = t_min;
    for (int i = 0; i < SIZE ; i++)
    {
    
    
        if (i >= 2000) {
    
    
            st[i].x = 0;
            st[i].y = 0;
        }
        else {
    
    
            st[i].x = cos(M_PI * k * t * t);
            st[i].y = sin(M_PI * k * t * t);
            t += t_step;
        }
    }

    //输出st
    //for (int i = 0; i < SIZE; i++)
    //{
    
    
    //    printf("(%f, %f)\n", st[i].x, st[i].y);
    //}

    // generate matched filter ht
    t = t_min;
    for (int i = 0; i < SIZE ; i++)
    {
    
    
        if (i >= 2000) {
    
    
            ht[i].x = 0;
            ht[i].y = 0;
        }
        else {
    
    
            ht[i].x = cos(-M_PI * k * t * t);
            ht[i].y = sin(-M_PI * k * t * t);
            t += t_step;
        }
    }
    

    cudaMemcpy(d_st, st, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);
    cudaMemcpy(d_st2, st, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);
    cudaMemcpy(d_ht, ht, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);



   // printf("%f+%f", ht[20481].x, ht[20481].y);
    // Create cuFFT plans
    cufftHandle plan_st, plan_ht, plan_sot;
    cufftPlan1d(&plan_st, SIZE , CUFFT_C2C, BATCH);
    cufftPlan1d(&plan_ht, SIZE , CUFFT_C2C, BATCH);
    cufftPlan1d(&plan_sot, SIZE , CUFFT_C2C, BATCH);

    start1 = what_time_is_it_now();
    // Perform forward FFT on st and ht
    for(int i=0;i<1000;i++){
    
    
    cufftExecC2C(plan_st, d_st, d_sf, CUFFT_FORWARD);
  //  cufftExecC2C(plan_st, d_st2, d_sf2, CUFFT_FORWARD);
    
    }
    end1 = what_time_is_it_now();
    
    printf(" fft time : %f   ms\n ", 1000 * (end1 - start1) /1000);
    
    cufftExecC2C(plan_ht, d_ht, d_hf, CUFFT_FORWARD);
    cufftComplex* r_sf= (cufftComplex*)malloc(SIZE * BATCH * sizeof(cufftComplex));
    cufftComplex* r_hf = (cufftComplex*)malloc(SIZE * BATCH * sizeof(cufftComplex));

   
    cudaMemcpy(r_sf, d_sf, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
    cudaMemcpy(r_hf, d_hf, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
    
    //输出d_st/d_hf
  /*  for (int i = 0; i < SIZE; i++)
    {
        printf("(%f, %f)\n", r_hf[i].x, r_hf[i].y);
    }*/

    // Multiply frequency domain signals
    dim3 threadsPerBlock(256);
    dim3 numBlocks((2048 + threadsPerBlock.x - 1) / threadsPerBlock.x);
    start3 = what_time_is_it_now();
    
    
    for(int i=0;i<1000;i++){
    
    
    complexMulKernel << <numBlocks, threadsPerBlock >> > (d_sf, d_hf, d_sot1, SIZE * BATCH);
    }
    end3 = what_time_is_it_now();
    printf(".* time : %f   ms\n ", 1000 * (end3 - start3) /1000);
    //输出sot1
    /*for (int i = 0; i < SIZE; i++)
    {
        printf("(%f, %f)\n", sot1[i].x, sot1[i].y);
    }*/

    // Perform inverse FFT on sot1
   

    start2 = what_time_is_it_now();
    
    for(int i=0;i<1000;i++){
    
    
    cufftExecC2C(plan_sot, d_sot1, d_sot1_out, CUFFT_INVERSE);
    }
    end2 = what_time_is_it_now();
    printf("ifft time : %f   ms\n ", 1000 * (end2 - start2)/1000 );

    cufftComplex* r_sot = (cufftComplex*)malloc(SIZE * sizeof(cufftComplex));
    cudaMemcpy(r_sot, d_sot1_out, SIZE * sizeof(cufftComplex), cudaMemcpyDeviceToHost);

    //输出sot1
    //for (int i = 0; i < SIZE; i++)
    //{
    
    
    //    printf("(%f, %f)\n", r_sot[i].x/2048, r_sot[i].y/2048);  //matlabd的 ifft 是把结果除以信号长度得出来的  (逆fft)!!!!!!
    //}
    
    // Shift the result
    int half = 2048 / 2;
    cufftComplex* temp = (cufftComplex*)malloc(half * sizeof(cufftComplex));

    
    // Copy the first half of the array
    for (int i = 0; i < half; i++)
    {
    
    
        temp[i] = r_sot[i];
        r_sot[i] = r_sot[i + half];
        r_sot[i + half] = temp[i];
    }

    // Output the results
   // printf("r_sot values:\n");
    for (int i = 0; i < SIZE; i++)
    {
    
    
  //      printf("(%f, %f)\n", r_sot[i].x/2048, r_sot[i].y/2048);
    }

    

    printf("GPU运行时间：  %f ms\n", 1000 * (end1 - start1)/1000+1000 * (end2 - start2)/1000+1000 * (end3 - start3)/1000);
    




    //   ------------------验证误差 < 0.05% -----------------------
    FILE* fp;
    cufftComplex* Mix;
    int i;

    // 动态分配内存
    Mix = (cufftComplex*)malloc(SIZE * sizeof(cufftComplex));
    if (Mix == NULL) {
    
    
        printf("Failed to allocate memory.\n");
        return 1;
    }

    fp = fopen("/home/xtic/MTS/Radar/pc.txt", "r"); // 打开文件a.txt，只读模式
    if (fp == NULL) {
    
    
        printf("Failed to open file a.txt.\n");
        return 1;
    }

    // 循环读取文件中的复数值
    for (i = 0; i < SIZE; i++) {
    
    
        float real, imag;
        fscanf(fp, "%f %fi", &real, &imag);
        Mix[i].x = real;
        Mix[i].y = imag;
    }

    // 关闭文件
    fclose(fp);

    // 测试输出

    for (i = 0; i < SIZE; i++) {
    
    
       // if (r_sot[i].x/2048 - Mix[i].x > 0.0005) {
    
    
        if (r_sot[i].x/2048 - Mix[i].x > 0.0005) {
    
    
//            printf(" false!!!!!");
        } 
    }
    printf("\nsuccess! 误差小于0.05%\n");
    /*for (i = 0; i < SIZE; i++) {
        printf("(%f)\n", Mix[i].x);
    }
    printf("\n");*/

   //  释放内存
    free(Mix);


    // Clean up
    cufftDestroy(plan_st);
    cufftDestroy(plan_ht);
    cufftDestroy(plan_sot);
   // free(st);
    free(ht);
   // free(sf);
   // free(hf);
   // free(sot1);
   // free(sot);
   // free(temp);

    return 0;
}

Algoritmo de compresión de pulsos de procesamiento de señales de radar Implementación y aceleración de GPU (incluido el código completo)

Supongo que te gusta