C++ implements YOLO target recognition image preprocessing and postprocessing

【background】

Why do you need to write the image post-processing part of YOLO target recognition by yourself? Because we used a self-developed FPGA development board and developed and deployed the YOLO target recognition accelerated computing circuit on it. As a PS-side developer, I need to send the input image to the specified DDR address and enable the FPGA accelerated computing register. Wait for the calculation result to initiate an interrupt, retrieve the original data of the target recognition result, restore it through post-processing operations, and mark the result on the original image.

【Dry goods】

Without further ado, let’s get straight to the code!

Through the following code, you should learn at least the following operations:

1. Learn how to read data set images from a folder

2. Learn how to dequantize, decode, maximize suppression and other basic operations on FPGA results

3. Learn how to label and save recognition results

4. Learn to control the operation of FPGA through operating registers

static char text[256];
static int length = 5+5;
static int kind_nums = 5;
static float anchors_0[6] = {2.53125f, 2.5625f, 4.21875f, 5.28125f, 10.75f, 9.96875f};
static float anchors_1[6] = {1.4375f, 1.6875f, 2.3125f, 3.625f, 5.0625f, 5.125f};
static float conf_thresh_ = 0.5f;
static float nms_thresh_ = 0.4f;

static std::string label_name[] = {"person", "aeroplane", "bus", "car", "truck"};

bool PedestrianDetection::get_files_list()
{
  struct dirent *ptr;
  DIR *dir;
  std::string PATH = dataset_dir_;
  dir = opendir(PATH.c_str());
  while((ptr=readdir(dir)) != nullptr)
  {
    if(ptr->d_name[0] == '.')
      continue;
    files_list.push_back(ptr->d_name);
  }
  for(auto iter : files_list)
  {
    printf("%s\n", iter.c_str());
  }
  closedir(dir);
  if(files_list.size() == dataset_size)
  {
    return true;
  }
  else {
    printf("[PedestrianDetection] : Error to search dataset %ld \n", files_list.size());
    return false;
  }
}

bool PedestrianDetection::init_net_info(std::string pkg_path)
{
  yolov4tn = 2;
  outputchannel = 30;  // class number change
  yolov4tnsize = {11, 15, 22, 30};  // class number change
  yolov4tnsize_sum = yolov4tnsize[0] * yolov4tnsize[1] + yolov4tnsize[2] * yolov4tnsize[3];
  accData_size = yolov4tnsize_sum * outputchannel;
  accRaw_size = yolov4tnsize_sum * (outputchannel + 2);

  return true;
}

void PedestrianDetection::init_fpga()
{
  raw_img_w = 480;
  raw_img_h = 352;
  raw_img_c = 3;
  memset(&yolo_v4, 0, sizeof(struct Model_Output));
  yolo_v4.addr[0] = 0x4491F800;
  yolo_v4.length[0] = 11 * 15 * (30+2);
  yolo_v4.addr[1] = 0x44921000;
  yolo_v4.length[1] = 22 * 30 * (30+2);
  if(init_model_output(&yolo_v4, fd_mem) == -1)
  {
    printf("[yolov4] failed to init model output \n");
    exit(-1);
  }
}

void PedestrianDetection::read_dataset()
{
  while(nh_private.ok())
  {
    while(ready_buf_.size() > 50)
    {
      std::this_thread::sleep_for(std::chrono::milliseconds(5));
    }

#ifdef WITH_NETCAM
    cv::Mat origin;
    struct timeval timestamp;
    if(netcam.getCameraImage(origin, timestamp, cv::Size(1920, 1080)))
    {
      std::vector<cv::Mat> pics;
      cut_img(origin, 2, 2, pics);
      for(int i = 0; i < pics.size(); i++)
      {
        if(i != (pics.size()-1))
        {
          std::vector<uint8_t> vec_data;
          cv::imencode(".jpg", pics[i], vec_data);
          std::string str_data;
          str_data.assign(vec_data.begin(), vec_data.end());
          zmqpp::message message;
          message << "robot" + std::to_string(i+1) +"pic" << str_data;
          socket_pubraw.send(message);
        }
        else {
          cv::Mat resize;
          cv::resize(pics[i], resize, cv::Size(raw_img_w, raw_img_h), cv::INTER_CUBIC); // cv::INTER_CUBIC
          cv::cvtColor(resize, resize, cv::COLOR_BGR2RGB);

          cv::Mat padding;
          cv::copyMakeBorder(resize, padding, 1, 0, 1, 0, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
          rawImgFile tmp;
          tmp.raw_img = pics[i];
          tmp.pad_img = padding;
          tmp.file_name = robot_id;
          ready_buf_mutex_.lock();
          ready_buf_.push(tmp);
          ready_buf_mutex_.unlock();
        }
      }
    }
    else {
      printf("camera no data\n");
      usleep(30*1000);
      continue;
    }
#else
    zmqpp::message message;
    socket_sub.receive(message);
    std::string topic;
    std::string data_str;
    message >> topic >> data_str;
    std::vector<uint8_t> data_vec;
    data_vec.assign(data_str.begin(), data_str.end());

    cv::Mat img_decode;
    img_decode = cv::imdecode(data_vec, CV_LOAD_IMAGE_COLOR);
    cv::Mat resize;
    cv::resize(img_decode, resize, cv::Size(raw_img_w, raw_img_h), cv::INTER_CUBIC); // cv::INTER_CUBIC
    cv::cvtColor(resize, resize, cv::COLOR_BGR2RGB);

    cv::Mat padding;
    cv::copyMakeBorder(resize, padding, 1, 0, 1, 0, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
    rawImgFile tmp;
    tmp.raw_img = img_decode;
    tmp.pad_img = padding;
    tmp.file_name = robot_id;
    ready_buf_mutex_.lock();
    ready_buf_.push(tmp);
    ready_buf_mutex_.unlock();
#endif

  }
}

void PedestrianDetection::test_fpga()
{
  while(nh_private.ok())
  {
    while(nh_private.ok())
    {
      if(ready_buf_.empty())
      {
        std::this_thread::sleep_for(std::chrono::milliseconds(5));
      }
      else
      {
        break;
      }
    }

    rawImgFile tmp_input;
    ready_buf_mutex_.lock();
    tmp_input = ready_buf_.front();
    ready_buf_.pop();
    ready_buf_mutex_.unlock();

    tmp_input.acc.acc_out = nullptr;
    tmp_input.acc.acc_out = new int8_t[accRaw_size];
    if(!tmp_input.acc.acc_out)
    {
      ROS_ERROR("Allocation of memory Failed \n");
      return;
    }

    int image_size = 481 * 353 * 3;
    image_to_mem(tmp_input.pad_img.data, 0x46000000, fd_mem, image_size);
 	
    // TODO : start fpga
    void* mem_ctrl = mmap(nullptr, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd_mem, 0x0400000000L);

    int state = 0;
    while(state != 1)
    {
      memcpy(&state, (char*)mem_ctrl+124, 4);
    }
    state = 1;
    memcpy((char*)mem_ctrl, &state, 4);
    int res = -1;
    memcpy(&res, (char*)mem_ctrl, 4);
    
    state = 0;
    memcpy((char*)mem_ctrl, &state, 4);
    memcpy(&res, (char*)mem_ctrl, 4);

    int busy = 0;
    memcpy(&busy, (char*)mem_ctrl+8, 4);
    int busy_prev = busy;
    while(busy_prev != 0)
    {
      memcpy(&busy, (char*)mem_ctrl+8, 4);
      if(busy != busy_prev)
      {
        busy_prev = busy;
      }
    }
    munmap(mem_ctrl, 4096);

    get_model_output(&yolo_v4, tmp_input.acc.acc_out);

    result_buf_mutex_.lock();
    result_buf_.push(tmp_input);
    result_buf_mutex_.unlock();
  }
}


void PedestrianDetection::write_predict()
{
  while(nh_private.ok())
  {
    while(nh_private.ok())
    {
      if(result_buf_.empty())
      {
        std::this_thread::sleep_for(std::chrono::milliseconds(5));
      }
      else
      {
        break;
      }
    }

    rawImgFile tmp_output;
    result_buf_mutex_.lock();
    tmp_output = result_buf_.front();
    result_buf_.pop();
    result_buf_mutex_.unlock();

    decode(tmp_output);

    delete [] tmp_output.acc.acc_out;
    tmp_output.acc.acc_out = nullptr;
  }
}

static bool outbin_write = true;
void PedestrianDetection::decode(rawImgFile &msg)
{
  cv::Mat frame = msg.raw_img;

  if(msg.acc.acc_out != nullptr)
  {
    int8_t* data;
    data = msg.acc.acc_out;

    if(outbin_write)
    {
      FILE* fp = fopen((pkg_path + "/unit_test/result_img/out.bin").c_str(), "wb");
      fwrite(data, sizeof(int8_t), accRaw_size, fp);
      outbin_write = false;
    }

    float *out = (float*)calloc(accData_size, sizeof(float));
    unpadding(data, out, 32, 2);
    std::vector<de_result> res;
    process_decode(out, res, frame);
    draw_objects(frame, res);
    
#if 1

    draw_objects(msg.raw_img, res);
    std::vector<uint8_t> vec_data;
    cv::imencode(".jpg", msg.raw_img, vec_data);
    std::string str_data;
    str_data.assign(vec_data.begin(), vec_data.end());
    zmqpp::message message;
    message << "pedestrian_detect" << str_data;
    socket_pub.send(message);
//    cv::imwrite((pkg_path + "/unit_test/result_img/" + img_name +"jpg"), msg.raw_img);

/*
    std::ofstream outfile;
    outfile.open((pkg_path + "/unit_test/result_label/" + img_name +"txt"));
    for(size_t i = 0; i < res.size(); i++)
    {
      std::string str_conf = std::to_string(res[i].complex_prob);
      std::string conf_4(str_conf.begin(), str_conf.begin()+5);
      outfile << label_name[res[i].index] << " " << conf_4 << " " << static_cast<int>(res[i].x - res[i].w/2) << " " << static_cast<int>(res[i].y - res[i].h/2) << " " << static_cast<int>(res[i].x + res[i].w/2) << " " << static_cast<int>(res[i].y + res[i].h/2);
      if(i != (res.size()-1))
        outfile << std::endl;
    }
    outfile.close();*/
/*
    FILE* fp = fopen((pkg_path + "/unit_test/result_bin/" + img_name +"bin").c_str(), "wb");
    fwrite(data, sizeof(int8_t), 11 * 15 * 96 + 22 * 30 * 96, fp);
    fclose(fp);
    */
#endif

    free(out);
  }
  else
  {
    printf("acc is empty \n");
  }

}


void PedestrianDetection::unpadding(int8_t* input, float* output, int ic ,int padding)
{
  int offset = yolov4tnsize[0] * yolov4tnsize[1];
  for(int i = 0; i < yolov4tnsize_sum; i++)
  {
    if(i < offset)
    {
      for(int j = 0; j < ic - padding; j++)
      {
        int tmp = *(input + j);
        *(output++) = (tmp - 78) * 0.1175554f;
      }
    }
    else
    {
      for(int j = 0; j < ic - padding; j++)
      {
        int tmp = *(input + j);
        *(output++) = (tmp - 88) * 0.1006139f;
      }
    }

    input += ic;
  }
}

inline float PedestrianDetection::sigmoid(const float &x)
{
  return (1 / (1 + expf(-x)));
}

inline int PedestrianDetection::find_max_index(float *buf, int len)
{
  int k = 0;
  float max = *buf;
  for (int i = 1; i < len; ++i)
  {
    if (buf[i] > max)
    {
      k = i;
      max = buf[i];
    }
  }
  return k;
}

bool comp_prob(const de_result &a, const de_result &b)
{
  return a.complex_prob > b.complex_prob;
}

bool comp(const de_result &a, const de_result &b)
{
  return a.index > b.index;
}

inline float PedestrianDetection::clac_iou(const de_result &A, const de_result &B)
{
  float aminx = A.x - A.w / 2.f;
  float amaxx = A.x + A.w / 2.f;
  float aminy = A.y - A.h / 2.f;
  float amaxy = A.y + A.h / 2.f;

  float bminx = B.x - B.w / 2.f;
  float bmaxx = B.x + B.w / 2.f;
  float bminy = B.y - B.h / 2.f;
  float bmaxy = B.y + B.h / 2.f;

  float w = std::min(amaxx, bmaxx) - std::max(aminx, bminx);
  float h = std::min(amaxy, bmaxy) - std::max(aminy, bminy);

  if (w <= 0 || h <= 0)
    return 0;

  return (w * h) / ((A.w * A.h) + (B.w * B.h) - (w * h));
}

void PedestrianDetection::nms(const std::vector<de_result> &vpbox, std::vector<de_result> &voutbox, float iou_thres)
{
  std::vector<float> flag(vpbox.size(), -1.0);
  int n = vpbox.size();
  for (int i = 0; i < n; i++)
  {
    flag[i] = vpbox[i].index;
  }

  for (int i = 0; i < n; i++)
  {
    if (flag[i] == -1.0)
      continue;
    for (int j = i + 1; j < n; j++)
    {
      if(flag[j] == -1.0 || vpbox[i].index != vpbox[j].index)
        continue;
      float iou = clac_iou(vpbox[i], vpbox[j]);

      if (iou > iou_thres)
      {
        flag[j] = -1.0;
      }
    }
  }
  for (int i = 0; i < n; i++)
  {
    if (flag[i] != -1.0)
    {
      voutbox.push_back(vpbox[i]);
    }
  }
}

void PedestrianDetection::decode_one_with_thre(int layer, float *anchors, void *data, const int xno, const int yno, const int anchor_no, std::vector<de_result>& boxes, cv::Mat &img)
{
  const float *ori_ptr = (const float *)data;
  float decode_data[length], *data_ptr;
  data_ptr = decode_data;

  data_ptr[4] = sigmoid(ori_ptr[4]);
  for(int k = 5; k < length; k++)
  {
    data_ptr[k] = sigmoid(ori_ptr[k]);
  }
  int max_index = find_max_index(data_ptr+5, kind_nums);
  if(data_ptr[max_index+5] * data_ptr[4] < conf_thresh_)
    return;

  data_ptr[0] = sigmoid(ori_ptr[0]);
  data_ptr[1] = sigmoid(ori_ptr[1]);
  data_ptr[2] = expf(ori_ptr[2]);
  data_ptr[3] = expf(ori_ptr[3]);

  de_result tmp;
  tmp.x = (data_ptr[0] + xno) / yolov4tnsize[2 * layer + 1] * img.cols;
  tmp.y = (data_ptr[1] + yno) / yolov4tnsize[2 * layer] * img.rows;
  tmp.w = (((data_ptr[2]) * anchors[2 * anchor_no])) / yolov4tnsize[2 * layer + 1] * img.cols;
  tmp.h = (((data_ptr[3]) * anchors[2 * anchor_no + 1])) / yolov4tnsize[2 * layer] * img.rows;
  tmp.conf = data_ptr[4];
  tmp.index = max_index;
  tmp.prob = data_ptr[max_index+5];
  tmp.complex_prob = data_ptr[max_index+5] * data_ptr[4];
//  printf("## layer[%d] prob : %f , x : %f , y : %f , w : %f , h : %f \n", layer, tmp.prob, tmp.x, tmp.y, tmp.w, tmp.h);
  boxes.push_back(tmp);
}

void PedestrianDetection::process_decode(float* input, std::vector<de_result> &res, cv::Mat &img)
{
  std::vector<de_result> boxes;
  float* data_ptr = input;

  for(int i = 0; i < yolov4tnsize[0]; i++){
    for(int j = 0; j < yolov4tnsize[1]; j++){
      for(int k = 0; k < 3; k++)
      {
        decode_one_with_thre(0, anchors_0, data_ptr, j, i, k, boxes, img);
        data_ptr += length;
      }
    }
  }

  for(int i = 0; i < yolov4tnsize[2]; i++){
    for(int j = 0; j < yolov4tnsize[3]; j++){
      for(int k = 0; k < 3; k++)
      {
        decode_one_with_thre(1, anchors_1, data_ptr, j, i, k, boxes, img);
        data_ptr += length;
      }
    }
  }
  sort(boxes.begin(), boxes.end(), comp_prob);
  nms(boxes, res, nms_thresh_);
  boxes.clear();
}


void PedestrianDetection::draw_objects(cv::Mat &image, const std::vector<de_result> &objects)
{
  double font_size = 0.6;
  int font_bold = 1;
  int baseLine = 0;
  for(size_t i = 0; i < objects.size(); i++)
  {
    const de_result &obj = objects[i];

    sprintf(text, "%s %.1f%%", label_name[obj.index].c_str(), obj.complex_prob * 100);
    int x = std::max(static_cast<int>(obj.x - obj.w / 2), 0);
    int y = std::max(static_cast<int>(obj.y - obj.h / 2), 0);
    int x1 = std::min(static_cast<int>(obj.x + obj.w / 2), image.cols);
    int y1 = std::min(static_cast<int>(obj.y + obj.h / 2), image.rows);
    int w = x1 - x - 1;
    int h = y1 - y - 1;

    cv::Rect obj_rect(x, y, w, h);
    cv::rectangle(image, obj_rect, cv::Scalar(0, 255, 0), font_bold);
    cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, font_size, font_bold, &baseLine);

    int tx = obj_rect.x;
    int ty = obj_rect.y - label_size.height - baseLine;
    if (ty < 0)
      ty = 0;

    cv::Point point;
    point = cv::Point(tx, ty + label_size.height);
    cv::rectangle(image, cv::Rect(cv::Point(obj_rect.x, ty), cv::Size(obj_rect.width, label_size.height + baseLine)), cv::Scalar(128, 128, 0), CV_FILLED);
    cv::putText(image, text, point, cv::FONT_HERSHEY_SIMPLEX, font_size, cv::Scalar(0, 255, 0), font_bold);
  }
}

void PedestrianDetection::cut_img(cv::Mat &src, int m, int n, std::vector<cv::Mat> &ceil)
{
  int height = src.rows;
  int width = src.cols;
  int ceil_h = height / m;
  int ceil_w = width / n;
  cv::Mat roi_img;
  for(int i = 0; i < m; i++)
  {
    for(int j = 0; j < n; j++)
    {
      cv::Rect rect(j*ceil_w, i*ceil_h, ceil_w, ceil_h);
      roi_img = cv::Mat(src, rect);
      ceil.push_back(roi_img);
    }
  }
}

Guess you like

Origin blog.csdn.net/weixin_39538031/article/details/131001772