SSD：Single Shot MultiBox Detector（三）

这都是个人学习SSD所做记录，仅作为个人备忘录
SSD：Single Shot MultiBox Detector（一）： http://blog.csdn.net/u011956147/article/details/73028773
SSD：Single Shot MultiBox Detector（二）： http://blog.csdn.net/u011956147/article/details/73030116
SSD：Single Shot MultiBox Detector（三）： http://blog.csdn.net/u011956147/article/details/73032867
SSD：Single Shot MultiBox Detector（四）： http://blog.csdn.net/u011956147/article/details/73033170
SSD：Single Shot MultiBox Detector（五）： http://blog.csdn.net/u011956147/article/details/73033282

这篇博客主要写prior_box_layer
这一层完成的是给定一系列feature map后如何在上面生成prior box。SSD的做法很有意思，对于输入大小是W×H的feature map，生成的prior box中心就是W×H个，均匀分布在整张图上，像下图中演示的一样。在每个中心上，可以生成多个不同长宽比的prior box，如[1/3, 1/2, 1, 2, 3]。所以在一个feature map上可以生成的prior box总数是W×H×length_of_aspect_ratio，对于比较大的feature map，如VGG的conv4_3，生成的prior box可以达到数千个。当然对于边界上的box，还要做一些处理保证其不超出图片范围，这都是细节了。

这里需要注意的是，虽然prior box的位置是在W×H的格子上，但prior box的大小并不是跟格子一样大，而是人工指定的，原论文中随着feature map从底层到高层，prior box的大小在0.2到0.9之间均匀变化。

一开始看SSD的时候很困扰我的一点就是形状的匹配问题：SSD用卷积层做bbox的拟合，输出的不应该是feature map吗，怎么能正好输出4个坐标呢？这里的做法有点暴力，比如需要输出W×H×length_of_aspect_ratio×4个坐标，就直接用length_of_aspect_ratio×4个channel的卷积层做拟合，这样就得到length_of_aspect_ratio×4个大小为W×H的feature map，然后把feature map拉成一个长度为W×H×length_of_aspect_ratio×4的向量，用SmoothL1之类的loss去拟合，效果还意外地不错……
（上述参考链接：参考链接）

其实这里的做法和Faster RCNN的做法类似，有兴趣的参考我之前的博文 Faster RCNN代码理解（Python）
图示如下：
这里写图片描述

代码如下：

#include <algorithm>
#include <functional>
#include <utility>
#include <vector>

#include "caffe/layers/prior_box_layer.hpp"

namespace caffe {

template <typename Dtype>
void PriorBoxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,    // 参数解析
      const vector<Blob<Dtype>*>& top) {
  const PriorBoxParameter& prior_box_param =
      this->layer_param_.prior_box_param();
  CHECK_GT(prior_box_param.min_size_size(), 0) << "must provide min_size."; 
  for (int i = 0; i < prior_box_param.min_size_size(); ++i) {   // min_size_size()=1
    min_sizes_.push_back(prior_box_param.min_size(i));
    CHECK_GT(min_sizes_.back(), 0) << "min_size must be positive.";
  }
  aspect_ratios_.clear();
  aspect_ratios_.push_back(1.);   // 加入1，在ProtoTXT只设置了2,3或者2
  flip_ = prior_box_param.flip();  // 默认true
  for (int i = 0; i < prior_box_param.aspect_ratio_size(); ++i) {    // aspect_ratio_size=2
    float ar = prior_box_param.aspect_ratio(i);
    bool already_exist = false;
    for (int j = 0; j < aspect_ratios_.size(); ++j) {  // 这里判断是不是已近把ratio压入栈，保证每个ratios都只有一个1/ratios
      if (fabs(ar - aspect_ratios_[j]) < 1e-6) {    // 这里aspect_ratios_只有1一个值
        already_exist = true;
        break;   // 跳出for循环
      }
    }
    if (!already_exist) {
      aspect_ratios_.push_back(ar);
      if (flip_) {     //  翻转，改变长宽比
        aspect_ratios_.push_back(1./ar);  // 得到1,2,3,1/2,1/3
      }
    }    // 到这里，共有5个ratios，分别为1,2,1/2,3,1/3
  }
  num_priors_ = aspect_ratios_.size() * min_sizes_.size();  // min_sizes_.size()=1   5*1
  if (prior_box_param.max_size_size() > 0) {
    CHECK_EQ(prior_box_param.min_size_size(), prior_box_param.max_size_size());  // 最大和最小不能相等
    for (int i = 0; i < prior_box_param.max_size_size(); ++i) {  // max_size_size=1
      max_sizes_.push_back(prior_box_param.max_size(i));
      CHECK_GT(max_sizes_[i], min_sizes_[i])
          << "max_size must be greater than min_size.";
      num_priors_ += 1;    // num_priors_ = 6;这里很重要，不然就只有5个，和论文中的6个就不相符了
    }
  }
  clip_ = prior_box_param.clip();           // true 默认false
  if (prior_box_param.variance_size() > 1) {   // variance_size = 4
    // Must and only provide 4 variance.
    CHECK_EQ(prior_box_param.variance_size(), 4);   // 必须有4个variance
    for (int i = 0; i < prior_box_param.variance_size(); ++i) {   // variance:0.1 0.1 0.2 0.2
      CHECK_GT(prior_box_param.variance(i), 0);
      variance_.push_back(prior_box_param.variance(i));
    }
  } else if (prior_box_param.variance_size() == 1) {   // 或者只设置一个，设为0.1
    CHECK_GT(prior_box_param.variance(0), 0);
    variance_.push_back(prior_box_param.variance(0));
  } else {
    // Set default to 0.1.
    variance_.push_back(0.1);
  }

  if (prior_box_param.has_img_h() || prior_box_param.has_img_w()) {   // 设置图片的长宽
    CHECK(!prior_box_param.has_img_size())
        << "Either img_size or img_h/img_w should be specified; not both.";
    img_h_ = prior_box_param.img_h();
    CHECK_GT(img_h_, 0) << "img_h should be larger than 0.";
    img_w_ = prior_box_param.img_w();
    CHECK_GT(img_w_, 0) << "img_w should be larger than 0.";
  } else if (prior_box_param.has_img_size()) {
    const int img_size = prior_box_param.img_size();
    CHECK_GT(img_size, 0) << "img_size should be larger than 0.";
    img_h_ = img_size;
    img_w_ = img_size;
  } else {
    img_h_ = 0;
    img_w_ = 0;
  }

  if (prior_box_param.has_step_h() || prior_box_param.has_step_w()) {  // step,tesp_h,step_w参数设置
    CHECK(!prior_box_param.has_step())
        << "Either step or step_h/step_w should be specified; not both.";
    step_h_ = prior_box_param.step_h();
    CHECK_GT(step_h_, 0.) << "step_h should be larger than 0.";
    step_w_ = prior_box_param.step_w();
    CHECK_GT(step_w_, 0.) << "step_w should be larger than 0.";
  } else if (prior_box_param.has_step()) {
    const float step = prior_box_param.step();
    CHECK_GT(step, 0) << "step should be larger than 0.";
    step_h_ = step;
    step_w_ = step;
  } else {
    step_h_ = 0;
    step_w_ = 0;
  }

  offset_ = prior_box_param.offset();   // 偏移量，默认0.5
}  // layersetup 结束

template <typename Dtype>
void PriorBoxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const int layer_width = bottom[0]->width();    // 输入feature map的大小
  const int layer_height = bottom[0]->height();
  vector<int> top_shape(3, 1);
  // Since all images in a batch has same height and width, we only need to
  // generate one set of priors which can be shared across all images.
  top_shape[0] = 1;
  // 2 channels. First channel stores the mean of each prior coordinate.
  // Second channel stores the variance of each prior coordinate.
  top_shape[1] = 2;
  top_shape[2] = layer_width * layer_height * num_priors_ * 4;  
  // 输出坐标，就是需要这么多个map，类似faster rcnn，注意：这里，如果没有在ptototxt中没有设置max_size，num_priors_的值就要减1
  CHECK_GT(top_shape[2], 0);
  top[0]->Reshape(top_shape);
  // 在mbox_priorbox层中，concat是选的axis: 2，就是说是concat的map数。
}

template <typename Dtype>
void PriorBoxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const int layer_width = bottom[0]->width();    // 上一层feature map
  const int layer_height = bottom[0]->height();
  int img_width, img_height;
  if (img_h_ == 0 || img_w_ == 0) {
    img_width = bottom[1]->width();             // data layer出来的结果，原始图
    img_height = bottom[1]->height();
  } else {
    img_width = img_w_;    // 对图进行缩放，可以设置参数
    img_height = img_h_;
  }
  float step_w, step_h;
  if (step_w_ == 0 || step_h_ == 0) {   // 得到缩放比例，相当于faster的feat_stride，这里处理的稍好些，长宽都有相应参数
    step_w = static_cast<float>(img_width) / layer_width;  // 这里都用的float，不像faster直接暴力int型
    step_h = static_cast<float>(img_height) / layer_height;
  } else {
    step_w = step_w_;
    step_h = step_h_;
  }
  Dtype* top_data = top[0]->mutable_cpu_data();
  int dim = layer_height * layer_width * num_priors_ * 4;  // 一般情况下w*h*6*4，conv4_3除外，详细参考笔记上的框架图
  int idx = 0;
  for (int h = 0; h < layer_height; ++h) {   // 对于feature map上的每个点逐一映射
    for (int w = 0; w < layer_width; ++w) {
      // 这里和Faster RCNN 一样，就是把feature map上的点映射回原图,这里加上0.5也是为了四舍五入，和faster rcnn python代码类似
      float center_x = (w + offset_) * step_w;   
      float center_y = (h + offset_) * step_h;
      float box_width, box_height;
      for (int s = 0; s < min_sizes_.size(); ++s) {  // min_sizes_.size()=1
        int min_size_ = min_sizes_[s]; 
        // 这里的min_size从fc7_mbox_priorbox的60到最后的276，就是s_k从0.2到0.92的过程
        // first prior: aspect_ratio = 1, size = min_size
        box_width = box_height = min_size_;  
        // xmin
        top_data[idx++] = (center_x - box_width / 2.) / img_width;    // 
        // ymin
        top_data[idx++] = (center_y - box_height / 2.) / img_height;
        // xmax
        top_data[idx++] = (center_x + box_width / 2.) / img_width;
        // ymax
        top_data[idx++] = (center_y + box_height / 2.) / img_height;

        if (max_sizes_.size() > 0) {
          CHECK_EQ(min_sizes_.size(), max_sizes_.size());
          int max_size_ = max_sizes_[s];
          // second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)  // 这里就和论文中一致，s_k的选法，每个都不同
          box_width = box_height = sqrt(min_size_ * max_size_);
          // xmin
          top_data[idx++] = (center_x - box_width / 2.) / img_width;
          // ymin
          top_data[idx++] = (center_y - box_height / 2.) / img_height;
          // xmax
          top_data[idx++] = (center_x + box_width / 2.) / img_width;
          // ymax
          top_data[idx++] = (center_y + box_height / 2.) / img_height;
        }

        // rest of priors
        for (int r = 0; r < aspect_ratios_.size(); ++r) {  // 其他几个比例计算
          float ar = aspect_ratios_[r];
          if (fabs(ar - 1.) < 1e-6) {
            continue;
          }
          box_width = min_size_ * sqrt(ar);
          box_height = min_size_ / sqrt(ar);
          // xmin
          top_data[idx++] = (center_x - box_width / 2.) / img_width;
          // ymin
          top_data[idx++] = (center_y - box_height / 2.) / img_height;
          // xmax
          top_data[idx++] = (center_x + box_width / 2.) / img_width;
          // ymax
          top_data[idx++] = (center_y + box_height / 2.) / img_height;
        }
      }  // end for min_size=1
    }  // end for w
  }  // end for h
  // 到这里，所有的prior_box选取完成，共6个比例，和论文中相符合，同时在每一层中算一个s_k,就是每一层都会设置一个min_size
  // clip the prior's coordidate such that it is within [0, 1]
  if (clip_) {                        // 裁剪到[0,1]
    for (int d = 0; d < dim; ++d) {
      top_data[d] = std::min<Dtype>(std::max<Dtype>(top_data[d], 0.), 1.);
    }
  }
  // set the variance.
  // 解答： https://github.com/weiliu89/caffe/issues/75
  // 除以variance是对预测box和真实box的误差进行放大，从而增加loss，增大梯度，加快收敛。
  // 另外，top_data += top[0]->offset(0, 1);已经使指针指向新的地址，所以variance不会覆盖前面的结果。
  // offse一般都是4个参数的offset(n,c,w,h),设置相应的参数就可以指到下一张图（以四位张量为例）
  top_data += top[0]->offset(0, 1); // 这里我猜是指向了下一个chanel
  if (variance_.size() == 1) {
    caffe_set<Dtype>(dim, Dtype(variance_[0]), top_data);// 用常数variance_[0]对top_data进行初始化
  } else {
    int count = 0;
    for (int h = 0; h < layer_height; ++h) {
      for (int w = 0; w < layer_width; ++w) {
        for (int i = 0; i < num_priors_; ++i) {
          for (int j = 0; j < 4; ++j) {
            top_data[count] = variance_[j];
            ++count;
          }
        }
      }
    }
  }
}

INSTANTIATE_CLASS(PriorBoxLayer);
REGISTER_LAYER_CLASS(PriorBox);

}  // namespace caffe

本文链接：http://blog.csdn.net/u011956147/article/details/73032867

SSD：Single Shot MultiBox Detector（三）

猜你喜欢