ssd源码detection_output_layer解析

概述

由于cpu版本速度太慢，真正应用实际环境中仅仅只能通过cuda或者opencl实现，所以我将仅仅介绍cuda版本

detection_output_layer层的输入可以参考Caffe框架下SSD算法源码综述。它通过hpp，cpp和cu实现。
参看DetectionOutputLayer::Forward_gpu()，前向传播通过decodeBBoxesGPU函数将预测得到的检测框进行解码操作
通过PermuteDataGPU函数重新reshape一下类别的预测值，在处理之前，deploy.prototxt可以看出已经将conf当做dtection_output_layer的输入之前已经做了sofxmax。所以我们不需要在detection_output_layer中进行softmax
然后通过上述的两个结果进行处理：将不同的类别应用极大抑制算法（类别间的极大抑制算法是相互独立的）
最终将处理后的数据放入输出层中
源码还有存储结果操作，不是必要项，所以不解析

源码解析

和常规的layer层一样，detection_output_layer函数主要Forward和Backward组成，但没有实现Backward。

前向传播使用到的函数有:

DecodeBBoxesGPU函数
PermuteDataGPU函数
ApplyNMSFast函数

Forward_gpu

template <typename Dtype>
void DetectionOutputLayer<Dtype>::Forward_gpu(
        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
    // loc_data 为输入的数据，这里表示预测得到的位置
  
    const Dtype* loc_data = bottom[0]->gpu_data();
    // prior_data 为输入的数据，这里表示loc_data对应的原图的大小
    const Dtype* prior_data = bottom[2]->gpu_data();
    // num 为样本数量
    const int num = bottom[0]->num();

    // caffe 一般通过mutable表示将会改变数据的指针，否则仅仅进行读操作
    Dtype* bbox_data = bbox_preds_.mutable_gpu_data();
    // loc_count 为所有样本预测结果综合
    const int loc_count = bbox_preds_.count();
    // 是否将预测得到的框大小限定在原图之内(预测得到的数据可能会产生)
    const bool clip_bbox = false;
    /*
     * code_type_类型默认为 CENTER_SIZE
     * variance_encoded_in_target_ 默认为false，表示不使用variance带入到位置预测的结果计算结果
     * num_priors_ 表示所有候选框的数目
     * share_location 默认为true，表示位置预测默认将所有类的位置归为一种类别进行位置预测
     * num_loc_classes share_location ? 1 : num_classes
     * background_label_id 背景标签的id
     * clip_bbox: 是否将位置预测值限定在0到1中，图片大小内
    */
    DecodeBBoxesGPU<Dtype>(loc_count, loc_data, prior_data, code_type_,
        variance_encoded_in_target_, num_priors_, share_location_,
        num_loc_classes_, background_label_id_, clip_bbox, bbox_data);
    // Retrieve all decoded location predictions.
    const Dtype* bbox_cpu_data;
    if (!share_location_) {
      Dtype* bbox_permute_data = bbox_permute_.mutable_gpu_data();
      PermuteDataGPU<Dtype>(loc_count, bbox_data, num_loc_classes_, num_priors_,
          4, bbox_permute_data);
      bbox_cpu_data = bbox_permute_.cpu_data();
    } else {
      bbox_cpu_data = bbox_preds_.cpu_data();
    }

    // Retrieve all confidences.
    Dtype* conf_permute_data = conf_permute_.mutable_gpu_data();
    // bottom[1] 为conf分类数据结果， num_classes_为类别数量，num_priors为单个样本的所有先验框数量
    // 将 conf 数据由 num_batch d c num_dim 转换成　num_batch c d num_dim
    PermuteDataGPU<Dtype>(bottom[1]->count(), bottom[1]->gpu_data(),
        num_classes_, num_priors_, 1, conf_permute_data);
    const Dtype* conf_cpu_data = conf_permute_.cpu_data();

    int num_kept = 0;
    vector<map<int, vector<int> > > all_indices;
    // i 传输的num索引, 最终
    for (int i = 0; i < num; ++i) {
      map<int, vector<int> > indices;
      int num_det = 0;
      // 当前所处的num所在的类别起始索引
      const int conf_idx = i * num_classes_ * num_priors_;
      int bbox_idx;
      if (share_location_) {
        // bbox_idx为当前所处的num其实位置索引
        bbox_idx = i * num_priors_ * 4;
      } else {
        bbox_idx = conf_idx * 4;
      }
      // conf n 通道为num_batch,第二个通道是 classes，所以一层num循环后，紧接着进行classes循环
      // 而indices内存储的是经过nms筛选过后的样本，其中第一个索引为对应的类别，第二个索引对应的值为相应类别下的结果位置
      for (int c = 0; c < num_classes_; ++c) {
        // 不处理背景
        if (c == background_label_id_) {
          // Ignore background class.
          continue;
        }
        /* 获取当前num以及当前类别所在的cur_conf的起始索引指针
         * 其中conf_idx已经存储到了num所以为了找到类别起始索引
         * 仅仅需要加上c * num_proirs
        */
        const Dtype* cur_conf_data = conf_cpu_data + conf_idx + c * num_priors_;
        // 获取当前num的位置起始位置
        const Dtype* cur_bbox_data = bbox_cpu_data + bbox_idx;
        // share_location为true，跳过
        if (!share_location_) {
          cur_bbox_data += c * num_priors_ * 4;
        }
        /*
         * 应用非极大抑制算法
         * confidence_threashold为阈值设置，
         * nms_threashold为设置的阈值，
         * era_
         * top_k_表示保存的最大数量
        */
        ApplyNMSFast(cur_bbox_data, cur_conf_data, num_priors_,
            confidence_threshold_, nms_threshold_, eta_, top_k_, &(indices[c]));
        // 加上所有类别预测
        num_det += indices[c].size();
      }

      // 如果结果大于top_k_，则需要对各个类总体的运用一次nms算法
      if (keep_top_k_ > -1 && num_det > keep_top_k_) {
        vector<pair<float, pair<int, int> > > score_index_pairs;
        for (map<int, vector<int> >::iterator it = indices.begin();
             it != indices.end(); ++it) {
          int label = it->first;
          const vector<int>& label_indices = it->second;
          for (int j = 0; j < label_indices.size(); ++j) {
            int idx = label_indices[j];
            float score = conf_cpu_data[conf_idx + label * num_priors_ + idx];
            score_index_pairs.push_back(std::make_pair(
                    score, std::make_pair(label, idx)));
          }
        }
        // Keep top k results per image.
        std::sort(score_index_pairs.begin(), score_index_pairs.end(),
                  SortScorePairDescend<pair<int, int> >);
        score_index_pairs.resize(keep_top_k_);
        // Store the new indices.
        map<int, vector<int> > new_indices;
        for (int j = 0; j < score_index_pairs.size(); ++j) {
          int label = score_index_pairs[j].second.first;
          int idx = score_index_pairs[j].second.second;
          new_indices[label].push_back(idx);
        }
        all_indices.push_back(new_indices);
        num_kept += keep_top_k_;
      } else {
        all_indices.push_back(indices);
        num_kept += num_det;
      }
    }

    // 下面比较容易了，将数据存储到top中
    vector<int> top_shape(2, 1);
    // 输出层的shape第一维度为预测的类别结果数量
    top_shape.push_back(num_kept);
    // top第二个维度为7
    top_shape.push_back(7);
    Dtype* top_data;
    if (num_kept == 0) {
      LOG(INFO) << "Couldn't find any detections";
      top_shape[2] = num;
      top[0]->Reshape(top_shape);
      top_data = top[0]->mutable_cpu_data();
      caffe_set<Dtype>(top[0]->count(), -1, top_data);
      // Generate fake results per image.
      for (int i = 0; i < num; ++i) {
        top_data[0] = i;
        top_data += 7;
      }
    } else {
      top[0]->Reshape(top_shape);
      top_data = top[0]->mutable_cpu_data();
    }

    int count = 0;
    for (int i = 0; i < num; ++i) {
      const int conf_idx = i * num_classes_ * num_priors_;
      int bbox_idx;
      if (share_location_) {
        bbox_idx = i * num_priors_ * 4;
      } else {
        bbox_idx = conf_idx * 4;
      }
      for (map<int, vector<int> >::iterator it = all_indices[i].begin();
           it != all_indices[i].end(); ++it) {
        int label = it->first;
        vector<int>& indices = it->second;
        const Dtype* cur_conf_data =
          conf_cpu_data + conf_idx + label * num_priors_;
        const Dtype* cur_bbox_data = bbox_cpu_data + bbox_idx;
        if (!share_location_) {
          cur_bbox_data += label * num_priors_ * 4;
        }
        for (int j = 0; j < indices.size(); ++j) {
          /*
           * top第第二维度大小为7
           * 0 为所处的在的num
           * 1　为分类标签
           * 2 为分类置信度
           * 3 , 4, 5, 6 分别为坐标xmin,ymin,xmax,ymax
          */
          int idx = indices[j];
          top_data[count * 7] = i;
          top_data[count * 7 + 1] = label;
          top_data[count * 7 + 2] = cur_conf_data[idx];
          for (int k = 0; k < 4; ++k) {
            top_data[count * 7 + 3 + k] = cur_bbox_data[idx * 4 + k];
          }
          ++count;
        }
      }
    }
}

DecodeBBoxesGPU实现：

DecodeBBoxesGpu使用到了DecodeBBoxesKernel核函数

template <typename Dtype>
void DecodeBBoxesGPU(const int nthreads,
          const Dtype* loc_data, const Dtype* prior_data,
          const CodeType code_type, const bool variance_encoded_in_target,
          const int num_priors, const bool share_location,
          const int num_loc_classes, const int background_label_id,
          const bool clip_bbox, Dtype* bbox_data) {
  /*
   * 通过预测的位置结果以及先验框计算预测结果
   * 并储存到bbox_data中
   */
  // NOLINT_NEXT_LINE(whitespace/operators)

  DecodeBBoxesKernel<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
      CAFFE_CUDA_NUM_THREADS>>>(nthreads, loc_data, prior_data, code_type,
      variance_encoded_in_target, num_priors, share_location, num_loc_classes,
      background_label_id, clip_bbox, bbox_data);
  CUDA_POST_KERNEL_CHECK;
}

DecoeBBoxesKernel核函数实现：

template <typename Dtype>
__global__ void DecodeBBoxesKernel(const int nthreads,
          const Dtype* loc_data, const Dtype* prior_data,
          const CodeType code_type, const bool variance_encoded_in_target,
          const int num_priors, const bool share_location,
          const int num_loc_classes, const int background_label_id,
          const bool clip_bbox, Dtype* bbox_data) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    // i 存储的为除4的约数，表示是预测值x, y, center_x, center_y的哪一个
    const int i = index % 4;
    // 将num_loc_classes当做1对待
    // c表示处在那个4的倍数中
    const int c = (index / 4) % num_loc_classes;
    // d表示处在对应样本中的第几个位置
    const int d = (index / 4 / num_loc_classes) % num_priors;
    if (!share_location && c == background_label_id) {
      // Ignore background class if not share_location.
      return;
    }
    /*
     * pi 表示对应prior中先验框位置的索引
     * vi 表示对应prior中先验框位置所对应的variance索引
     * 需要说明的是nthreads的大小是随着检测样本数变化的，而proir先验框的大小是不变的
    */
    const int pi = d * 4;
    const int vi = pi + num_priors * 4;
    // 为了简化源码分析，默认仅仅分析CENTER，所以不考虑CORNER
    if (code_type == PriorBoxParameter_CodeType_CORNER) {
      if (variance_encoded_in_target) {
        // variance is encoded in target, we simply need to add the offset
        // predictions.
        bbox_data[index] = prior_data[pi + i] + loc_data[index];
      } else {
        // variance is encoded in bbox, we need to scale the offset accordingly.
        bbox_data[index] =
          prior_data[pi + i] + loc_data[index] * prior_data[vi + i];
      }
    } else if (code_type == PriorBoxParameter_CodeType_CENTER_SIZE) {
      // p_xmin, p_ymin. p_xmax,p_ymax表示预测结果对应的先验框的结果
      const Dtype p_xmin = prior_data[pi];
      const Dtype p_ymin = prior_data[pi + 1];
      const Dtype p_xmax = prior_data[pi + 2];
      const Dtype p_ymax = prior_data[pi + 3];
      // 通过对应与原图坐标点，算出先验框的长宽以及中心点坐标
      const Dtype prior_width = p_xmax - p_xmin;
      const Dtype prior_height = p_ymax - p_ymin;
      const Dtype prior_center_x = (p_xmin + p_xmax) / 2.;
      const Dtype prior_center_y = (p_ymin + p_ymax) / 2.;

      // 获取预测
      const Dtype xmin = loc_data[index - i];
      const Dtype ymin = loc_data[index - i + 1];
      const Dtype xmax = loc_data[index - i + 2];
      const Dtype ymax = loc_data[index - i + 3];

      Dtype decode_bbox_center_x, decode_bbox_center_y;
      Dtype decode_bbox_width, decode_bbox_height;
      if (variance_encoded_in_target) {
        // variance is encoded in target, we simply need to retore the offset
        // predictions.
        decode_bbox_center_x = xmin * prior_width + prior_center_x;
        decode_bbox_center_y = ymin * prior_height + prior_center_y;
        decode_bbox_width = exp(xmax) * prior_width;
        decode_bbox_height = exp(ymax) * prior_height;
      } else {
        // variance is encoded in bbox, we need to scale the offset accordingly.
        // 获取编码点
        decode_bbox_center_x =
          prior_data[vi] * xmin * prior_width + prior_center_x;
        decode_bbox_center_y =
          prior_data[vi + 1] * ymin * prior_height + prior_center_y;
        decode_bbox_width =
          exp(prior_data[vi + 2] * xmax) * prior_width;
        decode_bbox_height =
          exp(prior_data[vi + 3] * ymax) * prior_height;
      }


      // 将结果转换到对应的存储结果中的，
      switch (i) {
        // 结果的最小横坐标
        case 0:
          bbox_data[index] = decode_bbox_center_x - decode_bbox_width / 2.;
          break;
        // 结果的最小纵坐标
        case 1:
          bbox_data[index] = decode_bbox_center_y - decode_bbox_height / 2.;
          break;
        // 结果的最大横坐标
        case 2:
          bbox_data[index] = decode_bbox_center_x + decode_bbox_width / 2.;
          break;
        // 结果的最大纵坐标
        case 3:
          bbox_data[index] = decode_bbox_center_y + decode_bbox_height / 2.;
          break;
      }
      // CORNER_SIZE暂时不考虑
    } else if (code_type == PriorBoxParameter_CodeType_CORNER_SIZE) {
      const Dtype p_xmin = prior_data[pi];
      const Dtype p_ymin = prior_data[pi + 1];
      const Dtype p_xmax = prior_data[pi + 2];
      const Dtype p_ymax = prior_data[pi + 3];
      const Dtype prior_width = p_xmax - p_xmin;
      const Dtype prior_height = p_ymax - p_ymin;
      Dtype p_size;
      if (i == 0 || i == 2) {
        p_size = prior_width;
      } else {
        p_size = prior_height;
      }
      if (variance_encoded_in_target) {
        // variance is encoded in target, we simply need to add the offset
        // predictions.
        bbox_data[index] = prior_data[pi + i] + loc_data[index] * p_size;
      } else {
        // variance is encoded in bbox, we need to scale the offset accordingly.
        bbox_data[index] =
          prior_data[pi + i] + loc_data[index] * prior_data[vi + i] * p_size;
      }
    } else {
      // Unknown code type.
    }
    // 将预测得到的bbox_data值限定在0,1之间
    if (clip_bbox) {
      bbox_data[index] = max(min(bbox_data[index], Dtype(1.)), Dtype(0.));
    }
  }
}

PermuteDataGPU实现

DecodeBBoxesGpu使用到了PermuteDataKernel核函数

// num_classes_, num_priors_, 1
template <typename Dtype>
void PermuteDataGPU(const int nthreads,
          const Dtype* data, const int num_classes, const int num_data,
          const int num_dim, Dtype* new_data) {
  // NOLINT_NEXT_LINE(whitespace/operators)
  PermuteDataKernel<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
      CAFFE_CUDA_NUM_THREADS>>>(nthreads, data, num_classes, num_data,
      num_dim, new_data);
  CUDA_POST_KERNEL_CHECK;
}

PermuteDataKernel实现

// 将输出通道从num_batch d classes num_dim 转换成 num_batch classes d num_dim
template <typename Dtype>
__global__ void PermuteDataKernel(const int nthreads,
          const Dtype* data, const int num_classes, const int num_data,
          const int num_dim, Dtype* new_data) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    // i 表示处在样本的维度
    const int i = index % num_dim;
    // c 表示类别
    const int c = (index / num_dim) % num_classes;
    // d 表示所处的位置
    const int d = (index / num_dim / num_classes) % num_data;
    // n 表示处在第几个batch里面
    const int n = index / num_dim / num_classes / num_data;
    const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i;
    new_data[new_index] = data[index];
  }
}

ApplyNMSFast实现

GetMaxScoreIndex使用到了GetMaxScoreIndex函数

template <typename Dtype>
void ApplyNMSFast(const Dtype* bboxes, const Dtype* scores, const int num,
      const float score_threshold, const float nms_threshold,
      const float eta, const int top_k, vector<int>* indices) {
  // Get top_k scores (with corresponding indices).
  // std::pair 存储的是成对变量第一个索引为模板类定义的变量为存储的分类得分，而第二个值对int型变量
  vector<pair<Dtype, int> > score_index_vec;
  // score_index_vec 内部存储的是分数以及位置，以分数的降序排序，大小小于或者等于top_k_
  GetMaxScoreIndex(scores, num, score_threshold, top_k, &score_index_vec);

  // Do nms.
  float adaptive_threshold = nms_threshold;
  indices->clear();
  while (score_index_vec.size() != 0) {
    const int idx = score_index_vec.front().second;
    bool keep = true;
    // 与indices所存储的数据进行比对，如果公共比例小于adaptive_threshold阈值，
    // 则将keep设置为true,否则为false
    for (int k = 0; k < indices->size(); ++k) {
      if (keep) {
        const int kept_idx = (*indices)[k];
        float overlap = JaccardOverlap(bboxes + idx * 4, bboxes + kept_idx * 4);
        keep = overlap <= adaptive_threshold;
      } else {
        break;
      }
    }
    // 将其添加进indices中
    if (keep) {
      indices->push_back(idx);
    }
    // 删除score_index_vec中的第一个数据
    score_index_vec.erase(score_index_vec.begin());
    if (keep && eta < 1 && adaptive_threshold > 0.5) {
      adaptive_threshold *= eta;
    }
  }
}

GetMaxScoreIndex实现

template <typename Dtype>
void GetMaxScoreIndex(const Dtype* scores, const int num, const float threshold,
      const int top_k, vector<pair<Dtype, int> >* score_index_vec) {
  // Generate index score pairs.
  // 将得分与所处的piror位置做成匹配变量存储到score_index_vec中
  for (int i = 0; i < num; ++i) {
    if (scores[i] > threshold) {
      score_index_vec->push_back(std::make_pair(scores[i], i));
    }
  }

  // Sort the score pair according to the scores in descending order
  // 对score_index_vec的first进行降序排序， std::sort用法可以自己百度一下
  std::sort(score_index_vec->begin(), score_index_vec->end(),
            SortScorePairDescend<int>);

  // Keep top_k scores if needed.
  // 如何score_index_vec大小大于top_k，通过resize可以设置其大小
  if (top_k > -1 && top_k < score_index_vec->size()) {
    score_index_vec->resize(top_k);
  }
}

后记

竟然使用了一下午的时间注释源码。