论文阅读笔记：Center Loss: A Discriminative Feature Learning Approach for Deep Face Recognition

本文主要包含如下内容：

论文阅读笔记：Center Loss: A Discriminative Feature Learning Approach for Deep Face Recognition

本篇论文针对图像分类添加一个新损失 center loss, 进一步缩小类内距离,获得更高的性能. ECCV2016

主要思想

center loss 主要在softmax的基础上,通过对训练集的每一个类别在特征空间分别维护一个类中心,在训练过程中,增加样本经过网络映射后在特征空间与类中心的距离约束,从而兼顾了类内聚合与类间分离.

网络结构

center loss 作为训练阶段的辅助loss,只需要在特征输出层中引入即可:

这里损失被更改为 softmax 和 center loss,其中,center loss作为辅助损失约束前一层特征的内类距离,softmax loss 约束类间距离.在损失函数的定义中:权重W代表最后一层的权重.

center loss:cyi表示第yi个类别的特征中心，xi表示全连接层之前的特征。其中,m表示mini-batch的大小。因此这个公式就是希望一个batch中的每个样本的feature离feature 的中心的距离的平方和要越小越好，也就是类内距离要越小越好。

示意图为算法的大致流程:首先更新权重 W,其权值更新只与 softmax 损失有关;随后,更新类中心 C,类中心的计算为当前类中心的值与类中心的平均偏移量之和,这里有一个超参数a(为了防止mini-bath类中的扰动);最后,正常情况下更新网络的权重

实验结果

只采用损失 softmax loss 结果:

softmax loss + center loss 结果:主要变化在于类内距离 intra-class

代码实现

源文件主要由四个部分组成:caffe.proto/center_loss_layer.cpp/center_loss_layer.cu/center_loss_layer.hpp. 只需要导入到对应的caffe中编译即可,这里我们就再进行相应的讲解.
接下来,我们看看源码center_loss_layer.hpp:

#ifndef CAFFE_CENTER_LOSS_LAYER_HPP_
#define CAFFE_CENTER_LOSS_LAYER_HPP_

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

#include "caffe/layers/loss_layer.hpp"

namespace caffe {

template <typename Dtype>
class CenterLossLayer : public LossLayer<Dtype> {
 public:
  explicit CenterLossLayer(const LayerParameter& param)
      : LossLayer<Dtype>(param) {}
  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual inline const char* type() const { return "CenterLoss"; }
  virtual inline int ExactNumBottomBlobs() const { return 2; }
  virtual inline int ExactNumTopBlobs() const { return -1; }

 protected:
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

  int M_;       // mini_batch
  int K_;       // 特征输入的长度
  int N_;       // 输出神经元的个数

  Blob<Dtype> distance_;        // 存放前向传播距离差
  Blob<Dtype> variation_sum_;       // 记录样本距离对应类中心的距离
};

}  // namespace caffe

#endif  // CAFFE_CENTER_LOSS_LAYER_HPP_

接下来,我们看看源码center_loss_layer.cpp:

#include <vector>

#include "caffe/filler.hpp"
#include "caffe/layers/center_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
void CenterLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const int num_output = this->layer_param_.center_loss_param().num_output();  
  N_ = num_output;      // 输出神经元的个数
  const int axis = bottom[0]->CanonicalAxisIndex(   // 将特征C*H*W拉直为一个特征
      this->layer_param_.center_loss_param().axis());
  // Dimensions starting from "axis" are "flattened" into a single
  // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
  // and axis == 1, N inner products with dimension CHW are performed.
  K_ = bottom[0]->count(axis);      // 特征输入的长度
  // Check if we need to set up the weights
  if (this->blobs_.size() > 0) {
    LOG(INFO) << "Skipping parameter initialization";
  } else {
    this->blobs_.resize(1);
    // Intialize the weight
    vector<int> center_shape(2);
    center_shape[0] = N_;   // 类似输出维度
    center_shape[1] = K_;   // 类似输入维度
    this->blobs_[0].reset(new Blob<Dtype>(center_shape));   // 尺寸设置为center_shape的大小
    // fill the weights
    shared_ptr<Filler<Dtype> > center_filler(GetFiller<Dtype>(
        this->layer_param_.center_loss_param().center_filler()));
    center_filler->Fill(this->blobs_[0].get());

  }  // parameter initialization
  this->param_propagate_down_.resize(this->blobs_.size(), true);
}

template <typename Dtype>
void CenterLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  CHECK_EQ(bottom[1]->channels(), 1);
  CHECK_EQ(bottom[1]->height(), 1);
  CHECK_EQ(bottom[1]->width(), 1);
  M_ = bottom[0]->num();    // mini_batch
  // The top shape will be the bottom shape with the flattened axes dropped,
  // and replaced by a single axis with dimension num_output (N_).
  LossLayer<Dtype>::Reshape(bottom, top);
  distance_.ReshapeLike(*bottom[0]);
  variation_sum_.ReshapeLike(*this->blobs_[0]);
}

template <typename Dtype>
void CenterLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data(); // 输入倒数第二层的feature
  const Dtype* label = bottom[1]->cpu_data();       // 输入label
  const Dtype* center = this->blobs_[0]->cpu_data();    // 指向类中心的指针
  Dtype* distance_data = distance_.mutable_cpu_data();  // 记录样本距离对应类中心的距离

  // the i-th distance_data
  for (int i = 0; i < M_; i++) {
    const int label_value = static_cast<int>(label[i]);     // 第i个样本对应的label标签
    // D(i,:) = X(i,:) - C(y(i),:)
    caffe_sub(K_, bottom_data + i * K_, center + label_value * K_, distance_data + i * K_); // 计算样本到对应类中心的距离,存放在distance_中
  }
  Dtype dot = caffe_cpu_dot(M_ * K_, distance_.cpu_data(), distance_.cpu_data());   // L2范数,定义loss
  Dtype loss = dot / M_ / Dtype(2);
  top[0]->mutable_cpu_data()[0] = loss;     // 存放loss
}

template <typename Dtype>
void CenterLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  // Gradient with respect to centers
  if (this->param_propagate_down_[0]) {
    const Dtype* label = bottom[1]->cpu_data();     // 输入label
    Dtype* center_diff = this->blobs_[0]->mutable_cpu_diff();       // 存取center的更新值
    Dtype* variation_sum_data = variation_sum_.mutable_cpu_data();
    const Dtype* distance_data = distance_.cpu_data();

    // \sum_{y_i==j}
    caffe_set(N_ * K_, (Dtype)0., variation_sum_.mutable_cpu_data());       // variation_sum_初始化
    for (int n = 0; n < N_; n++) {
      int count = 0;
      for (int m = 0; m < M_; m++) {
        const int label_value = static_cast<int>(label[m]); // 第m个样本对应的label标签
        if (label_value == n) {
          count++;
          caffe_sub(K_, variation_sum_data + n * K_, distance_data + m * K_, variation_sum_data + n * K_);  // distance_data为前向传播时存的D(i,:) = X(i,:) - C(y(i),:) // variation_sum_data为每一类的总偏移量
        }   
      }
      caffe_axpy(K_, (Dtype)1./(count + (Dtype)1.), variation_sum_data + n * K_, center_diff + n * K_);     // 计算类中心的偏移量(对每一类而言),这里记录了this->blobs_[0]->mutable_cpu_diff(),更新后获得新的this->blobs_[0]->cpu_data()
    }
  }
  // Gradient with respect to bottom data 
  if (propagate_down[0]) {
    caffe_copy(M_ * K_, distance_.cpu_data(), bottom[0]->mutable_cpu_diff());       // 返回每一个样本的偏移量(二范数的梯度就是偏移量)
    caffe_scal(M_ * K_, top[0]->cpu_diff()[0] / M_, bottom[0]->mutable_cpu_diff()); // l2范数的导数 ,bottom[0]->mutable_cpu_diff()就是D(i,:) = X(i,:) - C(y(i),:)。
  } 
  if (propagate_down[1]) {
    LOG(FATAL) << this->type()
               << " Layer cannot backpropagate to label inputs.";
  }
}

#ifdef CPU_ONLY
STUB_GPU(CenterLossLayer);
#endif

INSTANTIATE_CLASS(CenterLossLayer);
REGISTER_LAYER_CLASS(CenterLoss);
}  // namespace caffe