SSD网络解析之Permute层

Permute层是SSD（Single Shot MultiBox Detector）中用于置换索引轴顺序的，与matlab中的permute()函数实现类似的功能，首先我们看一下caffe.proto中关于该层参数的说明：

optional PermuteParameter permute_param = 202;

message PermuteParameter {
  // The new orders of the axes of data. Notice it should be with
  // in the same range as the input data, and it starts from 0.
  // Do not provide repeated order.
  repeated uint32 order = 1;
}

从上述PermuteParameter中可以看出，需要设置的参数为数组order，即置换后的索引轴顺序，可以指定输入blob中所有索引轴（维度）的顺序，例如输入blob为num(0)×channel(1)×height(2)×width(3)，如果想要置换前两轴，则可设置

permute_param {
    order: 1
    order: 0
    order: 2
    order: 3
}

在上述情况下，由于长度和宽度轴不变，故也可以忽略后两轴的设置，直接设置为

permute_param {
    order: 1
    order: 0
}

其次，我们需要看一下该层的头文件，即permute_layer.hpp

#ifndef CAFFE_PERMUTE_LAYER_HPP_
#define CAFFE_PERMUTE_LAYER_HPP_

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {

/**
 * @brief Permute the input blob by changing the memory order of the data.
 *
 * TODO(weiliu89): thorough documentation for Forward, Backward, and proto params.
 */

// The main function which does the permute.
//真正实现置换的函数
template <typename Dtype>
void Permute(const int count, Dtype* bottom_data, const bool forward,
    const int* permute_order, const int* old_steps, const int* new_steps,
    const int num_axes, Dtype* top_data);

//PermuteLayer类，继承于Layer
template <typename Dtype>
class PermuteLayer : public Layer<Dtype> {
 public:
  explicit PermuteLayer(const LayerParameter& param)
      : Layer<Dtype>(param) {}
  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual inline const char* type() const { return "Permute"; }
  virtual inline int ExactNumBottomBlobs() const { return 1; } //输入blob数目为1
  virtual inline int ExactNumTopBlobs() const { return 1; }   //输出blob数目也为1

 protected:
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

  int num_axes_; //输入blob的索引轴数目（即维数）
  bool need_permute_; //判断是否需要置换索引轴顺序

  // Use Blob because it is convenient to be accessible in .cu file.
  Blob<int> permute_order_; //用于记录置换顺序后的各轴顺序
  Blob<int> old_steps_;  //用于记录置换前某几维的总元素数目
  Blob<int> new_steps_;  //用于记录置换后某几维的总元素数目
};

}  // namespace caffe

#endif  // CAFFE_PERMUTE_LAYER_HPP_

在此基础上，我们移步看一下头文件中各函数在cpp文件中的实现，permute_layer.cpp(CPU实现)和permute_layer.cu(GPU实现)。

（1）permute_layer.cpp

#include <vector>

#include "caffe/layers/permute_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

//真正实现置换的函数
template <typename Dtype>
void Permute(const int count, Dtype* bottom_data, const bool forward,
    const int* permute_order, const int* old_steps, const int* new_steps,
    const int num_axes, Dtype* top_data) {
    for (int i = 0; i < count; ++i) {
      int old_idx = 0;
      int idx = i;
      for (int j = 0; j < num_axes; ++j) {
        int order = permute_order[j];
        old_idx += (idx / new_steps[j]) * old_steps[order]; //old_idx为原始数据对应于现在的i的索引
        idx %= new_steps[j];
      }
      if (forward) {
        top_data[i] = bottom_data[old_idx];
      } else {
        bottom_data[old_idx] = top_data[i];
      }
    }
}

//PermuteLayer建立，并初始化一些参数
template <typename Dtype>
void PermuteLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  PermuteParameter permute_param = this->layer_param_.permute_param();
  CHECK_EQ(bottom.size(), 1);
  num_axes_ = bottom[0]->num_axes(); //获取输入blob的轴数目
  vector<int> orders;
  // Push the specified new orders.
  //将指定的新的索引轴顺序压入orders
  for (int i = 0; i < permute_param.order_size(); ++i) {
    int order = permute_param.order(i);
    CHECK_LT(order, num_axes_)
        << "order should be less than the input dimension.";
    //find()函数可参见https://www.cnblogs.com/chinshing/p/3984333.html
    if (std::find(orders.begin(), orders.end(), order) != orders.end()) {
      LOG(FATAL) << "there are duplicate orders";
    }
    orders.push_back(order);
  }
  // Push the rest orders. And save original step sizes for each axis.
  //注意所指定的新的索引轴顺序的大小不一定等于num_axes_,例如原来顺序为0,1,2,3;指定前两轴交换顺序，即交换后为1,0,2,3
  //这时只指定permute_param.order(0)=1,permute_param.order(1)=0即可，也即只需要permute_param.order_size()=2,后两轴无需指定
  //通过以下for循环自动设置
  for (int i = 0; i < num_axes_; ++i) {
    if (std::find(orders.begin(), orders.end(), i) == orders.end()) {
      orders.push_back(i);
    }
  }
  CHECK_EQ(num_axes_, orders.size());
  // Check if we need to reorder the data or keep it.检查是否需要改变数据的索引轴顺序
  need_permute_ = false;
  for (int i = 0; i < num_axes_; ++i) {
    if (orders[i] != i) {
      // As long as there is one order which is different from the natural order
      // of the data, we need to permute. Otherwise, we share the data and diff.
      //只要有一个轴的顺序发生改变，则需要置换顺序（即设置need_permute_为true）
      need_permute_ = true;
      break;
    }
  }

  vector<int> top_shape(num_axes_, 1);  //用于记录置换顺序后的输出blob的大小
  //以下三个变量均为blob类，方便.cu文件的实现
  permute_order_.Reshape(num_axes_, 1, 1, 1); //用于记录置换顺序后的各轴顺序
  old_steps_.Reshape(num_axes_, 1, 1, 1);
  new_steps_.Reshape(num_axes_, 1, 1, 1);
  for (int i = 0; i < num_axes_; ++i) {
    permute_order_.mutable_cpu_data()[i] = orders[i];  //将置换顺序写入permute_order_（blob）中
    top_shape[i] = bottom[0]->shape(orders[i]);  //将置换顺序后的输出blob的大小依次写入top_shape中
  }
  top[0]->Reshape(top_shape); //根据top_shape重新修正输出blob的大小
}


template <typename Dtype>
void PermuteLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  vector<int> top_shape;
  for (int i = 0; i < num_axes_; ++i) {
    if (i == num_axes_ - 1) {
      old_steps_.mutable_cpu_data()[i] = 1;
    } else {
      old_steps_.mutable_cpu_data()[i] = bottom[0]->count(i + 1); //count(int start_axis)实现计算从某一维度开始的元素总数
    }
    top_shape.push_back(bottom[0]->shape(permute_order_.cpu_data()[i]));
  }
  top[0]->Reshape(top_shape); //感觉多此一举（上面建立层的函数已经reshape过了）
  
  
  for (int i = 0; i < num_axes_; ++i) {
    if (i == num_axes_ - 1) {
      new_steps_.mutable_cpu_data()[i] = 1;
    } else {
      new_steps_.mutable_cpu_data()[i] = top[0]->count(i + 1);
    }
  }
}

//前向传播
template <typename Dtype>
void PermuteLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  if (need_permute_) {
    Dtype* bottom_data = bottom[0]->mutable_cpu_data();
    Dtype* top_data = top[0]->mutable_cpu_data();
    const int top_count = top[0]->count();
    const int* permute_order = permute_order_.cpu_data();
    const int* old_steps = old_steps_.cpu_data();
    const int* new_steps = new_steps_.cpu_data();
    bool forward = true; 
    //调用Permute()函数实现输入数据的索引轴顺序置换
    Permute(top_count, bottom_data, forward, permute_order, old_steps,
            new_steps, num_axes_, top_data);
  } else {
    // If there is no need to permute, we share data to save memory.
    top[0]->ShareData(*bottom[0]); //输出共享输入数据，节省内存
  }
}

//后向传播（其实就是将输出diff改回原顺序赋值给输入diff，从而实现后向传播）
template <typename Dtype>
void PermuteLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  if (need_permute_) {
    Dtype* top_diff = top[0]->mutable_cpu_diff();
    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
    const int top_count = top[0]->count();
    const int* permute_order = permute_order_.cpu_data();
    const int* old_steps = old_steps_.cpu_data();
    const int* new_steps = new_steps_.cpu_data();
    bool forward = false;
    Permute(top_count, bottom_diff, forward, permute_order, old_steps,
            new_steps, num_axes_, top_diff);
  } else {
    // If there is no need to permute, we share diff to save memory.
    bottom[0]->ShareDiff(*top[0]);
  }
}

#ifdef CPU_ONLY
STUB_GPU(PermuteLayer);
#endif

INSTANTIATE_CLASS(PermuteLayer);
REGISTER_LAYER_CLASS(Permute);

}  // namespace caffe

其中的permute()函数是实现置换的关键函数，其运用置换前和置换后各维度的元素数目来实现置换，即通过保存的old_steps_和new_steps_参数进行置换，具体举个例子：

如果输入blob的各维数为num=2,channel=2,height=3,width=2，置换后的索引轴顺序为num×channel×width×height，则按照上述Reshape()函数可知：

old_steps_[0] = channel×height×width = 12

old_steps_[1] = height×width = 6

old_steps_[2] = width = 2

old_steps_[3] = 1（无论输入为什么，均为1）

new_steps_[0] = channel×width×height= 12

new_steps_[1] = width×height = 6

new_steps_[2] = height = 3

new_steps_[3] = 1（无论输入为什么，均为1）

在此基础上，只要正确找到置换后某一位置对应的原数据中该元素的索引就可实现置换。由于caffe中的blob中的数据是由ProtoBuffer序列化的，即是一行数据，例如上述输入数据假设为：

input[0][0][0][0]=0

input[0][0][0][1]=1

input[0][0][1][0]=2

input[0][0][1][1]=3

input[0][1][0][0]=4

input[1][1][2][1]=23

则在调用input_ = input.mutable_cpu_data()或input_ = input.mutable_cpu_diff()得到的是序列化后的数据(按0000-1121依次增大的顺序序列化)，即：

input_ [0]=0

input_ [1]=1

input_ [23]=23

由此明白了数据的存放顺序，便能更好理解permute()函数的置换过程。

permute()函数实现的本质就是从置换后的数据索引找到对应的原始数据的该元素的索引，通过嵌套for循环实现：

for (int i = 0; i < count; ++i) {
      int old_idx = 0;
      int idx = i;
      for (int j = 0; j < num_axes; ++j) {
        int order = permute_order[j];
        old_idx += (idx / new_steps[j]) * old_steps[order]; //old_idx为原始数据对应于现在的i的索引
        idx %= new_steps[j];
      }
}

第一个for循环就是依次取出置换后各元素在数组中的索引idx;第二个for循环计算idx对应的原数据对应的该元素的索引old_idx，实现过程就是不断计算除数和余数来实现。

假设idx=1，则old_idx = (1 / 12)*12 + ((1 % 12) / 6)*6 + (((1 % 12) % 6) / 3)*1 + ((((1 % 12) % 6) % 3) / 1)*2 = 2

注：C++中的除号/对于int型来说是不保留小数的（取商），即1/12=0

故new[0][0][0][1]=new.mutable_cpu_data()[1]=input.mutable_cpu_data()[2]=input[0][0][1][0]=2，大家自行脑补一下，转换height和width后该位置是不是对应原始数据的2。

实际上上述嵌套for循环的核心就在于超过某一维后所有维度的元素数目（不包括当前维）后经过取余操作进入下一维，而对应的原始数据的索引则通过取商后乘以原始数据该维后的所有维度的元素数目（不包括当前维）来计算得到。

大家还可以自行试一下其余元素的索引。

（2）permute_layer.cu

#include <algorithm>
#include <cfloat>
#include <vector>

#include "caffe/layers/permute_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
__global__ void PermuteKernel(const int nthreads,
    Dtype* const bottom_data, const bool forward, const int* permute_order,
    const int* old_steps, const int* new_steps, const int num_axes,
    Dtype* const top_data) {
  CUDA_KERNEL_LOOP(index, nthreads) { //CUDA_KERNEL_LOOP函数相当于for循环，只是是多线程的for循环
    int temp_idx = index;
    int old_idx = 0;
    for (int i = 0; i < num_axes; ++i) {
      int order = permute_order[i];
      old_idx += (temp_idx / new_steps[i]) * old_steps[order];
      temp_idx %= new_steps[i];
    }
    if (forward) {
      top_data[index] = bottom_data[old_idx];
    } else {
      bottom_data[old_idx] = top_data[index];
    }
  }
}

//Forward_gpu和Backward_gpu与CPU版本一致，只是将Permute()函数替换成了GPU下的PermuteKernel()函数
template <typename Dtype>
void PermuteLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  if (need_permute_) {
    Dtype* bottom_data = bottom[0]->mutable_gpu_data();
    Dtype* top_data = top[0]->mutable_gpu_data();
    int count = top[0]->count();
    const int* permute_order = permute_order_.gpu_data();
    const int* new_steps = new_steps_.gpu_data();
    const int* old_steps = old_steps_.gpu_data();
    bool foward = true;
    // NOLINT_NEXT_LINE(whitespace/operators)
    PermuteKernel<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
        count, bottom_data, foward, permute_order, old_steps, new_steps,
        num_axes_, top_data);
    CUDA_POST_KERNEL_CHECK;
  } else {
    // If there is no need to permute, we share data to save memory.
    top[0]->ShareData(*bottom[0]);
  }
}


template <typename Dtype>
void PermuteLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  if (need_permute_) {
    Dtype* top_diff = top[0]->mutable_gpu_diff();
    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
    const int count = bottom[0]->count();
    const int* permute_order = permute_order_.gpu_data();
    const int* new_steps = new_steps_.gpu_data();
    const int* old_steps = old_steps_.gpu_data();
    bool foward = false;
    // NOLINT_NEXT_LINE(whitespace/operators)
    PermuteKernel<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
        count, bottom_diff, foward, permute_order, old_steps, new_steps,
        num_axes_, top_diff);
    CUDA_POST_KERNEL_CHECK;
  } else {
    // If there is no need to permute, we share diff to save memory.
    bottom[0]->ShareDiff(*top[0]);
  }
}

INSTANTIATE_LAYER_GPU_FUNCS(PermuteLayer);

}  // namespace caffe

SSD网络解析之Permute层

猜你喜欢