参考链接:https://github.com/BVLC/caffe/pull/5665/files
convolution depthwise layer是在Mobilenet v1中提出的概念,相比group convolution具有更少的参数与更快的计算速度。在caffe中可以参考conv_layer来实现。
(1)首先增加在$CAFFE_ROOT/include/caffe/layers中添加conv_dw_layer.hpp头文件
#ifndef CAFFE_CONV_DW_LAYER_HPP_
#define CAFFE_CONV_DW_LAYER_HPP_
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
namespace caffe
{
template <typename Dtype>
class ConvolutionDepthwiseLayer :public Layer<Dtype>
{
public:
explicit ConvolutionDepthwiseLayer(const LayerParameter& param):Layer<Dtype>(param){}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
virtual inline int ExactNumBottomBlobs() const { return 1; }
virtual inline int ExactNumTopBlobs() const { return 1; }
virtual inline const char* type() const { return "ConvolutionDepthwise"; }
protected:
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
unsigned int kernel_h_;
unsigned int kernel_w_;
unsigned int stride_h_;
unsigned int stride_w_;
unsigned int pad_h_;
unsigned int pad_w_;
unsigned int dilation_h_;
unsigned int dilation_w_;
Blob<Dtype> weight_buffer_;
Blob<Dtype> weight_multiplier_;
Blob<Dtype> bias_buffer_;
Blob<Dtype> bias_multiplier_;
};
}// namespace caffe
#endif // CAFFE_CONV_DW_LAYER_HPP_
(2)在$CAFFE_ROOT/src/caffe/layers中添加conv_dw_layer.cpp
#include <algorithm>
#include <vector>
#include "caffe/filler.hpp"
#include "caffe/layers/conv_dw_layer.hpp"
namespace caffe
{
template <typename Dtype>
void ConvolutionDepthwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top)
{
ConvolutionParameter conv_param = this->layer_param_.convolution_param();
if (conv_param.has_kernel_h() && conv_param.has_kernel_w())
{
kernel_h_ = conv_param.kernel_h();
kernel_w_ = conv_param.kernel_w();
}
else
{
if (conv_param.kernel_size_size() == 1)
{
kernel_h_ = conv_param.kernel_size(0);
kernel_w_ = conv_param.kernel_size(0);
}
else
{
kernel_h_ = conv_param.kernel_size(0);
kernel_w_ = conv_param.kernel_size(1);
}
}
if (conv_param.has_stride_h() && conv_param.has_stride_w())
{
stride_h_ = conv_param.stride_h();
stride_w_ = conv_param.stride_w();
}
else
{
if (conv_param.stride_size() == 1)
{
stride_h_ = conv_param.stride(0);
stride_w_ = conv_param.stride(0);
}
else
{
stride_h_ = conv_param.stride(0);
stride_w_ = conv_param.stride(1);
}
}
if (conv_param.has_pad_h() && conv_param.has_pad_w())
{
pad_h_ = conv_param.pad_h();
pad_w_ = conv_param.pad_w();
}
else
{
if (conv_param.pad_size() == 1)
{
pad_h_ = conv_param.pad(0);
pad_w_ = conv_param.pad(0);
}
else
{
pad_h_ = conv_param.pad(0);
pad_w_ = conv_param.pad(1);
}
}
if (conv_param.dilation_size() > 0)
{
if (conv_param.dilation_size() == 1)
{
dilation_h_ = conv_param.dilation(0);
dilation_w_ = conv_param.dilation(0);
}
else
{
dilation_h_ = conv_param.dilation(0);
dilation_w_ = conv_param.dilation(1);
}
}
else
{
dilation_h_ = 1;
dilation_w_ = 1;
}
vector<int> weight_shape(4);
weight_shape[0] = bottom[0]->channels();
weight_shape[1] = 1;
weight_shape[2] = kernel_h_;
weight_shape[3] = kernel_w_;
vector<int> bias_shape;
if (conv_param.bias_term())
{
bias_shape.push_back(bottom[0]->channels());
}
if (this->blobs_.size() == 0)
{
if (conv_param.bias_term())
{
this->blobs_.resize(2);
}
else
{
this->blobs_.resize(1);
}
this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(conv_param.weight_filler()));
weight_filler->Fill(this->blobs_[0].get());
if (conv_param.bias_term())
{
this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(conv_param.bias_filler()));
bias_filler->Fill(this->blobs_[1].get());
}
}
this->param_propagate_down_.resize(this->blobs_.size(), true);
}
template <typename Dtype>
void ConvolutionDepthwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top)
{
vector<int> top_shape;
top_shape.push_back(bottom[0]->num());
top_shape.push_back(bottom[0]->channels());
top_shape.push_back((bottom[0]->height() + 2 * pad_h_ - (dilation_h_ * (kernel_h_ - 1) + 1)) / stride_h_ + 1);
top_shape.push_back((bottom[0]->width() + 2 * pad_w_ - (dilation_w_ * (kernel_w_ - 1) + 1)) / stride_w_ + 1);
top[0]->Reshape(top_shape);
vector<int> weight_buffer_shape;
weight_buffer_shape.push_back(bottom[0]->channels());
weight_buffer_shape.push_back(kernel_h_);
weight_buffer_shape.push_back(kernel_w_);
weight_buffer_shape.push_back(bottom[0]->num());
weight_buffer_shape.push_back(top[0]->height());
weight_buffer_shape.push_back(top[0]->width());
weight_buffer_.Reshape(weight_buffer_shape);
vector<int> weight_multiplier_shape;
weight_multiplier_shape.push_back(bottom[0]->num());
weight_multiplier_shape.push_back(top[0]->height());
weight_multiplier_shape.push_back(top[0]->width());
weight_multiplier_.Reshape(weight_multiplier_shape);
caffe_gpu_set(weight_multiplier_.count(), Dtype(1), weight_multiplier_.mutable_gpu_data());
if (this->layer_param_.convolution_param().bias_term())
{
vector<int> bias_buffer_shape;
bias_buffer_shape.push_back(bottom[0]->channels());
bias_buffer_shape.push_back(bottom[0]->num());
bias_buffer_shape.push_back(top[0]->height());
bias_buffer_shape.push_back(top[0]->width());
bias_buffer_.Reshape(bias_buffer_shape);
vector<int> bias_multiplier_shape;
bias_multiplier_shape.push_back(bottom[0]->num());
bias_multiplier_shape.push_back(top[0]->height());
bias_multiplier_shape.push_back(top[0]->width());
bias_multiplier_.Reshape(bias_multiplier_shape);
caffe_gpu_set(bias_multiplier_.count(), Dtype(1), bias_multiplier_.mutable_gpu_data());
}
}
template <typename Dtype>
void ConvolutionDepthwiseLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top)
{
const int num = top[0]->num();
const int channels = top[0]->channels();
const int top_height = top[0]->height();
const int top_width = top[0]->width();
const int bottom_height = bottom[0]->height();
const int bottom_width = bottom[0]->width();
const Dtype* bottom_data = bottom[0]->cpu_data();
const Dtype* weight_data_base = this->blobs_[0]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
for (int n = 0; n < num; ++n)
{
for (int c = 0; c < channels; ++c)
{
for (int h = 0; h < top_height; ++h)
{
for (int w = 0; w < top_width; ++w)
{
const Dtype* weight_data = weight_data_base + c * kernel_h_ * kernel_w_;
Dtype value = 0;
for (int kh = 0; kh < kernel_h_; ++kh)
{
for (int kw = 0; kw < kernel_w_; ++kw)
{
int h_in = -pad_h_ + h * stride_h_ + kh * dilation_h_;
int w_in = -pad_w_ + w * stride_w_ + kw * dilation_w_;
if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
{
int offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
value += (*weight_data) * bottom_data[offset];
}
++weight_data;
}
}
*top_data++ = value;
}
}
}
}
if (this->layer_param_.convolution_param().bias_term())
{
top_data = top[0]->mutable_cpu_data();
for (int n = 0; n < num; ++n)
{
const Dtype* bias_data = this->blobs_[1]->cpu_data();
for (int c = 0; c < channels; ++c)
{
for (int h = 0; h < top_height; ++h)
{
for (int w = 0; w < top_width; ++w)
{
*top_data += *bias_data;
++top_data;
}
}
++bias_data;
}
}
}
}
template <typename Dtype>
void ConvolutionDepthwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom)
{
const int num = top[0]->num();
const int channels = top[0]->channels();
const int top_height = top[0]->height();
const int top_width = top[0]->width();
const int bottom_height = bottom[0]->height();
const int bottom_width = bottom[0]->width();
caffe_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_cpu_diff());
if (this->layer_param_.convolution_param().bias_term() && this->param_propagate_down_[1])
{
const Dtype* top_diff = top[0]->cpu_diff();
for (int n = 0; n < num; ++n)
{
Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
for (int c = 0; c < channels; ++c)
{
for (int h = 0; h < top_height; ++h)
{
for (int w = 0; w < top_width; ++w)
{
*bias_diff += *top_diff;
++top_diff;
}
}
++bias_diff;
}
}
}
if (this->param_propagate_down_[0])
{
const Dtype* top_diff = top[0]->cpu_diff();
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* weight_diff_base = this->blobs_[0]->mutable_cpu_diff();
for (int n = 0; n < num; ++n)
{
for (int c = 0; c < channels; ++c)
{
for (int h = 0; h < top_height; ++h)
{
for (int w = 0; w < top_width; ++w)
{
Dtype* weight_diff = weight_diff_base + c * kernel_h_ * kernel_w_;
for (int kh = 0; kh < kernel_h_; ++kh)
{
for (int kw = 0; kw < kernel_w_; ++kw)
{
int h_in = -pad_h_ + h * stride_h_ + kh * dilation_h_;
int w_in = -pad_w_ + w * stride_w_ + kw * dilation_w_;
if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
{
int offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
*weight_diff += bottom_data[offset] * (*top_diff);
}
++weight_diff;
}
}
++top_diff;
}
}
}
}
}
if (propagate_down[0])
{
const Dtype* top_diff = top[0]->cpu_diff();
const Dtype* weight_data_base = this->blobs_[0]->cpu_data();
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
for (int n = 0; n < num; ++n)
{
for (int c = 0; c < channels; ++c)
{
for (int h = 0; h < top_height; ++h)
{
for (int w = 0; w < top_width; ++w)
{
const Dtype* weight_data = weight_data_base + c * kernel_h_ * kernel_w_;
for (int kh = 0; kh < kernel_h_; ++kh)
{
for (int kw = 0; kw < kernel_w_; ++kw)
{
int h_in = -pad_h_ + h * stride_h_ + kh * dilation_h_;
int w_in = -pad_w_ + w * stride_w_ + kw * dilation_w_;
if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
{
int offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
bottom_diff[offset] += (*weight_data) * (*top_diff);
}
++weight_data;
}
}
++top_diff;
}
}
}
}
}
}
#ifdef CPU_ONLY
STUB_GPU(ConvolutionDepthwiseLayer);
#endif
INSTANTIATE_CLASS(ConvolutionDepthwiseLayer);
REGISTER_LAYER_CLASS(ConvolutionDepthwise);
} // namespace caffe
(3)在$CAFFE_ROOT/src/caffe/layers中添加conv_dw_layer.cu
#include <vector>
#include "caffe/layers/conv_dw_layer.hpp"
#include "caffe/util/gpu_util.cuh"
namespace caffe
{
template <typename Dtype>
__global__ void ConvolutionDepthwiseWeightForward(const int nthreads,
const Dtype* const bottom_data, const Dtype* const weight_data, const int num, const int channels,
const int top_height, const int top_width, const int bottom_height, const int bottom_width,
const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h, const int dilation_w,Dtype* const top_data)
{
CUDA_KERNEL_LOOP(index, nthreads)
{
const int n = index / channels / top_height / top_width;
const int c = (index / top_height / top_width) % channels;
const int h = (index / top_width) % top_height;
const int w = index % top_width;
const Dtype* weight = weight_data + c * kernel_h * kernel_w;
Dtype value = 0;
for (int kh = 0; kh < kernel_h; ++kh)
{
for (int kw = 0; kw < kernel_w; ++kw)
{
const int h_in = -pad_h + h * stride_h + kh * dilation_h;
const int w_in = -pad_w + w * stride_w + kw * dilation_w;
if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
{
const int offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
value += (*weight) * bottom_data[offset];
}
++weight;
}
}
top_data[index] = value;
}
}
template <typename Dtype>
__global__ void ConvolutionDepthwiseBiasForward(const int nthreads,
const Dtype* const bias_data, const int num, const int channels,
const int top_height, const int top_width, Dtype* const top_data)
{
CUDA_KERNEL_LOOP(index, nthreads)
{
const int c = (index / top_height / top_width) % channels;
top_data[index] += bias_data[c];
}
}
template <typename Dtype>
void ConvolutionDepthwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top)
{
const Dtype* bottom_data = bottom[0]->gpu_data();
Dtype* top_data = top[0]->mutable_gpu_data();
const Dtype* weight_data = this->blobs_[0]->gpu_data();
const int count = top[0]->count();
const int num = top[0]->num();
const int channels = top[0]->channels();
const int top_height = top[0]->height();
const int top_width = top[0]->width();
const int bottom_height = bottom[0]->height();
const int bottom_width = bottom[0]->width();
ConvolutionDepthwiseWeightForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, bottom_data, weight_data, num, channels,
top_height, top_width, bottom_height, bottom_width,
kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_h_, pad_w_, dilation_h_, dilation_w_, top_data);
if (this->layer_param_.convolution_param().bias_term())
{
const Dtype* bias_data = this->blobs_[1]->gpu_data();
ConvolutionDepthwiseBiasForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, bias_data, num, channels,
top_height, top_width, top_data);
}
}
template <typename Dtype>
__global__ void ConvolutionDepthwiseWeightBackward(const int nthreads,
const Dtype* const top_diff, const Dtype* const bottom_data, const int num, const int channels,
const int top_height, const int top_width, const int bottom_height, const int bottom_width,
const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h, const int dilation_w,
Dtype* const buffer_data)
{
CUDA_KERNEL_LOOP(index, nthreads)
{
const int h = (index / top_width) % top_height;
const int w = index % top_width;
const int kh = (index / kernel_w / num / top_height / top_width) % kernel_h;
const int kw = (index / num / top_height / top_width) % kernel_w;
const int h_in = -pad_h + h * stride_h + kh * dilation_h;
const int w_in = -pad_w + w * stride_w + kw * dilation_w;
if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
{
const int c = index / kernel_h / kernel_w / num / top_height / top_width;
const int n = (index / top_height / top_width) % num;
const int top_offset = ((n * channels + c) * top_height + h) * top_width + w;
const int bottom_offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
buffer_data[index] = top_diff[top_offset] * bottom_data[bottom_offset];
}
else
{
buffer_data[index] = 0;
}
}
}
template <typename Dtype>
__global__ void ConvolutionDepthwiseBottomBackward(const int nthreads,
const Dtype* const top_diff, const Dtype* const weight_data, const int num, const int channels,
const int top_height, const int top_width, const int bottom_height, const int bottom_width,
const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h, const int dilation_w,
Dtype* const bottom_diff)
{
CUDA_KERNEL_LOOP(index, nthreads)
{
const int n = index / channels / bottom_height / bottom_width;
const int c = (index / bottom_height / bottom_width) % channels;
const int h = (index / bottom_width) % bottom_height;
const int w = index % bottom_width;
const Dtype* weight = weight_data + c * kernel_h * kernel_w;
Dtype value = 0;
for (int kh = 0; kh < kernel_h; ++kh)
{
for (int kw = 0; kw < kernel_w; ++kw)
{
const int h_out_s = h + pad_h - kh * dilation_h;
const int w_out_s = w + pad_w - kw * dilation_w;
if (((h_out_s % stride_h) == 0) && ((w_out_s % stride_w) == 0))
{
const int h_out = h_out_s / stride_h;
const int w_out = w_out_s / stride_w;
if ((h_out >= 0) && (h_out < top_height) && (w_out >= 0) && (w_out < top_width))
{
const int offset = ((n * channels + c) * top_height + h_out) * top_width + w_out;
value += (*weight) * top_diff[offset];
}
}
++weight;
}
}
bottom_diff[index] += value;
}
}
template <typename Dtype>
__global__ void ConvolutionDepthwiseBiasBackward(const int nthreads,
const Dtype* const top_diff, const int num, const int channels,
const int top_height, const int top_width, Dtype* const buffer_data)
{
CUDA_KERNEL_LOOP(index, nthreads)
{
const int c = index / num / top_height / top_width;
const int n = (index / top_height / top_width) % num;
const int h = (index / top_width) % top_height;
const int w = index % top_width;
const int offset = ((n * channels + c) * top_height + h) * top_width + w;
buffer_data[index] = top_diff[offset];
}
}
template <typename Dtype>
void ConvolutionDepthwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom)
{
const Dtype* top_diff = top[0]->gpu_diff();
const int bottom_count = bottom[0]->count();
const int num = top[0]->num();
const int channels = top[0]->channels();
const int top_height = top[0]->height();
const int top_width = top[0]->width();
const int bottom_height = bottom[0]->height();
const int bottom_width = bottom[0]->width();
const int length = num * top_height * top_width;
caffe_gpu_set(bottom_count, Dtype(0), bottom[0]->mutable_gpu_diff());
if (this->layer_param_.convolution_param().bias_term() && this->param_propagate_down_[1])
{
const int bias_buffer_count = bias_buffer_.count();
Dtype* bias_buffer_mutable_data = bias_buffer_.mutable_gpu_data();
ConvolutionDepthwiseBiasBackward<Dtype><<<CAFFE_GET_BLOCKS(bias_buffer_count), CAFFE_CUDA_NUM_THREADS>>>(
bias_buffer_count, top_diff, num, channels,
top_height, top_width, bias_buffer_mutable_data);
const int bias_count = this->blobs_[1]->count();
const Dtype* bias_buffer_data = bias_buffer_.gpu_data();
Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
const Dtype* bias_multiplier_data = bias_multiplier_.gpu_data();
caffe_gpu_gemv(CblasNoTrans, bias_count, length, Dtype(1), bias_buffer_data, bias_multiplier_data, Dtype(1), bias_diff);
}
if (this->param_propagate_down_[0])
{
const int weight_buffer_count = weight_buffer_.count();
const Dtype* bottom_data = bottom[0]->gpu_data();
Dtype* weight_buffer_mutable_data = weight_buffer_.mutable_gpu_data();
ConvolutionDepthwiseWeightBackward<Dtype><<<CAFFE_GET_BLOCKS(weight_buffer_count), CAFFE_CUDA_NUM_THREADS>>>(
weight_buffer_count, top_diff, bottom_data, num, channels,
top_height, top_width, bottom_height, bottom_width,
kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_h_, pad_w_, dilation_h_, dilation_w_, weight_buffer_mutable_data);
const int weight_count = this->blobs_[0]->count();
const Dtype* weight_buffer_data = weight_buffer_.gpu_data();
Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
const Dtype* weight_multiplier_data = weight_multiplier_.gpu_data();
caffe_gpu_gemv(CblasNoTrans, weight_count, length, Dtype(1), weight_buffer_data, weight_multiplier_data, Dtype(1), weight_diff);
}
if (propagate_down[0])
{
const Dtype* weight_data = this->blobs_[0]->gpu_data();
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
ConvolutionDepthwiseBottomBackward<Dtype><<<CAFFE_GET_BLOCKS(bottom_count), CAFFE_CUDA_NUM_THREADS>>>(
bottom_count, top_diff, weight_data, num, channels,
top_height, top_width, bottom_height, bottom_width,
kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_h_, pad_w_, dilation_h_, dilation_w_, bottom_diff);
}
}
INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionDepthwiseLayer);
} // namespace caffe
(4)重新编译caffe就ok了