Convolution Deepwise layer的caffe实现

参考链接：https://github.com/BVLC/caffe/pull/5665/files

convolution depthwise layer是在Mobilenet v1中提出的概念，相比group convolution具有更少的参数与更快的计算速度。在caffe中可以参考conv_layer来实现。

（1）首先增加在$CAFFE_ROOT/include/caffe/layers中添加conv_dw_layer.hpp头文件

#ifndef CAFFE_CONV_DW_LAYER_HPP_
#define CAFFE_CONV_DW_LAYER_HPP_

#include <vector>
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe
{
	template <typename Dtype>
	class ConvolutionDepthwiseLayer :public Layer<Dtype>
	{
		public:
			explicit ConvolutionDepthwiseLayer(const LayerParameter& param):Layer<Dtype>(param){}
			virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
			virtual void Reshape(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
			virtual inline int ExactNumBottomBlobs() const { return 1; }
			virtual inline int ExactNumTopBlobs() const { return 1; }
			virtual inline const char* type() const { return "ConvolutionDepthwise"; }
		protected:
			virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
			virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
			virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			unsigned int kernel_h_;
			unsigned int kernel_w_;
			unsigned int stride_h_;
			unsigned int stride_w_;
			unsigned int pad_h_;
			unsigned int pad_w_;
			unsigned int dilation_h_;
			unsigned int dilation_w_;
			Blob<Dtype> weight_buffer_;
			Blob<Dtype> weight_multiplier_;
			Blob<Dtype> bias_buffer_;
			Blob<Dtype> bias_multiplier_;
	};
}// namespace caffe

#endif  // CAFFE_CONV_DW_LAYER_HPP_

（2）在$CAFFE_ROOT/src/caffe/layers中添加conv_dw_layer.cpp

#include <algorithm>
#include <vector>
#include "caffe/filler.hpp"
#include "caffe/layers/conv_dw_layer.hpp"

namespace caffe 
{
	template <typename Dtype>
	void ConvolutionDepthwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top) 
	{	
		ConvolutionParameter conv_param = this->layer_param_.convolution_param();
		if (conv_param.has_kernel_h() && conv_param.has_kernel_w()) 
		{
			kernel_h_ = conv_param.kernel_h();
			kernel_w_ = conv_param.kernel_w();
		} 
		else 
		{
			if (conv_param.kernel_size_size() == 1)
			{
				kernel_h_ = conv_param.kernel_size(0);
				kernel_w_ = conv_param.kernel_size(0);
			}
			else
			{
				kernel_h_ = conv_param.kernel_size(0);
				kernel_w_ = conv_param.kernel_size(1);
			}
		}
		if (conv_param.has_stride_h() && conv_param.has_stride_w()) 
		{
			stride_h_ = conv_param.stride_h();
			stride_w_ = conv_param.stride_w();
		} 
		else 
		{
			if (conv_param.stride_size() == 1)
			{
				stride_h_ = conv_param.stride(0);
				stride_w_ = conv_param.stride(0);
			}
			else
			{
				stride_h_ = conv_param.stride(0);
				stride_w_ = conv_param.stride(1);
			}
		}
		if (conv_param.has_pad_h() && conv_param.has_pad_w()) 
		{
			pad_h_ = conv_param.pad_h();
			pad_w_ = conv_param.pad_w();
		}
		else
		{
			if (conv_param.pad_size() == 1)
			{
				pad_h_ = conv_param.pad(0);
				pad_w_ = conv_param.pad(0);
			}
			else
			{
				pad_h_ = conv_param.pad(0);
				pad_w_ = conv_param.pad(1);
			}
		}
		if (conv_param.dilation_size() > 0)
		{
			if (conv_param.dilation_size() == 1)
			{
				dilation_h_ = conv_param.dilation(0);
				dilation_w_ = conv_param.dilation(0);
			}
			else
			{
				dilation_h_ = conv_param.dilation(0);
				dilation_w_ = conv_param.dilation(1);
			}
		}
		else
		{
			dilation_h_ = 1;
			dilation_w_ = 1;
		}
		vector<int> weight_shape(4);
		weight_shape[0] = bottom[0]->channels();
		weight_shape[1] = 1;
		weight_shape[2] = kernel_h_;
		weight_shape[3] = kernel_w_;
		vector<int> bias_shape;
		if (conv_param.bias_term())
		{
			bias_shape.push_back(bottom[0]->channels());
		}
		if (this->blobs_.size() == 0)
		{
			if (conv_param.bias_term())
			{
				this->blobs_.resize(2);
			} 
			else
			{
				this->blobs_.resize(1);
			}
			this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
			shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(conv_param.weight_filler()));
			weight_filler->Fill(this->blobs_[0].get());
			if (conv_param.bias_term()) 
			{
				this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
				shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(conv_param.bias_filler()));
				bias_filler->Fill(this->blobs_[1].get());
			}
		}
		this->param_propagate_down_.resize(this->blobs_.size(), true);
	}

	template <typename Dtype>
	void ConvolutionDepthwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top) 
	{
		vector<int> top_shape;
		top_shape.push_back(bottom[0]->num());
		top_shape.push_back(bottom[0]->channels());
		top_shape.push_back((bottom[0]->height() + 2 * pad_h_ - (dilation_h_ * (kernel_h_ - 1) + 1)) / stride_h_ + 1);
		top_shape.push_back((bottom[0]->width() + 2 * pad_w_ - (dilation_w_ * (kernel_w_ - 1) + 1)) / stride_w_ + 1);
		top[0]->Reshape(top_shape);
		vector<int> weight_buffer_shape;
		weight_buffer_shape.push_back(bottom[0]->channels());
		weight_buffer_shape.push_back(kernel_h_);
		weight_buffer_shape.push_back(kernel_w_);
		weight_buffer_shape.push_back(bottom[0]->num());
		weight_buffer_shape.push_back(top[0]->height());
		weight_buffer_shape.push_back(top[0]->width());
		weight_buffer_.Reshape(weight_buffer_shape);
		vector<int> weight_multiplier_shape;
		weight_multiplier_shape.push_back(bottom[0]->num());
		weight_multiplier_shape.push_back(top[0]->height());
		weight_multiplier_shape.push_back(top[0]->width());
		weight_multiplier_.Reshape(weight_multiplier_shape);
		caffe_gpu_set(weight_multiplier_.count(), Dtype(1), weight_multiplier_.mutable_gpu_data());
		if (this->layer_param_.convolution_param().bias_term())
		{
			vector<int> bias_buffer_shape;
			bias_buffer_shape.push_back(bottom[0]->channels());
			bias_buffer_shape.push_back(bottom[0]->num());
			bias_buffer_shape.push_back(top[0]->height());
			bias_buffer_shape.push_back(top[0]->width());
			bias_buffer_.Reshape(bias_buffer_shape);
			vector<int> bias_multiplier_shape;
			bias_multiplier_shape.push_back(bottom[0]->num());
			bias_multiplier_shape.push_back(top[0]->height());
			bias_multiplier_shape.push_back(top[0]->width());
			bias_multiplier_.Reshape(bias_multiplier_shape);
			caffe_gpu_set(bias_multiplier_.count(), Dtype(1), bias_multiplier_.mutable_gpu_data());
		}
	}

	template <typename Dtype>
	void ConvolutionDepthwiseLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top)
	{
		const int num = top[0]->num();
		const int channels = top[0]->channels();
		const int top_height = top[0]->height();
		const int top_width = top[0]->width();
		const int bottom_height = bottom[0]->height();
		const int bottom_width = bottom[0]->width();
		const Dtype* bottom_data = bottom[0]->cpu_data();
		const Dtype* weight_data_base = this->blobs_[0]->cpu_data();
		Dtype* top_data = top[0]->mutable_cpu_data();
		for (int n = 0; n < num; ++n)
		{
			for (int c = 0; c < channels; ++c)
			{
				for (int h = 0; h < top_height; ++h)
				{
					for (int w = 0; w < top_width; ++w)
					{
						const Dtype* weight_data = weight_data_base + c * kernel_h_ * kernel_w_;
						Dtype value = 0;
						for (int kh = 0; kh < kernel_h_; ++kh)
						{
							for (int kw = 0; kw < kernel_w_; ++kw)
							{
								int h_in = -pad_h_ + h * stride_h_ + kh * dilation_h_;
								int w_in = -pad_w_ + w * stride_w_ + kw * dilation_w_;
								if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
								{
									int offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
									value += (*weight_data) * bottom_data[offset];
								}
								++weight_data;
							}
						}
						*top_data++ = value;
					}
				}
			}
		}
		if (this->layer_param_.convolution_param().bias_term())
		{
			top_data = top[0]->mutable_cpu_data();
			for (int n = 0; n < num; ++n)
			{
				const Dtype* bias_data = this->blobs_[1]->cpu_data();
				for (int c = 0; c < channels; ++c)
				{
					for (int h = 0; h < top_height; ++h)
					{
						for (int w = 0; w < top_width; ++w)
						{
							*top_data += *bias_data;
							++top_data;
						}
					}
					++bias_data;
				}
			}
		}
	}

	template <typename Dtype>
	void ConvolutionDepthwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom)
	{
		const int num = top[0]->num();
		const int channels = top[0]->channels();
		const int top_height = top[0]->height();
		const int top_width = top[0]->width();
		const int bottom_height = bottom[0]->height();
		const int bottom_width = bottom[0]->width();
		caffe_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_cpu_diff());
		if (this->layer_param_.convolution_param().bias_term() && this->param_propagate_down_[1])
		{
			const Dtype* top_diff = top[0]->cpu_diff();
			for (int n = 0; n < num; ++n)
			{
				Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
				for (int c = 0; c < channels; ++c)
				{
					for (int h = 0; h < top_height; ++h)
					{
						for (int w = 0; w < top_width; ++w)
						{
							*bias_diff += *top_diff;
							++top_diff;
						}
					}
					++bias_diff;
				}
			}
		}
		if (this->param_propagate_down_[0])
		{
			const Dtype* top_diff = top[0]->cpu_diff();
			const Dtype* bottom_data = bottom[0]->cpu_data();
			Dtype* weight_diff_base = this->blobs_[0]->mutable_cpu_diff();
			for (int n = 0; n < num; ++n)
			{
				for (int c = 0; c < channels; ++c)
				{
					for (int h = 0; h < top_height; ++h)
					{
						for (int w = 0; w < top_width; ++w)
						{
							Dtype* weight_diff = weight_diff_base + c * kernel_h_ * kernel_w_;
							for (int kh = 0; kh < kernel_h_; ++kh)
							{
								for (int kw = 0; kw < kernel_w_; ++kw)
								{
									int h_in = -pad_h_ + h * stride_h_ + kh * dilation_h_;
									int w_in = -pad_w_ + w * stride_w_ + kw * dilation_w_;
									if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
									{
										int offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
										*weight_diff += bottom_data[offset] * (*top_diff);
									}
									++weight_diff;
								}
							}
							++top_diff;
						}
					}
				}
			}
		}
		if (propagate_down[0])
		{
			const Dtype* top_diff = top[0]->cpu_diff();
			const Dtype* weight_data_base = this->blobs_[0]->cpu_data();
			Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
			for (int n = 0; n < num; ++n)
			{
				for (int c = 0; c < channels; ++c)
				{
					for (int h = 0; h < top_height; ++h)
					{
						for (int w = 0; w < top_width; ++w)
						{
							const Dtype* weight_data = weight_data_base + c * kernel_h_ * kernel_w_;
							for (int kh = 0; kh < kernel_h_; ++kh)
							{
								for (int kw = 0; kw < kernel_w_; ++kw)
								{
									int h_in = -pad_h_ + h * stride_h_ + kh * dilation_h_;
									int w_in = -pad_w_ + w * stride_w_ + kw * dilation_w_;
									if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
									{
										int offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
										bottom_diff[offset] += (*weight_data) * (*top_diff);
									}
									++weight_data;
								}
							}
							++top_diff;
						}
					}
				}
			}
		}
	}
	
	#ifdef CPU_ONLY
	STUB_GPU(ConvolutionDepthwiseLayer);
	#endif
	
	INSTANTIATE_CLASS(ConvolutionDepthwiseLayer);
	REGISTER_LAYER_CLASS(ConvolutionDepthwise);
}  // namespace caffe

(3)在$CAFFE_ROOT/src/caffe/layers中添加conv_dw_layer.cu

#include <vector>
#include "caffe/layers/conv_dw_layer.hpp"
#include "caffe/util/gpu_util.cuh"

namespace caffe 
{

	template <typename Dtype>
	__global__ void ConvolutionDepthwiseWeightForward(const int nthreads,
		const Dtype* const bottom_data, const Dtype* const weight_data, const int num, const int channels,
		const int top_height, const int top_width, const int bottom_height, const int bottom_width,
		const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,
		const int pad_h, const int pad_w, const int dilation_h, const int dilation_w,Dtype* const top_data) 
	{
		CUDA_KERNEL_LOOP(index, nthreads)
		{
			const int n = index / channels / top_height / top_width;
			const int c = (index / top_height / top_width) % channels;
			const int h = (index / top_width) % top_height;
			const int w = index % top_width;
			const Dtype* weight = weight_data + c * kernel_h * kernel_w;
			Dtype value = 0;
			for (int kh = 0; kh < kernel_h; ++kh)
			{
				for (int kw = 0; kw < kernel_w; ++kw)
				{
					const int h_in = -pad_h + h * stride_h + kh * dilation_h;
					const int w_in = -pad_w + w * stride_w + kw * dilation_w;
					if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
					{
						const int offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
						value += (*weight) * bottom_data[offset];
					}
					++weight;
				}
			}
			top_data[index] = value;
		}
	}

	template <typename Dtype>
	__global__ void ConvolutionDepthwiseBiasForward(const int nthreads,
		const Dtype* const bias_data, const int num, const int channels,
		const int top_height, const int top_width, Dtype* const top_data)
	{
		CUDA_KERNEL_LOOP(index, nthreads)
		{
			const int c = (index / top_height / top_width) % channels;
			top_data[index] += bias_data[c];
		}
	}

	template <typename Dtype>
	void ConvolutionDepthwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top) 
	{
		const Dtype* bottom_data = bottom[0]->gpu_data();
		Dtype* top_data = top[0]->mutable_gpu_data();
		const Dtype* weight_data = this->blobs_[0]->gpu_data();
		const int count = top[0]->count();
		const int num = top[0]->num();
		const int channels = top[0]->channels();
		const int top_height = top[0]->height();
		const int top_width = top[0]->width();
		const int bottom_height = bottom[0]->height();
		const int bottom_width = bottom[0]->width();
		ConvolutionDepthwiseWeightForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
			count, bottom_data, weight_data, num, channels,
			top_height, top_width, bottom_height, bottom_width,
			kernel_h_, kernel_w_, stride_h_, stride_w_,
			pad_h_, pad_w_, dilation_h_, dilation_w_, top_data);
		if (this->layer_param_.convolution_param().bias_term())
		{
			const Dtype* bias_data = this->blobs_[1]->gpu_data();
			ConvolutionDepthwiseBiasForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
				count, bias_data, num, channels,
				top_height, top_width, top_data);
		}
	}

	template <typename Dtype>
	__global__ void ConvolutionDepthwiseWeightBackward(const int nthreads,
		const Dtype* const top_diff, const Dtype* const bottom_data, const int num, const int channels,
		const int top_height, const int top_width, const int bottom_height, const int bottom_width,
		const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,
		const int pad_h, const int pad_w, const int dilation_h, const int dilation_w,
		Dtype* const buffer_data)
	{
		CUDA_KERNEL_LOOP(index, nthreads)
		{
			const int h = (index / top_width) % top_height;
			const int w = index % top_width;
			const int kh = (index / kernel_w / num / top_height / top_width) % kernel_h;
			const int kw = (index / num / top_height / top_width) % kernel_w;
			const int h_in = -pad_h + h * stride_h + kh * dilation_h;
			const int w_in = -pad_w + w * stride_w + kw * dilation_w;
			if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
			{
				const int c = index / kernel_h / kernel_w / num / top_height / top_width;
				const int n = (index / top_height / top_width) % num;
				const int top_offset = ((n * channels + c) * top_height + h) * top_width + w;
				const int bottom_offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
				buffer_data[index] = top_diff[top_offset] * bottom_data[bottom_offset];
			}
			else
			{
				buffer_data[index] = 0;
			}
		}
	}

	template <typename Dtype>
	__global__ void ConvolutionDepthwiseBottomBackward(const int nthreads,
		const Dtype* const top_diff, const Dtype* const weight_data, const int num, const int channels,
		const int top_height, const int top_width, const int bottom_height, const int bottom_width,
		const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,
		const int pad_h, const int pad_w, const int dilation_h, const int dilation_w,
		Dtype* const bottom_diff) 
	{
		CUDA_KERNEL_LOOP(index, nthreads) 
		{
			const int n = index / channels / bottom_height / bottom_width;
			const int c = (index / bottom_height / bottom_width) % channels;
			const int h = (index / bottom_width) % bottom_height;
			const int w = index % bottom_width;
			const Dtype* weight = weight_data + c * kernel_h * kernel_w;
			Dtype value = 0;
			for (int kh = 0; kh < kernel_h; ++kh)
			{
				for (int kw = 0; kw < kernel_w; ++kw)
				{
					const int h_out_s = h + pad_h - kh * dilation_h;
					const int w_out_s = w + pad_w - kw * dilation_w;
					if (((h_out_s % stride_h) == 0) && ((w_out_s % stride_w) == 0))
					{
						const int h_out = h_out_s / stride_h;
						const int w_out = w_out_s / stride_w;
						if ((h_out >= 0) && (h_out < top_height) && (w_out >= 0) && (w_out < top_width))
						{
							const int offset = ((n * channels + c) * top_height + h_out) * top_width + w_out;
							value += (*weight) * top_diff[offset];
						}
					}
					++weight;
				}
			}
			bottom_diff[index] += value;
		}
	}

	template <typename Dtype>
	__global__ void ConvolutionDepthwiseBiasBackward(const int nthreads,
		const Dtype* const top_diff, const int num, const int channels,
		const int top_height, const int top_width, Dtype* const buffer_data) 
	{
		CUDA_KERNEL_LOOP(index, nthreads) 
		{
			const int c = index / num / top_height / top_width;
			const int n = (index / top_height / top_width) % num;
			const int h = (index / top_width) % top_height;
			const int w = index % top_width;
			const int offset = ((n * channels + c) * top_height + h) * top_width + w;
			buffer_data[index] = top_diff[offset];
		}
	}

	template <typename Dtype>
	void ConvolutionDepthwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
		  const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) 
	{
		const Dtype* top_diff = top[0]->gpu_diff();
		const int bottom_count = bottom[0]->count();
		const int num = top[0]->num();
		const int channels = top[0]->channels();
		const int top_height = top[0]->height();
		const int top_width = top[0]->width();
		const int bottom_height = bottom[0]->height();
		const int bottom_width = bottom[0]->width();
		const int length = num * top_height * top_width;
		caffe_gpu_set(bottom_count, Dtype(0), bottom[0]->mutable_gpu_diff());
		if (this->layer_param_.convolution_param().bias_term() && this->param_propagate_down_[1])
		{
			const int bias_buffer_count = bias_buffer_.count();
			Dtype* bias_buffer_mutable_data = bias_buffer_.mutable_gpu_data();
			ConvolutionDepthwiseBiasBackward<Dtype><<<CAFFE_GET_BLOCKS(bias_buffer_count), CAFFE_CUDA_NUM_THREADS>>>(
				bias_buffer_count, top_diff, num, channels,
				top_height, top_width, bias_buffer_mutable_data);
			const int bias_count = this->blobs_[1]->count();
			const Dtype* bias_buffer_data = bias_buffer_.gpu_data();
			Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
			const Dtype* bias_multiplier_data = bias_multiplier_.gpu_data();
			caffe_gpu_gemv(CblasNoTrans, bias_count, length, Dtype(1), bias_buffer_data, bias_multiplier_data, Dtype(1), bias_diff);
		}
		if (this->param_propagate_down_[0])
		{
			const int weight_buffer_count = weight_buffer_.count();
			const Dtype* bottom_data = bottom[0]->gpu_data();
			Dtype* weight_buffer_mutable_data = weight_buffer_.mutable_gpu_data();
			ConvolutionDepthwiseWeightBackward<Dtype><<<CAFFE_GET_BLOCKS(weight_buffer_count), CAFFE_CUDA_NUM_THREADS>>>(
				weight_buffer_count, top_diff, bottom_data, num, channels,
				top_height, top_width, bottom_height, bottom_width,
				kernel_h_, kernel_w_, stride_h_, stride_w_,
				pad_h_, pad_w_, dilation_h_, dilation_w_, weight_buffer_mutable_data);
			const int weight_count = this->blobs_[0]->count();
			const Dtype* weight_buffer_data = weight_buffer_.gpu_data();
			Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
			const Dtype* weight_multiplier_data = weight_multiplier_.gpu_data();
			caffe_gpu_gemv(CblasNoTrans, weight_count, length, Dtype(1), weight_buffer_data, weight_multiplier_data, Dtype(1), weight_diff);
		}
		if (propagate_down[0])
		{
			const Dtype* weight_data = this->blobs_[0]->gpu_data();
			Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
			ConvolutionDepthwiseBottomBackward<Dtype><<<CAFFE_GET_BLOCKS(bottom_count), CAFFE_CUDA_NUM_THREADS>>>(
				bottom_count, top_diff, weight_data, num, channels,
				top_height, top_width, bottom_height, bottom_width,
				kernel_h_, kernel_w_, stride_h_, stride_w_,
				pad_h_, pad_w_, dilation_h_, dilation_w_, bottom_diff);
		}
	}

INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionDepthwiseLayer);

}  // namespace caffe

（4）重新编译caffe就ok了

Convolution Deepwise layer的caffe实现

猜你喜欢