我们需要做的操作就是：

将include下的depthwise_conv_layer.hpp放到/caffeMS/include/caffe/layers/目录下

将src下的depthwise_conv_layer.cpp和 depthwise_conv_layer.cu放到/caffeMS/src/caffe/layers/目录下。

扫描二维码关注公众号，回复： 11295146 查看本文章

然后重新编译Caffe即可。

    make all -j8
    make test -j8
    make runtest -j8

实际使用：
对dw层，即group参数大于1的层，将其type由"Convolution"改为 “DepthwiseConvolution”

layer {
  name: "conv2_1/dw"
  type: "DepthwiseConvolution"
  bottom: "conv1"
  top: "conv2_1/dw"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  convolution_param {
    num_output: 32
    bias_term: false
    pad: 1
    kernel_size: 3
    group: 32
    stride: 1
    weight_filler {
      type: "msra"
    }
    engine: CAFFE
  }
}

不过，链接中下载的文件“transferTypeToDepthwiseConvolution.py”可以直接完成这个操作

python2 transferTypeToDepthwiseConvolution.py mobilenet_train.prototxt mobilenet_train_dw.prototxt

import caffe.proto.caffe_pb2 as caffe_pb2
from google.protobuf.text_format import Merge
import argparse
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('source_prototxt')
    parser.add_argument('target_prototxt')

    args = parser.parse_args()
    net = caffe_pb2.NetParameter()
    Merge(open(args.source_prototxt, 'r').read(), net)
    for layer in net.layer:
        if layer.type == "Convolution":
            if layer.convolution_param.group !=1:
                layer.type = "DepthwiseConvolution"
    with open(args.target_prototxt, 'w') as tf:
        tf.write(str(net))

源码：

depthwise_conv_layer.hpp

/*
 * depthwise_conv_layer.hpp
 *
 *  Created on: May 23, 2017
 *      Author: liuhao
 */

#ifndef CAFFE_DEPTHWISE_CONV_LAYER_HPP_
#define CAFFE_DEPTHWISE_CONV_LAYER_HPP_



#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

#include "caffe/layers/base_conv_layer.hpp"

namespace caffe {

/**
 * @brief Convolves the input image with a bank of learned filters,
 *        and (optionally) adds biases.
 *
 *   Caffe convolves by reduction to matrix multiplication. This achieves
 *   high-throughput and generality of input and filter dimensions but comes at
 *   the cost of memory for matrices. This makes use of efficiency in BLAS.
 *
 *   The input is "im2col" transformed to a channel K' x H x W data matrix
 *   for multiplication with the N x K' x H x W filter matrix to yield a
 *   N' x H x W output matrix that is then "col2im" restored. K' is the
 *   input channel * kernel height * kernel width dimension of the unrolled
 *   inputs so that the im2col matrix has a column for each input region to
 *   be filtered. col2im restores the output spatial structure by rolling up
 *   the output channel N' columns of the output matrix.
 */
template <typename Dtype>
class DepthwiseConvolutionLayer : public BaseConvolutionLayer<Dtype> {
 public:
  /**
   * @param param provides ConvolutionParameter convolution_param,
   *    with ConvolutionLayer options:
   *  - num_output. The number of filters.
   *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
   *  kernel_size for square filters or kernel_h and kernel_w for rectangular
   *  filters.
   *  - stride / stride_h / stride_w (\b optional, default 1). The filter
   *  stride, given by stride_size for equal dimensions or stride_h and stride_w
   *  for different strides. By default the convolution is dense with stride 1.
   *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
   *  convolution, given by pad for equal dimensions or pad_h and pad_w for
   *  different padding. Input padding is computed implicitly instead of
   *  actually padding.
   *  - dilation (\b optional, default 1). The filter
   *  dilation, given by dilation_size for equal dimensions for different
   *  dilation. By default the convolution has dilation 1.
   *  - group (\b optional, default 1). The number of filter groups. Group
   *  convolution is a method for reducing parameterization by selectively
   *  connecting input and output channels. The input and output channel dimensions must be divisible
   *  by the number of groups. For group @f$ \geq 1 @f$, the
   *  convolutional filters' input and output channels are separated s.t. each
   *  group takes 1 / group of the input channels and makes 1 / group of the
   *  output channels. Concretely 4 input channels, 8 output channels, and
   *  2 groups separate input channels 1-2 and output channels 1-4 into the
   *  first group and input channels 3-4 and output channels 5-8 into the second
   *  group.
   *  - bias_term (\b optional, default true). Whether to have a bias.
   *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
   *    kernels + stream parallelism) engines.
   */
  explicit DepthwiseConvolutionLayer(const LayerParameter& param)
      : BaseConvolutionLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "DepthwiseConvolution"; }

 protected:
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual inline bool reverse_dimensions() { return false; }
  virtual void compute_output_shape();
};

}  // namespace caffe



#endif /* INCLUDE_CAFFE_LAYERS_DEPTHWISE_CONV_LAYER_HPP_ */

depthwise_conv_layer.cpp

#include <vector>
#include "caffe/layers/depthwise_conv_layer.hpp"

namespace caffe {

template <typename Dtype>
void DepthwiseConvolutionLayer<Dtype>::compute_output_shape() {
  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
  const int* stride_data = this->stride_.cpu_data();
  const int* pad_data = this->pad_.cpu_data();
  const int* dilation_data = this->dilation_.cpu_data();
  this->output_shape_.clear();
  for (int i = 0; i < this->num_spatial_axes_; ++i) {
    // i + 1 to skip channel axis
    const int input_dim = this->input_shape(i + 1);
    const int kernel_extent = dilation_data[i] * (kernel_shape_data[i] - 1) + 1;
    const int output_dim = (input_dim + 2 * pad_data[i] - kernel_extent)
        / stride_data[i] + 1;
    this->output_shape_.push_back(output_dim);
  }
}

template <typename Dtype>
void DepthwiseConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
	const Dtype* weight = this->blobs_[0]->cpu_data();
  for (int i = 0; i < bottom.size(); ++i) {
    const Dtype* bottom_data = bottom[i]->cpu_data();
    Dtype* top_data = top[i]->mutable_cpu_data();
    for (int n = 0; n < this->num_; ++n) {
      this->forward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight,
          top_data + n * this->top_dim_);
      if (this->bias_term_) {
        const Dtype* bias = this->blobs_[1]->cpu_data();
        this->forward_cpu_bias(top_data + n * this->top_dim_, bias);
      }
    }
  }
}

template <typename Dtype>
void DepthwiseConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  const Dtype* weight = this->blobs_[0]->cpu_data();
  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
  for (int i = 0; i < top.size(); ++i) {
    const Dtype* top_diff = top[i]->cpu_diff();
    const Dtype* bottom_data = bottom[i]->cpu_data();
    Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
    // Bias gradient, if necessary.
    if (this->bias_term_ && this->param_propagate_down_[1]) {
      Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
      for (int n = 0; n < this->num_; ++n) {
        this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
      }
    }
    if (this->param_propagate_down_[0] || propagate_down[i]) {
      for (int n = 0; n < this->num_; ++n) {
        // gradient w.r.t. weight. Note that we will accumulate diffs.
        if (this->param_propagate_down_[0]) {
          this->weight_cpu_gemm(bottom_data + n * this->bottom_dim_,
              top_diff + n * this->top_dim_, weight_diff);
        }
        // gradient w.r.t. bottom data, if necessary.
        if (propagate_down[i]) {
          this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight,
              bottom_diff + n * this->bottom_dim_);
        }
      }
    }
  }
}

#ifdef CPU_ONLY
STUB_GPU(DepthwiseConvolutionLayer);
#endif

INSTANTIATE_CLASS(DepthwiseConvolutionLayer);
REGISTER_LAYER_CLASS(DepthwiseConvolution);
}  // namespace caffe

depthwise_conv_layer.cu

#include <vector>
#include <algorithm>
#include <cfloat>
#include "caffe/layers/depthwise_conv_layer.hpp"
#include "caffe/util/math_functions.hpp"


/*
 * The depthwise layer for mobilenet.   only for stride 1
 */

namespace caffe {

template <typename Dtype>
__global__ void ConvForward(const int nthreads,
		const Dtype* const bottom_data, const int num, const int channels,
		const int height, const int width,const int conved_height,
		const int conved_width,const int kernel_h, const int kernel_w,
		const int stride_h, const int stride_w, const int pad_h, const int pad_w,
		Dtype* const top_data,const Dtype* const weight,const Dtype* const bias,const bool bias_term_) {
	CUDA_KERNEL_LOOP(index, nthreads) {

		const int pw = index % conved_width;
		const int ph = (index / conved_width) % conved_height;
		const int c = (index / conved_width / conved_height) % channels;
		const int n = index / conved_width / conved_height / channels;
		int hstart = ph * stride_h - pad_h;
		int wstart = pw * stride_w - pad_w;
		int hend = min(hstart + kernel_h, height + pad_h);
		int wend = min(wstart + kernel_w, width + pad_w);
//		const int pool_size = (hend - hstart) * (wend - wstart);
		hstart = max(hstart, 0);
		wstart = max(wstart, 0);
		hend = min(hend, height);
		wend = min(wend, width);
		Dtype aveval = 0;
		const Dtype* const bottom_slice =
		bottom_data + (n * channels + c) * height * width;
		const Dtype* const weight_slice =
		weight + c * kernel_h * kernel_w;
//		if (index==1) {
//			printf("pw%d ph%d c%d n%d \n",pw,ph,c,n);
//			printf("hstart%d wstart%d hend%d wend%d \n",hstart,wstart,hend,wend);
//		}

		int khstart=hend<kernel_h?kernel_h-hend:0;
		int kwstart=wend<kernel_w?kernel_w-wend:0;
		for (int h = hstart; h < hend; ++h) {
			for (int w = wstart; w < wend; ++w) {

				aveval += bottom_slice[h * width + w]*weight_slice[(khstart+h-hstart) * kernel_w + (kwstart+w-wstart)];
//				if (index==1) {
//					printf("pos:h%d w%d\n",h,w);
//					printf("cal:bottom%f weight%f\n",bottom_slice[h * width + w],weight_slice[(h-hstart) * kernel_w + (w-wstart)]);
//				}
			}
		}
		if(bias_term_) {
			aveval+=bias[c];
		}
		top_data[index] = aveval;
	}
}

template<typename Dtype>
void DepthwiseConvolutionLayer<Dtype>::Forward_gpu(
		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
//	std::cout << "fp" << std::endl;
	const Dtype* weight = this->blobs_[0]->gpu_data();
	int* kernel_shape_data = this->kernel_shape_.mutable_cpu_data();
	int* stride_data = this->stride_.mutable_cpu_data();
	int* pad_data = this->pad_.mutable_cpu_data();

	for (int i = 0; i < bottom.size(); ++i) {
		const Dtype* bottom_data = bottom[i]->gpu_data();
		Dtype* top_data = top[i]->mutable_gpu_data();
		const int count = top[i]->count();
		vector<int> shape_ = bottom[i]->shape();
		const int channels_ = shape_[1];
		const int height_ = shape_[2];
		const int width_ = shape_[3];

		const int kernel_h_ = kernel_shape_data[0];
		const int kernel_w_ = kernel_shape_data[1];
		const int stride_h_ = stride_data[0];
		const int stride_w_ = stride_data[1];
		const int pad_h_ = pad_data[0];
		const int pad_w_ = pad_data[1];

		const int conved_height = this->output_shape_[0];
		const int conved_weight = this->output_shape_[1];

		const bool bias_term_ = this->bias_term_;

		if (bias_term_) {
			const Dtype* const bias = this->blobs_[1]->gpu_data();
			ConvForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
					count, bottom_data, bottom[i]->num(), channels_,
					height_, width_,conved_height,conved_weight,kernel_h_,
					kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,weight,bias,bias_term_);
		} else {
			ConvForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
					count, bottom_data, bottom[i]->num(), channels_,
					height_, width_,conved_height,conved_weight,kernel_h_,
					kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,weight,0,bias_term_);
		}
	}
}

template <typename Dtype>
__global__ void ConvBackward(const int nthreads,
const Dtype* const top_diff,
const int num, const int channels, const int height,
const int width, const int conved_height, const int conved_width,
const int kernel_h, const int kernel_w, const int stride_h,
const int stride_w, const int pad_h, const int pad_w,
Dtype* const bottom_diff,
const Dtype* const weight) {

	CUDA_KERNEL_LOOP(index, nthreads) {
		const int w = index % width + pad_w;
		const int h = (index / width) % height + pad_h;
		const int c = (index / width / height) % channels;
		const int n = index / width / height / channels;
		
		const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
		const int phend = min(h / stride_h + 1, conved_height);
		const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
		const int pwend = min(w / stride_w + 1, conved_width);
		
		const int khstart=(h >= kernel_h) ? ((h-kernel_h)%stride_h)+(kernel_h-stride_h): h;
		const int kwstart=(w >= kernel_w) ? ((w-kernel_w)%stride_w)+(kernel_w-stride_w) : w;
		
		Dtype gradient = 0;
		const Dtype* const top_diff_slice =
		top_diff + (n * channels + c) * conved_height * conved_width;
		
		const Dtype* const weight_slice =weight + c * kernel_h * kernel_w;
		
//		if (index==2) {
//			printf("w%d h%d c%d n%d \n",w,h,c,n);
//			printf("phstart%d phend%d pwstart%d pwend%d \n",phstart,phend,pwstart,pwend);
//		}
		
		for (int ph = phstart; ph < phend; ++ph) {
			for (int pw = pwstart; pw < pwend; ++pw) {
				int kh=khstart-(ph-phstart)*stride_h;
				int kw=kwstart-(pw-pwstart)*stride_w;
				gradient += top_diff_slice[ph * conved_width + pw] *weight_slice[kh*kernel_w+kw];
				
//						if (index==2) {
//							printf("pos:ph%d pw%d kh%d kw%d\n",ph,pw,kh,kw);
//							printf("cal:top_diff%f weight%f\n",top_diff_slice[ph * conved_width + pw],weight_slice[kh*kernel_w+kw]);
//				//			printf("cal:top_diff%f weight%f\n",top_diff_slice[ph * conved_width + pw],weight_slice[kh*kernel_w+kw]);
//						}
			}
		}
		bottom_diff[index] = gradient;
	}
}

__device__ float atomicAddme(float* address, float val)
{
    return atomicAdd(address,val);
}

__device__ double atomicAddme(double* address, double val)
{
    unsigned long long int* address_as_ull =
                                          (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, 
                        __double_as_longlong(val + 
                        __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}



#define DIVIDE_CEIL(a,b) a/b+((a/b*b)<a)


template <typename Dtype>
__global__ void ConvBackwardWeight(const int nthreads,
const Dtype* const top_diff,
const int num, const int channels, const int height,
const int width, const int conved_height, const int conved_width,
const int kernel_h, const int kernel_w, const int stride_h,
const int stride_w, const int pad_h, const int pad_w,
Dtype* const weight_diff,
const Dtype* const bottom_data) {

	CUDA_KERNEL_LOOP(index, nthreads) {
		const int kw=index % kernel_w;
		const int kh= (index /kernel_w)%kernel_h;
		const int c=index /kernel_w/kernel_h;
		
//		if (index==5) {
//			printf("kh%d kw%d kc%d\n",kh,kw,c);
//		}
		Dtype gradient = 0;
		for( int n=0;n<num;n++) {
			
			const Dtype* const top_diff_slice = top_diff + (n * channels + c) * conved_height * conved_width;
			const Dtype* const bottom_data_slice = bottom_data + (n * channels + c) * height * width;
		
			
			const int phstart=max(DIVIDE_CEIL((pad_h-kh),stride_h),0);
			const int phend=min(DIVIDE_CEIL((height+pad_h-kh),stride_h),conved_height);
		
			const int pwstart=max(DIVIDE_CEIL((pad_w-kw),stride_w),0);
			
			const int pwend=min(DIVIDE_CEIL((width+pad_w-kw),stride_w),conved_width);
//			if (index==5) {
//				printf("phstart%d phend%d pwstart%d pwend%d \n",phstart,phend,pwstart,pwend);
//			}
//			
			for(int ph=phstart;ph<phend;ph++){
				for (int pw=pwstart;pw<pwend;pw++){
					const int h=ph*stride_h+kh-pad_h;
					const int w=pw*stride_w+kw-pad_w;
					gradient+=top_diff_slice[ph * conved_width + pw]*bottom_data_slice[h*width+w];
//					if (index==5) {
//						printf("n%d h%d w%d ph%d pw%d topdiff%f bottomdata%f\n",n,h,w,ph,pw,top_diff_slice[ph * conved_width + pw],bottom_data_slice[h*width+w]);
//			//			printf("phstart%d phend%d pwstart%d pwend%d \n",phstart,phend,pwstart,pwend);
//					}
				}
			}
		}
		weight_diff[c * kernel_h * kernel_w+kh*kernel_w+kw]+=gradient;
	}
}

template <typename Dtype>
__global__ void ConvBackwardBias(const int nthreads,
const Dtype* const top_diff,
const int num, const int channels, const int height,
const int width, const int conved_height, const int conved_width,
const int kernel_h, const int kernel_w, const int stride_h,
const int stride_w, const int pad_h, const int pad_w,
Dtype* const bias_diff) {
	CUDA_KERNEL_LOOP(index, nthreads) {
		const int c = index;
		Dtype gradient=0;
		for( int n=0;n<num;n++) {
			const Dtype* const top_diff_slice =
			top_diff + (n * channels + c) * conved_height * conved_width;
			for(int ph=0;ph<conved_height;ph++) {
				for (int pw=0;pw<conved_width;pw++) {
					gradient+=top_diff_slice[ph * conved_width + pw];
				}
			}
		}
		bias_diff[c]+=gradient;
	}
}
template<typename Dtype>
void DepthwiseConvolutionLayer<Dtype>::Backward_gpu(
const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) {


	int* kernel_shape_data = this->kernel_shape_.mutable_cpu_data();
	int* stride_data = this->stride_.mutable_cpu_data();
	int* pad_data = this->pad_.mutable_cpu_data();

	const Dtype* weight = this->blobs_[0]->gpu_data();
	Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();

	const bool bias_term_ = this->bias_term_;
	Dtype* bias_diff = bias_term_ ? this->blobs_[1]->mutable_gpu_diff() : 0;
	const bool bias_propagate_down_ = this->param_propagate_down_[1];
	const bool weight_propagate_down_ = this->param_propagate_down_[0];


	const int kernel_h_ = kernel_shape_data[0];
	const int kernel_w_ = kernel_shape_data[1];
	const int stride_h_ = stride_data[0];
	const int stride_w_ = stride_data[1];
	const int pad_h_ = pad_data[0];
	const int pad_w_ = pad_data[1];

	const int conved_height = this->output_shape_[0];
	const int conved_weight = this->output_shape_[1];

//	CHECK_EQ(stride_h_, 1)
//	        << "The backward of the net whose stride is bigger than 1 is not implemented now. ";
//	CHECK_EQ(stride_w_, 1)
//	        << "The backward of the net whose stride is bigger than 1 is not implemented now. ";


	for (int i = 0; i < top.size(); ++i) {

		const Dtype* top_diff = top[i]->gpu_diff();
		const Dtype* bottom_data = bottom[i]->gpu_data();
		Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();

		vector<int> shape_ = bottom[i]->shape();
		const int channels_ = shape_[1];
		const int height_ = shape_[2];
		const int width_ = shape_[3];

		// Bias gradient, if necessary.
		if (bias_term_ && bias_propagate_down_) {
			const int count_bias = channels_;
			ConvBackwardBias<Dtype><<<CAFFE_GET_BLOCKS(count_bias), CAFFE_CUDA_NUM_THREADS>>>(
				count_bias, top_diff, bottom[i]->num(), channels_,
				height_, width_,conved_height,conved_weight,kernel_h_,
				kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
				bias_diff);
		}
		// gradient w.r.t. weight. Note that we will accumulate diffs.
		if (weight_propagate_down_) {
			const int count_weight = channels_ * kernel_h_ * kernel_w_;
			ConvBackwardWeight<Dtype><<<CAFFE_GET_BLOCKS(count_weight), CAFFE_CUDA_NUM_THREADS>>>(
					count_weight, top_diff, bottom[i]->num(), channels_,
				height_, width_,conved_height,conved_weight,kernel_h_,
				kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
				weight_diff,
				bottom_data);
		}
		// gradient w.r.t. bottom data, if necessary.
		if (propagate_down[i]) {
			const int count_bottom=bottom[i]->count();
			ConvBackward<Dtype><<<CAFFE_GET_BLOCKS(count_bottom), CAFFE_CUDA_NUM_THREADS>>>(
				count_bottom, top_diff, bottom[i]->num(), channels_,
				height_, width_,conved_height,conved_weight,kernel_h_,
				kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, 
				bottom_diff,
				weight);
		}
	}

}

INSTANTIATE_LAYER_GPU_FUNCS (DepthwiseConvolutionLayer);

}  // namespace caffe

ShuffleNet通道混洗操作（shuffle channel）在Caffe下的实现

我用的是Github上farmingyard的源码，可在以下链接下载： Github上shuffle channel实现源码

和DepthwiseConvolution的实现差不太多，唯一不同的是需要对Caffe目录下的/src/caffe/proto/caffe.proto文件进行修改。

还是三个文件：

文件名字	文件用途
shuffle_channel_layer.hpp	头文件
shuffle_channel_layer.cpp	shuffle channel的CPU实现
shuffle_channel_layer.cu	shuffle channel的GPU实现

实现步骤

我们需要做的操作：

将shuffle_channel_layer.hpp放到/caffeMS/include/caffe/layers/目录下

将shuffle_channel_layer.cpp和 shuffle_channel_layer.cu放到/caffeMS/src/caffe/layers/目录下。

修改/caffeMS/src/caffe/proto/caffe.proto文件：

大概是在420行左右，在message LayerParameter中添加一行代码如下，注意164可以随意取，但是不能和其他已有操作的数值一样

message LayerParameter {
...
optional ShuffleChannelParameter shuffle_channel_param = 164;
...
}

在文件最后添加：

message ShuffleChannelParameter {
  optional uint32 group = 1[default = 1]; // The number of group
}

然后重新编译Caffe即可。

make all -j8
make test -j8
make runtest -j8

源码

shuffle_channel_layer.hpp

#ifndef CAFFE_SHUFFLE_CHANNEL_LAYER_HPP_
#define CAFFE_SHUFFLE_CHANNEL_LAYER_HPP_

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {

template <typename Dtype>
class ShuffleChannelLayer : public Layer<Dtype> {
public:
    explicit ShuffleChannelLayer(const LayerParameter& param)
        : Layer<Dtype>(param) {}
    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top);
    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top);
    virtual inline const char* type() const { return "ShuffleChannel"; }

protected:
    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
                             const vector<Blob<Dtype>*>& top);
    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
                             const vector<Blob<Dtype>*>& top);

    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
                              const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
                              const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

private:
    void Resize_cpu(Dtype *output, const Dtype *input, int group_row, int group_column, int len);
    void Resize_gpu(Dtype *output, const Dtype *input, int group_row, int group_column, int len);

    //Blob<Dtype> temp_blob_;
    int group_;
};

}  // namespace caffe

#endif  // CAFFE_SHUFFLE_CHANNEL_LAYER_HPP_

shuffle_channel_layer.cpp

#include <algorithm>
#include <vector>

#include "caffe/layers/shuffle_channel_layer.hpp"

namespace caffe {

template <typename Dtype>
void ShuffleChannelLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype> *> &bottom, const vector<Blob<Dtype> *> &top)
{
    group_ = this->layer_param_.shuffle_channel_param().group();
    CHECK_GT(group_, 0) << "group must be greater than 0";
    //temp_blob_.ReshapeLike(*bottom[0]);
	top[0]->ReshapeLike(*bottom[0]);
}

template <typename Dtype>
void ShuffleChannelLayer<Dtype>::Resize_cpu(Dtype *output, const Dtype *input, int group_row, int group_column, int len)
{
    for (int i = 0; i < group_row; ++i) // 2
    {
        for(int j = 0; j < group_column ; ++j) // 3
        {
            const Dtype* p_i = input + (i * group_column + j ) * len;
            Dtype* p_o = output + (j * group_row + i ) * len;

            caffe_copy(len, p_i, p_o);
        }
    }
}

template <typename Dtype>
void ShuffleChannelLayer<Dtype>::Reshape(const vector<Blob<Dtype> *> &bottom, const vector<Blob<Dtype> *> &top)
{
  int channels_ = bottom[0]->channels();
  int height_ = bottom[0]->height();
  int width_ = bottom[0]->width();

  top[0]->Reshape(bottom[0]->num(), channels_, height_, width_);

}

template <typename Dtype>
void ShuffleChannelLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
                                             const vector<Blob<Dtype>*>& top) {
    const Dtype* bottom_data = bottom[0]->cpu_data();
    Dtype* top_data = top[0]->mutable_cpu_data();

    const int num = bottom[0]->shape(0);
    const int feature_map_size = bottom[0]->count(1);
    const int sp_sz = bottom[0]->count(2);
    const int chs = bottom[0]->shape(1);

    int group_row = group_;
    int group_column = int(chs / group_row);
    CHECK_EQ(chs, (group_column * group_row)) << "Wrong group size.";

    //Dtype* temp_data = temp_blob_.mutable_cpu_data();
    for(int n = 0; n < num; ++n)
    {
		Resize_cpu(top_data + n*feature_map_size, bottom_data + n*feature_map_size, group_row, group_column, sp_sz);
    }
    //caffe_copy(bottom[0]->count(), temp_blob_.cpu_data(), top_data);
}

template <typename Dtype>
void ShuffleChannelLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                                              const vector<bool>& propagate_down,
                                              const vector<Blob<Dtype>*>& bottom) {
    if (propagate_down[0]) {
        const Dtype* top_diff = top[0]->cpu_diff();
        Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();

        const int num = bottom[0]->shape(0);
        const int feature_map_size = bottom[0]->count(1);
        const int sp_sz = bottom[0]->count(2);
        const int chs = bottom[0]->shape(1);

        int group_row = int(chs / group_);
        int group_column = group_;

        //Dtype* temp_diff = temp_blob_.mutable_cpu_diff();
        for(int n = 0; n < num; ++n)
        {
			Resize_cpu(bottom_diff + n * feature_map_size, top_diff + n*feature_map_size, group_row, group_column, sp_sz);
        }
        //caffe_copy(top[0]->count(), temp_blob_.cpu_diff(), bottom_diff);
    }
}


#ifdef CPU_ONLY
STUB_GPU(ShuffleChannelLayer);
#endif

INSTANTIATE_CLASS(ShuffleChannelLayer);
REGISTER_LAYER_CLASS(ShuffleChannel);
}  // namespace caffe

shuffle_channel_layer.cu

#include <algorithm>
#include <vector>

#include "caffe/layers/shuffle_channel_layer.hpp"

namespace caffe {

template <typename Dtype>
__global__ void ShuffleChannelKernel(const int nthreads, const int feature_map_size,
	Dtype *output, const Dtype *input, int group_row, int group_column, int len) {
	CUDA_KERNEL_LOOP(index, nthreads) {
		const int n = index / group_row / group_column / len;
		const int i = (index / group_column / len) % group_row;
		const int j = index / len % group_column;
		const int k = index - (n * feature_map_size + (i * group_column + j) * len);
		Dtype* p_o = output + n * feature_map_size + (j * group_row + i) * len;
		p_o[k] = input[index];
	}
}

template <typename Dtype>
void ShuffleChannelLayer<Dtype>::Resize_gpu(Dtype *output, const Dtype *input, int group_row, int group_column, int len)
{
    for (int i = 0; i < group_row; ++i) // 2
    {
        for(int j = 0; j < group_column ; ++j) // 3
        {
            const Dtype* p_i = input + (i * group_column + j ) * len;
            Dtype* p_o = output + (j * group_row + i ) * len;

            caffe_copy(len, p_i, p_o);
        }
    }
}

template <typename Dtype>
void ShuffleChannelLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
    const Dtype* bottom_data = bottom[0]->gpu_data();
    Dtype* top_data = top[0]->mutable_gpu_data();

    const int num = bottom[0]->num();
    const int feature_map_size = bottom[0]->count(1);
    const int sp_sz = bottom[0]->count(2);
    const int chs = bottom[0]->channels();

    int group_row = group_;
    int group_column = int(chs / group_row);
    CHECK_EQ(chs, (group_column * group_row)) << "Wrong group size.";
	int count = num * group_column * group_row * sp_sz;
	ShuffleChannelKernel<Dtype> << <CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS >> >(
		count, feature_map_size, top_data, bottom_data, group_row, group_column, sp_sz);
    //Dtype* temp_data = temp_blob_.mutable_gpu_data();
    //for(int n = 0; n < num; ++n)
    //{
    //    Resize_gpu(top_data + n*feature_map_size, bottom_data + n*feature_map_size, group_row, group_column, sp_sz);
    //}
    //caffe_copy(bottom[0]->count(), temp_blob_.gpu_data(), top_data);
}

template <typename Dtype>
void ShuffleChannelLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  if (propagate_down[0]) {
      const Dtype* top_diff = top[0]->gpu_diff();
      Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();

      const int num = bottom[0]->num();
      const int feature_map_size = bottom[0]->count(1);
      const int sp_sz = bottom[0]->count(2);
      const int chs = bottom[0]->channels();

      int group_row = int(chs / group_);
      int group_column = group_;
	  int count = num * group_column * group_row * sp_sz;
	  ShuffleChannelKernel<Dtype> << <CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS >> >(
		  count, feature_map_size, bottom_diff, top_diff, group_row, group_column, sp_sz);
      //Dtype* temp_diff = temp_blob_.mutable_gpu_diff();
    //  for(int n = 0; n < num; ++n)
    //  {
		  //Resize_gpu(bottom_diff + n * feature_map_size, top_diff + n*feature_map_size, group_row, group_column, sp_sz);
    //  }
      //caffe_copy(top[0]->count(), temp_blob_.gpu_diff(), bottom_diff);
  }
}

INSTANTIATE_LAYER_GPU_FUNCS(ShuffleChannelLayer);

}  // namespace caffe

损失函数CenterLoss在Caffe下的实现

CenterLoss主要用在人脸识别上，某个网络需要用到，所以我学习了下怎么使用。我用的是Github上ydwen的官方源码，可在以下链接下载： Github上CenterLoss实现源码

和shuffle channel实现一样，同样也需要对/caffeMS/src/caffe/proto/caffe.proto文件进行修改。

三个文件：

文件名字	文件用途
center_loss_layer.hpp	头文件
center_loss_layer.cpp	CenterLoss的CPU实现
center_loss_layer.cu	CenterLoss的GPU实现

实现步骤

我们需要做的操作：

将center_loss_layer.hpp放到/caffeMS/include/caffe/layers/目录下

将center_loss_layer.cpp和 center_loss_layer.cu放到/caffeMS/src/caffe/layers/目录下。

修改/caffeMS/src/caffe/proto/caffe.proto文件：

大概是在420行左右，在message LayerParameter中添加一行代码如下，注意147可以随意取，但是不能和其他已有操作的数值一样

message LayerParameter {
...
optional CenterLossParameter center_loss_param = 147; 
...
}

在文件最后添加：

message CenterLossParameter {  
  optional uint32 num_output = 1; // The number of outputs for the layer  
  optional FillerParameter center_filler = 2; // The filler for the centers  
  // The first axis to be lumped into a single inner product computation;  
  // all preceding axes are retained in the output.  
  // May be negative to index from the end (e.g., -1 for the last axis).  
  optional int32 axis = 3 [default = 1];  
}

然后重新编译Caffe即可。

make all -j8
make test -j8
make runtest -j8

源码

center_loss_layer.hpp

#ifndef CAFFE_CENTER_LOSS_LAYER_HPP_
#define CAFFE_CENTER_LOSS_LAYER_HPP_

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

#include "caffe/layers/loss_layer.hpp"

namespace caffe {

template <typename Dtype>
class CenterLossLayer : public LossLayer<Dtype> {
 public:
  explicit CenterLossLayer(const LayerParameter& param)
      : LossLayer<Dtype>(param) {}
  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual inline const char* type() const { return "CenterLoss"; }
  virtual inline int ExactNumBottomBlobs() const { return 2; }
  virtual inline int ExactNumTopBlobs() const { return -1; }

 protected:
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

  int M_;
  int K_;
  int N_;
  
  Blob<Dtype> distance_;
  Blob<Dtype> variation_sum_;
};

}  // namespace caffe

#endif  // CAFFE_CENTER_LOSS_LAYER_HPP_```

  ### center_loss_layer.cpp
  

```c
#include <vector>

#include "caffe/filler.hpp"
#include "caffe/layers/center_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
void CenterLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const int num_output = this->layer_param_.center_loss_param().num_output();  
  N_ = num_output;
  const int axis = bottom[0]->CanonicalAxisIndex(
      this->layer_param_.center_loss_param().axis());
  // Dimensions starting from "axis" are "flattened" into a single
  // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
  // and axis == 1, N inner products with dimension CHW are performed.
  K_ = bottom[0]->count(axis);
  // Check if we need to set up the weights
  if (this->blobs_.size() > 0) {
    LOG(INFO) << "Skipping parameter initialization";
  } else {
    this->blobs_.resize(1);
    // Intialize the weight
    vector<int> center_shape(2);
    center_shape[0] = N_;
    center_shape[1] = K_;
    this->blobs_[0].reset(new Blob<Dtype>(center_shape));
    // fill the weights
    shared_ptr<Filler<Dtype> > center_filler(GetFiller<Dtype>(
        this->layer_param_.center_loss_param().center_filler()));
    center_filler->Fill(this->blobs_[0].get());

  }  // parameter initialization
  this->param_propagate_down_.resize(this->blobs_.size(), true);
}

template <typename Dtype>
void CenterLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  CHECK_EQ(bottom[1]->channels(), 1);
  CHECK_EQ(bottom[1]->height(), 1);
  CHECK_EQ(bottom[1]->width(), 1);
  M_ = bottom[0]->num();
  // The top shape will be the bottom shape with the flattened axes dropped,
  // and replaced by a single axis with dimension num_output (N_).
  LossLayer<Dtype>::Reshape(bottom, top);
  distance_.ReshapeLike(*bottom[0]);
  variation_sum_.ReshapeLike(*this->blobs_[0]);
}

template <typename Dtype>
void CenterLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  const Dtype* label = bottom[1]->cpu_data();
  const Dtype* center = this->blobs_[0]->cpu_data();
  Dtype* distance_data = distance_.mutable_cpu_data();
  
  // the i-th distance_data
  for (int i = 0; i < M_; i++) {
    const int label_value = static_cast<int>(label[i]);
    // D(i,:) = X(i,:) - C(y(i),:)
    caffe_sub(K_, bottom_data + i * K_, center + label_value * K_, distance_data + i * K_);
  }
  Dtype dot = caffe_cpu_dot(M_ * K_, distance_.cpu_data(), distance_.cpu_data());
  Dtype loss = dot / M_ / Dtype(2);
  top[0]->mutable_cpu_data()[0] = loss;
}

template <typename Dtype>
void CenterLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  // Gradient with respect to centers
  if (this->param_propagate_down_[0]) {
    const Dtype* label = bottom[1]->cpu_data();
    Dtype* center_diff = this->blobs_[0]->mutable_cpu_diff();
    Dtype* variation_sum_data = variation_sum_.mutable_cpu_data();
    const Dtype* distance_data = distance_.cpu_data();

    // \sum_{y_i==j}
    caffe_set(N_ * K_, (Dtype)0., variation_sum_.mutable_cpu_data());
    for (int n = 0; n < N_; n++) {
      int count = 0;
      for (int m = 0; m < M_; m++) {
        const int label_value = static_cast<int>(label[m]);
        if (label_value == n) {
          count++;
          caffe_sub(K_, variation_sum_data + n * K_, distance_data + m * K_, variation_sum_data + n * K_);
        }
      }
      caffe_axpy(K_, (Dtype)1./(count + (Dtype)1.), variation_sum_data + n * K_, center_diff + n * K_);
    }
  }
  // Gradient with respect to bottom data 
  if (propagate_down[0]) {
    caffe_copy(M_ * K_, distance_.cpu_data(), bottom[0]->mutable_cpu_diff());
    caffe_scal(M_ * K_, top[0]->cpu_diff()[0] / M_, bottom[0]->mutable_cpu_diff());
  }
  if (propagate_down[1]) {
    LOG(FATAL) << this->type()
               << " Layer cannot backpropagate to label inputs.";
  }
}

#ifdef CPU_ONLY
STUB_GPU(CenterLossLayer);
#endif

INSTANTIATE_CLASS(CenterLossLayer);
REGISTER_LAYER_CLASS(CenterLoss);

}  // namespace caffe

center_loss_layer.cpp

#include <vector>

#include "caffe/filler.hpp"
#include "caffe/layers/center_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
void CenterLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const int num_output = this->layer_param_.center_loss_param().num_output();  
  N_ = num_output;
  const int axis = bottom[0]->CanonicalAxisIndex(
      this->layer_param_.center_loss_param().axis());
  // Dimensions starting from "axis" are "flattened" into a single
  // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
  // and axis == 1, N inner products with dimension CHW are performed.
  K_ = bottom[0]->count(axis);
  // Check if we need to set up the weights
  if (this->blobs_.size() > 0) {
    LOG(INFO) << "Skipping parameter initialization";
  } else {
    this->blobs_.resize(1);
    // Intialize the weight
    vector<int> center_shape(2);
    center_shape[0] = N_;
    center_shape[1] = K_;
    this->blobs_[0].reset(new Blob<Dtype>(center_shape));
    // fill the weights
    shared_ptr<Filler<Dtype> > center_filler(GetFiller<Dtype>(
        this->layer_param_.center_loss_param().center_filler()));
    center_filler->Fill(this->blobs_[0].get());

  }  // parameter initialization
  this->param_propagate_down_.resize(this->blobs_.size(), true);
}

template <typename Dtype>
void CenterLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  CHECK_EQ(bottom[1]->channels(), 1);
  CHECK_EQ(bottom[1]->height(), 1);
  CHECK_EQ(bottom[1]->width(), 1);
  M_ = bottom[0]->num();
  // The top shape will be the bottom shape with the flattened axes dropped,
  // and replaced by a single axis with dimension num_output (N_).
  LossLayer<Dtype>::Reshape(bottom, top);
  distance_.ReshapeLike(*bottom[0]);
  variation_sum_.ReshapeLike(*this->blobs_[0]);
}

template <typename Dtype>
void CenterLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  const Dtype* label = bottom[1]->cpu_data();
  const Dtype* center = this->blobs_[0]->cpu_data();
  Dtype* distance_data = distance_.mutable_cpu_data();
  
  // the i-th distance_data
  for (int i = 0; i < M_; i++) {
    const int label_value = static_cast<int>(label[i]);
    // D(i,:) = X(i,:) - C(y(i),:)
    caffe_sub(K_, bottom_data + i * K_, center + label_value * K_, distance_data + i * K_);
  }
  Dtype dot = caffe_cpu_dot(M_ * K_, distance_.cpu_data(), distance_.cpu_data());
  Dtype loss = dot / M_ / Dtype(2);
  top[0]->mutable_cpu_data()[0] = loss;
}

template <typename Dtype>
void CenterLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  // Gradient with respect to centers
  if (this->param_propagate_down_[0]) {
    const Dtype* label = bottom[1]->cpu_data();
    Dtype* center_diff = this->blobs_[0]->mutable_cpu_diff();
    Dtype* variation_sum_data = variation_sum_.mutable_cpu_data();
    const Dtype* distance_data = distance_.cpu_data();

    // \sum_{y_i==j}
    caffe_set(N_ * K_, (Dtype)0., variation_sum_.mutable_cpu_data());
    for (int n = 0; n < N_; n++) {
      int count = 0;
      for (int m = 0; m < M_; m++) {
        const int label_value = static_cast<int>(label[m]);
        if (label_value == n) {
          count++;
          caffe_sub(K_, variation_sum_data + n * K_, distance_data + m * K_, variation_sum_data + n * K_);
        }
      }
      caffe_axpy(K_, (Dtype)1./(count + (Dtype)1.), variation_sum_data + n * K_, center_diff + n * K_);
    }
  }
  // Gradient with respect to bottom data 
  if (propagate_down[0]) {
    caffe_copy(M_ * K_, distance_.cpu_data(), bottom[0]->mutable_cpu_diff());
    caffe_scal(M_ * K_, top[0]->cpu_diff()[0] / M_, bottom[0]->mutable_cpu_diff());
  }
  if (propagate_down[1]) {
    LOG(FATAL) << this->type()
               << " Layer cannot backpropagate to label inputs.";
  }
}

#ifdef CPU_ONLY
STUB_GPU(CenterLossLayer);
#endif

INSTANTIATE_CLASS(CenterLossLayer);
REGISTER_LAYER_CLASS(CenterLoss);

}  // namespace caffe

center_loss_layer.cu

#include <vector>

#include "caffe/filler.hpp"
#include "caffe/layers/center_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
__global__ void Compute_distance_data_gpu(int nthreads, const int K, const Dtype* bottom,
	      const Dtype* label, const Dtype* center, Dtype* distance) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    int m = index / K;
    int k = index % K;
    const int label_value = static_cast<int>(label[m]);
    // distance(i) = x(i) - c_{y(i)}
    distance[index] = bottom[index] - center[label_value * K + k];
  }
}

template <typename Dtype>
__global__ void Compute_center_diff_gpu(int nthreads, const int M, const int K, 
        const Dtype* label, const Dtype* distance, Dtype* variation_sum, 
        Dtype* center_diff) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    int count = 0;
    for (int m = 0; m < M; m++) {
      const int label_value = static_cast<int>(label[m]);
      if (label_value == index) {
        count++;
        for (int k = 0; k < K; k++) {
          variation_sum[index * K + k] -= distance[m * K + k];
        }
      }
    }
    for (int k = 0; k < K; k++) {
      center_diff[index * K + k] = variation_sum[index * K + k] /(count + (Dtype)1.);
    }
  }
}


template <typename Dtype>
void CenterLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  int nthreads = M_ * K_;
  Compute_distance_data_gpu<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
      CAFFE_CUDA_NUM_THREADS>>>(nthreads, K_, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
                                this->blobs_[0]->gpu_data(), distance_.mutable_gpu_data());
  Dtype dot;
  caffe_gpu_dot(M_ * K_, distance_.gpu_data(), distance_.gpu_data(), &dot);
  Dtype loss = dot / M_ / Dtype(2);
  top[0]->mutable_cpu_data()[0] = loss;
}

template <typename Dtype>
void CenterLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  int nthreads = N_;
  caffe_gpu_set(N_ * K_, (Dtype)0., variation_sum_.mutable_cpu_data());
  Compute_center_diff_gpu<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
      CAFFE_CUDA_NUM_THREADS>>>(nthreads, M_, K_, bottom[1]->gpu_data(), distance_.gpu_data(), 
                                variation_sum_.mutable_cpu_data(), this->blobs_[0]->mutable_gpu_diff());

  if (propagate_down[0]) {
    caffe_gpu_scale(M_ * K_, top[0]->cpu_diff()[0] / M_, 
                             distance_.gpu_data(), bottom[0]->mutable_gpu_diff());
  }
  if (propagate_down[1]) {
    LOG(FATAL) << this->type()
               << " Layer cannot backpropagate to label inputs.";
  }
}

INSTANTIATE_LAYER_GPU_FUNCS(CenterLossLayer);

}  // namespace caffe

Caffe下关于DepthwiseConvolution、shuffle channel、CenterLoss的实现至此已介绍完毕。

希望能帮到大家。谢谢
2019.7.12

MobileNet DepthwiseConvolution、ShuffleNet shuffle channel、CenterLoss在Caffe下实现

针对Caffe下特殊操作实现

MobileNet-DepthwiseConvolution在Caffe下实现

实现步骤

源码：

depthwise_conv_layer.hpp

depthwise_conv_layer.cpp

depthwise_conv_layer.cu

ShuffleNet通道混洗操作（shuffle channel）在Caffe下的实现

实现步骤

源码

shuffle_channel_layer.hpp

shuffle_channel_layer.cpp

shuffle_channel_layer.cu

损失函数CenterLoss在Caffe下的实现

实现步骤

源码

center_loss_layer.hpp

center_loss_layer.cpp

center_loss_layer.cu

猜你喜欢