深度学习笔记(2):caffe 加新层 Attention LSTM layer

上一篇文章,详细地分析了LSTM layer 的源码和流程图,本篇将在caffe加入新层,Attention lstm layer。

在代码之前,我们先分析一下一些论文里的attention model 的公式和流程图。

(1):  Recurrent Models of Visual Attention 



A、Glimpse Sensor: 在t时刻,选取不同大小的区域,组合成数据ρ
B、Glimpse Network:图片局部信息与位置信息整合 
C、Model Architecture:ht-1隐藏记忆单元,与gt,生成新的ht,并以此生成attention,即感兴趣的地方。

具体公式推导可以看论文和代码


(2)ACTION RECOGNITION USING VISUAL ATTENTION


大体思想是对提出的特征分割,即每张图片分割成49个部分(7X7),这样找出每张图片的关注地方,这里的图(b)有问题,作者的代码也反映出这一点。本文主要是在caffe里写出一个这样的Attention Lstm layer.


(3)  Show, Attend and Tell: Neural Image Caption Generation with Visual Attention 





思想一致,具体推导看论文。



二:ALSTM Layer 代码

Alstm.cpp

  1. #include <string>  
  2. #include <vector>  
  3.   
  4. #include "caffe/blob.hpp"  
  5. #include "caffe/common.hpp"  
  6. #include "caffe/filler.hpp"  
  7. #include "caffe/layer.hpp"  
  8. #include "caffe/sequence_layers.hpp"  
  9. #include "caffe/util/math_functions.hpp"  
  10.   
  11. namespace caffe {  
  12.   
  13. template <typename Dtype>  
  14. void ALSTMLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {  
  15.   names->resize(2);  
  16.   (*names)[0] = "h_0";  
  17.   (*names)[1] = "c_0";  
  18. }  
  19.   
  20. template <typename Dtype>  
  21. void ALSTMLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {  
  22.   names->resize(2);  
  23.   (*names)[0] = "h_" + this->int_to_str(this->T_);  
  24.   (*names)[1] = "c_T";  
  25. }  
  26.   
  27. template <typename Dtype>  
  28. void ALSTMLayer<Dtype>::OutputBlobNames(vector<string>* names) const {  
  29.   names->resize(2);  
  30.   (*names)[0] = "h";  
  31.   (*names)[1] = "mask";  
  32. }  
  33.   
  34. template <typename Dtype>  
  35. void ALSTMLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {  
  36.   const int num_output = this->layer_param_.recurrent_param().num_output();  
  37.   CHECK_GT(num_output, 0) << "num_output must be positive";  
  38.   const FillerParameter& weight_filler =  
  39.       this->layer_param_.recurrent_param().weight_filler();  
  40.   const FillerParameter& bias_filler =  
  41.       this->layer_param_.recurrent_param().bias_filler();  
  42.   
  43.   // Add generic LayerParameter's (without bottoms/tops) of layer types we'll  
  44.   // use to save redundant code.  
  45.   LayerParameter hidden_param;  
  46.   hidden_param.set_type("InnerProduct");  
  47.   hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4);  
  48.   hidden_param.mutable_inner_product_param()->set_bias_term(false);  
  49.   hidden_param.mutable_inner_product_param()->set_axis(1);  
  50.   hidden_param.mutable_inner_product_param()->  
  51.       mutable_weight_filler()->CopyFrom(weight_filler);  
  52.   
  53.   LayerParameter biased_hidden_param(hidden_param);  
  54.   biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);  
  55.   biased_hidden_param.mutable_inner_product_param()->  
  56.       mutable_bias_filler()->CopyFrom(bias_filler);  
  57.   
  58.   LayerParameter attention_param;  
  59.   attention_param.set_type("InnerProduct");  
  60.   attention_param.mutable_inner_product_param()->set_num_output(256);  
  61.   attention_param.mutable_inner_product_param()->set_bias_term(false);  
  62.   attention_param.mutable_inner_product_param()->set_axis(2);  
  63.   attention_param.mutable_inner_product_param()->  
  64.       mutable_weight_filler()->CopyFrom(weight_filler);  
  65.   
  66.   LayerParameter biased_attention_param(attention_param);  
  67.   biased_attention_param.mutable_inner_product_param()->set_bias_term(true);  
  68.   biased_attention_param.mutable_inner_product_param()->  
  69.       mutable_bias_filler()->CopyFrom(bias_filler); // weight + bias  
  70.   
  71.   LayerParameter sum_param;  
  72.   sum_param.set_type("Eltwise");  
  73.   sum_param.mutable_eltwise_param()->set_operation(  
  74.       EltwiseParameter_EltwiseOp_SUM);  
  75.   
  76.   LayerParameter slice_param;  
  77.   slice_param.set_type("Slice");  
  78.   slice_param.mutable_slice_param()->set_axis(0);  
  79.   
  80.   LayerParameter softmax_param;  
  81.   softmax_param.set_type("Softmax");  
  82.   softmax_param.mutable_softmax_param()->set_axis(-1);  
  83.   
  84.   LayerParameter split_param;  
  85.   split_param.set_type("Split");  
  86.   
  87.   LayerParameter scale_param;  
  88.   scale_param.set_type("Scale");  
  89.   
  90.   LayerParameter permute_param;  
  91.   permute_param.set_type("Permute");  
  92.   
  93.   LayerParameter reshape_param;  
  94.   reshape_param.set_type("Reshape");  
  95.   
  96.   LayerParameter bias_layer_param;  
  97.   bias_layer_param.set_type("Bias");  
  98.   
  99.   LayerParameter pool_param;  
  100.   pool_param.set_type("Pooling");  
  101.   
  102.   LayerParameter reshape_layer_param;  
  103.   reshape_layer_param.set_type("Reshape");  
  104.   
  105.   BlobShape input_shape;  
  106.   input_shape.add_dim(1);  // c_0 and h_0 are a single timestep  
  107.   input_shape.add_dim(this->N_);  
  108.   input_shape.add_dim(num_output);  
  109.   
  110.   net_param->add_input("c_0");  
  111.   net_param->add_input_shape()->CopyFrom(input_shape);  
  112.   
  113.   net_param->add_input("h_0");  
  114.   net_param->add_input_shape()->CopyFrom(input_shape);  
  115.   
  116.   LayerParameter* cont_slice_param = net_param->add_layer();  
  117.   cont_slice_param->CopyFrom(slice_param);  
  118.   cont_slice_param->set_name("cont_slice");  
  119.   cont_slice_param->add_bottom("cont");  
  120.   cont_slice_param->mutable_slice_param()->set_axis(1);  
  121.   
  122.   LayerParameter* x_slice_param = net_param->add_layer();  
  123.   x_slice_param->CopyFrom(slice_param);  
  124.   x_slice_param->set_name("x_slice");  
  125.   x_slice_param->add_bottom("x");  
  126.   
  127.   // Add layer to transform all timesteps of x to the hidden state dimension.  
  128.   //     W_xc_x = W_xc * x + b_c  
  129. /* 
  130.   { 
  131.     LayerParameter* x_transform_param = net_param->add_layer(); 
  132.     x_transform_param->CopyFrom(biased_hidden_param); 
  133.     x_transform_param->set_name("x_transform"); 
  134.     x_transform_param->add_param()->set_name("W_xc"); 
  135.     x_transform_param->add_param()->set_name("b_c"); 
  136.     x_transform_param->add_bottom("x"); 
  137.     x_transform_param->add_top("W_xc_x"); 
  138.   } 
  139.  
  140.   if (this->static_input_) { 
  141.     // Add layer to transform x_static to the gate dimension. 
  142.     //     W_xc_x_static = W_xc_static * x_static 
  143.     LayerParameter* x_static_transform_param = net_param->add_layer(); 
  144.     x_static_transform_param->CopyFrom(hidden_param); 
  145.     x_static_transform_param->mutable_inner_product_param()->set_axis(1); 
  146.     x_static_transform_param->set_name("W_xc_x_static"); 
  147.     x_static_transform_param->add_param()->set_name("W_xc_static"); 
  148.     x_static_transform_param->add_bottom("x_static"); 
  149.     x_static_transform_param->add_top("W_xc_x_static"); 
  150.  
  151.     LayerParameter* reshape_param = net_param->add_layer(); 
  152.     reshape_param->set_type("Reshape"); 
  153.     BlobShape* new_shape = 
  154.          reshape_param->mutable_reshape_param()->mutable_shape(); 
  155.     new_shape->add_dim(1);  // One timestep. 
  156.     new_shape->add_dim(this->N_); 
  157.     new_shape->add_dim( 
  158.         x_static_transform_param->inner_product_param().num_output()); 
  159.     reshape_param->add_bottom("W_xc_x_static"); 
  160.     reshape_param->add_top("W_xc_x_static"); 
  161.   } 
  162.  
  163.  
  164.   LayerParameter* x_slice_param = net_param->add_layer(); 
  165.   x_slice_param->CopyFrom(slice_param); 
  166.   x_slice_param->add_bottom("W_xc_x"); 
  167.   x_slice_param->set_name("W_xc_x_slice"); 
  168. */  
  169.   
  170.   LayerParameter output_concat_layer;  
  171.   output_concat_layer.set_name("h_concat");  
  172.   output_concat_layer.set_type("Concat");  
  173.   output_concat_layer.add_top("h");  
  174.   output_concat_layer.mutable_concat_param()->set_axis(0);  
  175.   
  176.   LayerParameter output_m_layer;  
  177.   output_m_layer.set_name("m_concat");  
  178.   output_m_layer.set_type("Concat");  
  179.   output_m_layer.add_top("mask");  
  180.   output_m_layer.mutable_concat_param()->set_axis(0); // out put 2  
  181.   
  182.   for (int t = 1; t <= this->T_; ++t) {  
  183.     string tm1s = this->int_to_str(t - 1);  
  184.     string ts = this->int_to_str(t);  
  185.   
  186.     cont_slice_param->add_top("cont_" + ts);  
  187.     x_slice_param->add_top("x_" + ts);  
  188.   
  189.     // Add a layer to permute x  
  190.     {  
  191.       LayerParameter* permute_x_param = net_param->add_layer();  
  192.       permute_x_param->CopyFrom(permute_param);  
  193.       permute_x_param->set_name("permute_x_" + ts);  
  194.       permute_x_param->mutable_permute_param()->add_order(2);  
  195.       permute_x_param->mutable_permute_param()->add_order(0);  
  196.       permute_x_param->mutable_permute_param()->add_order(1);  
  197.       permute_x_param->mutable_permute_param()->add_order(3);  
  198.       permute_x_param->add_bottom("x_" + ts);  
  199.       permute_x_param->add_top("x_p_" + ts);  
  200.     }  
  201.     //  
  202.        
  203.   
  204.     // Add a layer to generate attention weights  
  205.     {  
  206.       LayerParameter* att_m_param = net_param->add_layer();  
  207.       att_m_param->CopyFrom(biased_attention_param);  
  208.       att_m_param->set_name("att_m_" + tm1s);  
  209.       att_m_param->add_bottom("h_" + tm1s);  
  210.       att_m_param->add_top("m_" + tm1s);     //     }  
  211.    {  
  212.       LayerParameter* permute_x_a_param = net_param->add_layer();  
  213.       permute_x_a_param->CopyFrom(permute_param);  
  214.       permute_x_a_param->set_name("permute_x_a_" + ts);  
  215.       permute_x_a_param->mutable_permute_param()->add_order(0);  
  216.       permute_x_a_param->mutable_permute_param()->add_order(1);  
  217.       permute_x_a_param->mutable_permute_param()->add_order(3);  
  218.       permute_x_a_param->mutable_permute_param()->add_order(2);  
  219.       permute_x_a_param->add_bottom("x_" + ts);  
  220.       permute_x_a_param->add_top("x_p_a_" + ts);  
  221.     }  // here is to change!  
  222.     {  
  223.       LayerParameter* att_x_param = net_param->add_layer();  
  224.       att_x_param->CopyFrom(biased_attention_param);  
  225.       att_x_param->set_name("att_x_" + tm1s);  
  226.       att_x_param->mutable_inner_product_param()->set_axis(3);  
  227.       att_x_param->add_bottom("x_p_a_" + ts);  
  228.       att_x_param->add_top("m_x_" + tm1s);  
  229.     }    //  fc layer ,change output,dim   
  230.    {  
  231.       LayerParameter* permute_x_a_p_param = net_param->add_layer();  
  232.       permute_x_a_p_param->CopyFrom(permute_param);  
  233.       permute_x_a_p_param->set_name("permute_x_a_p_" + ts);  
  234.       permute_x_a_p_param->mutable_permute_param()->add_order(2);  
  235.       permute_x_a_p_param->mutable_permute_param()->add_order(0);  
  236.       permute_x_a_p_param->mutable_permute_param()->add_order(1);  
  237.       permute_x_a_p_param->mutable_permute_param()->add_order(3);  
  238.       permute_x_a_p_param->add_bottom("m_x_" + tm1s);  
  239.       permute_x_a_p_param->add_top("m_x_a_" + tm1s);  
  240.     }  
  241.     {  
  242.       LayerParameter* m_sum_layer = net_param->add_layer();  
  243.       m_sum_layer->CopyFrom(bias_layer_param);  
  244.       m_sum_layer->set_name("mask_input_" + ts);  
  245.       m_sum_layer->add_bottom("m_x_a_" + tm1s);  
  246.       m_sum_layer->add_bottom("m_" + tm1s);  
  247.       m_sum_layer->add_top("m_input_" + tm1s);  
  248.     }  
  249.    {  
  250.       LayerParameter* att_x_ap_param = net_param->add_layer();  
  251.       att_x_ap_param->CopyFrom(biased_attention_param);  
  252.       att_x_ap_param->set_name("att_x_ap_" + tm1s);  
  253.       att_x_ap_param->mutable_inner_product_param()->set_axis(3);  
  254.       att_x_ap_param->mutable_inner_product_param()->set_num_output(1);  
  255.       att_x_ap_param->add_bottom("m_input_" + tm1s);  
  256.       att_x_ap_param->add_top("m_x_ap_" + tm1s);  //256---->1  
  257.     }  
  258.     {  
  259.       LayerParameter* permute_m_param = net_param->add_layer();  
  260.       permute_m_param->CopyFrom(permute_param);  
  261.       permute_m_param->set_name("permute_m_" + ts);  
  262.       permute_m_param->mutable_permute_param()->add_order(1);  
  263.       permute_m_param->mutable_permute_param()->add_order(2);  
  264.       permute_m_param->mutable_permute_param()->add_order(0);  
  265.       permute_m_param->mutable_permute_param()->add_order(3);  
  266.       permute_m_param->add_bottom("m_x_ap_" + tm1s);  
  267.       permute_m_param->add_top("m_f_" + tm1s);  //10*8*30*1  
  268.     }  
  269.     // Add a softmax layers to generate attention masks  
  270.     {  
  271.       LayerParameter* softmax_m_param = net_param->add_layer();  
  272.       softmax_m_param->CopyFrom(softmax_param);  
  273.       softmax_m_param->mutable_softmax_param()->set_axis(2);  
  274.       softmax_m_param->set_name("softmax_m_" + tm1s);  
  275.       softmax_m_param->add_bottom("m_f_" + tm1s);  
  276.       softmax_m_param->add_top("mask_" + tm1s);  
  277.     }  
  278.      
  279.      {  
  280.       LayerParameter* reshape_m_param = net_param->add_layer();  
  281.       reshape_m_param->CopyFrom(reshape_layer_param);  
  282.       BlobShape* shape = reshape_m_param->mutable_reshape_param()->mutable_shape();  
  283.       shape->Clear();  
  284.       shape->add_dim(0);  
  285.       shape->add_dim(0);  
  286.       shape->add_dim(0);  
  287.       reshape_m_param->set_name("reshape_m_" + tm1s);  
  288.       reshape_m_param->add_bottom("mask_" + tm1s);  
  289.       reshape_m_param->add_top("mask_reshape_" + tm1s);  
  290.     }  
  291.     //Reshape mask from 1*6*36 to 1*6*6*6  
  292.     /* 
  293.     { 
  294.       LayerParameter* reshape_param = net_param->add_layer(); 
  295.       reshape_param->set_type("Reshape"); 
  296.       BlobShape* new_shape = 
  297.          reshape_param->mutable_reshape_param()->mutable_shape(); 
  298.       new_shape->add_dim(1);  // One timestep. 
  299.       new_shape->add_dim(6); 
  300.       new_shape->add_dim(6); 
  301.       new_shape->add_dim(6); 
  302.       reshape_param->add_bottom("mask_" +tm1s); 
  303.       reshape_param->add_top("mask_reshape_" +tm1s); 
  304.     }*/  
  305.     // Conbine mask with input features  
  306.     {  
  307.       LayerParameter* scale_x_param = net_param->add_layer();  
  308.       scale_x_param->CopyFrom(scale_param);  
  309.       scale_x_param->set_name("scale_x_" + tm1s);  
  310.       scale_x_param->add_bottom("x_p_" + ts);  
  311.       scale_x_param->add_bottom("mask_reshape_" + tm1s);  
  312.       scale_x_param->add_top("x_mask_" + ts);  
  313.     }  
  314.   
  315.     {  
  316.       LayerParameter* permute_x_mask_param = net_param->add_layer();  
  317.       permute_x_mask_param->CopyFrom(permute_param);  
  318.       permute_x_mask_param->set_name("permute_x_mask_" + ts);  
  319.       permute_x_mask_param->mutable_permute_param()->add_order(1);  
  320.       permute_x_mask_param->mutable_permute_param()->add_order(2);  
  321.       permute_x_mask_param->mutable_permute_param()->add_order(0);  
  322.       permute_x_mask_param->mutable_permute_param()->add_order(3);  
  323.       permute_x_mask_param->add_bottom("x_mask_" + ts);  
  324.       permute_x_mask_param->add_top("x_mask_p_" + ts);  
  325.     }  
  326.   
  327.     {  
  328.       LayerParameter* reshape_x_param = net_param->add_layer();  
  329.       reshape_x_param->CopyFrom(reshape_param);  
  330.       reshape_x_param->set_name("reshape_x_" +ts);  
  331.       BlobShape* new_shape =  
  332.          reshape_x_param->mutable_reshape_param()->mutable_shape();  
  333.       new_shape->add_dim(this->N_);  
  334.       new_shape->add_dim(512);//512//384  
  335.       new_shape->add_dim(7);//7//6  
  336.       new_shape->add_dim(7);//7//6  
  337.       reshape_x_param->add_bottom("x_mask_p_" + ts);  
  338.       reshape_x_param->add_top("x_mask_reshape_"+ts);  
  339.     }  
  340.   
  341.     {  
  342.       LayerParameter* pool_x_param = net_param->add_layer();  
  343.       pool_x_param->CopyFrom(pool_param);  
  344.       pool_x_param->set_name("pool_x_"+ts);  
  345.       pool_x_param->mutable_pooling_param()->set_pool(PoolingParameter_PoolMethod_SUM);  
  346.       pool_x_param->mutable_pooling_param()->set_kernel_size(7);//7//6  
  347.       pool_x_param->add_bottom("x_mask_reshape_"+ts);  
  348.       pool_x_param->add_top("x_pool_"+ts);  
  349.     }  
  350.   
  351.     {  
  352.       LayerParameter* x_transform_param = net_param->add_layer();  
  353.       x_transform_param->CopyFrom(biased_hidden_param);  
  354.       x_transform_param->set_name("x_transform_" + ts);  
  355.       x_transform_param->add_param()->set_name("W_xc_" + ts);  
  356.       x_transform_param->add_param()->set_name("b_c" + ts);  
  357.       x_transform_param->add_bottom("x_pool_" +ts );  
  358.       x_transform_param->add_top("W_xc_x_"+ts);  
  359.     }  
  360.   
  361.     {  
  362.       LayerParameter* x_transform_reshape_param = net_param->add_layer();  
  363.       x_transform_reshape_param->CopyFrom(reshape_param);  
  364.       x_transform_reshape_param->set_name("x_transform_reshape_" +ts);  
  365.       BlobShape* new_shape_r =  
  366.          x_transform_reshape_param->mutable_reshape_param()->mutable_shape();  
  367.       new_shape_r->add_dim(1);  
  368.       new_shape_r->add_dim(this->N_);  
  369.       new_shape_r->add_dim(num_output * 4);  
  370.       x_transform_reshape_param->add_bottom("W_xc_x_" + ts);  
  371.       x_transform_reshape_param->add_top("W_xc_x_r_"+ts);  
  372.     }  
  373.     // Add layers to flush the hidden state when beginning a new  
  374.     // sequence, as indicated by cont_t.  
  375.     //     h_conted_{t-1} := cont_t * h_{t-1}  
  376.     //  
  377.     // Normally, cont_t is binary (i.e., 0 or 1), so:  
  378.     //     h_conted_{t-1} := h_{t-1} if cont_t == 1  
  379.     //                       0   otherwise  
  380.     {  
  381.       LayerParameter* cont_h_param = net_param->add_layer();  
  382.       cont_h_param->CopyFrom(sum_param);  
  383.       cont_h_param->mutable_eltwise_param()->set_coeff_blob(true);  
  384.       cont_h_param->set_name("h_conted_" + tm1s);  
  385.       cont_h_param->add_bottom("h_" + tm1s);  
  386.       cont_h_param->add_bottom("cont_" + ts);  
  387.       cont_h_param->add_top("h_conted_" + tm1s);  
  388.     }  
  389.   
  390.     // Add layer to compute  
  391.     //     W_hc_h_{t-1} := W_hc * h_conted_{t-1}  
  392.     {  
  393.       LayerParameter* w_param = net_param->add_layer();  
  394.       w_param->CopyFrom(hidden_param);  
  395.       w_param->set_name("transform_" + ts);  
  396.       w_param->add_param()->set_name("W_hc");  
  397.       w_param->add_bottom("h_conted_" + tm1s);  
  398.       w_param->add_top("W_hc_h_" + tm1s);  
  399.       w_param->mutable_inner_product_param()->set_axis(2);  
  400.     }  
  401.   
  402.     // Add the outputs of the linear transformations to compute the gate input.  
  403.     //     gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c  
  404.     //                   = W_hc_h_{t-1} + W_xc_x_t + b_c  
  405.     {  
  406.       LayerParameter* input_sum_layer = net_param->add_layer();  
  407.       input_sum_layer->CopyFrom(sum_param);  
  408.       input_sum_layer->set_name("gate_input_" + ts);  
  409.       input_sum_layer->add_bottom("W_hc_h_" + tm1s);  
  410.       input_sum_layer->add_bottom("W_xc_x_r_" + ts);  
  411.       if (this->static_input_) {  
  412.         input_sum_layer->add_bottom("W_xc_x_static");  
  413.       }  
  414.       input_sum_layer->add_top("gate_input_" + ts);  
  415.     }  
  416.   
  417.     // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t.  
  418.     // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t  
  419.     // Outputs: c_t, h_t  
  420.     //     [ i_t' ]  
  421.     //     [ f_t' ] := gate_input_t  
  422.     //     [ o_t' ]  
  423.     //     [ g_t' ]  
  424.     //         i_t := \sigmoid[i_t']  
  425.     //         f_t := \sigmoid[f_t']  
  426.     //         o_t := \sigmoid[o_t']  
  427.     //         g_t := \tanh[g_t']  
  428.     //         c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)  
  429.     //         h_t := o_t .* \tanh[c_t]  
  430.     {  
  431.       LayerParameter* lstm_unit_param = net_param->add_layer();  
  432.       lstm_unit_param->set_type("LSTMUnit");  
  433.       lstm_unit_param->add_bottom("c_" + tm1s);  
  434.       lstm_unit_param->add_bottom("gate_input_" + ts);  
  435.       lstm_unit_param->add_bottom("cont_" + ts);  
  436.       lstm_unit_param->add_top("c_" + ts);  
  437.       lstm_unit_param->add_top("h_" + ts);  
  438.       lstm_unit_param->set_name("unit_" + ts);  
  439.     }  
  440.     output_concat_layer.add_bottom("h_" + ts);  
  441.     output_m_layer.add_bottom("mask_" + tm1s);  
  442.   }  // for (int t = 1; t <= this->T_; ++t)  
  443.   
  444.   {  
  445.     LayerParameter* c_T_copy_param = net_param->add_layer();  
  446.     c_T_copy_param->CopyFrom(split_param);  
  447.     c_T_copy_param->add_bottom("c_" + this->int_to_str(this->T_));  
  448.     c_T_copy_param->add_top("c_T");  
  449.   }  
  450.   net_param->add_layer()->CopyFrom(output_concat_layer);  
  451.   net_param->add_layer()->CopyFrom(output_m_layer);  
  452. }  
  453.   
  454. INSTANTIATE_CLASS(ALSTMLayer);  
  455. REGISTER_LAYER_CLASS(ALSTM);  
  456.   
  457. }  // namespace caffe  
#include <string>
#include <vector>

#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/filler.hpp"
#include "caffe/layer.hpp"
#include "caffe/sequence_layers.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
void ALSTMLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
  names->resize(2);
  (*names)[0] = "h_0";
  (*names)[1] = "c_0";
}

template <typename Dtype>
void ALSTMLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
  names->resize(2);
  (*names)[0] = "h_" + this->int_to_str(this->T_);
  (*names)[1] = "c_T";
}

template <typename Dtype>
void ALSTMLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
  names->resize(2);
  (*names)[0] = "h";
  (*names)[1] = "mask";
}

template <typename Dtype>
void ALSTMLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
  const int num_output = this->layer_param_.recurrent_param().num_output();
  CHECK_GT(num_output, 0) << "num_output must be positive";
  const FillerParameter& weight_filler =
      this->layer_param_.recurrent_param().weight_filler();
  const FillerParameter& bias_filler =
      this->layer_param_.recurrent_param().bias_filler();

  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
  // use to save redundant code.
  LayerParameter hidden_param;
  hidden_param.set_type("InnerProduct");
  hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4);
  hidden_param.mutable_inner_product_param()->set_bias_term(false);
  hidden_param.mutable_inner_product_param()->set_axis(1);
  hidden_param.mutable_inner_product_param()->
      mutable_weight_filler()->CopyFrom(weight_filler);

  LayerParameter biased_hidden_param(hidden_param);
  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
  biased_hidden_param.mutable_inner_product_param()->
      mutable_bias_filler()->CopyFrom(bias_filler);

  LayerParameter attention_param;
  attention_param.set_type("InnerProduct");
  attention_param.mutable_inner_product_param()->set_num_output(256);
  attention_param.mutable_inner_product_param()->set_bias_term(false);
  attention_param.mutable_inner_product_param()->set_axis(2);
  attention_param.mutable_inner_product_param()->
      mutable_weight_filler()->CopyFrom(weight_filler);

  LayerParameter biased_attention_param(attention_param);
  biased_attention_param.mutable_inner_product_param()->set_bias_term(true);
  biased_attention_param.mutable_inner_product_param()->
      mutable_bias_filler()->CopyFrom(bias_filler); // weight + bias

  LayerParameter sum_param;
  sum_param.set_type("Eltwise");
  sum_param.mutable_eltwise_param()->set_operation(
      EltwiseParameter_EltwiseOp_SUM);

  LayerParameter slice_param;
  slice_param.set_type("Slice");
  slice_param.mutable_slice_param()->set_axis(0);

  LayerParameter softmax_param;
  softmax_param.set_type("Softmax");
  softmax_param.mutable_softmax_param()->set_axis(-1);

  LayerParameter split_param;
  split_param.set_type("Split");

  LayerParameter scale_param;
  scale_param.set_type("Scale");

  LayerParameter permute_param;
  permute_param.set_type("Permute");

  LayerParameter reshape_param;
  reshape_param.set_type("Reshape");

  LayerParameter bias_layer_param;
  bias_layer_param.set_type("Bias");

  LayerParameter pool_param;
  pool_param.set_type("Pooling");

  LayerParameter reshape_layer_param;
  reshape_layer_param.set_type("Reshape");

  BlobShape input_shape;
  input_shape.add_dim(1);  // c_0 and h_0 are a single timestep
  input_shape.add_dim(this->N_);
  input_shape.add_dim(num_output);

  net_param->add_input("c_0");
  net_param->add_input_shape()->CopyFrom(input_shape);

  net_param->add_input("h_0");
  net_param->add_input_shape()->CopyFrom(input_shape);

  LayerParameter* cont_slice_param = net_param->add_layer();
  cont_slice_param->CopyFrom(slice_param);
  cont_slice_param->set_name("cont_slice");
  cont_slice_param->add_bottom("cont");
  cont_slice_param->mutable_slice_param()->set_axis(1);

  LayerParameter* x_slice_param = net_param->add_layer();
  x_slice_param->CopyFrom(slice_param);
  x_slice_param->set_name("x_slice");
  x_slice_param->add_bottom("x");

  // Add layer to transform all timesteps of x to the hidden state dimension.
  //     W_xc_x = W_xc * x + b_c
/*
  {
    LayerParameter* x_transform_param = net_param->add_layer();
    x_transform_param->CopyFrom(biased_hidden_param);
    x_transform_param->set_name("x_transform");
    x_transform_param->add_param()->set_name("W_xc");
    x_transform_param->add_param()->set_name("b_c");
    x_transform_param->add_bottom("x");
    x_transform_param->add_top("W_xc_x");
  }

  if (this->static_input_) {
    // Add layer to transform x_static to the gate dimension.
    //     W_xc_x_static = W_xc_static * x_static
    LayerParameter* x_static_transform_param = net_param->add_layer();
    x_static_transform_param->CopyFrom(hidden_param);
    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
    x_static_transform_param->set_name("W_xc_x_static");
    x_static_transform_param->add_param()->set_name("W_xc_static");
    x_static_transform_param->add_bottom("x_static");
    x_static_transform_param->add_top("W_xc_x_static");

    LayerParameter* reshape_param = net_param->add_layer();
    reshape_param->set_type("Reshape");
    BlobShape* new_shape =
         reshape_param->mutable_reshape_param()->mutable_shape();
    new_shape->add_dim(1);  // One timestep.
    new_shape->add_dim(this->N_);
    new_shape->add_dim(
        x_static_transform_param->inner_product_param().num_output());
    reshape_param->add_bottom("W_xc_x_static");
    reshape_param->add_top("W_xc_x_static");
  }


  LayerParameter* x_slice_param = net_param->add_layer();
  x_slice_param->CopyFrom(slice_param);
  x_slice_param->add_bottom("W_xc_x");
  x_slice_param->set_name("W_xc_x_slice");
*/

  LayerParameter output_concat_layer;
  output_concat_layer.set_name("h_concat");
  output_concat_layer.set_type("Concat");
  output_concat_layer.add_top("h");
  output_concat_layer.mutable_concat_param()->set_axis(0);

  LayerParameter output_m_layer;
  output_m_layer.set_name("m_concat");
  output_m_layer.set_type("Concat");
  output_m_layer.add_top("mask");
  output_m_layer.mutable_concat_param()->set_axis(0); // out put 2

  for (int t = 1; t <= this->T_; ++t) {
    string tm1s = this->int_to_str(t - 1);
    string ts = this->int_to_str(t);

    cont_slice_param->add_top("cont_" + ts);
    x_slice_param->add_top("x_" + ts);

    // Add a layer to permute x
    {
      LayerParameter* permute_x_param = net_param->add_layer();
      permute_x_param->CopyFrom(permute_param);
      permute_x_param->set_name("permute_x_" + ts);
      permute_x_param->mutable_permute_param()->add_order(2);
      permute_x_param->mutable_permute_param()->add_order(0);
      permute_x_param->mutable_permute_param()->add_order(1);
      permute_x_param->mutable_permute_param()->add_order(3);
      permute_x_param->add_bottom("x_" + ts);
      permute_x_param->add_top("x_p_" + ts);
    }
    //
     

    // Add a layer to generate attention weights
    {
      LayerParameter* att_m_param = net_param->add_layer();
      att_m_param->CopyFrom(biased_attention_param);
      att_m_param->set_name("att_m_" + tm1s);
      att_m_param->add_bottom("h_" + tm1s);
      att_m_param->add_top("m_" + tm1s);     //     }
   {
      LayerParameter* permute_x_a_param = net_param->add_layer();
      permute_x_a_param->CopyFrom(permute_param);
      permute_x_a_param->set_name("permute_x_a_" + ts);
      permute_x_a_param->mutable_permute_param()->add_order(0);
      permute_x_a_param->mutable_permute_param()->add_order(1);
      permute_x_a_param->mutable_permute_param()->add_order(3);
      permute_x_a_param->mutable_permute_param()->add_order(2);
      permute_x_a_param->add_bottom("x_" + ts);
      permute_x_a_param->add_top("x_p_a_" + ts);
    }  // here is to change!
    {
      LayerParameter* att_x_param = net_param->add_layer();
      att_x_param->CopyFrom(biased_attention_param);
      att_x_param->set_name("att_x_" + tm1s);
      att_x_param->mutable_inner_product_param()->set_axis(3);
      att_x_param->add_bottom("x_p_a_" + ts);
      att_x_param->add_top("m_x_" + tm1s);
    }    //  fc layer ,change output,dim 
   {
      LayerParameter* permute_x_a_p_param = net_param->add_layer();
      permute_x_a_p_param->CopyFrom(permute_param);
      permute_x_a_p_param->set_name("permute_x_a_p_" + ts);
      permute_x_a_p_param->mutable_permute_param()->add_order(2);
      permute_x_a_p_param->mutable_permute_param()->add_order(0);
      permute_x_a_p_param->mutable_permute_param()->add_order(1);
      permute_x_a_p_param->mutable_permute_param()->add_order(3);
      permute_x_a_p_param->add_bottom("m_x_" + tm1s);
      permute_x_a_p_param->add_top("m_x_a_" + tm1s);
    }
    {
      LayerParameter* m_sum_layer = net_param->add_layer();
      m_sum_layer->CopyFrom(bias_layer_param);
      m_sum_layer->set_name("mask_input_" + ts);
      m_sum_layer->add_bottom("m_x_a_" + tm1s);
      m_sum_layer->add_bottom("m_" + tm1s);
      m_sum_layer->add_top("m_input_" + tm1s);
    }
   {
      LayerParameter* att_x_ap_param = net_param->add_layer();
      att_x_ap_param->CopyFrom(biased_attention_param);
      att_x_ap_param->set_name("att_x_ap_" + tm1s);
      att_x_ap_param->mutable_inner_product_param()->set_axis(3);
      att_x_ap_param->mutable_inner_product_param()->set_num_output(1);
      att_x_ap_param->add_bottom("m_input_" + tm1s);
      att_x_ap_param->add_top("m_x_ap_" + tm1s);  //256---->1
    }
    {
      LayerParameter* permute_m_param = net_param->add_layer();
      permute_m_param->CopyFrom(permute_param);
      permute_m_param->set_name("permute_m_" + ts);
      permute_m_param->mutable_permute_param()->add_order(1);
      permute_m_param->mutable_permute_param()->add_order(2);
      permute_m_param->mutable_permute_param()->add_order(0);
      permute_m_param->mutable_permute_param()->add_order(3);
      permute_m_param->add_bottom("m_x_ap_" + tm1s);
      permute_m_param->add_top("m_f_" + tm1s);  //10*8*30*1
    }
    // Add a softmax layers to generate attention masks
    {
      LayerParameter* softmax_m_param = net_param->add_layer();
      softmax_m_param->CopyFrom(softmax_param);
      softmax_m_param->mutable_softmax_param()->set_axis(2);
      softmax_m_param->set_name("softmax_m_" + tm1s);
      softmax_m_param->add_bottom("m_f_" + tm1s);
      softmax_m_param->add_top("mask_" + tm1s);
    }
   
     {
      LayerParameter* reshape_m_param = net_param->add_layer();
      reshape_m_param->CopyFrom(reshape_layer_param);
      BlobShape* shape = reshape_m_param->mutable_reshape_param()->mutable_shape();
      shape->Clear();
      shape->add_dim(0);
      shape->add_dim(0);
      shape->add_dim(0);
      reshape_m_param->set_name("reshape_m_" + tm1s);
      reshape_m_param->add_bottom("mask_" + tm1s);
      reshape_m_param->add_top("mask_reshape_" + tm1s);
    }
    //Reshape mask from 1*6*36 to 1*6*6*6
    /*
    {
      LayerParameter* reshape_param = net_param->add_layer();
      reshape_param->set_type("Reshape");
      BlobShape* new_shape =
         reshape_param->mutable_reshape_param()->mutable_shape();
      new_shape->add_dim(1);  // One timestep.
      new_shape->add_dim(6);
      new_shape->add_dim(6);
      new_shape->add_dim(6);
      reshape_param->add_bottom("mask_" +tm1s);
      reshape_param->add_top("mask_reshape_" +tm1s);
    }*/
    // Conbine mask with input features
    {
      LayerParameter* scale_x_param = net_param->add_layer();
      scale_x_param->CopyFrom(scale_param);
      scale_x_param->set_name("scale_x_" + tm1s);
      scale_x_param->add_bottom("x_p_" + ts);
      scale_x_param->add_bottom("mask_reshape_" + tm1s);
      scale_x_param->add_top("x_mask_" + ts);
    }

    {
      LayerParameter* permute_x_mask_param = net_param->add_layer();
      permute_x_mask_param->CopyFrom(permute_param);
      permute_x_mask_param->set_name("permute_x_mask_" + ts);
      permute_x_mask_param->mutable_permute_param()->add_order(1);
      permute_x_mask_param->mutable_permute_param()->add_order(2);
      permute_x_mask_param->mutable_permute_param()->add_order(0);
      permute_x_mask_param->mutable_permute_param()->add_order(3);
      permute_x_mask_param->add_bottom("x_mask_" + ts);
      permute_x_mask_param->add_top("x_mask_p_" + ts);
    }

    {
      LayerParameter* reshape_x_param = net_param->add_layer();
      reshape_x_param->CopyFrom(reshape_param);
      reshape_x_param->set_name("reshape_x_" +ts);
      BlobShape* new_shape =
         reshape_x_param->mutable_reshape_param()->mutable_shape();
      new_shape->add_dim(this->N_);
      new_shape->add_dim(512);//512//384
      new_shape->add_dim(7);//7//6
      new_shape->add_dim(7);//7//6
      reshape_x_param->add_bottom("x_mask_p_" + ts);
      reshape_x_param->add_top("x_mask_reshape_"+ts);
    }

    {
      LayerParameter* pool_x_param = net_param->add_layer();
      pool_x_param->CopyFrom(pool_param);
      pool_x_param->set_name("pool_x_"+ts);
      pool_x_param->mutable_pooling_param()->set_pool(PoolingParameter_PoolMethod_SUM);
      pool_x_param->mutable_pooling_param()->set_kernel_size(7);//7//6
      pool_x_param->add_bottom("x_mask_reshape_"+ts);
      pool_x_param->add_top("x_pool_"+ts);
    }

    {
      LayerParameter* x_transform_param = net_param->add_layer();
      x_transform_param->CopyFrom(biased_hidden_param);
      x_transform_param->set_name("x_transform_" + ts);
      x_transform_param->add_param()->set_name("W_xc_" + ts);
      x_transform_param->add_param()->set_name("b_c" + ts);
      x_transform_param->add_bottom("x_pool_" +ts );
      x_transform_param->add_top("W_xc_x_"+ts);
    }

    {
      LayerParameter* x_transform_reshape_param = net_param->add_layer();
      x_transform_reshape_param->CopyFrom(reshape_param);
      x_transform_reshape_param->set_name("x_transform_reshape_" +ts);
      BlobShape* new_shape_r =
         x_transform_reshape_param->mutable_reshape_param()->mutable_shape();
      new_shape_r->add_dim(1);
      new_shape_r->add_dim(this->N_);
      new_shape_r->add_dim(num_output * 4);
      x_transform_reshape_param->add_bottom("W_xc_x_" + ts);
      x_transform_reshape_param->add_top("W_xc_x_r_"+ts);
    }
    // Add layers to flush the hidden state when beginning a new
    // sequence, as indicated by cont_t.
    //     h_conted_{t-1} := cont_t * h_{t-1}
    //
    // Normally, cont_t is binary (i.e., 0 or 1), so:
    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
    //                       0   otherwise
    {
      LayerParameter* cont_h_param = net_param->add_layer();
      cont_h_param->CopyFrom(sum_param);
      cont_h_param->mutable_eltwise_param()->set_coeff_blob(true);
      cont_h_param->set_name("h_conted_" + tm1s);
      cont_h_param->add_bottom("h_" + tm1s);
      cont_h_param->add_bottom("cont_" + ts);
      cont_h_param->add_top("h_conted_" + tm1s);
    }

    // Add layer to compute
    //     W_hc_h_{t-1} := W_hc * h_conted_{t-1}
    {
      LayerParameter* w_param = net_param->add_layer();
      w_param->CopyFrom(hidden_param);
      w_param->set_name("transform_" + ts);
      w_param->add_param()->set_name("W_hc");
      w_param->add_bottom("h_conted_" + tm1s);
      w_param->add_top("W_hc_h_" + tm1s);
      w_param->mutable_inner_product_param()->set_axis(2);
    }

    // Add the outputs of the linear transformations to compute the gate input.
    //     gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
    //                   = W_hc_h_{t-1} + W_xc_x_t + b_c
    {
      LayerParameter* input_sum_layer = net_param->add_layer();
      input_sum_layer->CopyFrom(sum_param);
      input_sum_layer->set_name("gate_input_" + ts);
      input_sum_layer->add_bottom("W_hc_h_" + tm1s);
      input_sum_layer->add_bottom("W_xc_x_r_" + ts);
      if (this->static_input_) {
        input_sum_layer->add_bottom("W_xc_x_static");
      }
      input_sum_layer->add_top("gate_input_" + ts);
    }

    // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t.
    // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t
    // Outputs: c_t, h_t
    //     [ i_t' ]
    //     [ f_t' ] := gate_input_t
    //     [ o_t' ]
    //     [ g_t' ]
    //         i_t := \sigmoid[i_t']
    //         f_t := \sigmoid[f_t']
    //         o_t := \sigmoid[o_t']
    //         g_t := \tanh[g_t']
    //         c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
    //         h_t := o_t .* \tanh[c_t]
    {
      LayerParameter* lstm_unit_param = net_param->add_layer();
      lstm_unit_param->set_type("LSTMUnit");
      lstm_unit_param->add_bottom("c_" + tm1s);
      lstm_unit_param->add_bottom("gate_input_" + ts);
      lstm_unit_param->add_bottom("cont_" + ts);
      lstm_unit_param->add_top("c_" + ts);
      lstm_unit_param->add_top("h_" + ts);
      lstm_unit_param->set_name("unit_" + ts);
    }
    output_concat_layer.add_bottom("h_" + ts);
    output_m_layer.add_bottom("mask_" + tm1s);
  }  // for (int t = 1; t <= this->T_; ++t)

  {
    LayerParameter* c_T_copy_param = net_param->add_layer();
    c_T_copy_param->CopyFrom(split_param);
    c_T_copy_param->add_bottom("c_" + this->int_to_str(this->T_));
    c_T_copy_param->add_top("c_T");
  }
  net_param->add_layer()->CopyFrom(output_concat_layer);
  net_param->add_layer()->CopyFrom(output_m_layer);
}

INSTANTIATE_CLASS(ALSTMLayer);
REGISTER_LAYER_CLASS(ALSTM);

}  // namespace caffe


sequence_layers.hpp

  1. #ifndef CAFFE_SEQUENCE_LAYERS_HPP_  
  2. #define CAFFE_SEQUENCE_LAYERS_HPP_  
  3.   
  4. #include <string>  
  5. #include <utility>  
  6. #include <vector>  
  7.   
  8. #include "caffe/blob.hpp"  
  9. #include "caffe/common.hpp"  
  10. #include "caffe/layer.hpp"  
  11. #include "caffe/net.hpp"  
  12. #include "caffe/proto/caffe.pb.h"  
  13.   
  14. namespace caffe {  
  15.   
  16. template <typename Dtype> class RecurrentLayer;  
  17.   
  18. /** 
  19.  * @brief An abstract class for implementing recurrent behavior inside of an 
  20.  *        unrolled network.  This Layer type cannot be instantiated -- instaed, 
  21.  *        you should use one of its implementations which defines the recurrent 
  22.  *        architecture, such as RNNLayer or LSTMLayer. 
  23.  */  
  24. template <typename Dtype>  
  25. class RecurrentLayer : public Layer<Dtype> {  
  26.  public:  
  27.   explicit RecurrentLayer(const LayerParameter& param)  
  28.       : Layer<Dtype>(param) {}  
  29.   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,  
  30.       const vector<Blob<Dtype>*>& top);  
  31.   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,  
  32.       const vector<Blob<Dtype>*>& top);  
  33.   virtual void Reset();  
  34.   
  35.   virtual inline const char* type() const { return "Recurrent"; }  
  36.   virtual inline int MinBottomBlobs() const { return 2; }  
  37.   virtual inline int MaxBottomBlobs() const { return 3; }  
  38.   //virtual inline int ExactNumTopBlobs() const { return 2; }  
  39.   virtual inline int MinTopBlobs() const {return 1; }  
  40.   virtual inline int MaxTopBlobs() const {return 2; }  
  41.   
  42.   
  43.   virtual inline bool AllowForceBackward(const int bottom_index) const {  
  44.     // Can't propagate to sequence continuation indicators.  
  45.     return bottom_index != 1;  
  46.   }  
  47.   
  48.  protected:  
  49.   /** 
  50.    * @brief Fills net_param with the recurrent network arcthiecture.  Subclasses 
  51.    *        should define this -- see RNNLayer and LSTMLayer for examples. 
  52.    */  
  53.   virtual void FillUnrolledNet(NetParameter* net_param) const = 0;  
  54.   
  55.   /** 
  56.    * @brief Fills names with the names of the 0th timestep recurrent input 
  57.    *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer 
  58.    *        for examples. 
  59.    */  
  60.   virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;  
  61.   
  62.   /** 
  63.    * @brief Fills names with the names of the Tth timestep recurrent output 
  64.    *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer 
  65.    *        for examples. 
  66.    */  
  67.   virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;  
  68.   
  69.   /** 
  70.    * @brief Fills names with the names of the output blobs, concatenated across 
  71.    *        all timesteps.  Should return a name for each top Blob. 
  72.    *        Subclasses should define this -- see RNNLayer and LSTMLayer for 
  73.    *        examples. 
  74.    */  
  75.   virtual void OutputBlobNames(vector<string>* names) const = 0;  
  76.   
  77.   /** 
  78.    * @param bottom input Blob vector (length 2-3) 
  79.    * 
  80.    *   -# @f$ (T \times N \times ...) @f$ 
  81.    *      the time-varying input @f$ x @f$.  After the first two axes, whose 
  82.    *      dimensions must correspond to the number of timesteps @f$ T @f$ and 
  83.    *      the number of independent streams @f$ N @f$, respectively, its 
  84.    *      dimensions may be arbitrary.  Note that the ordering of dimensions -- 
  85.    *      @f$ (T \times N \times ...) @f$, rather than 
  86.    *      @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$ 
  87.    *      independent input streams must be "interleaved". 
  88.    * 
  89.    *   -# @f$ (T \times N) @f$ 
  90.    *      the sequence continuation indicators @f$ \delta @f$. 
  91.    *      These inputs should be binary (0 or 1) indicators, where 
  92.    *      @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream 
  93.    *      @f$ n @f$ is the beginning of a new sequence, and hence the previous 
  94.    *      hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$ 
  95.    *      and has no effect on the cell's output at timestep @f$ t @f$, and 
  96.    *      a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of 
  97.    *      stream @f$ n @f$ is a continuation from the previous timestep 
  98.    *      @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the 
  99.    *      updated hidden state and output. 
  100.    * 
  101.    *   -# @f$ (N \times ...) @f$ (optional) 
  102.    *      the static (non-time-varying) input @f$ x_{static} @f$. 
  103.    *      After the first axis, whose dimension must be the number of 
  104.    *      independent streams, its dimensions may be arbitrary. 
  105.    *      This is mathematically equivalent to using a time-varying input of 
  106.    *      @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input 
  107.    *      across the @f$ T @f$ timesteps and concatenating with the time-varying 
  108.    *      input.  Note that if this input is used, all timesteps in a single 
  109.    *      batch within a particular one of the @f$ N @f$ streams must share the 
  110.    *      same static input, even if the sequence continuation indicators 
  111.    *      suggest that difference sequences are ending and beginning within a 
  112.    *      single batch.  This may require padding and/or truncation for uniform 
  113.    *      length. 
  114.    * 
  115.    * @param top output Blob vector (length 1) 
  116.    *   -# @f$ (T \times N \times D) @f$ 
  117.    *      the time-varying output @f$ y @f$, where @f$ D @f$ is 
  118.    *      <code>recurrent_param.num_output()</code>. 
  119.    *      Refer to documentation for particular RecurrentLayer implementations 
  120.    *      (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$. 
  121.    */  
  122.   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,  
  123.       const vector<Blob<Dtype>*>& top);  
  124.   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,  
  125.       const vector<Blob<Dtype>*>& top);  
  126.   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,  
  127.       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);  
  128.   
  129.   /// @brief A helper function, useful for stringifying timestep indices.  
  130.   virtual string int_to_str(const int t) const;  
  131.   
  132.   /// @brief A Net to implement the Recurrent functionality.  
  133.   shared_ptr<Net<Dtype> > unrolled_net_;  
  134.   
  135.   /// @brief The number of independent streams to process simultaneously.  
  136.   int N_;  
  137.   
  138.   /** 
  139.    * @brief The number of timesteps in the layer's input, and the number of 
  140.    *        timesteps over which to backpropagate through time. 
  141.    */  
  142.   int T_;  
  143.   
  144.   /// @brief Whether the layer has a "static" input copied across all timesteps.  
  145.   bool static_input_;  
  146.   
  147.   vector<Blob<Dtype>* > recur_input_blobs_;  
  148.   vector<Blob<Dtype>* > recur_output_blobs_;  
  149.   vector<Blob<Dtype>* > output_blobs_;  
  150.   Blob<Dtype>* x_input_blob_;  
  151.   Blob<Dtype>* x_static_input_blob_;  
  152.   Blob<Dtype>* cont_input_blob_;  
  153. };  
  154.   
  155. /** 
  156.  * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM) 
  157.  *        [1] style recurrent neural network (RNN). Implemented as a network 
  158.  *        unrolled the LSTM computation in time. 
  159.  * 
  160.  * 
  161.  * The specific architecture used in this implementation is as described in 
  162.  * "Learning to Execute" [2], reproduced below: 
  163.  *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ] 
  164.  *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ] 
  165.  *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ] 
  166.  *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ] 
  167.  *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t) 
  168.  *     h_t := o_t .* \tanh[c_t] 
  169.  * In the implementation, the i, f, o, and g computations are performed as a 
  170.  * single inner product. 
  171.  * 
  172.  * Notably, this implementation lacks the "diagonal" gates, as used in the 
  173.  * LSTM architectures described by Alex Graves [3] and others. 
  174.  * 
  175.  * [1] Hochreiter, Sepp, and Schmidhuber, J黵gen. "Long short-term memory." 
  176.  *     Neural Computation 9, no. 8 (1997): 1735-1780. 
  177.  * 
  178.  * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute." 
  179.  *     arXiv preprint arXiv:1410.4615 (2014). 
  180.  * 
  181.  * [3] Graves, Alex. "Generating sequences with recurrent neural networks." 
  182.  *     arXiv preprint arXiv:1308.0850 (2013). 
  183.  */  
  184. template <typename Dtype>  
  185. class LSTMLayer : public RecurrentLayer<Dtype> {  
  186.  public:  
  187.   explicit LSTMLayer(const LayerParameter& param)  
  188.       : RecurrentLayer<Dtype>(param) {}  
  189.   
  190.   virtual inline const char* type() const { return "LSTM"; }  
  191.   
  192.  protected:  
  193.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  194.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  195.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  196.   virtual void OutputBlobNames(vector<string>* names) const;  
  197. };  
  198.   
  199. template <typename Dtype>  
  200. class LSTMStaticLayer : public RecurrentLayer<Dtype> {  
  201.  public:  
  202.   explicit LSTMStaticLayer(const LayerParameter& param)  
  203.       : RecurrentLayer<Dtype>(param) {}  
  204.   
  205.   virtual inline const char* type() const { return "LSTMStatic"; }  
  206.   
  207.  protected:  
  208.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  209.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  210.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  211.   virtual void OutputBlobNames(vector<string>* names) const;  
  212. };  
  213.   
  214. template <typename Dtype>  
  215. class LSTMStaticNewLayer : public RecurrentLayer<Dtype> {  
  216.  public:  
  217.   explicit LSTMStaticNewLayer(const LayerParameter& param)  
  218.       : RecurrentLayer<Dtype>(param) {}  
  219.   
  220.   virtual inline const char* type() const { return "LSTMStaticNew"; }  
  221.   
  222.  protected:  
  223.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  224.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  225.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  226.   virtual void OutputBlobNames(vector<string>* names) const;  
  227. };  
  228.   
  229. template <typename Dtype>  
  230. class ASLSTMLayer : public RecurrentLayer<Dtype> {  
  231.  public:  
  232.   explicit ASLSTMLayer(const LayerParameter& param)  
  233.       : RecurrentLayer<Dtype>(param) {}  
  234.   
  235.   virtual inline const char* type() const { return "ASLSTM"; }  
  236.   
  237.  protected:  
  238.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  239.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  240.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  241.   virtual void OutputBlobNames(vector<string>* names) const;  
  242. };  
  243.   
  244. template <typename Dtype>  
  245. class ADLSTMLayer : public RecurrentLayer<Dtype> {  
  246.  public:  
  247.   explicit ADLSTMLayer(const LayerParameter& param)  
  248.       : RecurrentLayer<Dtype>(param) {}  
  249.   
  250.   virtual inline const char* type() const { return "ADLSTM"; }  
  251.   
  252.  protected:  
  253.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  254.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  255.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  256.   virtual void OutputBlobNames(vector<string>* names) const;  
  257. };  
  258.   
  259. template <typename Dtype>  
  260. class ALSTMLayer : public RecurrentLayer<Dtype> {  
  261.  public:  
  262.   explicit ALSTMLayer(const LayerParameter& param)  
  263.       : RecurrentLayer<Dtype>(param) {}  
  264.   
  265.   virtual inline const char* type() const { return "ALSTM"; }  
  266.   
  267.  protected:  
  268.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  269.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  270.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  271.   virtual void OutputBlobNames(vector<string>* names) const;  
  272. };  
  273.   
  274. //coupled LSTM layer  
  275. template <typename Dtype>  
  276. class CLSTMLayer : public RecurrentLayer<Dtype> {  
  277.  public:  
  278.   explicit CLSTMLayer(const LayerParameter& param)  
  279.       : RecurrentLayer<Dtype>(param) {}  
  280.   
  281.   virtual inline const char* type() const { return "CLSTM"; }  
  282.   
  283.  protected:  
  284.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  285.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  286.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  287.   virtual void OutputBlobNames(vector<string>* names) const;  
  288. };  
  289.   
  290. //coupled LSTM layer  
  291. template <typename Dtype>  
  292. class ACLSTMLayer : public RecurrentLayer<Dtype> {  
  293.  public:  
  294.   explicit ACLSTMLayer(const LayerParameter& param)  
  295.       : RecurrentLayer<Dtype>(param) {}  
  296.   
  297.   virtual inline const char* type() const { return "ACLSTM"; }  
  298.   
  299.  protected:  
  300.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  301.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  302.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  303.   virtual void OutputBlobNames(vector<string>* names) const;  
  304. };  
  305.   
  306. template <typename Dtype>  
  307. class ACTLSTMLayer : public RecurrentLayer<Dtype> {  
  308.  public:  
  309.   explicit ACTLSTMLayer(const LayerParameter& param)  
  310.       : RecurrentLayer<Dtype>(param) {}  
  311.   
  312.   virtual inline const char* type() const { return "ACTLSTM"; }  
  313.   
  314.  protected:  
  315.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  316.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  317.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  318.   virtual void OutputBlobNames(vector<string>* names) const;  
  319. };  
  320.   
  321. template <typename Dtype>  
  322. class ACSLSTMLayer : public RecurrentLayer<Dtype> {  
  323.  public:  
  324.   explicit ACSLSTMLayer(const LayerParameter& param)  
  325.       : RecurrentLayer<Dtype>(param) {}  
  326.   
  327.   virtual inline const char* type() const { return "ACSLSTM"; }  
  328.   
  329.  protected:  
  330.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  331.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  332.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  333.   virtual void OutputBlobNames(vector<string>* names) const;  
  334. };  
  335.   
  336. template <typename Dtype>  
  337. class ACSSLSTMLayer : public RecurrentLayer<Dtype> {  
  338.  public:  
  339.   explicit ACSSLSTMLayer(const LayerParameter& param)  
  340.       : RecurrentLayer<Dtype>(param) {}  
  341.   
  342.   virtual inline const char* type() const { return "ACSSLSTM"; }  
  343.   
  344.  protected:  
  345.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  346.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  347.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  348.   virtual void OutputBlobNames(vector<string>* names) const;  
  349. };  
  350. template <typename Dtype>  
  351. class ACSSLSTMStaticLayer : public RecurrentLayer<Dtype> {  
  352.  public:  
  353.   explicit ACSSLSTMStaticLayer(const LayerParameter& param)  
  354.       : RecurrentLayer<Dtype>(param) {}  
  355.   
  356.   virtual inline const char* type() const { return "ACSSLSTMStatic"; }  
  357.   
  358.  protected:  
  359.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  360.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  361.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  362.   virtual void OutputBlobNames(vector<string>* names) const;  
  363. };  
  364. template <typename Dtype>  
  365. class ATLSTMLayer : public RecurrentLayer<Dtype> {  
  366.  public:  
  367.   explicit ATLSTMLayer(const LayerParameter& param)  
  368.       : RecurrentLayer<Dtype>(param) {}  
  369.   
  370.   virtual inline const char* type() const { return "ATLSTM"; }  
  371.   
  372.  protected:  
  373.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  374.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  375.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  376.   virtual void OutputBlobNames(vector<string>* names) const;  
  377. };  
  378.   
  379. /** 
  380.  * @brief A helper for LSTMLayer: computes a single timestep of the 
  381.  *        non-linearity of the LSTM, producing the updated cell and hidden 
  382.  *        states. 
  383.  */  
  384. template <typename Dtype>  
  385. class LSTMUnitLayer : public Layer<Dtype> {  
  386.  public:  
  387.   explicit LSTMUnitLayer(const LayerParameter& param)  
  388.       : Layer<Dtype>(param) {}  
  389.   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,  
  390.       const vector<Blob<Dtype>*>& top);  
  391.   
  392.   virtual inline const char* type() const { return "LSTMUnit"; }  
  393.   virtual inline int ExactNumBottomBlobs() const { return 3; }  
  394.   virtual inline int ExactNumTopBlobs() const { return 2; }  
  395.   
  396.   virtual inline bool AllowForceBackward(const int bottom_index) const {  
  397.     // Can't propagate to sequence continuation indicators.  
  398.     return bottom_index != 2;  
  399.   }  
  400.   
  401.  protected:  
  402.   /** 
  403.    * @param bottom input Blob vector (length 3) 
  404.    *   -# @f$ (1 \times N \times D) @f$ 
  405.    *      the previous timestep cell state @f$ c_{t-1} @f$ 
  406.    *   -# @f$ (1 \times N \times 4D) @f$ 
  407.    *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$ 
  408.    *   -# @f$ (1 \times 1 \times N) @f$ 
  409.    *      the sequence continuation indicators  @f$ \delta_t @f$ 
  410.    * @param top output Blob vector (length 2) 
  411.    *   -# @f$ (1 \times N \times D) @f$ 
  412.    *      the updated cell state @f$ c_t @f$, computed as: 
  413.    *          i_t := \sigmoid[i_t'] 
  414.    *          f_t := \sigmoid[f_t'] 
  415.    *          o_t := \sigmoid[o_t'] 
  416.    *          g_t := \tanh[g_t'] 
  417.    *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t) 
  418.    *   -# @f$ (1 \times N \times D) @f$ 
  419.    *      the updated hidden state @f$ h_t @f$, computed as: 
  420.    *          h_t := o_t .* \tanh[c_t] 
  421.    */  
  422.   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,  
  423.       const vector<Blob<Dtype>*>& top);  
  424.   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,  
  425.       const vector<Blob<Dtype>*>& top);  
  426.   
  427.   /** 
  428.    * @brief Computes the error gradient w.r.t. the LSTMUnit inputs. 
  429.    * 
  430.    * @param top output Blob vector (length 2), providing the error gradient with 
  431.    *        respect to the outputs 
  432.    *   -# @f$ (1 \times N \times D) @f$: 
  433.    *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$ 
  434.    *      with respect to the updated cell state @f$ c_t @f$ 
  435.    *   -# @f$ (1 \times N \times D) @f$: 
  436.    *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$ 
  437.    *      with respect to the updated cell state @f$ h_t @f$ 
  438.    * @param propagate_down see Layer::Backward. 
  439.    * @param bottom input Blob vector (length 3), into which the error gradients 
  440.    *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate 
  441.    *        inputs are computed.  Computatation of the error gradients w.r.t. 
  442.    *        the sequence indicators is not implemented. 
  443.    *   -# @f$ (1 \times N \times D) @f$ 
  444.    *      the error gradient w.r.t. the previous timestep cell state 
  445.    *      @f$ c_{t-1} @f$ 
  446.    *   -# @f$ (1 \times N \times 4D) @f$ 
  447.    *      the error gradient w.r.t. the "gate inputs" 
  448.    *      @f$ [ 
  449.    *          \frac{\partial E}{\partial i_t} 
  450.    *          \frac{\partial E}{\partial f_t} 
  451.    *          \frac{\partial E}{\partial o_t} 
  452.    *          \frac{\partial E}{\partial g_t} 
  453.    *          ] @f$ 
  454.    *   -# @f$ (1 \times 1 \times N) @f$ 
  455.    *      the gradient w.r.t. the sequence continuation indicators 
  456.    *      @f$ \delta_t @f$ is currently not computed. 
  457.    */  
  458.   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,  
  459.       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);  
  460.   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,  
  461.       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);  
  462.   
  463.   /// @brief The hidden and output dimension.  
  464.   int hidden_dim_;  
  465.   Blob<Dtype> X_acts_;  
  466. };  
  467.   
  468. /** 
  469.  * @brief Processes time-varying inputs using a simple recurrent neural network 
  470.  *        (RNN). Implemented as a network unrolling the RNN computation in time. 
  471.  * 
  472.  * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$ 
  473.  *     h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ] 
  474.  * @f$, and outputs @f$ 
  475.  *     o_t := \tanh[ W_{ho} h_t + b_o ] 
  476.  * @f$. 
  477.  */  
  478. template <typename Dtype>  
  479. class RNNLayer : public RecurrentLayer<Dtype> {  
  480.  public:  
  481.   explicit RNNLayer(const LayerParameter& param)  
  482.       : RecurrentLayer<Dtype>(param) {}  
  483.   
  484.   virtual inline const char* type() const { return "RNN"; }  
  485.   
  486.  protected:  
  487.   virtual void FillUnrolledNet(NetParameter* net_param) const;  
  488.   virtual void RecurrentInputBlobNames(vector<string>* names) const;  
  489.   virtual void RecurrentOutputBlobNames(vector<string>* names) const;  
  490.   virtual void OutputBlobNames(vector<string>* names) const;  
  491. };  
  492.   
  493. }  // namespace caffe  
  494.   
  495. #endif  // CAFFE_SEQUENCE_LAYERS_HPP_  
#ifndef CAFFE_SEQUENCE_LAYERS_HPP_
#define CAFFE_SEQUENCE_LAYERS_HPP_

#include <string>
#include <utility>
#include <vector>

#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/layer.hpp"
#include "caffe/net.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {

template <typename Dtype> class RecurrentLayer;

/**
 * @brief An abstract class for implementing recurrent behavior inside of an
 *        unrolled network.  This Layer type cannot be instantiated -- instaed,
 *        you should use one of its implementations which defines the recurrent
 *        architecture, such as RNNLayer or LSTMLayer.
 */
template <typename Dtype>
class RecurrentLayer : public Layer<Dtype> {
 public:
  explicit RecurrentLayer(const LayerParameter& param)
      : Layer<Dtype>(param) {}
  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Reset();

  virtual inline const char* type() const { return "Recurrent"; }
  virtual inline int MinBottomBlobs() const { return 2; }
  virtual inline int MaxBottomBlobs() const { return 3; }
  //virtual inline int ExactNumTopBlobs() const { return 2; }
  virtual inline int MinTopBlobs() const {return 1; }
  virtual inline int MaxTopBlobs() const {return 2; }


  virtual inline bool AllowForceBackward(const int bottom_index) const {
    // Can't propagate to sequence continuation indicators.
    return bottom_index != 1;
  }

 protected:
  /**
   * @brief Fills net_param with the recurrent network arcthiecture.  Subclasses
   *        should define this -- see RNNLayer and LSTMLayer for examples.
   */
  virtual void FillUnrolledNet(NetParameter* net_param) const = 0;

  /**
   * @brief Fills names with the names of the 0th timestep recurrent input
   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
   *        for examples.
   */
  virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;

  /**
   * @brief Fills names with the names of the Tth timestep recurrent output
   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
   *        for examples.
   */
  virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;

  /**
   * @brief Fills names with the names of the output blobs, concatenated across
   *        all timesteps.  Should return a name for each top Blob.
   *        Subclasses should define this -- see RNNLayer and LSTMLayer for
   *        examples.
   */
  virtual void OutputBlobNames(vector<string>* names) const = 0;

  /**
   * @param bottom input Blob vector (length 2-3)
   *
   *   -# @f$ (T \times N \times ...) @f$
   *      the time-varying input @f$ x @f$.  After the first two axes, whose
   *      dimensions must correspond to the number of timesteps @f$ T @f$ and
   *      the number of independent streams @f$ N @f$, respectively, its
   *      dimensions may be arbitrary.  Note that the ordering of dimensions --
   *      @f$ (T \times N \times ...) @f$, rather than
   *      @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$
   *      independent input streams must be "interleaved".
   *
   *   -# @f$ (T \times N) @f$
   *      the sequence continuation indicators @f$ \delta @f$.
   *      These inputs should be binary (0 or 1) indicators, where
   *      @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream
   *      @f$ n @f$ is the beginning of a new sequence, and hence the previous
   *      hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$
   *      and has no effect on the cell's output at timestep @f$ t @f$, and
   *      a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of
   *      stream @f$ n @f$ is a continuation from the previous timestep
   *      @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the
   *      updated hidden state and output.
   *
   *   -# @f$ (N \times ...) @f$ (optional)
   *      the static (non-time-varying) input @f$ x_{static} @f$.
   *      After the first axis, whose dimension must be the number of
   *      independent streams, its dimensions may be arbitrary.
   *      This is mathematically equivalent to using a time-varying input of
   *      @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input
   *      across the @f$ T @f$ timesteps and concatenating with the time-varying
   *      input.  Note that if this input is used, all timesteps in a single
   *      batch within a particular one of the @f$ N @f$ streams must share the
   *      same static input, even if the sequence continuation indicators
   *      suggest that difference sequences are ending and beginning within a
   *      single batch.  This may require padding and/or truncation for uniform
   *      length.
   *
   * @param top output Blob vector (length 1)
   *   -# @f$ (T \times N \times D) @f$
   *      the time-varying output @f$ y @f$, where @f$ D @f$ is
   *      <code>recurrent_param.num_output()</code>.
   *      Refer to documentation for particular RecurrentLayer implementations
   *      (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$.
   */
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

  /// @brief A helper function, useful for stringifying timestep indices.
  virtual string int_to_str(const int t) const;

  /// @brief A Net to implement the Recurrent functionality.
  shared_ptr<Net<Dtype> > unrolled_net_;

  /// @brief The number of independent streams to process simultaneously.
  int N_;

  /**
   * @brief The number of timesteps in the layer's input, and the number of
   *        timesteps over which to backpropagate through time.
   */
  int T_;

  /// @brief Whether the layer has a "static" input copied across all timesteps.
  bool static_input_;

  vector<Blob<Dtype>* > recur_input_blobs_;
  vector<Blob<Dtype>* > recur_output_blobs_;
  vector<Blob<Dtype>* > output_blobs_;
  Blob<Dtype>* x_input_blob_;
  Blob<Dtype>* x_static_input_blob_;
  Blob<Dtype>* cont_input_blob_;
};

/**
 * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
 *        [1] style recurrent neural network (RNN). Implemented as a network
 *        unrolled the LSTM computation in time.
 *
 *
 * The specific architecture used in this implementation is as described in
 * "Learning to Execute" [2], reproduced below:
 *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
 *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
 *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
 *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
 *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
 *     h_t := o_t .* \tanh[c_t]
 * In the implementation, the i, f, o, and g computations are performed as a
 * single inner product.
 *
 * Notably, this implementation lacks the "diagonal" gates, as used in the
 * LSTM architectures described by Alex Graves [3] and others.
 *
 * [1] Hochreiter, Sepp, and Schmidhuber, J黵gen. "Long short-term memory."
 *     Neural Computation 9, no. 8 (1997): 1735-1780.
 *
 * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
 *     arXiv preprint arXiv:1410.4615 (2014).
 *
 * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
 *     arXiv preprint arXiv:1308.0850 (2013).
 */
template <typename Dtype>
class LSTMLayer : public RecurrentLayer<Dtype> {
 public:
  explicit LSTMLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "LSTM"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

template <typename Dtype>
class LSTMStaticLayer : public RecurrentLayer<Dtype> {
 public:
  explicit LSTMStaticLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "LSTMStatic"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

template <typename Dtype>
class LSTMStaticNewLayer : public RecurrentLayer<Dtype> {
 public:
  explicit LSTMStaticNewLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "LSTMStaticNew"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

template <typename Dtype>
class ASLSTMLayer : public RecurrentLayer<Dtype> {
 public:
  explicit ASLSTMLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "ASLSTM"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

template <typename Dtype>
class ADLSTMLayer : public RecurrentLayer<Dtype> {
 public:
  explicit ADLSTMLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "ADLSTM"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

template <typename Dtype>
class ALSTMLayer : public RecurrentLayer<Dtype> {
 public:
  explicit ALSTMLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "ALSTM"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

//coupled LSTM layer
template <typename Dtype>
class CLSTMLayer : public RecurrentLayer<Dtype> {
 public:
  explicit CLSTMLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "CLSTM"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

//coupled LSTM layer
template <typename Dtype>
class ACLSTMLayer : public RecurrentLayer<Dtype> {
 public:
  explicit ACLSTMLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "ACLSTM"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

template <typename Dtype>
class ACTLSTMLayer : public RecurrentLayer<Dtype> {
 public:
  explicit ACTLSTMLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "ACTLSTM"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

template <typename Dtype>
class ACSLSTMLayer : public RecurrentLayer<Dtype> {
 public:
  explicit ACSLSTMLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "ACSLSTM"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

template <typename Dtype>
class ACSSLSTMLayer : public RecurrentLayer<Dtype> {
 public:
  explicit ACSSLSTMLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "ACSSLSTM"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};
template <typename Dtype>
class ACSSLSTMStaticLayer : public RecurrentLayer<Dtype> {
 public:
  explicit ACSSLSTMStaticLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "ACSSLSTMStatic"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};
template <typename Dtype>
class ATLSTMLayer : public RecurrentLayer<Dtype> {
 public:
  explicit ATLSTMLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "ATLSTM"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

/**
 * @brief A helper for LSTMLayer: computes a single timestep of the
 *        non-linearity of the LSTM, producing the updated cell and hidden
 *        states.
 */
template <typename Dtype>
class LSTMUnitLayer : public Layer<Dtype> {
 public:
  explicit LSTMUnitLayer(const LayerParameter& param)
      : Layer<Dtype>(param) {}
  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual inline const char* type() const { return "LSTMUnit"; }
  virtual inline int ExactNumBottomBlobs() const { return 3; }
  virtual inline int ExactNumTopBlobs() const { return 2; }

  virtual inline bool AllowForceBackward(const int bottom_index) const {
    // Can't propagate to sequence continuation indicators.
    return bottom_index != 2;
  }

 protected:
  /**
   * @param bottom input Blob vector (length 3)
   *   -# @f$ (1 \times N \times D) @f$
   *      the previous timestep cell state @f$ c_{t-1} @f$
   *   -# @f$ (1 \times N \times 4D) @f$
   *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
   *   -# @f$ (1 \times 1 \times N) @f$
   *      the sequence continuation indicators  @f$ \delta_t @f$
   * @param top output Blob vector (length 2)
   *   -# @f$ (1 \times N \times D) @f$
   *      the updated cell state @f$ c_t @f$, computed as:
   *          i_t := \sigmoid[i_t']
   *          f_t := \sigmoid[f_t']
   *          o_t := \sigmoid[o_t']
   *          g_t := \tanh[g_t']
   *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
   *   -# @f$ (1 \times N \times D) @f$
   *      the updated hidden state @f$ h_t @f$, computed as:
   *          h_t := o_t .* \tanh[c_t]
   */
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  /**
   * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
   *
   * @param top output Blob vector (length 2), providing the error gradient with
   *        respect to the outputs
   *   -# @f$ (1 \times N \times D) @f$:
   *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
   *      with respect to the updated cell state @f$ c_t @f$
   *   -# @f$ (1 \times N \times D) @f$:
   *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
   *      with respect to the updated cell state @f$ h_t @f$
   * @param propagate_down see Layer::Backward.
   * @param bottom input Blob vector (length 3), into which the error gradients
   *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
   *        inputs are computed.  Computatation of the error gradients w.r.t.
   *        the sequence indicators is not implemented.
   *   -# @f$ (1 \times N \times D) @f$
   *      the error gradient w.r.t. the previous timestep cell state
   *      @f$ c_{t-1} @f$
   *   -# @f$ (1 \times N \times 4D) @f$
   *      the error gradient w.r.t. the "gate inputs"
   *      @f$ [
   *          \frac{\partial E}{\partial i_t}
   *          \frac{\partial E}{\partial f_t}
   *          \frac{\partial E}{\partial o_t}
   *          \frac{\partial E}{\partial g_t}
   *          ] @f$
   *   -# @f$ (1 \times 1 \times N) @f$
   *      the gradient w.r.t. the sequence continuation indicators
   *      @f$ \delta_t @f$ is currently not computed.
   */
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

  /// @brief The hidden and output dimension.
  int hidden_dim_;
  Blob<Dtype> X_acts_;
};

/**
 * @brief Processes time-varying inputs using a simple recurrent neural network
 *        (RNN). Implemented as a network unrolling the RNN computation in time.
 *
 * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$
 *     h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ]
 * @f$, and outputs @f$
 *     o_t := \tanh[ W_{ho} h_t + b_o ]
 * @f$.
 */
template <typename Dtype>
class RNNLayer : public RecurrentLayer<Dtype> {
 public:
  explicit RNNLayer(const LayerParameter& param)
      : RecurrentLayer<Dtype>(param) {}

  virtual inline const char* type() const { return "RNN"; }

 protected:
  virtual void FillUnrolledNet(NetParameter* net_param) const;
  virtual void RecurrentInputBlobNames(vector<string>* names) const;
  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
  virtual void OutputBlobNames(vector<string>* names) const;
};

}  // namespace caffe

#endif  // CAFFE_SEQUENCE_LAYERS_HPP_


代码就不注释了,有问题可以留言。代码主要是在LSTM Unit前进行一些数据预处理,计算出Mask(即Attention),这里给出attention的计算方式,方便大家理解代码。


然后把S接入softmax进行[0,1]压缩。关于tanh这个函数可以更换成其他方式。




补充:博主推了半天的维度,参考LSTM layer,测试成功。但是博主参考一篇AAAI论文对一些joint坐标进行attention,改动代码测试失败,发邮件给作者无人回复,严重怀疑造假。为什么坐标就不行呢?因为上述代码是写图片的区域的,而坐标就3个点,维度太低。

猜你喜欢

转载自blog.csdn.net/sinat_22510827/article/details/80299484