Tensorflow - RNN-LSTM Recurrent Neural Networks (III)

network.py:

# - * - Coding: UTF-8 - * - 

"" " 
neural network model related 
RNN-LSTM recurrent neural network 

then you can add all kinds of name_scope (namespace) 
with TensorBoard to visualize 

==== terms of concepts ==== 
# batch size:. batch number (sample) iteration (Forword computation (for obtaining the loss function) and BackPropagation operation (for updating the neural network parameter)) the number of samples used .Batch larger size, the need more memory the 
# iteration: iteration each iteration of the updated weights (network parameters), and each time the weight update requires Batch size data were Forward operation, then BP operations. 
# Epoch:. age / Times all training samples complete one iteration 

# if: a training set of 1000 samples, 10 BATCH_SIZE = 
# then: a complete sample set of training required: 100 iteration, 1 one Epoch 
# but in general we have more than a training Epoch 

==== super parameters (Hyper the parameter) ==== 
init_scale: weight parameters (weights) of the initial value of the span, a small start taking some of the more conducive to training 
learning_rate: learning rate, Training initially 1.0 
num_layers: LSTM layer number (the default is 2)
num_steps: LSTM Expand step (step) number, equal to the number of words per input batch (default 35) 
hidden_size: Dimension LSTM layer neuron number, the word is the vector (default is 650) 
max_lr_epoch: initial learning with the number of training Epoch (default 10) 
Dropout: Dropout in retention layer (default is 0.5) 
lr_decay: after a max_lr_epoch the attenuation rate of each Epoch learning rate, initial training 0.93. Let the learning rate gradually decay is an effective way to improve training efficiency of 
batch_size: batch number (sample). Iteration (Forword computation (for obtaining the loss function) and BackPropagation operation (for updating the neural network parameter)) the number of samples used 
(the default is 20. batch_size take relatively small batch_size more conducive Stochastic Gradient Descent (stochastic gradient descent) prevent trapped in a local minimum) 
"" " 

Import tensorflow TF AS 


# neural network model 
class the model (Object):
     # constructor 
    DEF  the __init__ (Self, input_obj, is_training, hidden_size, vocab_size, num_layers,
                 Dropout = 0.5, init_scale = 0.05 ): 
        self.is_training = is_training
        self.input_obj on the word vector is essentially a word of poly method class (Clustering) of = input_obj 
        self.batch_size = input_obj.batch_size 
        self.num_steps = input_obj.num_steps 
        self.hidden_size = hidden_size 

        # and make it operational variables and CPU to calculate, because temporary (seemingly) not yet implemented the GPU 
        with tf.device ( " / the CPU: 0 " ):
             # create word vector (Word embedding), embedding represents dense vector (dense vector) 
            # 
            embedding = tf.Variable (tf.random_uniform ([vocab_size, self.hidden_size], - init_scale, init_scale) )
             # embedding_lookup return word vector 
            inputs =tf.nn.embedding_lookup (embedding that, self.input_obj.input_data) 

        # if dropout is trained and less than 1, the input Dropout layer through a 
        # Dropout prevent overfitting 
        IF is_training and dropout <1 : 
            Inputs = tf.nn. Dropout (inputs, Dropout) 

        # state (state) of the storage and retrieval 
        # the second dimension is 2 because there are two inputs from the last unit of each unit LSTM: 
        # a previous time is LSTM output h (t -1) 
        # a time unit is a state before the C (t-1)
         #The C and h are later used to construct tf.contrib.rnn.LSTMStateTuple 
        self.init_state = tf.placeholder (tf.float32, [num_layers, 2 , self.batch_size, self.hidden_size]) 

        # state of each layer 
        Cell =
        = tf.unstack state_per_layer_list (self.init_state, Axis = 0) 

        # after an initial state (before a timing LSTM comprises an output h (t-1) of previous time units and state C (t-1)), for the dynamic_rnn 
        rnn_tuple_state = tuple ( 
            [tf.contrib.rnn.LSTMStateTuple (state_per_layer_list [IDX] [0], state_per_layer_list [IDX] [ . 1 ])
              for IDX in Range (num_layers)] 
        ) 

        # Create a LSTM layer, wherein neurons is a number hidden_size (default 650) tf.contrib.rnn.LSTMCell (hidden_size) 

        # If the ratio is less than 1 Dropout training and, to add Dropout operation LSTM layer 
        # here only added to the output operation Dropout, retention ( output_keep_prob) 0.5 
        # input is a default, the operation corresponding to the input is not done Dropout 
        if is_training and Dropout <1 : 
            Cell = tf.contrib.rnn.DropoutWrapper (Cell, output_keep_prob = Dropout) 

        # If LSTM number of layers is greater than 1, the total number LSTM layers created num_layers 
        # and all layers LSTM packaged in such a hierarchical model serialization in MultiRNNCell 
        # state_is_tuple = True acceptance LSTMStateTuple form of the input state 
        IF num_layers>. 1 : 
            Cell = tf.contrib.rnn.MultiRNNCell ([Cell for _in the Range (num_layers)], state_is_tuple = True) 

        # dynamic_rnn (dynamic RNN) allows different iterations of incoming Batch data can be different lengths 
        # but all the same length of data inside the iteration is still a Batch fixed 
        # dynamic_rnn better handling of padding (zero padding), saving computing resources
        # Returns two variables: 
        # The first one is a time dimension in Batch (default 35) LSTM all output units deployed on the default shape [20, 35, 650], would then processed through a flat layer 
        # of two is the final state (status), including the current time LSTM output h (t) and the current time unit state C (T) 
        output, self.state = tf.nn.dynamic_rnn (cell, Inputs, DTYPE = TF. float32, initial_state = rnn_tuple_state) 

        # flattening process, changing the shape of the output (batch_size * num_steps, hidden_size), default shape [700, 650] 
        output = tf.reshape (output, [-1, hidden_size]) # -1 represents automatic derivation dimension size

        # The Softmax weight (Weight) 
        softmax_w = tf.Variable (tf.random_uniform ([hidden_size, vocab_size], - init_scale, init_scale))
         # the Softmax bias (Bias) 
        softmax_b = tf.Variable (tf.random_uniform ([vocab_size] , - init_scale, init_scale)) 

        # logits is Logistic Regression (for classification) model (the linear equation: y = W * x + b ) of the result of the calculation (score) 
        # this logits (score) will then be transferred using a Softmax as a percentage of the probability 
        # Output is input (x), softmax_w is a weight (W), softmax_b is the bias (B) 
        # returns W * x + b results 
        logits = tf.nn.xw_plus_b (Output, softmax_w, softmax_b) 

        # the logits the Tensor into three-dimensional, in order to calculate sequence loss of 
        # shape default [20, 35, 10,000]
        = logits tf.reshape (logits, [self.batch_size, self.num_steps, vocab_size]) 

        # cross-entropy of the sequence calculated logits (Cross-Entropy) loss (Loss) 
        Loss = tf.contrib.seq2seq.sequence_loss ( 
            logits,   # shape default [20 is, 35, 10000] 
            self.input_obj.targets,   # a desired output shape default [20 is, 35] 
            tf.ones ([self.batch_size, self.num_steps], DTYPE = tf.float32), 
            average_across_timesteps = False, 
            average_across_batch = True) 

        # update the price (cost) 
        self.cost = tf.reduce_sum (Loss) 

        # probability Softmax counted out
        = tf.nn.softmax self.softmax_out (tf.reshape (logits, [-1 , vocab_size])) 

        # that takes the maximum value as the predicted probability 
        self.predict = tf.cast (tf.argmax (self.softmax_out, axis . 1 = ), tf.int32) 

        # prediction value and the true value (the target) Comparative 
        correct_prediction = tf.equal (self.predict, tf.reshape (self.input_obj.targets, [-1 ])) 

        # calculating a predicted accuracy of 
        self = .accuracy tf.reduce_mean (tf.cast (correct_prediction, tf.float32)) 

        # If a test, exit 
        IF  not is_training:
             return 

        # learning rate. trainable = False representation "can not be trained" 
        self.learning_rate = tf.Variable (0.0, = trainable False) 

        #Return all can be trained (trainable = True. If you do not set trainable = False, the default is Variable can be trained) 
        # is in addition to not being trained learning rate of other variables 
        tvars = tf.trainable_variables () 

        # tf.clip_by_global_norm (implemented gradient clipping (clipping gradient)) to prevent explosion gradient 
        # 
        self.train_op =tf.gradients self.cost calculated for tvars gradient (derivative), returns a list of gradient 
        GrADS, _ = tf.clip_by_global_norm (tf.gradients (self.cost, tvars),. 5 ) 

        # optimizer uses GradientDescentOptimizer (gradient descent optimization device) 
        Optimizer = tf.train.GradientDescentOptimizer (self.learning_rate) 

        # apply_gradients (application gradient) previously used (gradient clipping) gradient trimmed gradient applied to the variables that can be trained up to do gradient descent 
        # apply_gradients actually minimize method inside the second step, the first step is to calculate the gradient optimizer.apply_gradients ( 
            ZIP (GrADS, tvars), 
            global_step = tf.train.get_or_create_global_step ()) 

        # for updating the learning rate 
        self.new_lr = tf.placeholder (tf.float32 , Shape = []) 
        self.lr_update= tf.assign(self.learning_rate, self.new_lr)

    # 更新 学习率
    def assign_lr(self, session, lr_value):
        session.run(self.lr_update, feed_dict={self.new_lr: lr_value})

Guess you like

Origin www.cnblogs.com/SCCQ/p/12346352.html