python深度学习--RNN

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pylab
from pandas import DataFrame, Series
from keras import models, layers, optimizers, losses, metrics
from keras.utils.np_utils import to_categorical
'''
RNN伪代码
state_t = 0
for input_t in input_sequence:
    output_t = activation(dot(W, input_t) + dot(U, state_t) + b) 
    state_t = output_t
'''
#绘制结果
def acc_loss_plot(history):
    fig=plt.figure()
    ax1=fig.add_subplot(2,1,1)
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    ax1.plot(epochs, acc, 'bo', label='Training acc')
    ax1.plot(epochs, val_acc, 'b', label='Validation acc')
    ax1.set_title('Training and validation accuracy')
    ax2=fig.add_subplot(2,1,2)
    ax2.plot(epochs, loss, 'bo', label='Training loss')
    ax2.plot(epochs, val_loss, 'b', label='Validation loss')
    ax2.set_title('Training and validation loss')
    plt.legend()
    plt.tight_layout()
    plt.show()

#简单的RNN的Numpy实现
def rnn_example():
    timesteps=100#时间步
    input_features=32#输入特征空间的维度
    output_features=64#输出特征空间的维度

    inputs=np.random.random((timesteps,input_features))#输入数据：随机噪声
    state_t=np.zeros((output_features,))#初始状态，全零向量
    W=np.random.random((output_features,input_features))#权重矩阵
    U=np.random.random((output_features,output_features))
    b=np.random.random((output_features,))

    successive_outputs=[]#保存每个时间步输出
    for input_t in inputs:
        output_t=np.tanh(np.dot(W,input_t)+np.dot(U,state_t)+b)#tanh()激活函数
        successive_outputs.append(output_t)
        state_t=output_t#状态更新，作为下一时间步的输入
    final_output_sequence=np.stack(successive_outputs,axis=0)#返回形状为 (timesteps, output_features) 的二维张量
    print(final_output_sequence)
    print(final_output_sequence.shape)
#RNN类似一个 for 循环，重复使用循环前一次迭代的计算结果
#RNN的特征在于其时间步函数

#Keras SimpleRNN
from keras.layers import SimpleRNN,Embedding
'''
    它接收形状为 (batch_size, timesteps, input_features) 的输入
    两种不同的模式下运行：
    一种是返回每个时间步连续输出的完整序列，即形状为 (batch_size, timesteps, output_features) 的三维张量；
    另一种是只返回每个输入序列的最终输出，即形状为 (batch_size, output_ features) 的二维张量
    这两种模式由 return_sequences 这个构造函数参数来控制
'''
model=models.Sequential()
model.add(Embedding(10000,32))#(标记个数，嵌入维度)，返回(samples, sequence_length, embedding_ dimensionality)3D张量
model.add(SimpleRNN(32,return_sequences=True))#True返回完整的状态序列，默认False

#为了提高网络的表示能力，将多个循环层逐个堆叠有时也是很有用的。在这种情况下，你需要让所有中间层都返回完整的输出序列
model.add(SimpleRNN(32,return_sequences=True))
model.add(SimpleRNN(32,return_sequences=True))
model.add(SimpleRNN(32,))

print(model.summary())

#将模型应用于IMDB电影评论分类
from keras.datasets import imdb
from keras.preprocessing import sequence

max_features=10000
maxlen=500
batch_size=32
print('Loading data...')
(input_train,y_train),(input_test,y_test)=imdb.load_data(num_words=max_features)
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')

input_train=sequence.pad_sequences(input_train,maxlen=maxlen)
input_test=sequence.pad_sequences(input_test,maxlen=maxlen)
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)

#用Embedding层和SimpleRNN层训练模型
'''
from keras.layers import Dense
model = models.Sequential()
model.add(Embedding(max_features, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(input_train, y_train, epochs=10,
batch_size=128, validation_split=0.2)

acc_loss_plot(history)#验证精度85%左右，并没有之前的简单网络表现好
'''
#部分原因：
# 输入只考虑了前 500 个单词，而不是整个序列，因此，RNN获得的信息比前面的基准模型更少。 另一部分原因在于，SimpleRNN 不擅长处理长序列，比如文本
'''
#SimpleRNN 通常过于简化，没有实用价值
SimpleRNN的最大问题是，在时刻t，理论上来说她应该能够记住许多时间步之前见过的信息，但实际上他是不可能学到这种长期依赖的。原因在于梯度消失问题，
这一效应类似于在层数较多的非循环网络（即前馈网络）中观察到的效应：随着层数的增加，网络最终变得无法训练。

LSTM（long short-term memory）层和GRU层:都是为了解决这个问题而设计的
'''
#LSTM（long short-term memory）
'''
SimpleRNN的一种变体，它增加了一种携带信息跨越多个时间步的方法。假设有一条传送带，其运行方向平行于你所处理的序列。序列中的信息可以在任意位置跳上传送带， 然后被传送到更晚的时间步，并在需要时原封不动地跳回来。
这实际上就是 LSTM 的原理：它保存信息以便后面使用，从而防止较早期的信号在处理过程中逐渐消失。
#LSTM架构的详细伪代码
------------------------------------------------------------------------
output_t = activation(dot(state_t, Uo) + dot(input_t, Wo) + dot(C_t, Vo) + bo)
i_t = activation(dot(state_t, Ui) + dot(input_t, Wi) + bi) 
f_t = activation(dot(state_t, Uf) + dot(input_t, Wf) + bf) 
k_t = activation(dot(state_t, Uk) + dot(input_t, Wk) + bk)
对 i_t、f_t 和 k_t 进行组合，可以得到新的携带状态（下一个 c_t）
c_t+1 = i_t * k_t + c_t * f_t
-------------------------------------------------------------------------
#你不需要理解关于 LSTM 单元具体架构的任何内容。作为人类，理解它不应该是你要做的。你只需要记住 LSTM 单元的作用：允许过去的信息稍后重新进入，从而解 决梯度消失问题
'''
#使用Keras中的LSTM层
from keras.layers import LSTM,Dense
model = models.Sequential()

model.add(Embedding(max_features, 32))
# model.add(LSTM(32))
model.add(layers.Bidirectional(layers.LSTM(32)))#双向RNN
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(input_train, y_train, epochs=10,
batch_size=128, validation_split=0.2)
acc_loss_plot(history)#验证精度89%左右
'''
为什么 LSTM 不能表现得更好？
    一个原因是你没有花力气来调节超参数，比如嵌入维度或 LSTM 输出维度。另 一个原因可能是缺少正则化。
    主要原因在于，适用于评论分析全局的长期性结构（这正是LSTM所擅长的），对情感分析问题帮助不大。对于这样的基本问题，观察每条评论中出现了哪些词及其出现频率就可以很好地解决。这也正是第一个全连接方法的做法。但还有更加困难的自然语言处理问题，特别是问答和机器翻译，这时LSTM的优势就明显了
    
'''
python深度学习--RNN

猜你喜欢