【AI】_02_RNN(递归神经网络)


 


【一】 RNN(Recurrent Neural Network)
  • 立体示意图
  • 平面示意图 (W 不变,权重共享)
  • Hidden Layer(隐藏层)

h 0 = f ( w h x x 0 ) h_0=f(w_{hx} \cdot x_0)

h 1 = f ( w h x x 1 + w h h h 0 ) h_1=f(w_{hx} \cdot x_1 + w_{hh} \cdot h_0)

h 2 = f ( w h x x 2 + w h h h 1 ) h_2=f(w_{hx} \cdot x_2 + w_{hh} \cdot h_1)

. . . . . . ......

h n = f ( w h x x n + w h h h n 1 ) h_n=f(w_{hx} \cdot x_n + w_{hh} \cdot h_{n-1})


【二】 链式求导
  • hidden Layer(隐层)对 hidden layer(隐层)求导, d i a g \bm {diag} 表示对角矩阵
     
    h t h t 1 = d i a g ( σ ( w h x x t + w h h h t 1 + b 1 ) ) w h h \bm {\frac { \partial h _ { t } } { \partial h _ { t - 1 } } }=diag(\sigma ^ { \prime } ( w _ { h x } \cdot x _ { t } + w_{hh} \cdot h_{t-1} + b_{1})) \cdot w_{hh}
  • Losshidden layer(隐层)求导
     
    J ( i ) ( θ ) h j = J ( i ) ( θ ) h i j < t i h t h t 1 \bm {\frac { \partial J ^ { ( i ) } ( \theta ) } { \partial h _ { j } }}=\frac { \partial J ^ { ( i ) } ( \theta ) } { \partial h _ { i } } \cdot \prod _ { j < t \leq i } \frac { \partial h _ { t } } { \partial h _ { t - 1 } }
    = J ( i ) ( θ ) h i j < t i d i a g ( σ ( w h x x t + w h h h t 1 + b 1 ) ) w h h =\frac { \partial J ^ { ( i ) } ( \theta ) } { \partial h _ { i } } \cdot \prod _ { j < t \leq i }diag(\sigma ^ { \prime } ( w _ { h x } \cdot x _ { t } + w_{hh} \cdot h_{t-1} + b_{1})) \cdot w_{hh}
    = J ( i ) ( θ ) h i w h h ( i j ) j < t i d i a g ( σ ( w h x x t + w h h h t 1 + b 1 ) ) =\frac { \partial J ^ { ( i ) } ( \theta ) } { \partial h _ { i } } \cdot w^{(i-j)}_{hh} \prod _ { j < t \leq i }diag(\sigma ^ { \prime } ( w _ { h x } \cdot x _ { t } + w_{hh} \cdot h_{t-1} + b_{1}))

【三】 梯度消失 and 梯度爆炸
  • 疯狂连乘导致
     
    j < t i d i a g ( σ ( w h x x t + w h h h t 1 + b 1 ) ) \prod _ { j < t \leq i }diag(\sigma ^ { \prime } ( w _ { h x } \cdot x _ { t } + w_{hh} \cdot h_{t-1} + b_{1}))
  • Gradient Clipping for Gradient Exploding(解决梯度爆炸)
     
    θ ( t + 1 ) = θ ( t ) η t θ f ( θ ( t ) ) \theta ^ { ( t + 1 ) } = \theta ^ { ( t ) } - \eta _ { t } \nabla _ { \theta } f ( \theta ^ { (t) })
    s . t        i f    θ f ( θ ( t ) ) t h r e s h o l d ,      t h e n : s.t \;\;\; if \;\|\nabla _ { \theta } f ( \theta ^ { (t) }) \| \geq threshold \,, \;\; then:
       θ f ( θ ( t ) ) = θ f ( θ ( t ) ) t h r e s h o l d θ f ( θ ( t ) ) \; \|\nabla _ { \theta } f ( \theta ^ { (t) }) \| = \nabla _ { \theta } f ( \theta ^ { (t) }) \cdot \frac { threshold } { \| \nabla _ { \theta } f ( \theta ^ { (t) }) \| }
  • Gradient Vanishing(梯度消失):LSTMGRU
  • 正则项:控制 L o s s / h t L o s s / h ( t 1 ) ∂Loss / ∂h_t ≈ ∂Loss / ∂h_{(t-1)}

Ω = t ( h ( t ) L h ( t ) h ( t ν ) h ( t ) L 1 ) 2 \Omega = \sum _ { t } {( \frac { \| \nabla _ {{ h } _{( t )}} L \cdot \frac { \partial h _ { ( t ) } } { \partial h _ { ( t - \nu ) } }\|} { \| \nabla _ {{ h } _{( t )}} L \| } -1)}^2


【四】 代码(前向传播)

【五】 RNN - Numpy 实现
  • 自然语言 编码 and 解码
import numpy as np

word = 'hello'
chars = list(set(word)) # ['e','h','l','o']

# 我们来做两个字典
char_to_ix = { ch:i for i,ch in enumerate(chars) } # 编码
ix_to_char = { i:ch for i,ch in enumerate(chars) } # 解码

def encoding(char_to_ix, data, targets):
    inputs = [char_to_ix[ch] for ch in data] # 转化成词向量, ex 2, 1, 3, 4
    targets = [char_to_ix[ch] for ch in targets] # 转化为词向量
    n_values = len(char_to_ix)
    # 转化为独热编码, 因为我们此处的字母没有大小顺序, 对于预测目标, 可以看成一个分类问题无需转化
    inputs = np.eye(n_values)[inputs]
	# targets = np.eye(n_values)[targets]
    return inputs, targets

def decoding(ix_to_char, data):
    outputs = [ix_to_char[ch] for ch in data]
    return outputs
  • RNN 前向传播
# hyperparameters
hidden_size = 10 # size of hidden layer of neurons
seq_length = 4 # number of steps to unroll the RNN for
learning_rate = 1e-1
vocab_size = 4

# model parameters
Wxh = np.random.randn(vocab_size, hidden_size)*0.1 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.1 # hidden to hidden
Why = np.random.randn(hidden_size, vocab_size)*0.1 # hidden to output
bh = np.zeros((1, hidden_size)) # hidden bias
by = np.zeros((1, vocab_size)) # output bias

# 前向传播函数
def rnn_forward(inputs, targets, Wxh, Whh, Why, bh, by):
    loss = 0
    hs, ys, ps = np.zeros((seq_length, hidden_size)), np.zeros((seq_length, vocab_size)), np.zeros((seq_length, vocab_size))
    for t in range(len(inputs)):
        if t == 0:
            hprev = np.zeros((1, hidden_size))
        else:
            hprev = hs[t-1]
        hs[t] = np.tanh(np.dot(inputs[t], Wxh) + np.dot(hprev, Whh) + bh) # hidden state
        ys[t] = np.dot(hs[t], Why) + by # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # softmax probabilities
        loss += -np.log(ps[t][np.int(targets[t])])
    loss = loss/len(inputs)
    return ps, hs, loss

# inputs 为独热编码,targets 为普通的词向量
inputs, targets = encoding(char_to_ix, 'hell', 'ello')

# 实例化
ps, hs, loss = rnn_forward(inputs, targets, Wxh, Whh, Why, bh, by)
print ('probability is', ps)
ts = [p.argmax() for p in ps]
print ('label is' , ts)
print ('letter is' , decoding(ix_to_char, ts))
  • RNN 反向传播
def rnn_backward(inputs, ps, hs, targets, Wxh, Whh, Why, bh, by): 
    # np.zeros_like() 函数,创建 shape 相同的 0 矩阵
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])  #      loss / h_t , loss/ h_t-1 , loss/ h_t-2  
    for t in reversed(range(len(inputs))):
        # Loss 对 y 求导 得到 dy
        dy = np.copy(ps[t])
        dy[np.int(targets[t])] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(hs[t].reshape(hidden_size, 1), dy.reshape(1, vocab_size)) # update readout matrix
        dby += dy  #update readout bias
        # key , core of bptt 
        dh = np.dot(Why, dy) + dhnext # dh composed of two parts, one through y and the other through last step value
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw   #  update bias of hidden
        dWxh += np.dot(inputs[t].reshape(vocab_size, 1), dhraw.reshape(1, hidden_size))  # update wxh matrix through dhraw
        dWhh += np.dot(dhraw, hs[t-1].T)  #  update whh through last step h value and this step dhraw
        dhnext = np.dot(dhraw, Whh) # step for last step dh, cruicial part as this step loss will affect last
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    	# 让梯度处于一个范围内 (-5, 0)
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return dWxh, dWhh, dWhy, dbh, dby

dWxh, dWhh, dWhy, dbh, dby = rnn_backward(inputs, ps, hs, targets, Wxh, Whh, Why, bh, by)
  • 训练测试
def train(inputs, targets):
    ps, hs, loss = rnn_forward(inputs, targets, Wxh, Whh, Why, bh, by)
#     print (ps.argmax())
    dWxh, dWhh, dWhy, dbh, dby = rnn_backward(inputs, ps, hs, targets, Wxh, Whh, Why, bh, by)
    # perform parameter update with Adagrad
    learning_rate = 1e-2
    for param, dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]):
        param += -learning_rate * dparam  # adagrad update
    return loss
    
for i in range(10000):       
    loss = train(inputs, targets)
    if i%1000 == 0:
        print (' Training loss: %f' % loss)# print progress

# 我们使用ps来进行预测, 我们要用预测值和真实标签的差距衡量训练的效果
ps,_,_ = rnn_forward(inputs, targets, Wxh, Whh, Why, bh, by)
ps[0].argmax(), ps[1].argmax(), ps[2].argmax(), ps[3].argmax()
decoding(ix_to_char, [1,2,2,0])

# 正式的写法
out = [p.argmax() for p in ps]
decoding(ix_to_char, out)

【六】 RNN - Pytorch 实现
  • 定义 RNN 结构
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):  
        super(RNN, self).__init__()
        self.hidden_size = hidden_size 
        self.i2h = nn.Parameter(torch.randn(input_size, hidden_size))
        self.h2h = nn.Parameter(torch.randn(hidden_size, hidden_size))
        self.h2o = nn.Parameter(torch.randn(hidden_size, output_size))
        self.bh = nn.Parameter(torch.randn(1, hidden_size))
        self.bo = nn.Parameter(torch.randn(1, output_size))
    
    def forward(self, inputs, hidden): 
        hidden =  F.tanh(inputs.matmul(self.i2h) + hidden.matmul(self.h2h) + self.bh)
        output = hidden.matmul(self.h2o)+ self.bo 
        return output, hidden 
    
    def initHidden(self): 
        return Variable(torch.zeros(1, self.hidden_size))
  • 测试网络效果
# 编码,inputs 独热编码,targets 普通编码
inputs, targets = encoding(char_to_ix, 'hell', 'ello')
# 改变 inputs 的类型
inputs = Variable(torch.Tensor(inputs))
# 初始化隐层状态
hidden = rnn.inithidden() 
# 每次都传 1 个独热编码
for letter in inputs:
    letter = letter.view(1, -1)
    out, hidden = rnn(letter, hidden)
    print(out)
    _, indice = out.view(-1).max(0)
  • 训练
rnn = RNN(input_size=4, hidden_size=2, output_size= 4)
learning_rate = 0.02 # If you set this too high, it might explode. If too low, it might not learn

# 训练函数
def train(inputs, targets, rnn, epoch):
    # start of sequence, that's why we use a init to clear history
    criterion = nn.CrossEntropyLoss()
    # 初始化隐层
    hidden = rnn.initHidden()
    # 0 of input size is sequence length,-1 自动计算一个 sequence 的长度
    inputs = inputs.view(inputs.size()[0], -1)
    # enter dynamic sequence
    loss = 0
    # 将 targets 转成 Tensor 类型
    targets = Variable(torch.LongTensor(targets))
    # inputs.size()[0] 表示有多少个 inputs
    for i in range(inputs.size()[0]):
        output, hidden = rnn(inputs[i], hidden)
        # torch.unsqueeze 将数据增加一个维度,按照 dim=0 的方向
        loss += criterion(output, torch.unsqueeze(targets[i], dim=0))
        if epoch%100 == 0:
            # max(0) 的意思是,dim = 0 的方向进行查询,[1] 表示返回 index
            out = output.view(-1).max(0)[1]  #  return index of max value along dim0 of output tensor
            inp = inputs[i].view(-1).max(0)[1]
            letter_i = ix_to_char[int(inp.data)]
            letter_o = ix_to_char[int(out.data)]
            print (letter_i, letter_o)
    # Add parameters' gradients to their values, multiplied by learning rate
    # loss 回传求导,retain_graph 保留计算图
    loss.backward(retain_graph = True)
    # 参数更新
    for p in rnn.parameters():
        # pay attention that we use add_   p.data = p.data - learning_rate * p.grad.data, assign value with .data
        p.data.add_(-learning_rate, p.grad.data)
    # 梯度清零
    rnn.zero_grad()
    return output, loss.data

# 开始训练
for i in range(1500):
    output, loss = train(inputs, targets, rnn, i)
    if i%100 == 0:
        print ('loss', loss)

【七】 RNN - 生成任务
  • 根据上一个字母 自动生成 下一个字母
# 自动生成字母(rnn 为网络架构,seed_ix 为初始字母,n 为希望的序列长度)
def sample(rnn, seed_ix, n):
    x = torch.zeros(1, 4)
    hidden = rnn.initHidden()
    x[0][seed_ix] = 1
    x = Variable(x)
    ixes = []
    for t in range(n):
        out, hidden = rnn(x, hidden)
        prob = torch.softmax(out, dim=1)
        # 根据概率,从 0,1,2,3 中抽样,而不是选择概率最大的直接输出
        m = np.random.choice(range(4), p = prob.data.numpy().ravel())
        x = torch.zeros(1, 4)
        x[0][m] = 1
        x = Variable(x)
        ixes.append(m)
        print(x)
    return ixes
发布了57 篇原创文章 · 获赞 5 · 访问量 2890

猜你喜欢

转载自blog.csdn.net/qq_34330456/article/details/98207637