视频链接: 29、PyTorch RNN的原理及其手写复现_哔哩哔哩_bilibili
PyTorch RNN API:https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
单向 RNN API
首先实例化一些参数:
import torch
import torch.nn as nn
import torch.nn.functional as F
batch_size, seq_len = 2, 3 # 批大小, 输入序列长度
input_size, hidden_size = 2, 3 # 输入特征大小feature, 隐含层大小
input = torch.randn(batch_size, seq_len, input_size) # 随机初始化一个特征序列
h_prev = torch.zeros(batch_size, hidden_size) # 初始隐含状态
调用PyTorch中的 RNN API:
rnn = nn.RNN(input_size, hidden_size, batch_first=True) # number_layers默认为1
output, h_n = rnn(input, h_prev.unsqueeze(0)) # h_prev: [num_layers, b, hidden_size]
看一下返回的结果的形状:
print(output.shape) # [2,3,3] [batch_size, seq_len, hidden_size]
print(h_n.shape) # [1,2,3] [num_layers, batch_size, hidden_size]
这里输出一下rnn中的参数名称及其形状:
for name, para in rnn.named_parameters():
print(name, para.shape)
输出结果如下:
weight_ih_l0 torch.Size([3, 2]) # [hidden_size, input_size]
weight_hh_l0 torch.Size([3, 3]) # [hidden_size, hidden_size]
bias_ih_l0 torch.Size([3]) # [hidden_size]
bias_hh_l0 torch.Size([3]) # [hidden_size]
手写 rnn_forward 函数
手写一个rnn_forward函数,实现RNN的计算原理。视频中的PyTorch官网公式与目前的不太一样,于是采用当前官网上的计算公式,如下:
h t = t a n h ( x t W i h T + b i h + h t − 1 W h h T + b h h ) h_t = tanh(x_t W^T_{ih} + b_{ih} + h_{t-1}W^T_{hh} + b_{hh}) ht=tanh(xtWihT+bih+ht−1WhhT+bhh)
这里先将rnn_forward函数中的每个参数的维度写出来:
:param input: [batch_size, seq_len, input_size] input_size就是feature_size
:param weight_ih: [hidden_size, input_size]
:param weight_hh: [hidden_size, hidden_size]
:param bias_ih: [hidden_size]
:param bias_hh: [hidden_size]
:param h_prev: [batch_size, hidden_size]
:return: output: [b, seq_len, D*hidden_size] h_n: [D*num_layers, b, hidden_size]
def rnn_forward(input, weight_ih, weight_hh, bias_ih, bias_hh, h_prev):
batch_size, seq_len, input_size = input.shape
hidden_size = h_prev.shape[-1]
output = torch.zeros(batch_size, seq_len, hidden_size) # 初始化一个输出(状态)矩阵
for t in range(seq_len): # RNN的计算复杂度是与序列长度呈线性相关的
x = input[:, t, :].unsqueeze(1) # 获取当前时刻输入特征, [b, input_size] -> [b, 1, input_size]
w_ih_batch = weight_ih.unsqueeze(0).tile(batch_size, 1, 1) # [b, hidden_size, input_size]
w_hh_batch = weight_hh.unsqueeze(0).tile(batch_size, 1, 1) # [b, hidden_size, hidden_size]
w_times_x = torch.bmm(x, w_ih_batch.transpose(1, 2)).squeeze(1) # 含有batch批大小的矩阵相乘
# [b, 1, input_size] * [b, input_size, hidden_size] -> [b, 1, hidden_size] -> [b, hidden_size]
w_times_h = torch.bmm(h_prev.unsqueeze(1), w_hh_batch.transpose(1, 2)).squeeze(1)
# [b, 1, hidden_size] * [b, hidden_size, hidden_size] -> [b, 1, hidden_size] -> [b, hidden_size]
h_prev = torch.tanh(w_times_x + bias_ih + w_times_h + bias_hh) # [b, hidden] bias相加的时候使用了广播机制
output[:, t, :] = h_prev
return output, h_prev.unsqueeze(0) # 官方是三维,在第0维扩一维
验证一下 rnn_forward 的准确性:
# 这里使用 rnn 中的参数
# 加了me表示自己手写的
output_me, h_n_me = rnn_forward(input, rnn.weight_ih_l0, rnn.weight_hh_l0,
rnn.bias_ih_l0, rnn.bias_hh_l0, h_prev)
打印一下,看两个的计算结果是否相同:
print("PyTorch API output:")
print(output) # [2,3,3] [batch_size, seq_len, hidden_size]
print(h_n) # [1,2,3] [num_layers, batch_size, hidden_size]
print("\nrnn_forward function output:")
print(output_me) # [2, 3, 3]
print(h_n_me) # [1, 2, 3]
结果如下,完全一致,说明手写的是对的:
PyTorch API output:
tensor([[[ 0.5930, 0.8005, 0.6494],
[-0.8536, 0.0442, 0.5911],
[-0.6422, -0.5200, 0.0830]],
[[ 0.4581, 0.4190, 0.3732],
[-0.5581, -0.1656, 0.3779],
[-0.4699, 0.5400, 0.7099]]], grad_fn=<TransposeBackward1>)
tensor([[[-0.6422, -0.5200, 0.0830],
[-0.4699, 0.5400, 0.7099]]], grad_fn=<StackBackward0>)
rnn_forward function output:
tensor([[[ 0.5930, 0.8005, 0.6494],
[-0.8536, 0.0442, 0.5911],
[-0.6422, -0.5200, 0.0830]],
[[ 0.4581, 0.4190, 0.3732],
[-0.5581, -0.1656, 0.3779],
[-0.4699, 0.5400, 0.7099]]], grad_fn=<CopySlices>)
tensor([[[-0.6422, -0.5200, 0.0830],
[-0.4699, 0.5400, 0.7099]]], grad_fn=<UnsqueezeBackward0>)
双向 RNN API
bi_rnn = nn.RNN(input_size, hidden_size, batch_first=True, bidirectional=True) # number_layers默认为1
h_prev = torch.zeros(2, batch_size, hidden_size) # 初始隐含状态
bi_output, bi_h_n = bi_rnn(input, h_prev) # h_prev: [D*num_layers, b, hidden_size]
for name, para in bi_rnn.named_parameters():
print(name, "\t", para.shape)
输出结果如下:
weight_ih_l0 torch.Size([3, 2]) # [hidden_size, input_size]
weight_hh_l0 torch.Size([3, 3]) # [hidden_size, hidden_size]
bias_ih_l0 torch.Size([3]) # [hidden]
bias_hh_l0 torch.Size([3]) # [hidden]
weight_ih_l0_reverse torch.Size([3, 2])
weight_hh_l0_reverse torch.Size([3, 3])
bias_ih_l0_reverse torch.Size([3])
bias_hh_l0_reverse torch.Size([3])
手写 bidirectional_rnn_forward 函数
手写一个bidirectional_rnn_forward函数,实现双向RNN计算原理:
def bidirectional_rnn_forward(input, weight_ih, weight_hh, bias_ih, bias_hh, h_prev,
weight_ih_reverse, weight_hh_reverse, bias_ih_reverse, bias_hh_reverse, h_prev_reverse):
batch_size, seq_len, input_size = input.shape
hidden_size = h_prev.shape[-1] # 这里只能用方括号
output = torch.zeros(batch_size, seq_len, hidden_size * 2) # 初始化一个输出(状态)矩阵 [b,seq_len,D*hidden_size]
# 注意双向是两倍的 hidden_size
forward_out, _ =rnn_forward(input, weight_ih, weight_hh, bias_ih, bias_hh, h_prev) # forward layer
backward_out, _ = rnn_forward(torch.flip(input, [1]), weight_ih_reverse, weight_hh_reverse,
bias_ih_reverse, bias_hh_reverse, h_prev_reverse)
# backward layer 反向的时候需要将 input 在 seq_len 这一维进行翻转
output[:, :, :hidden_size] = forward_out
output[:, :, hidden_size:] = torch.flip(backward_out, [1])
h_n = torch.zeros(batch_size, 2, hidden_size)
h_n[:, 0, :] = forward_out[:, -1, :] # forward_out中最后一个seq的状态
h_n[:, 1, :] = backward_out[:, -1, :] # backward_out中最后一个seq的状态
return output, h_n.transpose(0, 1) # 转换为[num_layers, b, hidden_size]
验证一下 bidirectional_rnn_forward 的准确性:
bi_output_me, bi_h_n_me = bidirectional_rnn_forward(input, bi_rnn.weight_ih_l0,
bi_rnn.weight_hh_l0, bi_rnn.bias_ih_l0,
bi_rnn.bias_hh_l0, h_prev[0],
bi_rnn.weight_ih_l0_reverse,
bi_rnn.weight_hh_l0_reverse,
bi_rnn.bias_ih_l0_reverse,
bi_rnn.bias_hh_l0_reverse, h_prev[1])
# 这里注意一下h_prev的维度[2,b,hidden_size],h_prev[0]是[b, hidden_size],与传入rnn_forward的维度保持一致
print("PyTorch API output:")
print(bi_output) # [2,3,6] [b, seq_len, D*hidden_size]
print(bi_h_n) # [2,2,3] [D*num_layers, b, hidden_size]
print("\nbidirectional_rnn_forward function output:")
print(bi_output_me) # [2,3,6]
print(bi_h_n_me) # [2,2,3]
输出的结果如下,完全一致,说明手写的是对的:
PyTorch API output:
tensor([[[-0.1550, -0.6962, -0.1232, -0.4596, -0.5607, 0.4513],
[-0.0293, -0.7499, -0.2036, -0.3674, -0.4376, 0.2990],
[-0.2439, 0.1274, -0.7016, 0.3596, 0.3512, -0.6899]],
[[-0.1954, -0.1316, -0.8758, 0.5571, 0.1206, -0.5847],
[-0.3007, -0.1827, -0.7897, 0.3305, 0.0416, -0.5789],
[-0.3381, -0.2915, -0.3005, -0.0482, -0.0247, -0.2440]]],
grad_fn=<TransposeBackward1>)
tensor([[[-0.2439, 0.1274, -0.7016],
[-0.3381, -0.2915, -0.3005]],
[[-0.4596, -0.5607, 0.4513],
[ 0.5571, 0.1206, -0.5847]]], grad_fn=<StackBackward0>)
bidirectional_rnn_forward function output:
tensor([[[-0.1550, -0.6962, -0.1232, -0.4596, -0.5607, 0.4513],
[-0.0293, -0.7499, -0.2036, -0.3674, -0.4376, 0.2990],
[-0.2439, 0.1274, -0.7016, 0.3596, 0.3512, -0.6899]],
[[-0.1954, -0.1316, -0.8758, 0.5571, 0.1206, -0.5847],
[-0.3007, -0.1827, -0.7897, 0.3305, 0.0416, -0.5789],
[-0.3381, -0.2915, -0.3005, -0.0482, -0.0247, -0.2440]]],
grad_fn=<CopySlices>)
tensor([[[-0.2439, 0.1274, -0.7016],
[-0.3381, -0.2915, -0.3005]],
[[-0.4596, -0.5607, 0.4513],
[ 0.5571, 0.1206, -0.5847]]], grad_fn=<TransposeBackward0>)