Simple RNN pytorch code implementation
Before writing this blog, the blogger wants to say one thing. Many simple RNN codes on the Internet are wrong, and the blogger’s is also wrong. Why?
Because the gradient descent code of simple RNN must be written by yourself, the gradient descent of simple RNN cannot use the default mechanism of pytorch, otherwise the gradient will disappear directly. The blogger has done a lot of experiments. At first, I always thought that the code was written wrong. , simple RNN cannot be done using the general gradient descent algorithm, but must be implemented using the gradient descent algorithm over time, that is, if you want to reproduce simple RNN, you need to write the gradient descent code yourself, and you cannot directly build a model for training. However, it is okay to use two or three layers of simple RNN, because the gradient disappearance is not serious. Here, we give our code:
#coding=gbk
import torch
from torch.autograd import Variable
import os
from torch.utils import data
import matplotlib.pyplot as plt
import torch.nn.functional as F
import numpy as np
import torch.nn as nn
sample_num=1000
sequense_num=10
input_length=10
train_de_test=0.8
hidden_size=10
num_epochs=100
batch=32
learning_rate=0.01
#torch.manual_seed(10)
x_data=[]
y_data=[]
for i in range(sample_num):
if i%2==0:
x_gene=torch.randint(0,5,(sequense_num,input_length))
y_data.append(1)
else:
x_gene=torch.randint(6,10,(sequense_num,input_length))
# y=torch.sum(x_gene)
# y_data.append(1)
y_data.append(0)
x_data.append(x_gene)
x_data=torch.stack((x_data),0)
x_data=x_data.type(dtype=torch.float32)
y_data=torch.as_tensor(y_data)
print(x_data,y_data)
print(x_data.size())
# 神经网络搭建
class sRNN(nn.Module):
def __init__(self, sequense_num,input_length,hidden_size):
super().__init__()
self.sequense_num=sequense_num
self.input_length=input_length
self.hidden_size=hidden_size
self.W = torch.nn.Parameter(data=torch.randn(input_length, hidden_size, requires_grad=True))
self.U = torch.nn.Parameter(data=torch.randn(input_length, hidden_size, requires_grad=True))
self.b = torch.nn.Parameter(data=torch.randn( 1,hidden_size, requires_grad=True))
self.V = torch.nn.Parameter(data=torch.randn(hidden_size,1 , requires_grad=True))
self.f=nn.Sigmoid()
def forward(self,input):#d就是整个网络的输入
hidden_state_pre=torch.zeros(1, hidden_size)
for i in range(sequense_num):
# print(torch.matmul(self.W,input[:,i,:]))
z=torch.matmul(hidden_state_pre,self.U)+torch.matmul(input[:,i,:],self.W)+ self.b
hidden_state_pre=F.relu(z)
# print(hidden_state_pre)
y=self.f(torch.matmul(hidden_state_pre,self.V))
return y
def backward(self):
pass
loss_fn = nn.BCELoss()
def sampling(sample_num):
index_sequense=torch.randperm(sample_num)
return index_sequense
def get_batch(index_sequense,X_data,Y_data,index,bacth):
return X_data[index:index+bacth],Y_data[index:index+bacth]
srnn=sRNN(sequense_num,input_length,hidden_size)
loss_fn = nn.BCELoss()
index_sequense=sampling(sample_num)
optimizer = torch.optim.Adam(srnn.parameters(), lr=0.1)
co=0
for param_tensor in srnn.state_dict(): # 字典的遍历默认是遍历 key,所以param_tensor实际上是键值
print(param_tensor,'\t',srnn.state_dict()[param_tensor].size())
print(srnn.state_dict()[param_tensor])
acc_list=[]
index=0
index_sequense=torch.randperm(sample_num)
loss_list=[]
for k in range(num_epochs):
if index+batch>=sample_num-1:
index=0
index_sequense=torch.randperm(sample_num)
x_batch,y_batch=get_batch(index_sequense,x_data,y_data,index,batch)
y_batch=y_batch.type(dtype=torch.float32)
y_batch=y_batch.reshape(batch,1)
# print(x_batch)
predict=srnn(x_batch)
co=torch.sum(torch.abs(y_batch-predict)<0.5)
loss = loss_fn(predict,y_batch)
index=index+batch
optimizer.zero_grad()
loss.backward()
optimizer.step()
for p in srnn.parameters():
# p.data.add_(p.grad.data, alpha=-learning_rate)
# print("p.data",p.data)
print("p.grad.data",p.grad.data)
# print(predict)
# print(y_batch-predict)
print("loss :",loss)
print("accuracy :",co/batch)
loss_list.append(loss)
acc_list.append(co/batch)
#except:
# index=0
# index_sequense=torch.randperm(sample_num)
#optimizer.zero_grad()
#loss.backward()
#
#print(x_batch,y_batch)
x_batch,y_batch=get_batch(index_sequense,x_data,y_data,0,batch)
print(x_batch.size())
print("#")
print(srnn(x_batch))
#y_batch=y_batch.type(dtype=torch.float32)
#train(x_data,y_data,num_epochs,batch)
epoch_list=list(range(num_epochs))
plt.plot(epoch_list,acc_list,label='adam')
plt.title("loss")
plt.legend()
plt.show()
plt.plot(epoch_list,loss_list,label='adam')
plt.title("loss")
plt.legend()
plt.show()
#print(srnn.parameters())
#print(type(srnn.state_dict())) # 查看state_dict所返回的类型,是一个“顺序字典OrderedDict”
#for param_tensor in srnn.state_dict(): # 字典的遍历默认是遍历 key,所以param_tensor实际上是键值
# print(param_tensor,'\t',srnn.state_dict()[param_tensor].size())
# print(srnn.state_dict()[param_tensor])
os.system("pause")
operation result:
The above code is not completely correct simple RNN code, but you need to improve the gradient descent algorithm. It is enough to calculate the gradient over time in reverse.