[Deep Learning Model 3] Slowly understand RNN...

A practice code is attached here, source: The thirteenth lecture of Mr. Liu Er of station b.

'''
处理自然语言:
字/词 --> one-hot向量(太过稀疏松散)--> 通过EmbeddingLayer嵌入层转为低维稠密的向量 --> RNN --> 通过LinearLayer线性层转化成想要的输出维度
'''

'''
e.g. 给出国家名称,判断使用哪个语言。
    结构: Embedding Layer - GPU Layer - Linear Layer
'''

'''

1.处理国家名字name:
序列 --> 列表;
做词典;
列表 -(根据词典)-> 数值列表(这里面的数字代表一个 one-hot vector);
因为数值列表长短不一、无法构成矩阵、不能构成张量,所以做一个padding(根据最长序列填充)。

2.处理语言名字country --> 索引标签(用作分类的label)
做词典。

'''

import torch
import time
import math
import csv
import gzip
from torch.utils.data import DataLoader
import datetime
import matplotlib.pyplot as plt
import numpy as np

HIDDEN_SIZE=100
BATCH_SIZE=256
N_LAYER=2
N_EPOCHS=200
N_CHARS=128
USE_GPU=False

class NameDataset():
    def __init__(self,is_train_set=True):
        filename = 'names_train.csv.gz' if is_train_set else 'names_test.csv.gz'  # 因为数据集小,一次性全部读入。
        with gzip.open(filename, 'rt') as f:    # 打开压缩文件并将变量名设为为f
            reader = csv.reader(f)              # 读取表格文件
            rows = list(reader)
        self.names = [row[0] for row in rows]  
        self.len = len(self.names)             
        self.countries = [row[1] for row in rows]
        self.country_list = list(sorted(set(self.countries)))  # set:list --> 集合,可以去重复;sorted:排序。
        
        self.country_dict = self.getCountryDict()  # 转变成词典。
        self.country_num = len(self.country_list)  

    def __getitem__(self,index):
        return self.names[index],self.country_dict[self.countries[index]] 

    def __len__(self):
        return self.len
    
    def getCountryDict(self):
        country_dict=dict()
        for idx, country_name in enumerate(self.country_list,0):    
            country_dict[country_name] = idx                       
        return country_dict
 
    def idx2country(self,index):      # 可以根据索引name获取country。  
        return self.country_list(index)
 
    def getCountrysNum(self):    # 返回有多少个国家。   
        return self.country_num

trainset = NameDataset(is_train_set=True)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE,shuffle=True)
testset = NameDataset(is_train_set=False)
testloader = DataLoader(testset, batch_size=BATCH_SIZE,shuffle=False)  # N_COUNTRY:决定最终size of output。

N_COUNTRY=trainset.getCountrysNum()

def create_tensor(tensor):  # 判断是否使用GPU。
    if USE_GPU:
        device = torch.device("cuda:0")
        tensor = tensor.to(device)
    return tensor
 
class RNNClassifier(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1, bidirectional=True):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size                  
        self.n_layers = n_layers
        self.n_directions = 2 if bidirectional else 1  # bidirectional:单向or双向RNN。
 
        self.embedding = torch.nn.Embedding(input_size, hidden_size)  # input.shape=(seqlen,batch) output.shape=(seqlen,batch,hiddensize)
        self.gru = torch.nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=bidirectional)
                                # 输入维度       输出维度      层数        说明单向还是双向
        self.fc = torch.nn.Linear(hidden_size * self.n_directions, output_size)  
        # 双向循环神经网络Bi-Direction RNN:分别从序列的正方向和反方向做运算,对正反向算出的隐层hidden做拼接。  
        # 所以,如果是双向,这里要“*2”。
 
    def forward(self, input, seq_lengths):
        input = input.t()    # “.t”==transpose做转置:B*S --> S*B ,因为 embedding layer 需要的维度是 seq*batch 。           
        batch_size = input.size(1)
        hidden =self._init_hidden(batch_size)
        embedding = self.embedding(input)
 
        seq_lengths = seq_lengths.cpu()
        # 为了提高运行效率(让0值不参与运算),GRU Layer支持pack_padded_sequence(),这样就可以处理长短不一的数据。但!要求seqLen必须是降序!所以在embedding中提前处理seqLen为降序。
        gru_input = torch.nn.utils.rnn.pack_padded_sequence(embedding, seq_lengths)
        
        output, hidden = self.gru(gru_input, hidden)
        if self.n_directions ==2:
            hidden_cat = torch.cat([hidden[-1], hidden[-2]], dim=1)  # torch.cat(inputs, dim=?):拼接函数,结果为Tensor。
        else:
            hidden_cat = hidden[-1]
        fc_output = self.fc(hidden_cat)
        return fc_output

    def _init_hidden(self,batch_size):  # 创建全0的初始隐层h0。
        hidden = torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size)
        return  create_tensor(hidden)

def name2list(name):  # name2list:将每一个名字转换为一个列表;通过把每个字符的ASCII码值读出来,这样就把每个名字(字符串)变成了ASCII码列表。
    arr = [ord(c) for c in name]
    return arr, len(arr)  # name2list返回一个元组,包括list和length。
 
def make_tensors(names, countries):     
    sequences_and_lengths= [name2list(name) for name in names]                  
    name_sequences = [sl[0] for sl in sequences_and_lengths]                    
    seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths])     
    countries = countries.long()
 
    # 如何“补”0: 先做一个全0张量;再将seqLen“贴”到全0张量上。
    seq_tensor = torch.zeros(len(name_sequences), seq_lengths.max()).long()     
    for idx, (seq, seq_len) in enumerate(zip(name_sequences, seq_lengths), 0):  
        seq_tensor[idx, :seq_len] = torch.LongTensor(seq)                       
 
    # sort by length to use pack_padded_sequence
    seq_lengths, perm_idx = seq_lengths.sort(dim=0, descending=True)  # 排序。 ".sort"返回: 排序后序列、对应的索引。
    seq_tensor = seq_tensor[perm_idx]                           
    countries = countries[perm_idx]                               
 
            # 返回:排序后的 ASCII列表         名字长度降序列表             国家名列表
    return create_tensor(seq_tensor),create_tensor(seq_lengths),create_tensor(countries)
 
def time_since(since):  # 转换单位,seconds -- mins 。
    s=time.time()-since
    m=math.floor(s/60)
    s-=m*60
    return '%dm %ds' % (m,s)

'''
训练:
-forward_compute output of model
-forward_compute loss
-zero grad
-backward
-update
'''
def trainModel():
    total_loss = 0
    for i, (names, countries) in enumerate(trainloader, 1):
        inputs, seq_lengths, target = make_tensors(names, countries)
        output = classifier(inputs, seq_lengths)   
        loss = criterion(output, target)            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if i % 10 == 0:
           print(f'[{time_since(start)}]Epoch{epoch}',end='')
           print(f'[{i * len(inputs)}/{len(trainset)}] ', end='')
           print(f'loss={total_loss / (i * len(inputs))}')
    return total_loss
 
def testModel():
    correct = 0
    total = len(testset)
    with torch.no_grad():
        for i, (names, countries) in enumerate(testloader, 1):
            inputs, seq_lengths, target = make_tensors(names, countries)   
            output = classifier(inputs, seq_lengths)                       
            pred = output.max(dim=1, keepdim=True)[1]   # 预测值。                
            correct += pred.eq(target.view_as(pred)).sum().item()   # 根据预测值看看有多少模型算对了。      
 
        percent = '%.2f' % (100 * correct / total)
        print(f'Test set: Accuracy {correct}/{total} {percent}%')
    return correct / total
 

if __name__ == '__main__':
    classifier = RNNClassifier(N_CHARS, HIDDEN_SIZE, N_COUNTRY, N_LAYER)
    if USE_GPU:
        device = torch.device('cuda:0')
        classifier.to(device)
 
    criterion = torch.nn.CrossEntropyLoss()    
    optimizer = torch.optim.Adam(classifier.parameters(), lr = 0.001)  
 
    start = time.time()  # 为了打印训练时间有多长。
    print("Train for %d epochs..." % N_EPOCHS)
    acc_list= []
    for epoch in range(1, N_EPOCHS+1):
        trainModel()  # 把训练和测试封装。
        acc = testModel()
        acc_list.append(acc)
    end = time.time()
    print(datetime.timedelta(seconds=(end - start) // 1)) 
 
 
    epoch = np.arange(1, len(acc_list) + 1, 1)
    acc_list = np.array(acc_list)
    plt.plot(epoch, acc_list)
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.grid()
    plt.show()

 

Guess you like

Origin blog.csdn.net/weixin_43594181/article/details/127339211