lstm desde cero

Prefacio

El sitio web oficial de pytorch hace la tarea de generar nombres.
En el tutorial hay un rnn personalizado, personalicé el lstm más simple.
El modelo lstm se refiere a la comprensión de las redes LSTM

Proceso de experimento completo

import torch
import torch.nn as nn
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters) + 1 # Plus EOS marker

def findFiles(path): return glob.glob(path)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

# Build the category_lines dictionary, a list of lines per category
category_lines = {
    
    }
all_categories = []
for filename in findFiles('data/name_data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

if n_categories == 0:
    raise RuntimeError('Data not found. Make sure that you downloaded data '
        'from https://download.pytorch.org/tutorial/data.zip and extract it to '
        'the current directory.')

print('# categories:', n_categories, all_categories)
# categories: 18 ['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese']
# import torch
# import torch.nn as nn

# class RNN(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size):
#         super(RNN, self).__init__()
#         self.hidden_size = hidden_size

#         self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size)
#         self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size)
#         self.o2o = nn.Linear(hidden_size + output_size, output_size)
#         self.dropout = nn.Dropout(0.1)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, category, input, hidden):
#         input_combined = torch.cat((category, input, hidden), 1)
#         hidden = self.i2h(input_combined)
#         output = self.i2o(input_combined)
#         output_combined = torch.cat((hidden, output), 1)
#         output = self.o2o(output_combined)
#         output = self.dropout(output)
#         output = self.softmax(output)
#         return output, hidden

#     def initHidden(self):
#         return torch.zeros(1, self.hidden_size)
import random

# Random item from a list
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]
#     return l[0]

# Get a random category and random line from that category
def randomTrainingPair():
    category = randomChoice(all_categories)
    
    line = randomChoice(category_lines[category])
    return category, line
# One-hot vector for category
def categoryTensor(category):
    li = all_categories.index(category)
    tensor = torch.zeros(1, n_categories)
    tensor[0][li] = 1
    return tensor

# One-hot matrix of first to last letters (not including EOS) for input
def inputTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

# LongTensor of second letter to end (EOS) for target
def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)
def randomTrainingExample():
    category, line = randomTrainingPair()
    category_tensor = categoryTensor(category)
    input_line_tensor = inputTensor(line)
    target_line_tensor = targetTensor(line)
    return category_tensor, input_line_tensor, target_line_tensor
class LSTm(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTm, self).__init__()
        self.hidden_size = hidden_size 
        self.input_size = input_size
        self.output_size = output_size
    
        self.w_f = torch.nn.Linear(self.input_size, hidden_size)
        nn.init.xavier_normal_(self.w_f.weight,gain=1)
        self.w_i = torch.nn.Linear(self.input_size, hidden_size)
        nn.init.xavier_normal_(self.w_i.weight,gain=1)
        self.w_c = torch.nn.Linear(self.input_size, hidden_size)
        nn.init.xavier_normal_(self.w_c.weight,gain=1)
        self.w_o = torch.nn.Linear(self.input_size, hidden_size)
        nn.init.xavier_normal_(self.w_o.weight,gain=1)
        
        self.sigmoid = torch.nn.Sigmoid()
        self.tanh = torch.nn.Tanh()
        self.outlayer = torch.nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, category, x, h, c):
        
        x = torch.cat([category, x], dim = 1)
        
        ft = self.sigmoid(self.w_f(torch.cat([h, x], dim=1)))
        
        it = self.sigmoid(self.w_i(torch.cat([h, x], dim=1)))
        
        ct_hat = self.tanh(self.w_c(torch.cat([h, x], dim=1)))
        
        ct = torch.mul(ft, c) + torch.mul(it, ct_hat)
        
        ot = self.sigmoid(self.w_o(torch.cat([h, x], dim=1)))
        
        ht = torch.mul(ot, ct)

        return ht, ht, ct
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    
    def initCell(self):
        return torch.zeros(1, self.hidden_size)
hidden_size = 128
out_size = n_letters
criterion = nn.CrossEntropyLoss()
LStm = LSTm(n_letters+n_categories+hidden_size, hidden_size, out_size)
optimizer = torch.optim.Adam(LStm.parameters(), lr = 1e-3)

def train(category_tensor, input_line_tensor, target_line_tensor):
    target_line_tensor.unsqueeze_(-1)
    hidden = LStm.initHidden()
    cell = LStm.initCell()
    optimizer.zero_grad()

    loss = 0

    for i in range(input_line_tensor.size(0)):
        hidden, cell, output = LStm(category_tensor, hidden, input_line_tensor[i], cell)
        l = criterion(output, target_line_tensor[i])
        loss += l
    loss.backward()
    optimizer.step()

    return output, loss.item() / input_line_tensor.size(0)
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)
# rnn = RNN(n_letters, 128, n_letters)
n_iters = 100000
print_every = 5000
plot_every = 500
all_losses = []
total_loss = 0 # Reset every plot_every iters

start = time.time()

for iter in range(1, n_iters + 1):
    output, loss = train(*randomTrainingExample())
    total_loss += loss

    if iter % print_every == 0:
#         print(output)
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))


    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0
0m 25s (5000 5%) 2.5861
0m 51s (10000 10%) 3.0035
1m 20s (15000 15%) 2.2847
1m 48s (20000 20%) 2.0237
2m 15s (25000 25%) 2.9981
2m 39s (30000 30%) 2.7145
3m 4s (35000 35%) 2.7130
3m 28s (40000 40%) 2.2639
3m 52s (45000 45%) 2.0530
4m 17s (50000 50%) 2.8611
4m 41s (55000 55%) 3.0305
5m 7s (60000 60%) 1.9455
5m 32s (65000 65%) 2.3996
5m 57s (70000 70%) 2.2090
6m 31s (75000 75%) 1.8699
7m 7s (80000 80%) 2.2690
7m 35s (85000 85%) 2.3699
8m 1s (90000 90%) 1.6409
8m 26s (95000 95%) 2.3424
8m 50s (100000 100%) 2.1979
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
plt.figure("train loss")
plt.plot(all_losses)
plt.title("Lstm train loss")
plt.ylabel("loss")
plt.xlabel("iter")

Inserte la descripción de la imagen aquí
hidden_size es diferente de out_size, se agrega una capa lineal (esto es incorrecto, la dimensión de salida está configurada incorrectamente):
Inserte la descripción de la imagen aquí
esta dimensión de salida es correcta,
aumente el número de rondas de entrenamiento:

max_length = 20

# Sample from a category and starting letter
def sample(category, start_letter='A'):
    with torch.no_grad():  # no need to track history in sampling
        category_tensor = categoryTensor(category)
        input = inputTensor(start_letter)
        hidden = LStm.initHidden()
        cell = LStm.initCell()

        output_name = start_letter
        # category_tensor, hidden, input_line_tensor[i], cell
        for i in range(max_length):
            hidden, cell, output = LStm(category_tensor, hidden, input[0], cell)
            topv, topi = output.topk(1)
            topi = topi[0][0]
            if topi == n_letters - 1:
                break
            else:
                letter = all_letters[topi]
                output_name += letter
            input = inputTensor(letter)

        return output_name

# Get multiple samples from one category and multiple starting letters
def samples(category, start_letters='ABC'):
    for start_letter in start_letters:
        print(sample(category, start_letter))
samples('Russian', 'abcdefghijklmn')
anera
balev
chalon
daranov
erinon
falin
galov
halanov
intono
jareva
kolovak
loukha
malov
nolouki

Conclusión y pensamiento

  • Mirando el RNN personalizado en el sitio web oficial, el efecto no es muy bueno, la siguiente es su curva de pérdida:
    Inserte la descripción de la imagen aquí
  • nn.init.xavier_normal_Después de inicializar los parámetros de la capa lineal , la pérdida cae más rápido.
  • input_size, hidden_size, output_size aún deben considerarse
  • No use logsoftmax si usa crossentropy, cross es una combinación de softmax + log + nllloss

referencia

Pytorch explica NLLLoss y CrossEntropyLoss
Comprensión de LSTM Networks
sitio web oficial de pytorch

Supongo que te gusta

Origin blog.csdn.net/jokerxsy/article/details/108996302
Recomendado
Clasificación