rnn code explanation

8.5. Scratch Implementation of Recurrent Neural Networks — Hands-On Deep Learning 2.0.0 documentation

personal notes

code for debugging

#[vocab.idx_to_token[i] for i in train_iter.corpus]

#[i for i in train_iter.corpus]

#''.join([vocab.idx_to_token[i] for i in train_iter.corpus])

batch_size, num_steps = 32, 35
train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps)

F.one_hot(torch.tensor([0, 2]), len(vocab))
#长度28中分别是0和2为真(值为1)的list

X = torch.arange(10).reshape((2, 5))
X1=F.one_hot(X.T, 28).shape   #28是26个字母加空格加unknown

#x大小为(2,5) ,做成onehot之后维度增加变成(5,2,28)
#意思是每个值对应一个onehot类别,此处为2条句子,每条句子有5个字符 而每个字符属于onehot里面的一类,
#比如即第一个x[0][0]中的值就是0对应着[1,0,0,0,......,0]的样子

The contents of batch_size and num_steps are temporarily unavailable

num_steps indicates how long the text is taken from the text, here is 35 characters, and each of the 32 sentences is 35 characters long 

But the following example only uses vocab


def get_params(vocab_size, num_hiddens, device):
    #给与vocab字典类别大小和需要做的隐藏层的大小 以获得并初始化权重w和偏差b的函数
    
    num_inputs = num_outputs = vocab_size #比如输入是一个类别数onehot之后变成28个类别 输出也是一个类别数onehot之后变成28个类别   都是vocab里28个类别的数

    def normal(shape):#给予形状初始化值
        return torch.randn(size=shape, device=device) * 0.01

    # 隐藏层参数
    #h为当前为下一位字符的影响因子
    W_xh = normal((num_inputs, num_hiddens))
    W_hh = normal((num_hiddens, num_hiddens))
    b_h = torch.zeros(num_hiddens, device=device)
    # 输出层参数
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs, device=device)
    # 附加梯度
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:#需要梯度学习以变得更能预测出理想的值
        param.requires_grad_(True)
    return params

#初始化初始状态 state就是h的值 h第一维是x的第一维也就是下标的0维,第二维为hidden维
#从第一个字符开始的时候 只从第一个字符x1预测第二个值y1(已知文本的话是x2),h是不存在所以需要初始化值为0,因为有更早的字符做影响因子,而从第二个之后开始 h就有了第一个字符的影响所以值在之前加上了W_xh*H了

def init_rnn_state(batch_size, num_hiddens, device):
    return (torch.zeros((batch_size, num_hiddens), device=device), )#传了两个返回值回去
    #是为了后面的lstm写的方便留的 所以下面H, = state  第二个不接收参数

def rnn(inputs, state, params):
    # inputs的形状:(时间步数量,批量大小,词表大小)
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    # X的形状:(批量大小,词表大小)
    for X in inputs:#公式要看网页或者视频 tanh此处做激活函数作用
        H = torch.tanh(torch.mm(X, W_xh) + torch.mm(H, W_hh) + b_h)
        Y = torch.mm(H, W_hq) + b_q
        outputs.append(Y)
    return torch.cat(outputs, dim=0), (H,)

class RNNModelScratch: 
    """从零开始实现的循环神经网络模型"""
    def __init__(self, vocab_size, num_hiddens, device,
                 get_params, init_state, forward_fn):
        #传参 设定参数
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(vocab_size, num_hiddens, device)
        self.init_state, self.forward_fn = init_state, forward_fn

    def __call__(self, X, state):
        X = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        return self.forward_fn(X, state, self.params)

    def begin_state(self, batch_size, device):
        return self.init_state(batch_size, self.num_hiddens, device)


num_hiddens = 512
net = RNNModelScratch(len(vocab), num_hiddens, d2l.try_gpu(), get_params,
                      init_rnn_state, rnn)
state = net.begin_state(X.shape[0], d2l.try_gpu())
#h第一维是x的第一维也就是下标的0维,第二维为hidden维
Y, new_state = net(X.to(d2l.try_gpu()), state)
#forward_fn传递为rnn()函数
Y.shape, len(new_state), new_state[0].shape

The current thinking is

x is the input shape (batch_size, num_steps) here is (2,5) and becomes (5,2,28) after transposing onehote

If you draw a matrix diagram, you can find that the last two dimensions are the transposition traversal of X, which can become the category number of the two sentences corresponding to each character. The shape is (2,28)

In order to transform into a hidden layer 512, W_xh needs to be (28,512) so W_xh*single x becomes (2,512)

h is the first dimension and the second dimension before x is not transformed is the hidden layer, so it is (2,512) W_hh is (512,512) transformed into (2,512)

W_hq is (512,28) so y can become (2,28) which means that the corresponding next character y can be taken out from each x, so maintain

(2,28) characters in each of the 2 sentences

Real examples start

#预测部分最后讲

def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter):
    """训练网络一个迭代周期(定义见第8章)"""
    state, timer = None, d2l.Timer()
    metric = d2l.Accumulator(2)  # 训练损失之和,词元数量
    for X, Y in train_iter:#X为文本中随机一段,Y为X文本后移一位的文本  对应着x的第一个预测y第一个也就是x的第二个(在X现有文本中)
        # #x第一个和第二个预测y的第二个,x的第一二三个预测y的第三个   ''.join([vocab.idx_to_token[i] for i in X[0]]), ''.join([vocab.idx_to_token[i] for i in Y[0]]),->('llsithe time traveller for so it wi', 'lsithe time traveller for so it wil')
        #''.join([vocab.idx_to_token[i] for i in X[0]]),len(''.join([vocab.idx_to_token[i] for i in X[0]]))  ->('llsithe time traveller for so it wi', 35)
        if state is None or use_random_iter:
            # 在第一次迭代或使用随机抽样时初始化state
            state = net.begin_state(batch_size=X.shape[0], device=device)
        else:
            if isinstance(net, nn.Module) and not isinstance(state, tuple):
                # state对于nn.GRU是个张量
                state.detach_()
            else:
                # state对于nn.LSTM或对于我们从零开始实现的模型是个张量
                for s in state:
                    s.detach_()
        y = Y.T.reshape(-1)
        X, y = X.to(device), y.to(device)
        y_hat, state = net(X, state)
        l = loss(y_hat, y.long()).mean()
        if isinstance(updater, torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            grad_clipping(net, 1)
            updater.step()
        else:
            l.backward()
            grad_clipping(net, 1)
            # 因为已经调用了mean函数
            updater(batch_size=1)
        metric.add(l * y.numel(), y.numel())
    return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()



def train_ch8(net, train_iter, vocab, lr, num_epochs, device,
              use_random_iter=False):
    """训练模型(定义见第8章)"""
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch', ylabel='perplexity',
                            legend=['train'], xlim=[10, num_epochs])
    # 初始化
    if isinstance(net, nn.Module):
        updater = torch.optim.SGD(net.parameters(), lr)
    else:
        updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
    predict = lambda prefix: predict_ch8(prefix, 50, net, vocab, device)
    # 训练和预测
    for epoch in range(num_epochs):
        ppl, speed = train_epoch_ch8(
            net, train_iter, loss, updater, device, use_random_iter)
        if (epoch + 1) % 10 == 0:
            print(predict('time traveller'))
            animator.add(epoch + 1, [ppl])
    print(f'困惑度 {ppl:.1f}, {speed:.1f} 词元/秒 {str(device)}')
    print(predict('time traveller'))
    print(predict('traveller'))

#num_epochs, lr = 500, 1
num_epochs, lr = 5, 1
train_ch8(net, train_iter, vocab, lr, num_epochs, d2l.try_gpu())

net = RNNModelScratch(len(vocab), num_hiddens, d2l.try_gpu(), get_params,
                      init_rnn_state, rnn)
#train_ch8(net, train_iter, vocab, lr, num_epochs, d2l.try_gpu(),use_random_iter=True)
d2l.plt.show()

A single y is (32,28) with 35 characters, so there should be 35 different ys in the list, representing each character of each sentence

In the final algorithm, he adds this up and becomes (32*28=1120,28) according to the 0 dimension

From top to bottom, every 32 sentences correspond to a subscript character, which constitutes y_hat, which corresponds to y. Finally, cross-entropy is used for loss and sum can correspond to their respective positions.

#裁剪部分就不讲了 有初步理解但是讲不清楚
def grad_clipping(net, theta):  
    """裁剪梯度"""
    if isinstance(net, nn.Module):
        params = [p for p in net.parameters() if p.requires_grad]
    else:
        params = net.params
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm


#预测部分最后讲

def predict_ch8(prefix, num_preds, net, vocab, device):  
    """在prefix后面生成新字符"""
    state = net.begin_state(batch_size=1, device=device)
    outputs = [vocab[prefix[0]]]  #[3] 第一个字符放进字典搜出下标
    get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1, 1))#最后预测的一个值作为输入  批量为1 时间长度为1
    for y in prefix[1:]:  # 预热期
        _, state = net(get_input(), state)
        outputs.append(vocab[y])
    for _ in range(num_preds):  # 预测num_preds步
        y, state = net(get_input(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    return ''.join([vocab.idx_to_token[i] for i in outputs])


おすすめ

転載: blog.csdn.net/qq_36632604/article/details/129782969