深度学习-自然语言处理(NLP)-Pytorch：Transformer模型源码分析【自定义构建Transformer模型（Copy任务）】

在这里插入图片描述

任务描述: 针对数字序列进行学习, 学习的最终目标是使输出与输入的序列相同. 如输入[1, 5, 8, 9, 3], 输出也是[1, 5, 8, 9, 3].

copy任务意义： copy任务在模型基础测试中具有重要意义，因为copy操作对于模型来讲是一条明显规律, 因此模型能否在短时间内，小数据集中学会它，可以帮助我们断定模型所有过程是否正常，是否已具备基本学习能力.

import numpy as np
import torch  # 导入必备的工具包
import torch.nn as nn  # 预定义的网络层torch.nn, 工具开发者已经帮助我们开发好的一些常用层【比如，卷积层, lstm层, embedding层等, 不需要我们再重新造轮子】
import math  # 数学计算工具包
import torch.nn.functional as F  # 工具包装载了网络层中那些只进行计算, 而没有参数的层
from torch.autograd import Variable  # torch中变量封装函数Variable.
import matplotlib.pyplot as plt
import copy  # 用于深度拷贝的copy工具包
import time

EMBEDDING_DIM = 6  # 词嵌入维度的大小
DROPOUT = 0.1  # dropout本身是对模型结构中的节点数进行随机抑制的比率，又因为节点被抑制等效就是该节点的输出都是0，因此也可以把dropout看作是对输出矩阵的随机置0的比率.
HEAD_SIZE = 3  # 多头注意力层的head数量
VOCAB_SIZE = 1000
MAX_LEN = 64
FF_MIDDLE_DIM = 4
N = 3

# 一、文本嵌入层：定义Embeddings类来实现文本嵌入层，这里s说明代表两个一模一样的嵌入层, 他们共享参数【该类继承nn.Module, 这样就有标准层的一些功能, 这里我们也可以理解为一种模式, 我们自己实现的所有层都会这样去写】
# 1.1 构建文本嵌入层
class MyEmbedding(nn.Module):
    # 类的初始化函数, 有两个参数, 【vocab_size: 指词表的大小; embedding_dim: 指转换后的词嵌入的维度】
    def __init__(self, vocab_size, embedding_dim):
        super(MyEmbedding, self).__init__()  # 使用super的方式指明继承nn.Module的初始化函数, 我们自己实现的所有层都会这样去写.
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)  # 调用nn中的预定义层Embedding, 获得一个词嵌入对象self.embedding【vocab_size表示词汇表所有单词数量】

    # 前向传播逻辑，所有层中都会有此函数,当传给该类的实例化对象参数时, 自动调用该类函数【参数word_tensor_input: 代表单词文本通过词汇映射(word2index)后的数值型张量,word_tensor_input里的每一个数字必须为0~vocab_size间的数来代表词汇表里的一个特定单词】
    def forward(self, word_tensor_input):
        # 将张量word_tensor_input传给self.embedding 返回词向量【math.sqrt(self.embedding_dim)具有缩放的作用，控制转换后每一个元素的数值大小尽可能离散】
        word_embedded = self.embedding(word_tensor_input) * math.sqrt(self.embedding_dim)
        return word_embedded


# # 1.2 测试文本嵌入层
# print("=" * 100, "MyEmbedding文本嵌入层测试", "=" * 100)
# embedding01 = MyEmbedding(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM)
# # 输入 word_tensor_input01 形状是 torch.Size([2, 4])，word_tensor_input01 中的每一个数字代表一个单词，该数字必须处于0~10, 通过embedding将每一个数字从一维转为三维
# word_tensor_input01 = Variable(torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]))  # 其中所有元素的数值必须在0~10之间，1、2、4、5、4、3、2、9 代表在词汇表(该词汇表中的单词总数为10)中的序号分别为1、2、4、5、4、3、2、9的单词
# print("MyEmbedding---->word_tensor_input01.shape = {0}----第一维度表示句子数量，第二维度表示句子长度".format(word_tensor_input01.shape))
# # print("PositionalEncoding---->word_tensor_input01 ={0}".format(word_tensor_input01))
# word_embedded01 = embedding01(word_tensor_input01)
# print("MyEmbedding---->word_embedded01.shape = {0}----第一维度表示句子数量，第二维度表示句子长度，第三维度表示每个单词的词向量维度".format(word_embedded01.shape))
# print("PositionalEncoding---->word_embedded01 ={0}".format(word_embedded01))
# print("-" * 200)
# # 输入 word_tensor_input02 形状是 torch.Size([2])，word_tensor_input02 中的每一个数字代表一个单词，该数字必须处于0~125, 通过embedding将每一个数字从一维转为四维
# embedding02 = nn.Embedding(125, 5, padding_idx=0)
# word_tensor_input02 = Variable(torch.LongTensor([99, 20]))  # 其中所有元素的数值必须在0~125之间，99、20代表在词汇表(该词汇表中的单词总数为125)中的序号分别为99、20的单词
# word_embedded02 = embedding02(word_tensor_input02)
# print("word_tensor_input02.shape = {0}\nword_tensor_input02 = {1}\nword_embedded02.shape = {2}\nword_embedded02 = {3}".format(word_tensor_input02.shape, word_tensor_input02, word_embedded02.shape, word_embedded02))
# print("-" * 200)
# # 输入 word_tensor_input03 维度为 torch.Size([1])，word_tensor_input03 中的每一个数字代表一个单词，该数字必须处于0~21356, 通过embedding将每一个数字从一维转为七维
# embedding03 = nn.Embedding(21356, 7, padding_idx=0)
# word_tensor_input03 = Variable(torch.LongTensor([12929]))  # 其中所有元素的数值必须在0~21356之间，12929代表在词汇表(该词汇表中的单词总数为21356)中的序号为12929的单词
# word_embedded03 = embedding03(word_tensor_input03)
# print("word_tensor_input03.shape = {0}\nword_tensor_input03 = {1}\nword_embedded03.shape = {2}\nword_embedded03 = {3}".format(word_tensor_input03.shape, word_tensor_input03, word_embedded03.shape, word_embedded03))
print("=" * 200)


# 二、位置编码器：我们同样把它看做一个层, 因此会继承nn.Module
# 2.1 构建位置编码器
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=5000, dropout=0.1):  # 位置编码器类的初始化函数【共有三个参数, embedding_dim: 词嵌入维度; dropout: 置0比率;  max_len: 每个句子的最大长度】
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)  # 实例化nn中预定义的Dropout层, 并将dropout传入其中, 获得对象self.dropout
        pe = torch.zeros(max_len, embedding_dim)  # 初始化一个形状为(max_len * embedding_dim)的位置编码矩阵【行大小max_len代表句子长度，每一行代表一个单词；列大小embedding_dim代表词向量维度】
        position = torch.arange(0, max_len).unsqueeze(1)  # 初始化一个形状为(max_len*1)绝对位置矩阵
        div_term = torch.exp(torch.arange(0, embedding_dim, 2) * -(math.log(10000.0) / embedding_dim))  # 定义(1*embedding_dim/2)形状的变换矩阵div_term【跳跃式初始化】
        pe[:, 0::2] = torch.sin(position * div_term)  # 把绝对位置矩阵position经过变换矩阵div_term转换后的矩阵，再经sin()函数处理，填充在位置编码矩阵pe的偶数列
        pe[:, 1::2] = torch.cos(position * div_term)  # 把绝对位置矩阵position经过变换矩阵div_term转换后的矩阵，再经cos()函数处理，填充在位置编码矩阵pe的奇数列
        pe = pe.unsqueeze(0)  # 将二维矩阵pe拓展为三维，用于和embedding层的输出(一个三维张量)相加，
        self.register_buffer('pe', pe)  # 把pe位置编码矩阵注册成模型的buffer【buffer是对模型效果有帮助的，但是却不是模型结构中超参数或者参数，不需要随着优化步骤进行更新的增益对象。注册之后我们就可以在模型保存后重加载时和模型结构与参数一同被加载】

    def forward(self, word_embedded):  # 参数word_embedded: 表示文本序列的词嵌入表示
        pe = self.pe[:, :word_embedded.size(1)]  # 对pe做一些适配工作， 将这个三维张量的第二维也就是句子最大长度的那一维将切片到与输入的word_embedded的第二维相同即word_embedded.size(1)，使pe与word_embedded的样式相同【因为我们默认max_len为5000一般来讲实在太大了，很难有一条句子包含5000个词汇，所以要进行与输入张量的适配】
        # print("word_embedded.shape = {0}".format(word_embedded.shape))
        # print("pe.shape = {0}".format(pe.shape))
        word_embedded_plus_pe = word_embedded + Variable(pe, requires_grad=False)  # 将pe使用Variable进行封装，但是它是不需要进行梯度求解的，因此把requires_grad设置成false.
        # print("word_embedded_plus_pe.shape = {0}".format(word_embedded_plus_pe.shape))
        return self.dropout(word_embedded_plus_pe)  # 最后使用self.dropout对象进行'丢弃'操作, 并返回结果.


# # 2.2 测试位置编码器
# print("=" * 100, "PositionalEncoding位置编码器测试", "=" * 100)
# dropout = DROPOUT  # 置0比率为0.1
# max_len = 60  # 句子最大长度
# word_embedded = word_embedded01  # 文本嵌入层的输出
# # 实例化PositionalEncoding层
# pe = PositionalEncoding(embedding_dim=word_embedded.size(2), max_len=max_len, dropout=dropout)
# pe_result = pe(word_embedded)
# print("PositionalEncoding---->pe_result.shape = {0}".format(pe_result.shape))
# # print("PositionalEncoding---->pe_result ={0}".format(pe_result))
# # 绘制词汇向量中特征的分布曲线
# plt.figure(figsize=(15, 5))  # 创建一张15 x 5大小的画布
# pe = PositionalEncoding(embedding_dim=20, dropout=0)  # 实例化PositionalEncoding类得到pe对象, 输入参数是20和0
# y = pe(Variable(torch.zeros(1, 100, 20)))  # 然后向pe传入被Variable封装的tensor, 这样pe会直接执行forward函数,且这个tensor里的数值都是0, 被处理后相当于位置编码张量
# plt.plot(np.arange(100), y[0, :, 4:8].data.numpy())  # 然后定义画布的横纵坐标, 横坐标到100的长度, 纵坐标是某一个词汇中的某维特征在不同长度下对应的值【因为总共有20维之多, 我们这里只查看4，5，6，7维的值.】
# plt.legend(["dim %d" % p for p in [4, 5, 6, 7]])  # 在画布上填写维度提示信息
# plt.savefig("./transformer_pe.png")  # 保存图像
# print("=" * 200)


# 三、注意力机制的实现【输入分别是query, key, value, mask(掩码张量), dropout是nn.Dropout层的实例化对象, 默认为None】
# 1 构建注意力机制
def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)  # 在函数中, 首先取query的最后一维的大小, 一般情况下就等同于词嵌入维度, 命名为d_k
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)  # torch.matmul()函数只对2个矩阵的最后两维度的数据进行操作。按照注意力公式, 将query(2*3*4*2)与”key的转置“(2*3*2*4)相乘【query、key最后一维一般都是词向量维度】, 这里面key是将最后两个维度进行转置, 再除以缩放系数根号下d_k, 得到注意力得分张量scores【这种计算方法也称为缩放点积注意力计算】
    # print("attention---->scores.shape = {0}".format(scores.shape))  # torch.Size([2, 3, 4, 4])
    if mask is not None:  # 接着判断是否使用掩码张量
        # print("attention---->mask.shape = {0}".format(mask.shape))  # torch.Size([1, 3, 4, 4])
        scores = scores.masked_fill(mask == 0, -1e9)  # 使用tensor的masked_fill方法, 将掩码张量和scores张量每个位置一一比较, 如果掩码张量处为0, 则对应的scores张量用一个非常小的数值(比如：-1e9)替换【mask的shape必须与scores的shape相同或可传播/broadcasting-semantics】
    attention_weight = F.softmax(scores, dim=-1)  # 对scores的最后一维进行softmax操作, 使用F.softmax方法, 第一个参数是softmax对象, 第二个是目标维度. 这样获得最终的注意力张量
    if dropout is not None:  # 之后判断是否使用dropout进行随机置0
        attention_weight = dropout(attention_weight)  # 将p_attn传入dropout对象中进行'丢弃'处理
        # print("attention---->attention_weight.shape = {0}".format(attention_weight.shape))  # torch.Size([2, 3, 4, 4])
    attention_result = torch.matmul(attention_weight, value)  # 根据公式将attention_weight(2*3*4*4)与value(2*3*4*2)张量相乘获得最终的query注意力表示
    # print("attention---->attention_result.shape = {0}".format(attention_result.shape))  # torch.Size([2, 3, 4, 2])
    return attention_weight, attention_result  # 返回注意力权重、最终的query注意力表示


# # 2 测试注意力机制
# query = key = value = pe_result  # 我们令输入的query, key, value都相同【自注意机制】, 都等于位置编码的输出
# attention_weight01, attention_result01 = attention(query=query, key=key, value=value)
# print("query.shape = key.shape = value.shape {0}".format(query.shape))
# print("attention_weight01.shape = {0}".format(attention_weight01.shape))
# print("attention_weight01 = {0}".format(attention_weight01))
# print("attention_result01.shape = {0}".format(attention_result01.shape))
# print("attention_result01 = {0}".format(attention_result01))
# print("-" * 200)
# # 令mask为一个4x4的零张量
# mask = Variable(torch.zeros(4, 4))
# attention_weight02, attention_result02 = attention(query=query, key=key, value=value, mask=mask)
# print("attention_weight02.shape = {0}".format(attention_weight02.shape))
# print("attention_weight02 = {0}".format(attention_weight02))
# print("attention_result02.shape = {0}".format(attention_result02.shape))
# print("attention_result02 = {0}".format(attention_result02))
# print("=" * 200)




# 四、多头注意力类
# 1 构建多头注意力类
class MultiHeadedAttention(nn.Module):
    # 在类的初始化时, 会传入三个参数，head_size代表头数，embedding_dim代表词嵌入的维度， dropout代表进行dropout操作时置0比率，默认是0.1
    def __init__(self, head_size, embedding_dim, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        self.head_size = head_size
        # print("embedding_dim = {0}----head_size = {1}".format(embedding_dim, head_size))
        assert embedding_dim % head_size == 0  # 在函数中，首先使用了一个测试中常用的assert语句，判断head_size是否能被embedding_dim整除【这是因为我们之后要给每个头分配等量的词特征.也就是embedding_dim/head_size个.】
        self.d_k = embedding_dim // head_size  # 整除得到每个头获得的分割词向量维度d_k【比如：词向量总维度为6,被3个head平分,每个head获得词向量中2个维度的数据】
        self.my_linears = clones(nn.Linear(embedding_dim, embedding_dim), 4)  # 然后获得线性层对象，通过nn的Linear实例化，它的内部变换矩阵是embedding_dim x embedding_dim【一定是一个方阵】，然后使用clones函数克隆四个【Q，K，V各需要一个，最后拼接的矩阵还需要一个，因此一共是四个】
        self.attention_result = None  # 初始化最后得到的注意力张量attention_result，现在还没有结果所以为None.
        self.dropout = nn.Dropout(p=dropout)  # 最后就是一个self.dropout对象，它通过nn中的Dropout实例化而来，置0比率为传进来的参数dropout.

    # 前向逻辑函数, 它的输入参数有四个，前三个就是注意力机制需要的Q, K, V，最后一个是注意力机制中可能需要的mask掩码张量，默认是None.
    def forward(self, query, key, value, mask=None):
        if mask is not None:  # 如果存在掩码张量mask
            mask = mask.unsqueeze(0)  # 使用unsqueeze拓展维度，代表多头中的各个头
        batch_size = query.size(0)  # 接着，我们获得一个batch_size的变量，他是query尺寸的第1个数字，代表有多少条样本.
        # print("MultiHeadedAttention---->query----传入的query.shape = {0}----第一维度表示句子数量，第二维度表示句子长度，第三维度表示每个单词的词向量维度".format(query.shape))  # torch.Size([2, 4, 6])
        # print("MultiHeadedAttention---->key----传入的key.shape = {0}----第一维度表示句子数量，第二维度表示句子长度，第三维度表示每个单词的词向量维度".format(key.shape))  # torch.Size([2, 4, 6])
        # print("MultiHeadedAttention---->value----传入的value.shape = {0}----第一维度表示句子数量，第二维度表示句子长度，第三维度表示每个单词的词向量维度".format(value.shape))  # torch.Size([2, 4, 6])
        # 进入多头处理环节
        # 做完线性变换后，开始为每个头分割输入，这里使用view方法对线性变换的结果进行维度重塑，多加了一个维度h，代表头数。这样就意味着每个头可以获得一部分词特征组成的句子。
        # 然后对第二维和第三维进行转置操作，为了让代表句子长度维度和词向量维度能够相邻，这样注意力机制才能找到词义与句子位置的关系，
        # attention函数利用的是原始输入的倒数第一和第二维.这样我们就得到了每个头的输入.
        # query, key, value = [my_linear(x).view(batch_size, -1, self.head, self.d_k).transpose(1, 2) for my_linear, x in zip(self.linears, (query, key, value))]   # zip()函数返回一个元祖列表
        query = self.my_linears[0](query).view(batch_size, -1, self.head_size, self.d_k)
        # print("MultiHeadedAttention---->query----通过view变换形状：query.shape = {0}----第一维度表示句子数量，第二维度表示句子长度，第三维度表示head_size，第四维度表示每个head从单词的词向量总维度中分到的维度数量".format(query.shape))  # torch.Size([2, 4, 3, 2])
        query = query.transpose(1, 2)
        # print("MultiHeadedAttention---->query----通过transpose变换形状：query.shape = {0}".format(query.shape))  # torch.Size([2, 3, 4, 2])，其中3为head_size的数量
        key = self.my_linears[0](key).view(batch_size, -1, self.head_size, self.d_k)
        # print("MultiHeadedAttention---->key----通过view变换形状：key.shape = {0}----第一维度表示句子数量，第二维度表示句子长度，第三维度表示head_size，第四维度表示每个head从单词的词向量总维度中分到的维度数量".format(key.shape))  # torch.Size([2, 4, 3, 2])
        key = key.transpose(1, 2)
        # print("MultiHeadedAttention---->key----通过transpose变换形状：key.shape = {0}".format(key.shape))  # torch.Size([2, 3, 4, 2])
        value = self.my_linears[0](value).view(batch_size, -1, self.head_size, self.d_k)
        # print("MultiHeadedAttention---->value----通过view变换形状：value.shape = {0}----第一维度表示句子数量，第二维度表示句子长度，第三维度表示head_size，第四维度表示每个head从单词的词向量总维度中分到的维度数量".format(value.shape))  # torch.Size([2, 4, 3, 2])
        value = value.transpose(1, 2)
        # print("MultiHeadedAttention---->value----通过transpose变换形状：value.shape = {0}".format(value.shape))  # torch.Size([2, 3, 4, 2])
        # 注意力计算
        self.attention_weight, self.attention_result = attention(query, key, value, mask=mask, dropout=self.dropout)  # 得到每个头的输入后，接下来就是将他们传入到attention中【这里直接调用我们之前实现的attention函数.同时也将mask和dropout传入其中】
        # print("MultiHeadedAttention---->self.attention_weight.shape = {0}".format(self.attention_weight.shape))  # torch.Size([2, 3, 4, 4])
        # print("MultiHeadedAttention----> self.attention_result.shape = {0}".format(self.attention_result.shape))  # torch.Size([2, 3, 4, 2])
        # 合并多头分别计算attention的结果
        self.attention_result = self.attention_result.transpose(1, 2)  # 通过多头注意力计算后，得到每个头计算结果组成的(4*2)维张量，我们需要将其转换为输入的形状以方便后续的计算【进行第一步处理环节的逆操作，对第二和第三维进行转置】
        # print("MultiHeadedAttention----> 通过transpose变换形状：self.attention_result.shape = {0}".format(self.attention_result.shape))  # torch.Size([2, 4, 3, 2])
        self.multi_headed_attention_result = self.attention_result.contiguous().view(batch_size, -1, self.head_size * self.d_k)  # 使用view重塑形状，变成和输入形状相同，将最后一维大小恢复为embedding_dim【contiguous方法的作用就是能够让转置后的张量应用view方法，否则将无法直接使用】
        # print("MultiHeadedAttention---->最终形状：self.multi_headed_attention_result.shape = {0}".format(self.multi_headed_attention_result.shape))  # torch.Size([2, 4, 3, 2])
        # 经过最后一层线性变换
        multi_headed_attention_result = self.my_linears[-1](self.multi_headed_attention_result)  # 最后使用线性层列表中的最后一个线性层对attention_result进行线性变换得到最终的多头注意力结构的输出.
        return multi_headed_attention_result


# # 2 测试多头注意力类
# head_size = 3  # 头数head_size
# embedding_dim = EMBEDDING_DIM  # 词嵌入维度embedding_dim,要与上一个模块位置编码器的输出pe_result的最后一维保持一致
# dropout = DROPOUT  # 置零比率dropout
# query = value = key = pe_result  # 假设输入的Q，K，V仍然相等【torch.Size([2, 4, 6])】
# mask = Variable(torch.zeros(3, 4, 4))  # 输入的掩码张量mask
# self_attention_layer = MultiHeadedAttention(head_size=head_size, embedding_dim=embedding_dim, dropout=dropout)  # 实例化多头注意力类
# print("=" * 100, "MultiHeadedAttention多头注意力层测试", "=" * 100)
# multi_headed_attention_result = self_attention_layer(query=query, key=key, value=value, mask=mask)
# print("multi_headed_attention_result = {0}".format(multi_headed_attention_result))
# print("=" * 200)


# 五、前馈全连接层【通过类PositionwiseFeedForward来实现前馈全连接层】
# 1 构建前馈全连接层类
class FeedForward(nn.Module):
    def __init__(self, embedding_dim, ff_middle_dim, dropout=0.1):  # embedding_dim是线性层的输入维度也是第二个线性层的输出维度，因为我们希望输入通过前馈全连接层后输入和输出的维度不变.ff_middle_dim就是第二个线性层的输入维度和第一个线性层的输出维度.
        super(FeedForward, self).__init__()
        self.linear01 = nn.Linear(embedding_dim, ff_middle_dim)  # 使用nn实例化线性层对象self.linear01
        self.linear02 = nn.Linear(ff_middle_dim, embedding_dim)  # 使用nn实例化线性层对象self.linear02
        self.dropout = nn.Dropout(dropout)  # 使用nn默认的Dropout实例化对象self.dropout

    def forward(self, x):  # 输入参数为x，代表来自上一层(多头注意力层)的输出
        # 首先经过第一个线性层，然后使用Funtional中relu函数进行激活,
        # 之后再使用dropout进行随机置0，最后通过第二个线性层linear02，返回最终结果.
        return self.linear02(self.dropout(F.relu(self.linear01(x))))


# # 2 测试前馈全连接层类
# embedding_dim = EMBEDDING_DIM  # 要与输入数据最后一维的维度保持一致
# ff_middle_dim = FF_MIDDLE_DIM
# dropout = DROPOUT
# x = multi_headed_attention_result  # 多头注意力层的输出
# feed_forward_layer = FeedForward(embedding_dim, ff_middle_dim, dropout)
# print("=" * 100, "PositionwiseFeedForward前馈全连接层测试", "=" * 100)
# feed_forward_result = feed_forward_layer(x)
# print("feed_forward_result.shape = {0}".format(feed_forward_result.shape))
# print("feed_forward_result = {0}".format(feed_forward_result))
# print("=" * 200)


# 六、规范化层【通过MyLayerNorm实现规范化层的类】
# 1 构建规范化层类【元素的规范化值=(元素的原始值-元素所在维度均值)/元素所在维度方差】
class MyLayerNorm(nn.Module):
    def __init__(self, embedding_dim, eps=1e-6):  # embedding_dim, 表示词嵌入的维度；eps它是一个足够小的数, 在规范化公式的分母中出现,防止分母为0.默认是1e-6.
        super(MyLayerNorm, self).__init__()
        self.eps = eps
        # 参数a、b的作用：如果直接对上一层得到的结果做规范化公式计算，将会改变结果的正常表征。所以需要使用辅助参数作为调节因子，使规范化后的数据即能满足规范化要求，又能不改变针对目标的表征.
        self.a = nn.Parameter(torch.ones(embedding_dim))  # 根据embedding_dim的形状初始化规范化层的参数a(全1张量)【使用nn.parameter封装，代表a是模型的参数，a会跟随着模型一起被训练更新】
        self.b = nn.Parameter(torch.zeros(embedding_dim))  # 根据embedding_dim的形状初始化规范化层的参数b(全0张量)【使用nn.parameter封装，代表b是模型的参数，b会跟随着模型一起被训练更新】

    def forward(self, x):  # 输入参数x代表来自上一层(前馈全连接层)的输出
        # print("-" * 50, "MyLayerNorm：开始", "-" * 50)
        # print("MyLayerNorm---->x.shape = {0}".format(x.shape))  # torch.Size([2, 4, 6])
        # μ0 = x.mean(-1, keepdim=False)  # 用于对比【-1表示对最后一个维度的数据求均值】
        # print("MyLayerNorm---->keepdim=False----μ0.shape = {0}".format(μ0.shape))  # torch.Size([2, 4])
        # print("MyLayerNorm---->keepdim=False----μ0 = {0}".format(μ0))
        μ = x.mean(-1, keepdim=True)  # 【-1表示对x的最后一个维度的所有数据求均值】，【keepdim=True表示保持输出维度与输入维度一致，以便后续计算】【如果keepdim=True，则输出shape为torch.Size([2, 4, 1])，否则为torch.Size([2, 4])】
        # print("MyLayerNorm---->keepdim=True----μ.shape = {0}".format(μ.shape))  # torch.Size([2, 4, 1])
        # print("MyLayerNorm---->keepdim=True----μ = {0}".format(μ))
        σ = x.std(-1, keepdim=True)  # 【-1表示对x的最后一个维度的所有数据求标准差】，【keepdim=True表示保持输出维度与输入维度一致，以便后续计算】，【如果keepdim=True，则输出shape为torch.Size([2, 4, 1])，否则为torch.Size([2, 4])】
        # print("MyLayerNorm---->keepdim=True----σ.shape = {0}".format(σ.shape))  # torch.Size([2, 4, 1])
        # print("MyLayerNorm---->keepdim=True----σ = {0}".format(σ))
        norm_result = (x - μ) / (σ + self.eps)  # 根据规范化公式，用x减去均值除以标准差获得规范化的结果，【eps它是一个足够小的数, 在分母中出现,防止分母为0】
        # print("MyLayerNorm---->norm_result.shape = {0}".format(norm_result.shape))
        # print("MyLayerNorm---->self.a.shape = {0}".format(self.a.shape))
        # print("MyLayerNorm---->self.b.shape = {0}".format(self.b.shape))
        norm_result = self.a * norm_result + self.b  # 最后对结果乘以我们的缩放参数，即a，*号代表同型点乘，即对应位置进行乘法操作，加上位移参数b.返回即可.
        # print("-" * 50, "MyLayerNorm：结束", "-" * 50)
        return norm_result


# # 2 测试规范化层
# embedding_dim = EMBEDDING_DIM
# eps = 1e-6
# x = feed_forward_result  # 前馈全连接层的输出
# myLayerNorm = MyLayerNorm(embedding_dim, eps)
# print("=" * 100, "MyLayerNorm规范化层测试", "=" * 100)
# layer_norm_result = myLayerNorm(x)
# print("layer_norm_result = {0}".format(layer_norm_result))
# print("=" * 200)


# 七、子层连接结构【使用SublayerConnection来实现子层连接结构的类】
# 1 子层连接结构类
class SublayerConnection(nn.Module):
    def __init__(self, embedding_dim, dropout=0.1):  # embedding_dim：一般是都是词嵌入维度的大小，dropout本身是对模型结构中的节点数进行随机抑制的比率，又因为节点被抑制等效就是该节点的输出都是0，因此也可以把dropout看作是对输出矩阵的随机置0的比率.
        super(SublayerConnection, self).__init__()
        self.myLayerNorm = MyLayerNorm(embedding_dim)  # 实例化了规范化对象self.norm
        self.dropout = nn.Dropout(p=dropout)  # 使用nn中预定义的droupout实例化一个self.dropout对象.

    def forward(self, x, sublayer_fn):  # 前向逻辑函数【x：代表上一个层或者子层的输出作为本子层的输入，sublayer_fn：代表该子层连接中的子层函数】
        sublayer_output = self.dropout(sublayer_fn(self.myLayerNorm(x)))  # 首先对x进行规范化,将规范化后的结果传给子层处理,对子层进行dropout操作【dropout操作随机停止一些网络中神经元的作用，来防止过拟合. 】
        # print("SublayerConnection---->x.shape = {0}----sublayer_output.shape = {1}".format(x.shape, sublayer_output.shape))
        output = x + sublayer_output  # 最后的add操作【因为存在跳跃连接，所以是将输入x与dropout后的sublayer子层输出结果sublayer_output相加作为最终的子层连接输出.】
        return output


# # 2 测试子层连接结构
# embedding_dim = EMBEDDING_DIM
# dropout = DROPOUT
# head_size = HEAD_SIZE
# x = pe_result  # 令x为位置编码器的输出
# mask = Variable(torch.zeros(3, 4, 4))
# self_attention_layer = MultiHeadedAttention(head_size, embedding_dim)  # 假设子层中装的是多头注意力层, 实例化这个类
# sublayer_fn = lambda x: self_attention_layer(query=x, value=x, key=x, mask=mask)  # 使用lambda获得一个 “函数类型” 的子层【sublayer_fn是一个函数，输入为x，输出为mha(query=x, value=x, key=x, mask=mask)】
# sc = SublayerConnection(embedding_dim, dropout)  # 实例化子层连接结构
# print("=" * 100, "SublayerConnection测试", "=" * 100)
# sc_result = sc(x, sublayer_fn)  # 调用
# print("SublayerConnection---->sc_result.shape = {0}".format(sc_result.shape))
# print("SublayerConnection---->子层连接结构最终输出结果：sc_result = {0}".format(sc_result))


# 八、编码器层【使用EncoderLayer类实现编码器层】
# 1 构建编码器层
class EncoderLayer(nn.Module):
    # embedding_dim：词嵌入维度的大小，它也将作为我们编码器层的大小,self_attention_layer：多头自注意力子层实例化对象, 并且是自注意力机制；feed_forward_layer：前馈全连接层实例化对象
    def __init__(self, embedding_dim, self_attention_layer, feed_forward_layer, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attention_layer = self_attention_layer
        self.feed_forward_layer = feed_forward_layer
        self.embedding_dim = embedding_dim
        self.sublayer_connection = clones(SublayerConnection(embedding_dim, dropout), 2)  # 编码器层中有两个子层连接结构, 所以使用clones函数进行克隆【此时子层连接结构里的sublayer_fn上没有被定义传入】

    def forward(self, x, mask):  # x：上一层的输出；mask：掩码张量.
        sublayer_fn01 = lambda x: self.self_attention_layer(query=x, key=x, value=x, mask=mask)
        sublayer_fn02 = lambda x: self.feed_forward_layer(x)
        x = self.sublayer_connection[0](x, sublayer_fn01)  # 第一个子层连接结构，包含多头自注意力子层
        x = self.sublayer_connection[1](x, sublayer_fn02)  # 第二个子层连接结构，包含前馈全连接子层
        return x


# # 2 测试编码器层
# embedding_dim = EMBEDDING_DIM
# head_size = HEAD_SIZE
# ff_middle_dim = FF_MIDDLE_DIM
# dropout = DROPOUT
# x = pe_result
# mask = Variable(torch.zeros(3, 4, 4))
# print("=" * 100, "EncoderLayer测试", "=" * 100)
# self_attention_layer = MultiHeadedAttention(head_size, embedding_dim)  # 实例化多头注意力层
# feed_forward_layer = FeedForward(embedding_dim, ff_middle_dim, dropout)  # 实例化前馈全连接层
# encoder_layer = EncoderLayer(embedding_dim, self_attention_layer, feed_forward_layer, dropout)  # 实例化编码器层
# encoder_layer_result = encoder_layer(x, mask)
# print("EncoderLayer---->sc_result.shape = {0}".format(encoder_layer_result.shape))
# print("EncoderLayer---->sc_result = {0}".format(encoder_layer_result))


# 九、编码器【使用Encoder类来实现编码器】
# 1 构建编码器
class Encoder(nn.Module):
    def __init__(self, encoderLayer, N):  # encoderLayer: 代表编码器层; N: 编码器层的个数
        super(Encoder, self).__init__()
        self.encoderLayers = clones(encoderLayer, N)  # 首先使用clones函数克隆N个编码器层放在self.encoderLayers中
        self.myLayerNorm = MyLayerNorm(encoderLayer.embedding_dim)  # 实例化一个规范化层, 它将用在编码器的最后面.

    def forward(self, source_embedded_x, mask):  # source_embedded_x：来自上一层的输入(源语言数据的词嵌入表示); mask: 掩码张量
        for encoderLayer in self.encoderLayers:  # 将输入source_embedded_x分别通过所有克隆的编码器层，每经过一个编码器层都会得到一个新的source_embedded_x
            source_embedded_x = encoderLayer(source_embedded_x, mask)
        encoder_result = self.myLayerNorm(source_embedded_x)  # 最后将x通过规范化层的对象self.myLayerNorm进行处理
        return encoder_result


# # 2 测试编码器
# embedding_dim = EMBEDDING_DIM
# head_size = HEAD_SIZE
# ff_middle_dim = FF_MIDDLE_DIM
# dropout = DROPOUT
# N = 3
# mask = Variable(torch.zeros(3, 4, 4))
# my_deep_copy = copy.deepcopy
# print("=" * 100, "Encoder测试", "=" * 100)
# # 实例化各个组件
# self_attention_layer = MultiHeadedAttention(head_size, embedding_dim)  # 实例化多头注意力层
# feed_forward_layer = FeedForward(embedding_dim, ff_middle_dim, dropout)  # 实例化前馈全连接层
# encoder_layer = EncoderLayer(embedding_dim, my_deep_copy(self_attention_layer), my_deep_copy(feed_forward_layer), dropout)  # 实例化编码器层【每一编码器层里的多头注意力层、前馈全连接层都是不同对象，得用深拷贝】
# encoder = Encoder(encoder_layer, N)  # 实例化编码器
# # 调用编码器进行编码
# x = pe_result
# encoder_result = encoder(source_embedded_x=x, mask=mask)
# print("Encoder---->encoder_result.shape = {0}".format(encoder_result.shape))
# print("Encoder---->Transformer模型编码器最终输出：encoder_result = {0}".format(encoder_result))


# 十、解码器层【使用DecoderLayer的类实现解码器层：作为解码器的组成单元, 每个解码器层根据给定的输入向目标方向进行特征提取操作，即解码过程.】
# 1 构建解码器层【最终输出由”编码器层的最终输出“、”目标语言数据张量“一同作为解码器层的输入的特征提取结果】
class DecoderLayer(nn.Module):
    # 初始化函数的参数有5个【embedding_dim: 词嵌入的维度大小(同时也代表解码器层的尺寸); self_attention_layer: 多头自注意力对象(Q=K=V); src_attention_layer: 多头常规注意力对象(Q!=K=V); feed_forward_layer: 前馈全连接层对象
    def __init__(self, embedding_dim, self_attention_layer, src_attention_layer, feed_forward_layer, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.embedding_dim = embedding_dim
        self.self_attention_layer = self_attention_layer
        self.src_attention_layer = src_attention_layer
        self.feed_forward_layer = feed_forward_layer
        self.sublayer_connection = clones(SublayerConnection(embedding_dim, dropout), 3)  # 使用clones函数克隆三个子层连接对象【解码器层包含3个子层连接结构】

    def forward(self, x, memory, source_mask, target_mask):  # forward函数中的参数有4个【x:来自上一层的输入(目标语言数据的词嵌入表示); mermory: 编码器层的最终输出(源语言数据语义存储变量)， source_mask: 源语言数据的掩码张量; target_mask:目标语言数据的掩码张量】
        sublayer_fn01 = lambda x: self.self_attention_layer(query=x, key=x, value=x, mask=target_mask)  # sublayer使用多头自注意力层，所以Q,K,V都是x，【target_mask是目标语言数据掩码张量，这时要对目标语言数据进行遮掩，因为此时模型可能还没有生成任何目标语言数据，比如在解码器准备生成第一个字符或词汇时，我们其实已经传入了第一个字符以便计算损失，但是我们不希望在生成第一个字符时模型能利用第一个字符以及之后的信息，因此我们会将其遮掩，同样生成第二个字符或词汇时，模型只能使用第一个字符或词汇信息，第二个字符以及之后的信息都不允许被模型使用.】
        x = self.sublayer_connection[0](x, sublayer_fn01)  # 第一个子层连接结构
        sublayer_fn02 = lambda x: self.src_attention_layer(query=x, key=memory, value=memory, mask=source_mask)  # sublayer使用多头常规注意力机制，q是输入x; k，v是编码层输出memory【source_mask是源语言数据掩码张量，进行源语言数据遮掩的原因并非是抑制信息泄漏，而是遮蔽掉对结果没有意义的字符而产生的注意力值, 以此提升模型效果和训练速度】
        x = self.sublayer_connection[1](x, sublayer_fn02)  # 第二个子层连接结构
        sublayer_fn03 = lambda x: self.feed_forward_layer(x)  # sublayer使用前馈全连接子层
        x = self.sublayer_connection[2](x, sublayer_fn03)  # 第三个子层连接结构
        return x  # 最终输出由”编码器层的最终输出“、”目标语言数据张量“一同作为解码器层的输入的特征提取结果


# # 2 测试解码器层
# embedding_dim = EMBEDDING_DIM
# head_size = HEAD_SIZE
# ff_middle_dim = FF_MIDDLE_DIM
# dropout = DROPOUT
# mask = Variable(torch.zeros(3, 4, 4))
# my_deep_copy = copy.deepcopy
# print("=" * 100, "DecoderLayer测试", "=" * 100)
# # 实例化各个组件
# self_attention_layer = MultiHeadedAttention(head_size, embedding_dim)  # 实例化多头自注意力层
# src_attention_layer = MultiHeadedAttention(head_size, embedding_dim)  # 实例化多头常规注意力层
# feed_forward_layer = FeedForward(embedding_dim, ff_middle_dim, dropout)  # 实例化前馈全连接层
# decoder_layer = DecoderLayer(embedding_dim=embedding_dim, self_attention_layer=self_attention_layer, src_attention_layer=src_attention_layer, feed_forward_layer=feed_forward_layer, dropout=dropout)
# # 初始化解码器层的各个输入
# x = pe_result  # x是来自目标语言数据的词嵌入表示, 但形式和源语言数据的词嵌入表示相同, 这里使用pe_result充当.
# memory = encoder_result  # memory是来自编码器的输出
# source_mask = target_mask = mask  # 实际中source_mask和target_mask并不相同, 这里为了方便计算使他们都为mask
# # 调用解码器层进行解码
# decoder_layer_result = decoder_layer(x, memory, source_mask, target_mask)
# print("DecoderLayer---->decoder_layer_result.shape = {0}".format(decoder_layer_result.shape))
# print("DecoderLayer---->decoder_layer_result = {0}".format(decoder_layer_result))


# 十一、解码器【根据”编码器层的最终输出“&“解码器上一次预测的结果” 对下一次可能出现的'值'进行特征表示】
# 1 构建解码器
class Decoder(nn.Module):
    def __init__(self, decoderLayers, N):  # decoderLayer: 代表解码器层; N: 解码器层的个数
        super(Decoder, self).__init__()
        self.decoderLayers = clones(decoderLayers, N)  # 首先使用clones函数克隆N个解码器层放在self.decoderLayers数组中
        self.myLayerNorm = MyLayerNorm(decoderLayers.embedding_dim)  # 实例化一个规范化层, 它将用在解码器的最后面.

    def forward(self, target_embedded_x, memory, source_mask, target_mask):  # forward函数中的参数有4个【target_embedded_x:来自上一层的输入(目标语言数据的词嵌入表示); mermory: 编码器层的最终输出(源语言数据语义存储变量)， source_mask: 源语言数据的掩码张量; target_mask:目标语言数据的掩码张量】
        for decoderLayer in self.decoderLayers:  # 将输入target_embedded分别通过所有克隆的编码器层，每经过一个编码器层都会得到一个新的target_embedded
            target_embedded_x = decoderLayer(target_embedded_x, memory, source_mask, target_mask)
        decoder_result = self.myLayerNorm(target_embedded_x)  # 最后将target_embedded_x通过规范化层的对象self.myLayerNorm进行处理
        return decoder_result


# # 2 测试解码器
# embedding_dim = EMBEDDING_DIM
# head_size = HEAD_SIZE
# ff_middle_dim = FF_MIDDLE_DIM
# dropout = DROPOUT
# N = 3
# mask = Variable(torch.zeros(3, 4, 4))
# my_deep_copy = copy.deepcopy
# print("=" * 100, "Decoder测试", "=" * 100)
# # 实例化各个组件
# self_attention_layer = MultiHeadedAttention(head_size, embedding_dim)  # 实例化多头自注意力层
# src_attention_layer = MultiHeadedAttention(head_size, embedding_dim)  # 实例化多头常规注意力层
# feed_forward_layer = FeedForward(embedding_dim, ff_middle_dim, dropout)  # 实例化前馈全连接层
# decoder_layer = DecoderLayer(embedding_dim=embedding_dim, self_attention_layer=my_deep_copy(self_attention_layer), src_attention_layer=my_deep_copy(src_attention_layer), feed_forward_layer=my_deep_copy(feed_forward_layer), dropout=dropout)
# decoder = Decoder(decoder_layer, N)
# # 初始化解码器层的各个输入
# x = pe_result  # x是来自目标语言数据的词嵌入表示, 但形式和源语言数据的词嵌入表示相同, 这里使用pe_result充当.
# memory = encoder_result  # memory是来自编码器的输出
# source_mask = target_mask = mask  # 实际中source_mask和target_mask并不相同, 这里为了方便计算使他们都为mask
# # 调用
# decoder_result = decoder(target_embedded=x, memory=memory, source_mask=source_mask, target_mask=target_mask)
# print("Decoder---->decoder_result.shape = {0}".format(decoder_result.shape))
# print("Decoder---->Transformer模型解码器最终输出：decoder_result = {0}".format(encoder_result))


# 十二、输出部分
# 1 构建输出部分
class Generator(nn.Module):
    def __init__(self, embedding_dim, target_vocab_size):  # embedding_dim: 代表词嵌入维度, target_vocab_size: 代表目标词表大小
        # print("Generator---->embedding_dim = {0}----target_vocab_size = {1}".format(embedding_dim, target_vocab_size))
        super(Generator, self).__init__()
        self.project = nn.Linear(embedding_dim, target_vocab_size)  # 使用nn中的预定义线性层进行实例化

    def forward(self, x):  # 前向逻辑函数中输入是上一层的输出张量x
        # print("Generator---->x.shape = {0}".format(x.shape))
        x = self.project(x)  # 使用self.project对x进行线性变化【转换维度的作用】
        generator_output = F.log_softmax(x, dim=-1)  # 使最后一维的向量中的数字缩放到0-1的概率值域内, 并满足他们的和为1。在这里之所以使用log_softmax是因为和我们这个pytorch版本的损失函数实现有关, 在其他版本中将修复.【log_softmax就是对softmax的结果又取了对数, 因为对数函数是单调递增函数, 因此对最终我们取最大的概率值没有影响. 最后返回结果即可】
        # print("Generator---->generator_output.shape = {0}".format(generator_output.shape))
        return generator_output


# # 2 测试输出部分
# embedding_dim = EMBEDDING_DIM
# target_vocab_size = 1000  # 目标词表大小是1000
# x = decoder_result  # 输入x是上一层网络的输出, 我们使用来自解码器层的输出
# generator = Generator(embedding_dim, target_vocab_size)
# print("=" * 100, "Generator测试", "=" * 100)
# generator_result = generator(x)
# print("Generator---->generator_result.shape = {0}".format(generator_result.shape))
# print("Generator---->Transformer模型输出部分的最终输出：generator_result = {0}".format(generator_result))


# 十三、编码器-解码器结构
# 1 构建编码器-解码器结构
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, source_embedding_fn, target_embedding_fn, generator):  # 初始化函数中有5个参数【encoder: 编码器对象, decoder: 解码器对象, source_embedding_fn:源语言数据词嵌入函数, target_embedding_fn:目标语言数据词嵌入函数, generator:输出部分的类别生成器对象】
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.source_embedding_fn = source_embedding_fn
        self.target_embedding_fn = target_embedding_fn
        self.generator = generator

    def forward(self, source_tensor, target_tensor, source_mask, target_mask):  # 在forward函数中，有四个参数【source:源语言数据(数值型张量); target: 目标语言数据(数值型张量); source_mask: 源语言数据的掩码张量; target_mask:目标语言数据的掩码张量】
        encoder_result = self.encode(source_tensor=source_tensor, source_mask=source_mask)  # 将source, source_mask传入编码函数, 得到编码器最终输出结果
        decoder_result = self.decode(target_tensor=target_tensor, memory=encoder_result, source_mask=source_mask, target_mask=target_mask)
        # print("EncoderDecoder---->decoder_result.shape = {0}".format(decoder_result.shape))
        return decoder_result

    def encode(self, source_tensor, source_mask):  # 编码函数【source_tensor: 源语言数据张量; source_mask: 源语言数据的掩码张量】
        source_embedded = self.source_embedding_fn(source_tensor)  # 将源语言数据张量转为词向量
        encoder_result = self.encoder(source_embedded, source_mask)  # 将源语言词向量经过编码器编码得到编码器最终输出
        return encoder_result

    # decoder的作用：基于输入的目标语言某个单词张量(target_tensor)、编码器最终输出(memory,对源语言一句待翻译文本的编码结果)预测并输出接下来的目标语言某个单词张量(decoder_output)。
    def decode(self, target_tensor, memory, source_mask, target_mask):  # 解码函数【target_tensor: 目标语言数据张量; memory: 编码器最终输出; source_mask: 源语言数据的掩码张量; target_mask:目标语言数据的掩码张量】
        target_embedded = self.target_embedding_fn(target_tensor)  # 将目标语言数据张量转为词向量
        decoder_output = self.decoder(target_embedded, memory, source_mask, target_mask)
        return decoder_output


# # 2 测试编码器-解码器结构
# embedding_dim = EMBEDDING_DIM
# max_len = MAX_LEN
# dropout = DROPOUT
# head_size = HEAD_SIZE
# ff_middle_dim = FF_MIDDLE_DIM
# source_vocab_size = VOCAB_SIZE
# target_vocab_size = VOCAB_SIZE
# N = 3
# mask = Variable(torch.zeros(3, 4, 4))
# my_deep_copy = copy.deepcopy
# print("=" * 100, "EncoderDecoder测试", "=" * 100)
# # 实例化各个组件
# source_embedding_fn = MyEmbedding(source_vocab_size, embedding_dim)  # 实例化编码器的Embedding对象
# target_embedding_fn = MyEmbedding(target_vocab_size, embedding_dim)  # 实例化解码器的Embedding对象
# position_encoder = PositionalEncoding(embedding_dim=embedding_dim, max_len=max_len, dropout=dropout)
# self_attention_layer = MultiHeadedAttention(head_size, embedding_dim)  # 实例化多头自注意力层
# src_attention_layer = MultiHeadedAttention(head_size, embedding_dim)  # 实例化多头常规注意力层
# feed_forward_layer = FeedForward(embedding_dim, ff_middle_dim, dropout)  # 实例化前馈全连接层
# encoder_layer = EncoderLayer(embedding_dim, my_deep_copy(self_attention_layer), my_deep_copy(feed_forward_layer), dropout)  # 实例化编码器层【每一编码器层里的多头注意力层、前馈全连接层都是不同对象，得用深拷贝】
# decoder_layer = DecoderLayer(embedding_dim=embedding_dim, self_attention_layer=my_deep_copy(self_attention_layer), src_attention_layer=my_deep_copy(src_attention_layer), feed_forward_layer=my_deep_copy(feed_forward_layer), dropout=dropout)
# encoder = Encoder(encoder_layer, N)  # 实例化编码器
# decoder = Decoder(decoder_layer, N)  # 实例化解码器
# generator = Generator(embedding_dim, target_vocab_size)  # 实例化输出部分
# encoder_decoder = EncoderDecoder(encoder, decoder, source_embedding_fn, target_embedding_fn, position_encoder, generator)  # 实例化编码器-解码器结构
# # 初始化各个输入
# source_tensor = target_tensor = Variable(torch.LongTensor([[100, 2, 421, 508], [491, 998, 1, 221]]))  # 假设源数据与目标数据相同, 实际中并不相同
# source_mask = target_mask = mask  # 实际中source_mask和target_mask并不相同, 这里为了方便计算使他们都为mask
# # 调用
# encoder_decoder_result = encoder_decoder(source_tensor=source_tensor, target_tensor=target_tensor, source_mask=source_mask, target_mask=target_mask)
# print("EncoderDecoder---->encoder_decoder_result.shape = {0}".format(encoder_decoder_result.shape))
# print("EncoderDecoder---->Transformer模型整体最终输出：encoder_decoder_result = {0}".format(encoder_decoder_result))


# 十四、Transfomer模型构建函数
# 1 该函数用来构建模型, 有7个参数，【source_vocab_size: 源数据特征(词汇)总数; target_vocab_size: 目标数据特征(词汇)总数; N: 编码器和解码器堆叠数; embedding_dim: 词向量映射维度; ff_middle_dim: 前馈全连接层中变换矩阵的维度; head_size: 多头注意力结构中的多头数; dropout: 置零比率】
def build_model(source_vocab_size, target_vocab_size, N=2, max_len=64, embedding_dim=512, head_size=8, ff_middle_dim=2048, dropout=0.1):  # 这些都是超参数，需要调试来优化模型
    my_deep_copy = copy.deepcopy  # 首先得到一个深度拷贝命令，接下来很多结构都需要进行深度拷贝，来保证他们彼此之间相互独立，不受干扰.
    attention_layer = MultiHeadedAttention(head_size=head_size, embedding_dim=embedding_dim)  # 实例化了多头注意力类
    feed_forward_layer = FeedForward(embedding_dim=embedding_dim, ff_middle_dim=ff_middle_dim, dropout=dropout)  # 然后实例化前馈全连接类
    position_encoder = PositionalEncoding(embedding_dim=embedding_dim, max_len=max_len, dropout=dropout)
    # 根据Transfomer模型结构图, 最外层是EncoderDecoder，在 EncoderDecoder中，分别是1、编码器层; 2、解码器层; 3、源数据Embedding层和位置编码组成的有序结构，4、目标数据Embedding层和位置编码组成的有序结构; 5、类别生成器层.
    model = EncoderDecoder(
        Encoder(EncoderLayer(embedding_dim, my_deep_copy(attention_layer), my_deep_copy(feed_forward_layer), dropout), N),  # 在编码器层中有2个子层【self_attention_layer子层、前馈全连接子层】
        Decoder(DecoderLayer(embedding_dim, my_deep_copy(attention_layer), my_deep_copy(attention_layer), my_deep_copy(feed_forward_layer), dropout), N),  # 在解码器层中有3个子层【self_attention_layer子层、src_attention_layer子层、前馈全连接子层】
        nn.Sequential(MyEmbedding(source_vocab_size, embedding_dim), my_deep_copy(position_encoder)),
        nn.Sequential(MyEmbedding(target_vocab_size, embedding_dim), my_deep_copy(position_encoder)),
        Generator(embedding_dim, target_vocab_size))

    # 模型结构完成后，接下来就是初始化模型中的参数(比如线性层中的变换矩阵)【一但判断参数的维度大于1，则会将其初始化成一个服从均匀分布的矩阵; 如果维度为 1,直接初始化未0即可】
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model


# # 2 测试Transfomer模型构建函数
# source_vocab_size = 12000
# target_vocab_size = 15000
# # 其他参数都使用默认值
# transformer_model = build_model(source_vocab_size, target_vocab_size)
# print("transformer_model = \n{0}".format(transformer_model))


# 十五、工具函数
# 1 克隆函数, 因为在多头注意力机制的实现中, 用到多个结构相同的线性层. 我们将使用clone函数将他们一同初始化在一个网络层列表对象中. 之后的结构中也会用到该函数.
def clones(module, N):  # 用于生成相同网络层的克隆函数, 它的参数module表示要克隆的目标网络层, N代表需要克隆的数量
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])  # 在函数中, 我们通过for循环对module进行N次深度拷贝, 使其每个module成为独立的层,然后将其放在nn.ModuleList类型的列表中存放.

# 2 构建掩码张量
def subsequent_mask(size):
    attn_shape = (1, size, size)  # 定义掩码张量的形状【参数size是掩码张量最后两个维度的大小, attn_shape的最后两维形成一个方阵】
    ones_matrix = np.ones(attn_shape)
    # print("ones_matrix = \n{0}".format(ones_matrix))  # 构建一个全1的张量
    subsequent_mask = np.triu(ones_matrix, k=1).astype('uint8')  # 使用np.triu形成上三角阵, 最后为了节约空间,再使其中的数据类型变为无符号8位整形unit8
    subsequent_mask = torch.from_numpy(subsequent_mask) == 0  # 将numpy类型转化为torch中的tensor【与0比较后返回True、False】
    # print("subsequent_mask = \n{0}".format(subsequent_mask))
    return subsequent_mask


# 测试：生成20×20的掩码张量
# size = 20  # 设置生成的掩码张量的最后两维的大小
# sm = subsequent_mask(size)
# print("sm = \n{0}".format(sm))
# plt.figure(figsize=(5, 5))
# plt.imshow(sm[0])
# print("=" * 200)

# 3 Batch【Object for holding a batch of data with mask during training. 它能够对原始样本数据生成对应批次的掩码张量】
class Batch:
    def __init__(self, src, trg=None, pad=0):
        print("使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：\nsrc={0}\ntrg={1}\npad={2}".format(src, trg, pad))
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
            print("Batch---->self.trg.shape={0}".format(self.trg.shape))
            print("Batch---->self.trg_y.shape={0}".format(self.trg_y.shape))
            print("Batch---->self.trg_mask.shape={0}".format(self.trg_mask.shape))
            print("Batch---->self.ntokens={0}".format(self.ntokens))

    @staticmethod
    def make_std_mask(tgt, pad):  # "Create a mask to hide padding and future words.
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask


# 4 优化器生成函数
class NoamOpt:
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0

    def step(self):  # Update parameters and rate
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()

    def rate(self, step=None):  # Implement `lrate` above
        if step is None:
            step = self._step
        return self.factor * (self.model_size ** (-0.5) * min(step ** (-0.5), step * self.warmup ** (-1.5)))


# 优化器工具包 get_std_optimizer, 该工具用于获得标准的针对Transformer标准化的模型优化器 【该标准优化器基于Adam优化器, 使其对序列到序列的任务更有效.】
def get_std_optimizer(model):
    return NoamOpt(model_size=model.source_embedding_fn[0].embedding_dim, factor=2, warmup=4000, optimizer=torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))


# 5 标签平滑损失函数, 该工具用于标签平滑, 标签平滑的作用就是小幅度的改变原有标签值的值域【因为在理论上即使是人工的标注数据也可能并非完全正确, 会受到一些外界因素的影响而产生一些微小的偏差。因此使用标签平滑来弥补这种偏差, 减少模型对某一条规律的绝对认知, 以防止过拟合】
class CriterionWithLabelSmoothing(nn.Module):
    # 第一个参数size代表目标数据的词汇总数, 也是模型最后一层得到张量的最后一维大小; 第二个参数padding_idx表示要将那些tensor中的数字替换成0, 一般padding_idx=0表示不进行替换; 第三个参数smoothing, 表示标签的平滑程度, 如原来标签的表示值为1, 则平滑后它的值域变为[1-smoothing, 1+smoothing]
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(CriterionWithLabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)  # KL距离损失
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, predict, target):
        assert predict.size(1) == self.size
        true_dist = predict.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        criterion_result = self.criterion(predict, Variable(true_dist, requires_grad=False))
        print("CriterionWithLabelSmoothing---->predict.shape = {0}----true_dist.shape = {1}----criterion_result = {2}".format(predict.shape, true_dist.shape, criterion_result))
        return criterion_result


# # 测试标签平滑损失函数
# crit = CriterionWithLabelSmoothing(size=5, padding_idx=0, smoothing=0.5)
# predict = Variable(torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0], [0, 0.2, 0.7, 0.1, 0], [0, 0.2, 0.7, 0.1, 0]]))    # 假定一个任意的模型最后Softmax输出预测结果
# target = Variable(torch.LongTensor([2, 1, 0]))  # 标签的表示值是0，1，2
# crit(predict, target)  # 将predict, target传入到对象中
# plt.imshow(crit.true_dist)


# 6  损失计算工具包, 该工具能够进行损失的计算, 损失的计算方法可以认为是交叉熵损失函数.
class SimpleLossCompute:
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt

    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)) / norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.data * norm


# 7 模型单轮训练工具包run_epoch, 该工具将对模型使用给定的损失函数计算方法进行单轮参数更新.并打印每轮参数更新的损失结果.
def run_epoch(data_iter, model, loss_compute):
    epoch_start_time = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for batch_index, batch in enumerate(data_iter):
        batch_start_time = time.time()
        print("run_epoch---->batch.ntokens = {0}".format(batch.ntokens))
        out = model.forward(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        batch_end_time = time.time()
        timeused_of_this_batch = batch_end_time - batch_start_time
        print("run_epoch---->batch_index = %d----Loss = %f----batch.ntokens = %f----batch average Loss = %f----当前batch所用时间 = %f 秒\n\n" % (batch_index, loss, int(batch.ntokens), loss / batch.ntokens, timeused_of_this_batch))
        tokens = 0
    avg_loss_of_this_epoch = total_loss / total_tokens
    epoch_end_time = time.time()
    timeused_of_this_epoch = epoch_end_time - epoch_start_time
    print("run_epoch---->avg_loss_of_this_epoch = %f----当前epoch所用时间 = %f 秒" % (avg_loss_of_this_epoch, timeused_of_this_epoch))
    return avg_loss_of_this_epoch


# 8 贪婪解码(贪心算法)的方式是每次预测都选择概率最大的结果作为输出, 它不一定能获得全局最优性, 但却拥有最高的执行效率.
def greedy_decode(model, source_tensor, source_mask, max_len, start_symbol):
    memory = model.encode(source_tensor, source_mask)
    print("greedy_decode---->start_symbol = {0}".format(start_symbol))
    target_tensor = torch.ones(1, 1).fill_(start_symbol).type_as(source_tensor.data)
    print("greedy_decode---->target_tensor = {0}".format(target_tensor))
    for i in range(max_len - 1):
        out = model.decode(target_tensor=Variable(target_tensor), memory=memory, source_mask=source_mask, target_mask=Variable(subsequent_mask(target_tensor.size(1)).type_as(source_tensor.data)))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        target_tensor = torch.cat([target_tensor, torch.ones(1, 1).type_as(source_tensor.data).fill_(next_word)], dim=1)
        print("greedy_decode---->target_tensor = {0}".format(target_tensor))
    return target_tensor


# 十六、Transfomer模型基本测试【copy任务】
# copy任务：任务描述: 针对数字序列进行学习, 学习的最终目标是使输出与输入的序列相同. 如输入[1, 5, 8, 9, 3], 输出也是[1, 5, 8, 9, 3].
# 任务意义: copy任务在模型基础测试中具有重要意义，因为copy操作对于模型来讲是一条明显规律, 因此模型能否在短时间内，小数据集中学会它，可以帮助我们断定模型所有过程是否正常，是否已具备基本学习能力.

# 1 第一步: 数据集生成器
# 1.1 构建数据集生成器
def data_generator(vocab_size, batch_size, max_len, batch_amount_per_epoch):  # 该函数用于随机生成copy任务的数据, 它的三个输入参数是【vocab_size: 随机生成数字的最大值+1, batch_size: 每次输送给模型更新一次参数的数据量, batch_amount_per_epoch: 一共输送 batch_amount_per_epoch 个 batch 完成一轮 epoch】
    for i in range(batch_amount_per_epoch):  # 使用for循环遍历 batches
        data = torch.from_numpy(np.random.randint(1, vocab_size, size=(batch_size, max_len)))  # 在循环中使用np的random.randint方法随机生成[1, vocab_size)的整数,分布在(batch_size, max_len)形状的矩阵中, 然后再把numpy形式转换称torch中的tensor.【batch_size代表行数量; 每行一个batch代表一句文本，max_len(每行长度)代表每句文本的长度; 每个元素代表一个单词张量(该单词在所属语言词表里的index)】
        data[:, 0] = 1  # 使数据矩阵中的第一列数字都为1, 这一列也就成为了起始标志列,当解码器进行第一次解码的时候, 会使用起始标志列作为输入.【每行的第一个元素设置为1来代表该句文本的起始标志，所以每句文本的单词数量=(max_len-1)】
        # print("data_generator---->data = {0}".format(data))
        # tensor([[1, 6, 7, 10, 1, 9, 2, 3, 4, 6],
        #         [1, 7, 2, 7, 6, 5, 2, 7, 10, 10],
        #         [1, 9, 4, 6, 5, 2, 5, 4, 4, 1],
        #         [1, 1, 8, 5, 6, 6, 2, 2, 9, 10],
        #         [1, 9, 5, 6, 3, 3, 4, 7, 2, 6],
        #         [1, 9, 8, 3, 7, 5, 5, 6, 9, 5],
        #         [1, 4, 3, 5, 6, 3, 10, 3, 2, 6],
        #         [1, 3, 3, 6, 7, 9, 5, 6, 7, 3]], dtype=torch.int32)
        source_tensor = Variable(data, requires_grad=False).long()  # 数据样本作用变量不需要求梯度, 因此requires_grad设置为False【因为是copy任务, 所有source与target是完全相同的】
        target_tensor = Variable(data, requires_grad=False).long()  # 数据样本作用变量不需要求梯度, 因此requires_grad设置为False【因为是copy任务, 所有source与target是完全相同的】
        batch_result = Batch(source_tensor, target_tensor)  # 使用Batch对source和target进行对应批次的掩码张量生成, 最后使用yield返回
        yield batch_result  # yield函数作用：返回的是一个生成器对象，该对象可以迭代遍历和通过next()方法取出对象中的值。比较节约内存空间。保存的是生成数据的方式。可以达到随用随取的效果。


# 1.2 数据集生成器
vocab_size = 11  # 将生成0-10的整数
batch_size = 20  # 每次喂给模型20个数据进行参数更新【20个句子】
max_len = 10  # 每个句子长度为 10
batch_amount_per_epoch = 30  # 连续喂30次完成全部数据的遍历, 也就是1轮
batch_result = data_generator(vocab_size=vocab_size, batch_size=batch_size, max_len=max_len, batch_amount_per_epoch=batch_amount_per_epoch)  # 会得到一个数据生成器(生成器对象)  <generator object data_gen at 0x10c053e08>

# 2 第二步: 获得Transformer模型及其优化器和损失函数
model = build_model(source_vocab_size=vocab_size, target_vocab_size=vocab_size)  # 使用 build_model 获得model【因为是copy任务，源数据词汇总量、目标数据词汇总量应该相同，都设置为 vocab_size。 其他参数采用默认】
print("model = {0}".format(model))
model_optimizer = get_std_optimizer(model)  # 使用 get_std_optimizer 获得标准化的模型优化器
total_params = sum(p.numel() for p in model.parameters())	# 模型参数总数量
print("模型参数总数量 = {0}".format(total_params ))
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)	# 模型可训练参数总数量
print("模型可训练参数总数量 = {0}".format(total_trainable_params )) # print(f'{total_trainable_params:,} training parameters.')
criterion = CriterionWithLabelSmoothing(size=vocab_size, padding_idx=0, smoothing=0.0)  # 实例化一个CriterionWithLabelSmoothing标签平滑损失函数
loss = SimpleLossCompute(model.generator, criterion, model_optimizer)  # 实例化一个SimpleLossCompute利用标签平滑结果的损失计算方法


# 3 第三步: 运行模型进行训练和评估
def run(model, loss, epochs=2):  # 模型训练函数, 共有三个参数, model代表将要进行训练的模型, loss代表使用的损失计算方法, epochs代表模型训练的轮数
    for epoch in range(epochs):  # 遍历轮数
        print("=" * 100, "epoch={0}".format(epoch), "=" * 100)
        print("-" * 50, "epoch={0}：开始训练".format(epoch), "-" * 50)
        model.train()  # 模型使用训练模式, 所有参数将被更新【train()函数是nn模块内置函数】
        run_epoch(data_generator(vocab_size=vocab_size, batch_size=8, max_len=10, batch_amount_per_epoch=3), model, loss)  # 训练时, 每 epoch有 3 个batch,每个batch有 8句文本,每句文本有(10-1)=9个单词
        print("-" * 50, "epoch={0}：开始测试".format(epoch), "-" * 50)
        model.eval()  # 模型使用评估模式, 参数将不会变化【eval()函数是nn模块内置函数】
        run_epoch(data_generator(vocab_size=vocab_size, batch_size=8, max_len=10, batch_amount_per_epoch=2), model, loss)  # 评估时, 每 epoch有 2 个batch,每个batch有 8句文本,每句文本有(10-1)=9个单词


run(model, loss)

# 4 第四步: 使用模型进行贪婪解码
print("=" * 100, "贪婪解码", "=" * 100)
model.eval()  # 使模型进入测试模式
source_tensor = Variable(torch.LongTensor([[1, 3, 2, 5, 4, 6, 7, 8, 9, 10]]))  # 假定的输入张量
source_mask = Variable(torch.ones(1, 1, 10))  # 定义源数据掩码张量, 因为元素都是1, 在我们这里1代表不遮掩【因此相当于对源数据没有任何遮掩】(1, 1, 10) 中的前两个1的目的是为了扩展维度, 10代表最后一个维度有10个“1”元素
predict_tensor = greedy_decode(model, source_tensor, source_mask, max_len=10, start_symbol=1)   # 贪婪解码(贪心算法)的方式是每次预测都选择概率最大的结果作为输出, 它不一定能获得全局最优性, 但却拥有最高的执行效率.
print("\n贪婪解码最终结果：predict_tensor = {0}".format(predict_tensor))

打印结果：

model = EncoderDecoder(
  (encoder): Encoder(
    (encoderLayers): ModuleList(
      (0): EncoderLayer(
        (self_attention_layer): MultiHeadedAttention(
          (my_linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_layer): FeedForward(
          (linear01): Linear(in_features=512, out_features=2048, bias=True)
          (linear02): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer_connection): ModuleList(
          (0): SublayerConnection(
            (myLayerNorm): MyLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): SublayerConnection(
            (myLayerNorm): MyLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (1): EncoderLayer(
        (self_attention_layer): MultiHeadedAttention(
          (my_linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_layer): FeedForward(
          (linear01): Linear(in_features=512, out_features=2048, bias=True)
          (linear02): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer_connection): ModuleList(
          (0): SublayerConnection(
            (myLayerNorm): MyLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): SublayerConnection(
            (myLayerNorm): MyLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (myLayerNorm): MyLayerNorm()
  )
  (decoder): Decoder(
    (decoderLayers): ModuleList(
      (0): DecoderLayer(
        (self_attention_layer): MultiHeadedAttention(
          (my_linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (src_attention_layer): MultiHeadedAttention(
          (my_linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_layer): FeedForward(
          (linear01): Linear(in_features=512, out_features=2048, bias=True)
          (linear02): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer_connection): ModuleList(
          (0): SublayerConnection(
            (myLayerNorm): MyLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): SublayerConnection(
            (myLayerNorm): MyLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (2): SublayerConnection(
            (myLayerNorm): MyLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (1): DecoderLayer(
        (self_attention_layer): MultiHeadedAttention(
          (my_linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (src_attention_layer): MultiHeadedAttention(
          (my_linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_layer): FeedForward(
          (linear01): Linear(in_features=512, out_features=2048, bias=True)
          (linear02): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer_connection): ModuleList(
          (0): SublayerConnection(
            (myLayerNorm): MyLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): SublayerConnection(
            (myLayerNorm): MyLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (2): SublayerConnection(
            (myLayerNorm): MyLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (myLayerNorm): MyLayerNorm()
  )
  (source_embedding_fn): Sequential(
    (0): MyEmbedding(
      (embedding): Embedding(11, 512)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (target_embedding_fn): Sequential(
    (0): MyEmbedding(
      (embedding): Embedding(11, 512)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (generator): Generator(
    (project): Linear(in_features=512, out_features=11, bias=True)
  )
)
模型参数总数量 = 14731787
模型可训练参数总数量 = 14731787
==================================================================================================== epoch=0 ====================================================================================================
-------------------------------------------------- epoch=0：开始训练 --------------------------------------------------
使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：
src=tensor([[ 1,  5,  1, 10,  5,  3,  6,  6, 10, 10],
        [ 1,  4,  4,  9,  6,  8,  2,  3,  5, 10],
        [ 1,  2,  4,  2,  9,  4,  6,  3,  5,  2],
        [ 1,  5,  5,  6,  8,  1,  2,  3, 10, 10],
        [ 1,  9,  6,  6,  5,  2,  9,  1,  1,  4],
        [ 1,  4,  6,  5,  1,  2,  9,  6,  5,  4],
        [ 1,  9,  3,  3,  2,  6,  1,  2,  4,  9],
        [ 1,  5,  1,  4,  9,  4,  2,  4,  7, 10]])
trg=tensor([[ 1,  5,  1, 10,  5,  3,  6,  6, 10, 10],
        [ 1,  4,  4,  9,  6,  8,  2,  3,  5, 10],
        [ 1,  2,  4,  2,  9,  4,  6,  3,  5,  2],
        [ 1,  5,  5,  6,  8,  1,  2,  3, 10, 10],
        [ 1,  9,  6,  6,  5,  2,  9,  1,  1,  4],
        [ 1,  4,  6,  5,  1,  2,  9,  6,  5,  4],
        [ 1,  9,  3,  3,  2,  6,  1,  2,  4,  9],
        [ 1,  5,  1,  4,  9,  4,  2,  4,  7, 10]])
pad=0
Batch---->self.trg.shape=torch.Size([8, 9])
Batch---->self.trg_y.shape=torch.Size([8, 9])
Batch---->self.trg_mask.shape=torch.Size([8, 9, 9])
Batch---->self.ntokens=72
run_epoch---->batch.ntokens = 72
CriterionWithLabelSmoothing---->predict.shape = torch.Size([72, 11])----true_dist.shape = torch.Size([72, 11])----criterion_result = 229.94061279296875
run_epoch---->batch_index = 0----Loss = 229.940613----batch.ntokens = 72.000000----batch average Loss = 3.193620----当前batch所用时间 = 0.197447 秒
使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：
src=tensor([[ 1,  3,  9,  2,  6,  3,  8,  9,  5,  2],
        [ 1,  9,  2,  7,  2,  4,  4,  4,  3,  3],
        [ 1,  5,  8,  2,  4,  3,  6, 10,  1,  3],
        [ 1,  7, 10,  8,  1,  1,  5,  4,  5,  7],
        [ 1,  3,  2,  7, 10,  2,  8,  1,  4,  2],
        [ 1,  5,  6,  5, 10,  9,  4, 10,  8,  2],
        [ 1,  8, 10,  5,  3,  4,  1,  2,  1, 10],
        [ 1,  7,  8, 10, 10,  4,  7,  6,  8,  8]])
trg=tensor([[ 1,  3,  9,  2,  6,  3,  8,  9,  5,  2],
        [ 1,  9,  2,  7,  2,  4,  4,  4,  3,  3],
        [ 1,  5,  8,  2,  4,  3,  6, 10,  1,  3],
        [ 1,  7, 10,  8,  1,  1,  5,  4,  5,  7],
        [ 1,  3,  2,  7, 10,  2,  8,  1,  4,  2],
        [ 1,  5,  6,  5, 10,  9,  4, 10,  8,  2],
        [ 1,  8, 10,  5,  3,  4,  1,  2,  1, 10],
        [ 1,  7,  8, 10, 10,  4,  7,  6,  8,  8]])
pad=0
Batch---->self.trg.shape=torch.Size([8, 9])
Batch---->self.trg_y.shape=torch.Size([8, 9])
Batch---->self.trg_mask.shape=torch.Size([8, 9, 9])
Batch---->self.ntokens=72
run_epoch---->batch.ntokens = 72
CriterionWithLabelSmoothing---->predict.shape = torch.Size([72, 11])----true_dist.shape = torch.Size([72, 11])----criterion_result = 241.04693603515625
run_epoch---->batch_index = 1----Loss = 241.046936----batch.ntokens = 72.000000----batch average Loss = 3.347874----当前batch所用时间 = 0.159575 秒
使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：
src=tensor([[ 1,  8,  1,  8,  8,  6,  3,  6,  5,  1],
        [ 1,  7,  3,  9,  2,  8,  7,  8,  7,  1],
        [ 1, 10,  1,  4,  6,  3,  9,  2,  7, 10],
        [ 1,  4,  3,  1,  9, 10, 10,  3,  4,  4],
        [ 1,  3, 10,  4,  4,  4,  7,  4,  5,  7],
        [ 1,  5,  1,  4,  3,  4,  6,  7,  2,  7],
        [ 1,  5,  3,  8,  8,  5,  1,  4, 10,  8],
        [ 1,  6,  3,  1,  2,  8,  5,  6, 10,  7]])
trg=tensor([[ 1,  8,  1,  8,  8,  6,  3,  6,  5,  1],
        [ 1,  7,  3,  9,  2,  8,  7,  8,  7,  1],
        [ 1, 10,  1,  4,  6,  3,  9,  2,  7, 10],
        [ 1,  4,  3,  1,  9, 10, 10,  3,  4,  4],
        [ 1,  3, 10,  4,  4,  4,  7,  4,  5,  7],
        [ 1,  5,  1,  4,  3,  4,  6,  7,  2,  7],
        [ 1,  5,  3,  8,  8,  5,  1,  4, 10,  8],
        [ 1,  6,  3,  1,  2,  8,  5,  6, 10,  7]])
pad=0
Batch---->self.trg.shape=torch.Size([8, 9])
Batch---->self.trg_y.shape=torch.Size([8, 9])
Batch---->self.trg_mask.shape=torch.Size([8, 9, 9])
Batch---->self.ntokens=72
run_epoch---->batch.ntokens = 72
CriterionWithLabelSmoothing---->predict.shape = torch.Size([72, 11])----true_dist.shape = torch.Size([72, 11])----criterion_result = 217.42254638671875
run_epoch---->batch_index = 2----Loss = 217.422546----batch.ntokens = 72.000000----batch average Loss = 3.019758----当前batch所用时间 = 0.194479 秒
run_epoch---->avg_loss_of_this_epoch = 3.187084----当前epoch所用时间 = 0.558478 秒
-------------------------------------------------- epoch=0：开始测试 --------------------------------------------------
使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：
src=tensor([[ 1,  5,  6,  1,  9,  8,  7, 10,  1,  9],
        [ 1,  9,  9,  3, 10,  5,  8,  7,  6,  8],
        [ 1,  8, 10,  8,  2,  6, 10,  4,  9,  6],
        [ 1, 10,  7,  6,  8,  6,  1,  4,  5,  1],
        [ 1,  1,  9,  4,  5,  2,  7,  8,  9,  1],
        [ 1, 10,  1,  2,  9,  6,  1, 10,  7,  3],
        [ 1,  4,  6,  4,  8,  2,  2, 10, 10,  8],
        [ 1,  3,  3,  3,  3,  8,  5,  6, 10,  5]])
trg=tensor([[ 1,  5,  6,  1,  9,  8,  7, 10,  1,  9],
        [ 1,  9,  9,  3, 10,  5,  8,  7,  6,  8],
        [ 1,  8, 10,  8,  2,  6, 10,  4,  9,  6],
        [ 1, 10,  7,  6,  8,  6,  1,  4,  5,  1],
        [ 1,  1,  9,  4,  5,  2,  7,  8,  9,  1],
        [ 1, 10,  1,  2,  9,  6,  1, 10,  7,  3],
        [ 1,  4,  6,  4,  8,  2,  2, 10, 10,  8],
        [ 1,  3,  3,  3,  3,  8,  5,  6, 10,  5]])
pad=0
Batch---->self.trg.shape=torch.Size([8, 9])
Batch---->self.trg_y.shape=torch.Size([8, 9])
Batch---->self.trg_mask.shape=torch.Size([8, 9, 9])
Batch---->self.ntokens=72
run_epoch---->batch.ntokens = 72
CriterionWithLabelSmoothing---->predict.shape = torch.Size([72, 11])----true_dist.shape = torch.Size([72, 11])----criterion_result = 216.03460693359375
run_epoch---->batch_index = 0----Loss = 216.034607----batch.ntokens = 72.000000----batch average Loss = 3.000481----当前batch所用时间 = 0.149601 秒
使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：
src=tensor([[ 1,  2,  4,  7,  8,  7,  8,  3,  5,  1],
        [ 1,  7,  4,  5,  4,  7,  9,  4,  1,  5],
        [ 1,  6,  2,  8,  9, 10,  9, 10,  8,  2],
        [ 1, 10,  3,  3,  5,  4,  3,  1,  5,  7],
        [ 1,  5,  1,  7,  3, 10,  7,  3,  1,  7],
        [ 1,  2,  2, 10,  1,  1,  5,  4, 10,  7],
        [ 1,  5,  2, 10, 10,  3,  6,  4,  9,  8],
        [ 1,  6,  3,  1, 10,  3,  5,  4,  8,  2]])
trg=tensor([[ 1,  2,  4,  7,  8,  7,  8,  3,  5,  1],
        [ 1,  7,  4,  5,  4,  7,  9,  4,  1,  5],
        [ 1,  6,  2,  8,  9, 10,  9, 10,  8,  2],
        [ 1, 10,  3,  3,  5,  4,  3,  1,  5,  7],
        [ 1,  5,  1,  7,  3, 10,  7,  3,  1,  7],
        [ 1,  2,  2, 10,  1,  1,  5,  4, 10,  7],
        [ 1,  5,  2, 10, 10,  3,  6,  4,  9,  8],
        [ 1,  6,  3,  1, 10,  3,  5,  4,  8,  2]])
pad=0
Batch---->self.trg.shape=torch.Size([8, 9])
Batch---->self.trg_y.shape=torch.Size([8, 9])
Batch---->self.trg_mask.shape=torch.Size([8, 9, 9])
Batch---->self.ntokens=72
run_epoch---->batch.ntokens = 72
CriterionWithLabelSmoothing---->predict.shape = torch.Size([72, 11])----true_dist.shape = torch.Size([72, 11])----criterion_result = 222.1366424560547
run_epoch---->batch_index = 1----Loss = 222.136642----batch.ntokens = 72.000000----batch average Loss = 3.085231----当前batch所用时间 = 0.182512 秒
run_epoch---->avg_loss_of_this_epoch = 3.042856----当前epoch所用时间 = 0.337099 秒
==================================================================================================== epoch=1 ====================================================================================================
-------------------------------------------------- epoch=1：开始训练 --------------------------------------------------
使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：
src=tensor([[ 1,  9,  4,  9,  1,  4,  7,  6, 10,  5],
        [ 1,  6,  2,  7,  9,  9,  4, 10,  3,  7],
        [ 1, 10, 10,  6,  3,  6,  2, 10,  9, 10],
        [ 1,  8,  1, 10, 10,  1,  1,  4,  2,  5],
        [ 1, 10,  9,  3,  3,  2,  3,  1,  1,  2],
        [ 1,  5,  7,  7,  2,  5,  8,  2,  4,  9],
        [ 1,  5,  5,  1,  6, 10,  9,  7,  8,  3],
        [ 1,  3,  9, 10,  8,  1,  2,  3,  3,  6]])
trg=tensor([[ 1,  9,  4,  9,  1,  4,  7,  6, 10,  5],
        [ 1,  6,  2,  7,  9,  9,  4, 10,  3,  7],
        [ 1, 10, 10,  6,  3,  6,  2, 10,  9, 10],
        [ 1,  8,  1, 10, 10,  1,  1,  4,  2,  5],
        [ 1, 10,  9,  3,  3,  2,  3,  1,  1,  2],
        [ 1,  5,  7,  7,  2,  5,  8,  2,  4,  9],
        [ 1,  5,  5,  1,  6, 10,  9,  7,  8,  3],
        [ 1,  3,  9, 10,  8,  1,  2,  3,  3,  6]])
pad=0
Batch---->self.trg.shape=torch.Size([8, 9])
Batch---->self.trg_y.shape=torch.Size([8, 9])
Batch---->self.trg_mask.shape=torch.Size([8, 9, 9])
Batch---->self.ntokens=72
run_epoch---->batch.ntokens = 72
CriterionWithLabelSmoothing---->predict.shape = torch.Size([72, 11])----true_dist.shape = torch.Size([72, 11])----criterion_result = 229.82168579101562
run_epoch---->batch_index = 0----Loss = 229.821686----batch.ntokens = 72.000000----batch average Loss = 3.191968----当前batch所用时间 = 0.151594 秒
使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：
src=tensor([[ 1,  3,  4,  6,  5,  9,  4,  3,  4,  5],
        [ 1,  1,  3,  1,  2,  7,  7,  9,  1,  3],
        [ 1,  9,  6,  7,  3,  8,  3,  2,  7,  3],
        [ 1,  4,  2,  1, 10,  6,  7,  7, 10,  7],
        [ 1,  2,  2, 10,  3,  8,  9, 10,  6,  1],
        [ 1, 10,  3,  1,  4,  9,  2,  4,  4,  7],
        [ 1, 10,  6,  4,  1,  7,  2,  9,  8,  9],
        [ 1,  6,  5,  9, 10,  8,  6, 10,  6,  8]])
trg=tensor([[ 1,  3,  4,  6,  5,  9,  4,  3,  4,  5],
        [ 1,  1,  3,  1,  2,  7,  7,  9,  1,  3],
        [ 1,  9,  6,  7,  3,  8,  3,  2,  7,  3],
        [ 1,  4,  2,  1, 10,  6,  7,  7, 10,  7],
        [ 1,  2,  2, 10,  3,  8,  9, 10,  6,  1],
        [ 1, 10,  3,  1,  4,  9,  2,  4,  4,  7],
        [ 1, 10,  6,  4,  1,  7,  2,  9,  8,  9],
        [ 1,  6,  5,  9, 10,  8,  6, 10,  6,  8]])
pad=0
Batch---->self.trg.shape=torch.Size([8, 9])
Batch---->self.trg_y.shape=torch.Size([8, 9])
Batch---->self.trg_mask.shape=torch.Size([8, 9, 9])
Batch---->self.ntokens=72
run_epoch---->batch.ntokens = 72
CriterionWithLabelSmoothing---->predict.shape = torch.Size([72, 11])----true_dist.shape = torch.Size([72, 11])----criterion_result = 226.7887420654297
run_epoch---->batch_index = 1----Loss = 226.788742----batch.ntokens = 72.000000----batch average Loss = 3.149844----当前batch所用时间 = 0.228657 秒
使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：
src=tensor([[ 1,  8,  1, 10,  5, 10,  4,  4,  8,  4],
        [ 1, 10, 10,  7,  3,  9,  7,  2,  3,  3],
        [ 1,  6,  7,  3,  5,  5,  5,  6,  7,  7],
        [ 1,  4,  5,  5,  9,  4,  1,  3,  7,  8],
        [ 1,  8,  7,  3,  8,  6,  8,  1,  5,  6],
        [ 1,  8,  1,  6,  8, 10, 10,  9,  3,  8],
        [ 1,  8,  4,  8,  1,  7,  7, 10, 10,  1],
        [ 1,  3,  4,  2,  9,  9,  8,  1,  7,  9]])
trg=tensor([[ 1,  8,  1, 10,  5, 10,  4,  4,  8,  4],
        [ 1, 10, 10,  7,  3,  9,  7,  2,  3,  3],
        [ 1,  6,  7,  3,  5,  5,  5,  6,  7,  7],
        [ 1,  4,  5,  5,  9,  4,  1,  3,  7,  8],
        [ 1,  8,  7,  3,  8,  6,  8,  1,  5,  6],
        [ 1,  8,  1,  6,  8, 10, 10,  9,  3,  8],
        [ 1,  8,  4,  8,  1,  7,  7, 10, 10,  1],
        [ 1,  3,  4,  2,  9,  9,  8,  1,  7,  9]])
pad=0
Batch---->self.trg.shape=torch.Size([8, 9])
Batch---->self.trg_y.shape=torch.Size([8, 9])
Batch---->self.trg_mask.shape=torch.Size([8, 9, 9])
Batch---->self.ntokens=72
run_epoch---->batch.ntokens = 72
CriterionWithLabelSmoothing---->predict.shape = torch.Size([72, 11])----true_dist.shape = torch.Size([72, 11])----criterion_result = 202.9710235595703
run_epoch---->batch_index = 2----Loss = 202.971024----batch.ntokens = 72.000000----batch average Loss = 2.819042----当前batch所用时间 = 0.162577 秒
run_epoch---->avg_loss_of_this_epoch = 3.053618----当前epoch所用时间 = 0.547813 秒
-------------------------------------------------- epoch=1：开始测试 --------------------------------------------------
使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：
src=tensor([[ 1,  8,  5,  8,  8,  3,  6,  1, 10, 10],
        [ 1,  6,  9,  8,  6,  3,  9,  6,  4,  6],
        [ 1,  3,  5,  4,  2, 10,  2,  3,  3,  8],
        [ 1,  4,  6,  3,  5,  7,  2,  8,  5,  7],
        [ 1,  4,  3, 10,  5,  9,  5,  3,  3,  5],
        [ 1, 10,  7,  7,  3,  4,  9,  4, 10,  7],
        [ 1,  1,  3,  6,  9,  2, 10,  6,  2,  4],
        [ 1,  2,  7,  1,  7,  7,  2,  5, 10,  6]])
trg=tensor([[ 1,  8,  5,  8,  8,  3,  6,  1, 10, 10],
        [ 1,  6,  9,  8,  6,  3,  9,  6,  4,  6],
        [ 1,  3,  5,  4,  2, 10,  2,  3,  3,  8],
        [ 1,  4,  6,  3,  5,  7,  2,  8,  5,  7],
        [ 1,  4,  3, 10,  5,  9,  5,  3,  3,  5],
        [ 1, 10,  7,  7,  3,  4,  9,  4, 10,  7],
        [ 1,  1,  3,  6,  9,  2, 10,  6,  2,  4],
        [ 1,  2,  7,  1,  7,  7,  2,  5, 10,  6]])
pad=0
Batch---->self.trg.shape=torch.Size([8, 9])
Batch---->self.trg_y.shape=torch.Size([8, 9])
Batch---->self.trg_mask.shape=torch.Size([8, 9, 9])
Batch---->self.ntokens=72
run_epoch---->batch.ntokens = 72
CriterionWithLabelSmoothing---->predict.shape = torch.Size([72, 11])----true_dist.shape = torch.Size([72, 11])----criterion_result = 216.8260498046875
run_epoch---->batch_index = 0----Loss = 216.826050----batch.ntokens = 72.000000----batch average Loss = 3.011473----当前batch所用时间 = 0.194037 秒
使用Batch工具类组装data_generator()函数生成的source_tensor、target_tensor：
src=tensor([[ 1,  4,  3, 10,  9,  6, 10,  2,  7,  5],
        [ 1,  7,  1, 10,  9,  3,  1,  4,  4,  1],
        [ 1,  6,  6,  2, 10, 10,  3,  5,  7, 10],
        [ 1,  6,  9,  4,  3,  2,  9, 10,  8,  6],
        [ 1,  5,  7,  4,  2,  6,  8,  3,  5,  1],
        [ 1,  1,  1,  5,  7, 10,  6,  3,  3,  6],
        [ 1,  2,  2,  9, 10,  1,  8,  1, 10,  8],
        [ 1,  9,  6,  9,  1,  3,  8,  9,  4,  8]])
trg=tensor([[ 1,  4,  3, 10,  9,  6, 10,  2,  7,  5],
        [ 1,  7,  1, 10,  9,  3,  1,  4,  4,  1],
        [ 1,  6,  6,  2, 10, 10,  3,  5,  7, 10],
        [ 1,  6,  9,  4,  3,  2,  9, 10,  8,  6],
        [ 1,  5,  7,  4,  2,  6,  8,  3,  5,  1],
        [ 1,  1,  1,  5,  7, 10,  6,  3,  3,  6],
        [ 1,  2,  2,  9, 10,  1,  8,  1, 10,  8],
        [ 1,  9,  6,  9,  1,  3,  8,  9,  4,  8]])
pad=0
Batch---->self.trg.shape=torch.Size([8, 9])
Batch---->self.trg_y.shape=torch.Size([8, 9])
Batch---->self.trg_mask.shape=torch.Size([8, 9, 9])
Batch---->self.ntokens=72
run_epoch---->batch.ntokens = 72
CriterionWithLabelSmoothing---->predict.shape = torch.Size([72, 11])----true_dist.shape = torch.Size([72, 11])----criterion_result = 221.12046813964844
run_epoch---->batch_index = 1----Loss = 221.120468----batch.ntokens = 72.000000----batch average Loss = 3.071118----当前batch所用时间 = 0.152593 秒
run_epoch---->avg_loss_of_this_epoch = 3.041295----当前epoch所用时间 = 0.351615 秒
==================================================================================================== 贪婪解码 ====================================================================================================
greedy_decode---->start_symbol = 1
greedy_decode---->target_tensor = tensor([[1]])
greedy_decode---->target_tensor = tensor([[1, 1]])
greedy_decode---->target_tensor = tensor([[1, 1, 1]])
greedy_decode---->target_tensor = tensor([[ 1,  1,  1, 10]])
greedy_decode---->target_tensor = tensor([[ 1,  1,  1, 10,  4]])
greedy_decode---->target_tensor = tensor([[ 1,  1,  1, 10,  4,  5]])
greedy_decode---->target_tensor = tensor([[ 1,  1,  1, 10,  4,  5,  5]])
greedy_decode---->target_tensor = tensor([[ 1,  1,  1, 10,  4,  5,  5,  5]])
greedy_decode---->target_tensor = tensor([[ 1,  1,  1, 10,  4,  5,  5,  5,  5]])
greedy_decode---->target_tensor = tensor([[ 1,  1,  1, 10,  4,  5,  5,  5,  5,  5]])
贪婪解码最终结果：predict_tensor = tensor([[ 1,  1,  1, 10,  4,  5,  5,  5,  5,  5]])

如果设置训练epochs=100，则贪婪解码结果为：

==================================================================================================== 贪婪解码 ====================================================================================================
greedy_decode---->start_symbol = 1
greedy_decode---->target_tensor = tensor([[1]])
greedy_decode---->target_tensor = tensor([[1, 3]])
greedy_decode---->target_tensor = tensor([[1, 3, 2]])
greedy_decode---->target_tensor = tensor([[1, 3, 2, 5]])
greedy_decode---->target_tensor = tensor([[1, 3, 2, 5, 4]])
greedy_decode---->target_tensor = tensor([[1, 3, 2, 5, 4, 6]])
greedy_decode---->target_tensor = tensor([[1, 3, 2, 5, 4, 6, 7]])
greedy_decode---->target_tensor = tensor([[1, 3, 2, 5, 4, 6, 7, 8]])
greedy_decode---->target_tensor = tensor([[1, 3, 2, 5, 4, 6, 7, 8, 9]])
greedy_decode---->target_tensor = tensor([[ 1,  3,  2,  5,  4,  6,  7,  8,  9, 10]])
贪婪解码最终结果：predict_tensor = tensor([[ 1,  3,  2,  5,  4,  6,  7,  8,  9, 10]])

深度学习-自然语言处理(NLP)-Pytorch：Transformer模型源码分析【自定义构建Transformer模型（Copy任务）】

猜你喜欢