Bert PyTorch 源码分析：一、嵌入层

# 标记嵌入就是最普通的嵌入层
# 接受单词ID输出单词向量
# 直接转发给了`nn.Embedding`
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, embed_size=512):
        super().__init__(vocab_size, embed_size, padding_idx=0)
		
		
# 片段嵌入实际上是句子嵌入
# 接受单词所属句子的 ID，例如 [0, ..., 0, 1, ..., 1, 2, ..., 2]
# 输出句子向量，句子最多有三个（实际上只有两个？）
class SegmentEmbedding(nn.Embedding):
    def __init__(self, embed_size=512):
        super().__init__(3, embed_size, padding_idx=0)

# 位置嵌入接受单词ID，输出位置向量
class PositionalEmbedding(nn.Module):

    def __init__(self, d_model, max_len=512):
        super().__init__()

        # 将嵌入矩阵初始化为 ML * ES 的全零矩阵
        pe = torch.zeros(max_len, d_model).float()
        # 不更新它的梯度
        pe.require_grad = False

        # 位置项，0 到 ML-1 的向量，并转型为 ML * 1
        position = torch.arange(0, max_len).float().unsqueeze(1)
        # 除法项
        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()

        # 将偶数列设为二者乘积的正弦值，奇数列设置为余弦值
        # 设计原理略过，见搜索引擎【*】
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # 转型为 1 * ML * ES，因为位置对于每个句子都是一样的，便于它针对每个句子广播
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # 输入形状为 BS * ML
        # 返回前 ML 个位置向量
        return self.pe[:, :x.size(1)]

# 整体嵌入层，包含以上三部分
class BERTEmbedding(nn.Module):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)

        sum of all these features are output of BERTEmbedding
    """

    def __init__(self, vocab_size, embed_size, dropout=0.1):
        """
        :param vocab_size: total vocab size
        :param embed_size: embedding size of token embedding
        :param dropout: dropout rate
        """
        super().__init__()
		# 初始化三个嵌入子模块和 dropout
        self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
        self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
        self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.embed_size = embed_size

    def forward(self, sequence, segment_label):
        # `sequence`是单词 ID 的序列
        # `segment_label`是句子 ID 的序列，形状都是 BS * ML
        # 计算三个向量并相加，然后添加 dropout
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
        return self.dropout(x)
Bert PyTorch 源码分析：一、嵌入层

猜你喜欢