基于Pytorch+Bert的预训练模型

# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @version: v1.0
# @Author   : Meng Li
# @contact: [email protected]
# @FILE     : Torch_bert.py
# @Time     : 2022/7/7 14:32
# @Software : PyCharm
# @site: 
# @Description :  自己实现的Bert模型
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import re
import random
import numpy as np
import math

text = (
    'Hello, how are you? I am Romeo.\n'  # R
    'Hello, Romeo My name is Juliet. Nice to meet you.\n'  # J
    'Nice meet you too. How are you today?\n'  # R
    'Great. My baseball team won the competition.\n'  # J
    'Oh Congratulations, Juliet\n'  # R
    'Thank you Romeo\n'  # J
    'Where are you going today?\n'  # R
    'I am going shopping. What about you?\n'  # J
    'I am going to visit my grandmother. she is not very well'  # R
)
sentence = re.sub("[,.!?\\-]", "", text.lower()).split("\n")  # 去除字符串中的".,!?-"
vocab = " ".join([i for i in sentence])
vocab = list(set([i for i in vocab.split(" ")]))
word2idx = {'MASK': 0, 'CLS': 1, 'SEQ': 2, 'PAD': 3}
for i in range(len(vocab)):
    word2idx[vocab[i]] = i + 4
idx2word = {i: j for i, j in enumerate(word2idx)}
vocab_size = len(idx2word)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

token_list = []
for i in range(len(sentence)):
    token_list.append([word2idx[j] for j in sentence[i].split(" ")])

max_len = 30  # 句子最长长度
num_pred = 5  # 最长掩码长度
batch_size = 6  # batch大小
n_layers = 6
embedding_size = 768  # 向量的embed维度
segments_len = 2
embed_size = 768
dim = 64
num_heads = 12
d_ff = 64
dropout = 0.5


class my_dataset(Dataset):
    def __init__(self, input_ids, segment_ids, masked_pos, masked_tokens, isNext):
        super().__init__()
        self.input_ids = input_ids
        self.segment_ids = segment_ids
        self.masked_pos = masked_pos
        self.masked_tokens = masked_tokens
        self.isNext = isNext

    def __getitem__(self, index):
        return self.input_ids[index], self.segment_ids[index], self.masked_pos[index], self.masked_tokens[index], \
               self.isNext[index]

    def __len__(self):
        return self.input_ids.size(0)


def make_data(seq_data):
    """
    :param seq_data:
    :return: 返回 [input_ids, segment_ids, masked_tokens, masked_pos, isNext]
    """
    batch = []
    left_cnt = right_cnt = 0
    while left_cnt <= batch_size / 2 or right_cnt <= batch_size / 2:
        rand_a_idx = rand_b_idx = 0
        sen_a_idx = random.randrange(len(seq_data))
        sen_b_idx = random.randrange(len(seq_data))
        tokens_a = seq_data[sen_a_idx]
        tokens_b = seq_data[sen_b_idx]
        # int型 和 list 相加可以采用这种方式
        input_ids = [word2idx['CLS']] + tokens_a + [word2idx['SEQ']] + tokens_b + [word2idx['SEQ']]
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        n_pred = min(num_pred, int(len(input_ids) * 0.15))  # 将一句话中15%的词替换成其他内容
        test_input_ids = [i for i, j in enumerate(input_ids) if idx2word[j] != 'CLS' and idx2word[j] != 'SEQ']
        random.shuffle(test_input_ids)  # 将input_ids随机打乱
        masked_tokens, masked_pos = [], []
        for word in range(n_pred):
            cand_rep_idx = test_input_ids[word]
            masked_pos.append(cand_rep_idx)
            masked_tokens.append(input_ids[cand_rep_idx])
            p = random.random()
            if p > 0.8:
                input_ids[cand_rep_idx] = word2idx['MASK']
            elif p > 0.1:
                other_idx = random.randrange(len(input_ids))
                input_ids[cand_rep_idx] = input_ids[other_idx]
            else:
                input_ids[cand_rep_idx] = input_ids[word]

        n_pad = max_len - len(input_ids)
        input_ids.extend(n_pad * [0])
        segment_ids.extend(n_pad * [0])

        if num_pred > n_pred:
            n_pad = num_pred - n_pred
            masked_pos.extend(n_pad * [0])
            masked_tokens.extend(n_pad * [0])

        if sen_a_idx + 1 != sen_b_idx and left_cnt <= batch_size / 2:
            isNext = False
            left_cnt = left_cnt + 1
            batch.append([input_ids, segment_ids, masked_pos, masked_tokens, isNext])
        elif sen_a_idx + 1 == sen_b_idx and right_cnt <= batch_size / 2:
            isNext = True
            right_cnt = right_cnt + 1
            batch.append([input_ids, segment_ids, masked_pos, masked_tokens, isNext])
    return batch


class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        """
        Q: [batch_size, n_heads, len_q, d_k]
        K: [batch_size, n_heads, len_k, d_k]
        V: [batch_size, n_heads, len_v(=len_k), d_v]
        attn_mask: [batch_size, n_heads, seq_len, seq_len]
        这里求取注意力相似度采用的是可缩放点积，普通的点乘，方差会很大，反向传播时梯度会变小
        """
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(dim)  # scores : [batch_size, n_heads, len_q, len_k]
        # 将矩阵scores中attn_mask为True时对应的元素索引置-1e9
        # scores.masked_fill_(attn_mask, -1e9)  # Fills elements of self tensor with value where mask is True.
        scores.masked_fill_(attn_mask, 0)  # Fills elements of self tensor with value where mask is True.
        # scores = torch.matmul(attn_mask.float(), scores)
        attn = nn.Softmax(dim=-1)(scores)  # [batch_size, n_heads, len_q]
        # atten_mask 与 V 相乘得到经过Masked后的注意力矩阵
        context = torch.matmul(attn, V)  # [batch_size, n_heads, len_q, d_v]
        return context


class Multi_Head_Attention(nn.Module):
    def __init__(self):
        super().__init__()
        self.W_Q = nn.Linear(embed_size, dim * num_heads, bias=False)  # 将输入矩阵映射为低维度
        self.W_K = nn.Linear(embed_size, dim * num_heads, bias=False)  # 将输入矩阵映射为低维度
        self.W_V = nn.Linear(embed_size, dim * num_heads, bias=False)  # 将输入矩阵映射为低维度
        self.projection = torch.nn.Linear(num_heads * dim, embed_size)  # 将atten的维度转换为与输入的维度一致

    def forward(self, input_Q, input_K, input_V, atten_mask):
        """
        :param input_Q: -> [Batch_size, len_q, embedding_size]
        :param input_K: -> [Batch_size, len_k, embedding_size]
        :param input_V: -> [Batch_size, len_v(=len_k), embedding_size]
        :param atten_mask:  -> [Batch_size, atten_len_k, atten_len_v]
        :return: 这里的dim是QKV矩阵的维度
        # 对输入求得Q、K、V三个矩阵，然后根据Q和K矩阵求得注意力矩阵，最后根据注意力矩阵求得经过Masked后的注意力矩阵
        # 返回的enc_inputs 和 atten 张量维度的一样的
        """
        torch.backends.cudnn.enabled = False
        residual = input_Q  # [Batch_size, len_q, embedding_size]  这里是残差项，多层注意力的输出与此项相加
        _, len_q, embedding_size = input_Q.size()
        _, len_k, _ = input_K.size()
        Batch_size, atten_len_k, atten_len_v = atten_mask.size()
        # 输入乘以矩阵得到Q、K、V矩阵
        Q = self.W_Q(input_Q).view(Batch_size, num_heads, len_q, dim)  # Q -> [Batch_size, len_q, dim*num_heads]
        K = self.W_K(input_K).view(Batch_size, num_heads, len_k, dim)  # K -> [Batch_size, len_k, dim*num_heads]
        V = self.W_V(input_V).view(Batch_size, num_heads, len_k, dim)  # V -> [Batch_size, len_v, dim*num_heads]

        atten_mask = atten_mask.unsqueeze(1)  # atten_mask -> [Batch_size, 1, atten_len_k, atten_len_v]
        # atten_mask -> [Batch_size, num_heads, atten_len_k, atten_len_v]  这里的 atten_len_v == len_q
        atten_mask = atten_mask.repeat(1, num_heads, 1, 1)
        atten = ScaledDotProductAttention()(Q, K, V, atten_mask)
        atten = atten.transpose(1, 2)  # atten -> [Batch_size, atten_len_k, num_heads, dim]

        atten = atten.reshape(Batch_size, atten_len_k, -1)  # atten -> [Batch_size, atten_len_k, num_heads * dim]
        atten = self.projection(atten).to(device)  # atten -> [Batch_size, atten_len_k, embed_size] atten_len_k == len_q
        # softmax  不改变矩阵的维度，这里对行方向对行向量进行归一化  这里对输出和残差 进行Add && Norm 操作
        atten_ret = (residual + torch.softmax(atten, dim=1))
        atten_ret = nn.LayerNorm(embed_size).to(device)(atten_ret)
        return atten_ret


class Feed_forward(nn.Module):
    """
    对应于原论文中的Feed-Forward流程
    查看某个数据是否存储于cuda上,可键入命令: x.is_cuda 其中x为变量
    """

    def __init__(self):
        super().__init__()
        self.W1 = nn.Linear(embed_size, d_ff).to(device)
        self.W2 = nn.Linear(d_ff, embed_size).to(device)
        self.b1 = torch.rand(d_ff).to(device)
        self.b2 = torch.rand(embed_size).to(device)
        self.relu = nn.ReLU().to(device)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, enc_inputs):
        """
        :param enc_inputs: # enc_inputs -> [Batch_size, seq_len, embedding_size]
        # atten -> [Batch_size, seq_len, embedding_size]
        :return:
        """
        fc1 = self.W1(enc_inputs) + self.b1
        fc1 = self.relu(fc1)
        fc2 = self.W2(fc1) + self.b2  # fc2 -> [Batch_size, seq_len, embedding_size]
        output = fc2  # output -> [Batch_size, seq_len, embedding_size]
        residual = enc_inputs
        Add_And_Norm = nn.LayerNorm(embed_size).cuda()(output + residual)
        return Add_And_Norm


class Encoder_layer(nn.Module):
    def __init__(self):
        super().__init__()
        self.multi_head_attention = Multi_Head_Attention()
        self.feed_forward = Feed_forward()

    def forward(self, enc_inputs, enc_atten_mask):
        """
        :param enc_inputs:  # enc_inputs -> [Batch_size, src_len, embedding_size]
        :param enc_atten_mask:   # enc_atten_mask -> [Batch_size, src_len, src_len]
        :return:
        """
        # 传入多层注意力机制的输入Q、K、V 都假定为一样的
        atten_output = self.multi_head_attention(enc_inputs, enc_inputs, enc_inputs, enc_atten_mask)  # 这里得到的是注意力矩阵
        output = self.feed_forward(atten_output).to(device)  # output -> [Batch_size, seq_len, embeded_size]
        return output, atten_output


def get_attn_pad_mask(seq_q, seq_k):
    """
    :param seq_q:  seq_q -> [Batch_size, len_q]
    :param seq_k:  seq_k -> [Batch_size, len_k]
    :return:
    """
    Batch_size, len_q = seq_q.size()
    Batch_size, len_k = seq_k.size()
    atten_mask = seq_k.eq(0).unsqueeze(1)  # atten_mask -> [Batch_size, 1, len_k]
    atten_mask = atten_mask.expand(Batch_size, len_q, len_k)  # atten_mask -> [Batch_size, len_q, len_k]
    return atten_mask


def gelu(x):
    """
      Implementation of the gelu activation function.
      For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
      0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
      Also see https://arxiv.org/abs/1606.08415
      论文中用GELU代替了RELU
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


class BERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embed = torch.nn.Embedding(vocab_size, embedding_size).to(device)  # 对token进行向量化
        self.pos_embed = torch.nn.Embedding(max_len, embedding_size).to(device)  # 对position进行向量化
        self.seg_embed = torch.nn.Embedding(segments_len, embedding_size).to(device)  # 对不同的语句进行向量化
        self.layers = nn.ModuleList(Encoder_layer() for _ in range(n_layers))
        self.fc1 = nn.Sequential(
            nn.Linear(embedding_size, embedding_size),
            nn.Dropout(0.5),
            nn.Tanh(),
        )
        self.classifier = nn.Linear(embedding_size, 2)  # 对两句话进行分类,isNext or not is Next
        self.fc2 = nn.Linear(embedding_size, vocab_size)
        self.linear = nn.Linear(embedding_size, embedding_size)

    def forward(self, input_token, segments_, masked_pos):
        """
        :param masked_pos: [Batch_size, n_pred]
        :param input_token: [Batch_size, seq_len]
        :param segments_:  [Batch_size, seq_len]
        :return:
        """
        Batch_size, seq_len = input_token.size()
        pos = torch.arange(seq_len, dtype=torch.long)  # [seq_len]
        pos = pos.unsqueeze(0)  # [1, seq_len]
        pos = pos.repeat(Batch_size, 1).to(device)  # [Batch_size, seq_len]
        # input_token_embed -> [Batch_size, seq_len, embedding_size]
        input_token_embed = self.token_embed(input_token) + self.seg_embed(segments_) + self.pos_embed(pos)
        enc_atten_mask = get_attn_pad_mask(input_token, input_token)  # [Batch_size, seq_len, seq_len]
        output = input_token_embed
        for layer in self.layers:
            output, _ = layer(output, enc_atten_mask)  # output [Batch_size, seq_len, embedding_size]
        _, seq_len, _ = output.size()
        nsp_output = output
        nsp_output = self.fc1(nsp_output)  # [Batch_size, seq_len, embedding_size]
        nsp_output = self.classifier(nsp_output)  # [Batch_size, seq_len, 2]
        nsp_output = torch.sum(nsp_output.transpose(2, 1), dim=-1)  # [Batch_size, 2]
        nsp_output = torch.softmax(nsp_output, dim=-1)  # [Batch_size, 2]

        masked_pos = masked_pos.unsqueeze(-1).repeat(1, 1, embedding_size)  # [Batch_size, n_pred, embedding_size]
        nlm_output = torch.gather(output, 1, masked_pos)
        nlm_output = gelu(self.linear(nlm_output))  # [Batch_size, n_pred, embedding_size]
        nlm_output = self.fc2(nlm_output)  # [Batch_size, n_pred, vocab_size]
        return nsp_output, nlm_output


def train():
    batch = make_data(token_list)
    input_ids, segment_ids, masked_pos, masked_tokens, isNext = zip(*batch)
    input_ids, segment_ids, masked_pos, masked_tokens, isNext = torch.LongTensor(input_ids), torch.LongTensor(
        segment_ids), torch.LongTensor(masked_pos), torch.LongTensor(masked_tokens), torch.LongTensor(isNext)
    train_data = my_dataset(input_ids, segment_ids, masked_pos, masked_tokens, isNext)
    train_iter = DataLoader(train_data, batch_size, shuffle=True)
    crition = torch.nn.CrossEntropyLoss()  # 括号里面的参数顺序应该是(label, target)
    model = BERT().train()
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    for step in range(1000):
        for input_ids_i, segment_ids_i, masked_pos_i, masked_tokens_i, isNext_i in train_iter:
            input_ids_i, segment_ids_i, masked_pos_i, masked_tokens_i, isNext_i = input_ids_i.to(device), \
                                                                                  segment_ids_i.to(device), \
                                                                                  masked_pos_i.to(device), \
                                                                                  masked_tokens_i.to(device), \
                                                                                  isNext_i.to(device)
            optimizer.zero_grad()
            nsp_out, nlm_out = model(input_ids_i, segment_ids_i, masked_pos_i)
            classify_loss = crition(nsp_out, isNext_i)
            masked_tokens_i = masked_tokens_i.view(-1)
            nlm_out = nlm_out.view(-1, vocab_size)
            nlm_out = nlm_out
            nlm_loss = 0
            nlm_loss = crition(nlm_out, masked_tokens_i)
            nlm_loss = nlm_loss.mean()
            loss = nlm_loss + classify_loss
            loss.backward()
            optimizer.step()
            if step % 100 == 0:
                print("step {0} loss {1} loss {2}".format(step, nlm_loss, classify_loss))


if __name__ == '__main__':
    train()

目前NLP系列最后一道坎，Bert模型

Bert采用了Transformers模型中的Encoder模型，这里有6层Encoder。每层Encoder有12层多层注意力层（Multi-Head Attention）。

其中，输入语料库为有先后顺序的9句话。我这里将这9句话自制作为一个小型的数据集。

随机在语料库中选取两句话，如果这两句话有在整个文档中有先后顺序，那么isNext 字段为True

对于选取的两句话中的每一句话，随机Mask其中的某几个token，token就是字符级别的。

masked的方式有三种，80%的可能性将其变成'[MASKED]'，10% 的可能性将其变成其他的token，还剩下10%的可能性将其保持不变。

这样做的好处就是，训练模型能够根据上下文预测当前被Masked的token，灵盖是由Word2Vec的EBOW激发而来。因为EBOW也是由一个词周边的词语预测当前词。

至于要将剩下的10%的token 不进行任何处理，是因为下游任务中是没有训练集，也就是没有'MASKED'，为了让模型能够适配下游任务，这里做了相应的处理。

            if p > 0.8:
                input_ids[cand_rep_idx] = word2idx['MASK']
            elif p > 0.1:
                other_idx = random.randrange(len(input_ids))
                input_ids[cand_rep_idx] = input_ids[other_idx]
            else:
                input_ids[cand_rep_idx] = input_ids[word]

训练集中的每一条语句有三部分相加而成。input_ids, segment_ids, isNext

input_ids的构造方式上文讲了，segment_ids是为了区分两句话中，之间的分割

eg:segment_ids [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

将[input_ids , segment_ids, isNext] 字段经过一个Embedding层，得到一个大小为[Batch_size, seq_len, embedding_size]大小的张量。这里的embeding层中对pos位置的处理方法相对于Transformer有点不同，这里采用的是一个Embedding的矩阵来表示位置，该矩阵是可以学习的。

pos = torch.arange(seq_len, dtype=torch.long)  # [seq_len]
pos = pos.unsqueeze(0)  # [1, seq_len]
pos = pos.repeat(Batch_size, 1).to(device)  # [Batch_size, seq_len]

张量经过num_layers 个Encoder层，输出接两个全连接层，网络设计完毕

模型的loss 分两个，一个是MLM的loss 一个是NSP的loss

MLM的loss 仅仅针对被masked的token的loss(nlm_loss)，NSP的loss是对句向量进行分类的loss(nsp_loss)。但是由于nsp_loss 太小， nlm_loss 对整个loss的影响太大,nsp_loss下降不是很明显

基于Pytorch+Bert的预训练模型

猜你喜欢