预训练源码阅读2

接着分析data_collator.py之中的__call__函数的后续的内容

if self.mlm:
    #special_tokens_mask = None
    batch["input_ids"], batch["labels"] = self.mask_tokens(
        batch["input_ids"], special_tokens_mask=special_tokens_mask
    )

这里面需要进入self.mask_tokens去调用

    def mask_tokens(
        self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        print('data/data_collator.py mask_tokens')
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
        labels = inputs.clone()
        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
        r"""
        labels = tensor(
       [[  101,   169,   107,  ..., 10539,   107,   102],
        [  101,   169,   107,  ...,   100,   100,   102],
        [  101,   169,   107,  ...,   100,   100,   102],
        ...,
        [  101,   169,   107,  ...,   100,   100,   102],
        [  101,   169,   107,  ...,   100,   100,   102],
        [  101,   169,   107,  ...,   117,   169,   102]])
        """
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        r"""
        probability_matrix = 
        tensor([[0.1500,0.1500,...],
                [0.1500,0.1500,...],
                ..................
            
        """
        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()

        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels

首先复制一下对应的labels的值

labels = inputs.clone()
labels = tensor(
	   [[  101,   169,   107,  ..., 10539,   107,   102],
        [  101,   169,   107,  ...,   100,   100,   102],
        [  101,   169,   107,  ...,   100,   100,   102],
        ...,
        [  101,   169,   107,  ...,   100,   100,   102],
        [  101,   169,   107,  ...,   100,   100,   102],
        [  101,   169,   107,  ...,   117,   169,   102]]
)

接着调用对应的probability_matrix矩阵

probability_matrix = torch.full(labels.shape,self.mlm_probability)

得到的对应的probability_matrix矩阵

probability_matrix = 
tensor([[0.1500,0.1500,...],
        [0.1500,0.1500,...],
        ..................
        [0.1500,0.1500,...]])

接下来查看对于masked_indices的调用

if special_tokens_mask is None:
    special_tokens_mask = [
        self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    print('special_tokens_mask1 = ')
    print(special_tokens_mask)
    special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
    print('special_tokens_mask2 = ')
    print(special_tokens_mask)
else:
    special_tokens_mask = special_tokens_mask.bool()

得到对应的内容为

special_tokens_mask1 = 
[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
.....................
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1]]

对应的special_tokens_mask2的内容为

special_tokens_mask2 = 
tensor([[ True, False, False,  ..., False, False,  True],
        ...,
        [ True, False, False,  ..., False, False,  True]])

这里调用special_tokens_mask1需要调用get_special_tokens_mask的函数内容

special_tokens_mask = [
	self.tokenizer.get_special_tokens_mask(val,already_head_special_tokens=True) for valu in labels.tolist()
]

进入到self.tokenizer.get_special_tokens_mask函数之中去查看,这里的self.tokenizer指向的是PreTrainedTokenizer类别的内容
查看Transformer/tokenization_utils/PreTrainedTokenizer.py的文件内容

if isinstnace(examples[0], (dict,BatchEncoding)):
	batch = self.tokenizer.pad(examples,return_tensors="pt",pad_to_multiple_of
	print('|||self.tokenizer = |||')
	print(self.tokenizer)
|||self.tokenizer = |||
PreTrainedTokenizer is_fast
PreTrainedTokenizer is_fast
PreTrainedTokenizer(name_or_path='/home/xiaoguzai/数据/nezha-chinese-base/vocab.txt', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

首先这里PreTrainedTokenizer必须调用的是is_fast函数的内容,因为这里的is_fast被标注为@property

@property
def is_fast(self) -> bool:
	return False

这里不知道为何调用is_fast函数的内容,猜想这里的is_fast函数内容是函数初始化进行调用的

self.pad_to_multiple_of = None

这里调用的是PreTrainedTokenizer.pad,同时从tokenization_utils_base之中能够调用出PreTrainedTokenizerBase之中的pad函数

batch = 
{'input_ids':tensor(
[[101,169,...102],
 ........................
 [101,169,...102]])
 'attention_mask':tensor(
 [[1,1,...1,1]                   
  ...............
  [1,1,...1,1]]
 )


这里先总览一下对应的batch内容

batch = 
        {'input_ids': tensor([[  101,   169,   107,  ...,   100,   100,   102],
        [  101,   169,   107,  ...,   100,   100,   102],
        [  101,   169,   107,  ...,   107, 10539,   102],
        ...,
        [  101,   169,   107,  ...,   131,   107,   102],
        [  101,   169,   103,  ...,   100,   124,   102],
        [  101,   169,   107,  ...,   171,   117,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        ...,
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100,  107,  ..., -100, -100, -100],
        [-100, -100,  107,  ..., -100, -100, -100]])}

然后应该是这波数据进入model之中进行训练
发现这里输入的预训练的内容仍然为:

sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
#prediction_scores = torch.Size([32,90,21128])
outputs = (prediction_scores,) + outputs[2:]  
#outputs[0].shape = torch.Size([32,90,21128])
masked_lm_labels = None
 if labels is not None:
     loss_fct = CrossEntropyLoss()  # -100 index = padding token
     #prediction_scores = ([32,90,21128])
     #prediction_scores.view = ([2880,21128])
     #labels.view(-1) = ([2880])
     #labels.view = tensor([-100,-100,...,117,-100,-100],device='cuda:0')
     masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
     outputs = (masked_lm_loss,) + outputs
 return outputs  # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions)

最后阅读一波关于在预训练之中使用的损失函数的代码内容

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
        )
        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)
        outputs = (prediction_scores,) + outputs[2:] 
        masked_lm_labels = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # -100 index = padding token
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
            outputs = (masked_lm_loss,) + outputs

这里的self.bert的内容为NeZhaModel的内容(带有cls网络层),接着调用的内容为

prediction_scores = self.cls(sequence_output)

这里调用的网络层内容为

self.cls = BertOnlyMLMHead(config)

进入到BertOnlyMLMHead(config)网络层之中查看调用过程

class BertOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.predictions = BertLMPredictionHead(config)

    def forward(self, sequence_output):
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores

这里需要调用BertLMPredictionHead的网络层,进入到BertLMPredictionHead网络层之中

class BertPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        if isinstance(config.hidden_act, str):
            #config.hidden_act = 'gelu',调用第一个if语句
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            print('BertPredictionHeadTransform situation2')
            self.transform_act_fn = config.hidden_act
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


class BertLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.transform = BertPredictionHeadTransform(config)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
        return hidden_states

完整地调用网络层的过程如下:

Linear(config.hidden_size,config.hidden_size)
'gelu'激活函数
LayerNorm(0.1)
Linear(config.hidden_size,config.vocab_size)
(最后一个Linear的bias=zeros)

最后调用相应的CrossEntropy()损失函数的内容

if labels is not None:
    loss_fct = CrossEntropyLoss()  # -100 index = padding token
    masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
    outputs = (masked_lm_loss,) + outputs

!!!注意这里的对应的modeling.py的内容一定要读pretrain_code目录下面的modeling下面的modeling_nezha下面的modeling.py的内容
也就是说,这里本质上是求

batch = 
        {'input_ids': tensor([[  101,   169,   107,  ...,   100,   100,   102],
        [  101,   169,   107,  ...,   100,   100,   102],
        [  101,   169,   107,  ...,   107, 10539,   102],
        ...,
        [  101,   169,   107,  ...,   131,   107,   102],
        [  101,   169,   103,  ...,   100,   124,   102],
        [  101,   169,   107,  ...,   171,   117,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        ...,
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100,  107,  ..., -100, -100, -100],
        [-100, -100,  107,  ..., -100, -100, -100]])}

中由input_ids输出的prediction_scores.view = ([2880,21128])和labels.view(-1) = ([2880])的好多是-100的计算对应的cross_entropy交叉熵概率,
由于没有-100这个标签,所以带有-100的标签交叉熵计算出来的概率为0,这样就剔除了这一部分的误差内容

おすすめ

転載: blog.csdn.net/znevegiveup1/article/details/118569990