Simcse training based on tensorflow keras and bert related knowledge

Simcse can perform similar text matching tasks, but the tensorflow version of the transformer does not have an embedded simcse for downstream training, and the feasibility of this alignment has been verified

The relevant code is as follows, and the corresponding complete code can be found in bert_classification/4_bert_sentence_similarity.py sparkle_code_guy/bert_related_task - Code Cloud - Open Source China (gitee.com) :

import tensorflow as tf
import numpy as np
from transformers import TFBertPreTrainedModel, BertConfig, TFBertMainLayer, BertTokenizer
from transformers.modeling_tf_outputs import  TFSemanticSegmenterOutput
from typing import Optional, Tuple, Union
import pandas as pd
from transformers.models.bert.modeling_tf_bert import (
    TFModelInputType,
    TFSequenceClassificationLoss,
    unpack_inputs, BERT_INPUTS_DOCSTRING
)
from transformers.utils import add_start_docstrings_to_model_forward
class TFSimCSE(TFBertPreTrainedModel, TFSequenceClassificationLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
    _keys_to_ignore_on_load_missing = [r"dropout"]
    def __init__(self, config: BertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.bert = TFBertMainLayer(config, name="bert")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def call(
            self,
            input_ids: Optional[TFModelInputType] = None,
            attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
            token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
            position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
            head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
            inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = False,
            return_dict: Optional[bool] = None,
            labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
            training: Optional[bool] = False,
    ) -> Union[TFSemanticSegmenterOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        outputs1 = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 说明,transformer相关模型返回的结果必须是tuple,dict(ModelOutput对象类似)
        return TFSemanticSegmenterOutput(logits=outputs1.pooler_output, loss=None)

    def serving_output(self, output: dict) -> dict:
        return output


def simcse_loss(y_true, y_pred):
    """
    simcse loss
    对应的介绍见文章:https://blog.csdn.net/sslfk/article/details/123210756
    """
    idxs = tf.range(0, tf.shape(y_pred)[0])
    idxs_1 = idxs[None, :]
    idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]
    y_true = tf.equal(idxs_1, idxs_2)
    y_true = tf.cast(y_true, tf.keras.backend.floatx())
    y_pred = tf.math.l2_normalize(y_pred, axis=1)
    similarities = tf.matmul(y_pred, y_pred, transpose_b=True)
    similarities = similarities - tf.eye(tf.shape(y_pred)[0]) * 1e12
    similarities = similarities / 0.05
    loss = tf.keras.losses.categorical_crossentropy(y_true, similarities, from_logits=True)
    return tf.reduce_mean(loss)


def simcse_hard_neg_loss(y_true, y_pred):
    """
    simcse loss for hard neg or random neg
    """
    row = tf.range(0, tf.shape(y_pred)[0], 3)
    col = tf.range(tf.shape(y_pred)[0])
    col = tf.squeeze(tf.where(col % 3 != 0), axis=1)
    y_true = tf.range(0, len(col), 2)
    y_pred = tf.math.l2_normalize(y_pred, axis=1)
    similarities = tf.matmul(y_pred, y_pred, transpose_b=True)
    similarities = tf.gather(similarities, row, axis=0)
    similarities = tf.gather(similarities, col, axis=1)
    similarities = similarities / 0.05
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, similarities, from_logits=True)
    return tf.reduce_mean(loss)


max_length = 60
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')


def simcse_generater():
    df_raw = pd.read_csv("data/sts_data/senteval_cn/ATEC/ATEC.train.data", sep="\t", header=None,
                         names=["x1", "x2", "y"])

    def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
        return {
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_masks,
        }, label

    def encode_examples(ds, limit=-1):
        # prepare list, so that we can build up final TensorFlow dataset from slices.
        input_ids_list = []
        token_type_ids_list = []
        attention_mask_list = []
        label_list = []
        if (limit > 0):
            ds = ds.take(limit)

        for index, row in ds.iterrows():
            x1 = row["x1"]
            x2 = row["x2"]
            for each in (x1, x2):
                bert_input = tokenizer.encode_plus(each,
                                                   add_special_tokens=True,  # add [CLS], [SEP]
                                                   padding='max_length',
                                                   truncation=True,
                                                   max_length=max_length,  # max length of the text that can go to BERT
                                                   # pad_to_max_length=True,
                                                   return_attention_mask=True,
                                                   # add attention mask to not focus on pad tokens
                                                   )
                input_ids_list.append(bert_input['input_ids'])
                token_type_ids_list.append(bert_input['token_type_ids'])
                attention_mask_list.append(bert_input['attention_mask'])
                label_list.append([0])

        return tf.data.Dataset.from_tensor_slices(
            (input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

    # train dataset
    batch_size = 100
    ds_train_encoded = encode_examples(df_raw).shuffle(10000).batch(batch_size)
    return ds_train_encoded


learning_rate = 2e-5
my_model = TFSimCSE.from_pretrained('bert-base-chinese')

# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08, clipnorm=1)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
my_model.compile(optimizer=optimizer, loss=simcse_loss)
# fit model
bert_history = my_model.fit(simcse_generater(), epochs=1)
# evaluate test set
tf.keras.models.save_model(my_model, filepath="my_model1")

illustrate:

  1. For transformer-related model training, the returned result must be of tuple or dict type, where dict can be the implementation class corresponding to the specific ModelOutput.

  1. Relevant STS data download: Link: https://pan.baidu.com/s/1JzzDVjaBRrDjYGgPJ6D4hQ?pwd=cxa6 Extraction code: cxa6

  1. The Bert model of the tensorflow version actually adds a pooler layer to the encoder structure of the original transformer. Its processing is only to perform dense+tanh processing on the encoding of the first token in the hidden state of the last layer of the encoder. Related code: TFBertPooler

  1. The original author's code: https://github.com/princeton-nlp/SimCSE#model-list , the open source simcse default code is only applicable to English and Chinese parts, you need to replace the Chinese training set after replacing the bert pre-training model

  1. The version of tf2: https://github.com/jifei/simcse-tf2.git needs to rely on bert4keras, note that it is not compatible with the higher version of tf2

  1. For the understanding of transformer and bert, see: ‌‍⁢‍⁣‍⁡‍⁤‍⁣⁣⁤⁢⁤⁡‍⁢⁣⁢‍⁡⁡⁤⁤‍⁣⁡‍Transformer and related technologies and branch trends in various fields - Feishu Cloud Documentation (feishu.cn)

  1. Transformer related introduction and code implementation: Transformer model for understanding language | TensorFlow Core (google.cn)

Guess you like

Origin blog.csdn.net/sslfk/article/details/129028969