Albert handles text classification tasks


Preparation

Pre-training model download address: Albert_Large_zh
Dataset download address: Accident and disaster multi-classification dataset (the dataset is obtained by crawlers, please advise if there are any errors)

1. Processing Datasets

Put the pre-trained model into the pretraining_model folder.
Divide the data set into training set, test set and verification set, which are train.txt, test.txt, and dev.txt respectively. The ratio is generally 7:2:1, and put them into the datasets folder.

Case data:

data labels and content are separated by '\t'

2. Build classification tasks

project structure

Keras_Bert_Class/
|-- datasets/
|   |-- train.txt
|   |-- test.txt
|   |-- dev.txt
|
|-- model/
|
|-- pretraining_model/
|   |-- albert_larger/
|
|-- main.py
|-- requirements.txt

1. Import library

code show as below:

import numpy as np
from sklearn import metrics
from bert4keras.tokenizers import Tokenizer
from bert4keras.backend import keras, set_gelu
from bert4keras.models import build_transformer_model
from bert4keras.snippets import DataGenerator, sequence_padding
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from keras.layers import Lambda, Dense
from contextlib import redirect_stdout

2. Configuration parameters

code show as below:

# 配置
class Config:
    def __init__(self):
        # 预训练模型名称
        self.model_name = "bert"
        # 数据集
        self.train_path = "datasets/train.txt"
        # 类别列表
        self.class_list = self.read_class()
        # 类别数
        self.num_classes = len(self.class_list)
        # epoch
        self.epochs = 10
        # mini-batch
        self.batch_size = 8
        # 每句话处理长度(短填切长)
        self.pad_size = 128
        # 学习率
        self.learning_rate = 1e-5
        # 预训练模型路径
        self.config_path = "./pretraining_model/albert_large/albert_config.json"
        self.checkpoint_path = "./pretraining_model/albert_large/albert_model.ckpt"
        self.dict_path = "./pretraining_model/albert_large/vocab.txt"
        # 文本处理
        self.tokenizer = Tokenizer(self.dict_path)
        # 标签字典
        self.label2id, self.id2label = self.label_dict()

    def read_class(self):
        class_list = []
        for line in open("datasets/train.txt", 'r', encoding='utf-8').readlines():
            line = line.split('\t')
            if line[0] not in class_list:
                class_list.append(line)
        return class_list

    def label_dict(self):
        label2id, id2label = {
    
    }, {
    
    }
        with open(self.train_path, 'r', encoding="utf-8") as data:
            for line in data:
                line = line.split('\t')
                label, text = line[0], line[1].replace("\n", "")
                if label not in label2id:
                    label2id[label] = len(label2id)
                    id2label[len(label2id)] = label
        return label2id, id2label
config = Config()

3. Read and process data

code show as below:

def split_data(ratio=0.2, transmit_data=config.train_path):
    data = open(transmit_data, 'r', encoding='utf-8').readlines()
    train_x, test_val_x, train_y, test_val_y = train_test_split(x, y, test_size=ratio, stratify=y, random_state=42)
    test_x, val_x, test_y, val_y = train_test_split(test_val_x, test_val_y, test_size=0.5, stratify=test_val_y, random_state=42)
    train_data = [(x, y) for x, y in zip(train_x, train_y)]
    test_data = [(x, y) for x, y in zip(test_x, test_y)]
    val_data = [(x, y) for x, y in zip(val_x, val_y)]
    return train_data, test_data, val_data


train_data, test_data, dev_data = split_data(ratio=0.2, transmit_data=config.train_path)


class data_gernerator(DataGenerator):
    """数据生成器"""
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label) in self.sample(random):
            token_ids, segment_ids = config.tokenizer.encode(text, maxlen=config.pad_size)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == config.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []

train_generator = data_gernerator(train_data, config.batch_size)
test_generator = data_gernerator(test_data, config.batch_size)
dev_generator = data_gernerator(dev_data, config.batch_size)

4. Load the Albert pre-trained model

Use Sushen's bert4keras to load the pre-trained model
code as follows:

bert = build_transformer_model(
    config_path=config.config_path,
    checkpoint_path=config.checkpoint_path,
    model="albert",
    return_keras_model=False
)

output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
output = Dense(
    units=config.num_classes,
    activation='softmax',
    kernel_initializer=bert.initializer
)(output)

model = keras.models.Model(bert.model.input, output)

# 将模型结构写入txt文件 
with open('model/modelsummary.txt', 'w+') as f:
    with redirect_stdout(f):
        model.summary()

AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=AdamLR(lr=config.learning_rate),
    metrics=['accuracy'])

5. Start training

code show as below:

# 生成准确率
def evaluate(data):
    total, right = 0., 0.
    for x_true, y_true in data:
        y_pred = model.predict(x_true).argmax(axis=1)
        y_true = y_true[:, 0]
        total += len(y_true)
        right += (y_true == y_pred).sum()
    return right / total


class Evaluator(keras.callbacks.Callback):
    def __init__(self):
        self.best_val_acc = 0.

    def on_epoch_end(self, epoch, logs=None):

        val_acc = evaluate(dev_generator)
        if val_acc > self.best_val_acc:
            self.best_val_acc = val_acc
            model.save_weights('best_model.weights')
        test_acc = evaluate(test_generator)
        print(
            u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
            (val_acc, self.best_val_acc, test_acc)
        )


evaluator = Evaluator()


model.fit_generator(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=config.epochs,
    callbacks=[evaluator]
)

6. Verify the model

code show as below:

# 生成分类报告
def evaluate1(data):
    y1, y2 = [], []
    for x_true, y_true in data:
        y_pred = model.predict(x_true).argmax(axis=1)
        y_pred = y_pred.tolist()
        y_true = y_true[:, 0]
        y_true = y_true.tolist()
        y1 = y1 + y_pred
        y2 = y2 + y_true

    y1, y2 = np.array(y1), np.array(y2)
    categories = list(config.label2id.keys())
    print(categories)
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y2, y1, target_names=categories)
    
evaluate1(test_generator)

Summarize

The overall effect of the model is better. Due to the small number of individual categories in the data set, the weights are unbalanced, and the recognition effect is poor. Everyone can show their talents to obtain data and obtain sample balance. The model effect fluctuates around 97%.
Thanks to the comment area 'Karabuka' for the pointers

references

https://github.com/bojone/bert4keras/blob/master/examples/task_sentiment_albert.py
scientific space station

Guess you like

Origin blog.csdn.net/black_lightning/article/details/111071172