Article directory
Preparation
Pre-training model download address: Albert_Large_zh
Dataset download address: Accident and disaster multi-classification dataset (the dataset is obtained by crawlers, please advise if there are any errors)
1. Processing Datasets
Put the pre-trained model into the pretraining_model folder.
Divide the data set into training set, test set and verification set, which are train.txt, test.txt, and dev.txt respectively. The ratio is generally 7:2:1, and put them into the datasets folder.
Case data:
data labels and content are separated by '\t'
2. Build classification tasks
project structure
Keras_Bert_Class/
|-- datasets/
| |-- train.txt
| |-- test.txt
| |-- dev.txt
|
|-- model/
|
|-- pretraining_model/
| |-- albert_larger/
|
|-- main.py
|-- requirements.txt
1. Import library
code show as below:
import numpy as np
from sklearn import metrics
from bert4keras.tokenizers import Tokenizer
from bert4keras.backend import keras, set_gelu
from bert4keras.models import build_transformer_model
from bert4keras.snippets import DataGenerator, sequence_padding
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from keras.layers import Lambda, Dense
from contextlib import redirect_stdout
2. Configuration parameters
code show as below:
# 配置
class Config:
def __init__(self):
# 预训练模型名称
self.model_name = "bert"
# 数据集
self.train_path = "datasets/train.txt"
# 类别列表
self.class_list = self.read_class()
# 类别数
self.num_classes = len(self.class_list)
# epoch数
self.epochs = 10
# mini-batch
self.batch_size = 8
# 每句话处理长度(短填切长)
self.pad_size = 128
# 学习率
self.learning_rate = 1e-5
# 预训练模型路径
self.config_path = "./pretraining_model/albert_large/albert_config.json"
self.checkpoint_path = "./pretraining_model/albert_large/albert_model.ckpt"
self.dict_path = "./pretraining_model/albert_large/vocab.txt"
# 文本处理
self.tokenizer = Tokenizer(self.dict_path)
# 标签字典
self.label2id, self.id2label = self.label_dict()
def read_class(self):
class_list = []
for line in open("datasets/train.txt", 'r', encoding='utf-8').readlines():
line = line.split('\t')
if line[0] not in class_list:
class_list.append(line)
return class_list
def label_dict(self):
label2id, id2label = {
}, {
}
with open(self.train_path, 'r', encoding="utf-8") as data:
for line in data:
line = line.split('\t')
label, text = line[0], line[1].replace("\n", "")
if label not in label2id:
label2id[label] = len(label2id)
id2label[len(label2id)] = label
return label2id, id2label
config = Config()
3. Read and process data
code show as below:
def split_data(ratio=0.2, transmit_data=config.train_path):
data = open(transmit_data, 'r', encoding='utf-8').readlines()
train_x, test_val_x, train_y, test_val_y = train_test_split(x, y, test_size=ratio, stratify=y, random_state=42)
test_x, val_x, test_y, val_y = train_test_split(test_val_x, test_val_y, test_size=0.5, stratify=test_val_y, random_state=42)
train_data = [(x, y) for x, y in zip(train_x, train_y)]
test_data = [(x, y) for x, y in zip(test_x, test_y)]
val_data = [(x, y) for x, y in zip(val_x, val_y)]
return train_data, test_data, val_data
train_data, test_data, dev_data = split_data(ratio=0.2, transmit_data=config.train_path)
class data_gernerator(DataGenerator):
"""数据生成器"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, (text, label) in self.sample(random):
token_ids, segment_ids = config.tokenizer.encode(text, maxlen=config.pad_size)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
if len(batch_token_ids) == config.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
train_generator = data_gernerator(train_data, config.batch_size)
test_generator = data_gernerator(test_data, config.batch_size)
dev_generator = data_gernerator(dev_data, config.batch_size)
4. Load the Albert pre-trained model
Use Sushen's bert4keras to load the pre-trained model
code as follows:
bert = build_transformer_model(
config_path=config.config_path,
checkpoint_path=config.checkpoint_path,
model="albert",
return_keras_model=False
)
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
output = Dense(
units=config.num_classes,
activation='softmax',
kernel_initializer=bert.initializer
)(output)
model = keras.models.Model(bert.model.input, output)
# 将模型结构写入txt文件
with open('model/modelsummary.txt', 'w+') as f:
with redirect_stdout(f):
model.summary()
AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=AdamLR(lr=config.learning_rate),
metrics=['accuracy'])
5. Start training
code show as below:
# 生成准确率
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
y_true = y_true[:, 0]
total += len(y_true)
right += (y_true == y_pred).sum()
return right / total
class Evaluator(keras.callbacks.Callback):
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, epoch, logs=None):
val_acc = evaluate(dev_generator)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
model.save_weights('best_model.weights')
test_acc = evaluate(test_generator)
print(
u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
(val_acc, self.best_val_acc, test_acc)
)
evaluator = Evaluator()
model.fit_generator(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=config.epochs,
callbacks=[evaluator]
)
6. Verify the model
code show as below:
# 生成分类报告
def evaluate1(data):
y1, y2 = [], []
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
y_pred = y_pred.tolist()
y_true = y_true[:, 0]
y_true = y_true.tolist()
y1 = y1 + y_pred
y2 = y2 + y_true
y1, y2 = np.array(y1), np.array(y2)
categories = list(config.label2id.keys())
print(categories)
print("Precision, Recall and F1-Score...")
print(metrics.classification_report(y2, y1, target_names=categories)
evaluate1(test_generator)
Summarize
The overall effect of the model is better. Due to the small number of individual categories in the data set, the weights are unbalanced, and the recognition effect is poor. Everyone can show their talents to obtain data and obtain sample balance. The model effect fluctuates around 97%.
Thanks to the comment area 'Karabuka' for the pointers
references
https://github.com/bojone/bert4keras/blob/master/examples/task_sentiment_albert.py
scientific space station