多标签分类是NLP领域的常见任务, 最常用的是sigmoid + BCE , 根据标签数量和特点不同,可能有不同的变形和优化。
本文实践的是一个25w标签词的标签体系。由于标签数量巨大,将标签进行全量表示将难以训练。
基于该任务的若干思考:
(1)使用负采样的方式减少结果向量的表示
(2)标签具有相关性,负采样的量不是越大越好
(3)在loss上,采用margin loss 优化正负样本的间隔
效果见《文档标签化的几种方案》方案三
# coding=utf-8
"""
负彩阳版本 - 加快迭代训练效率
"""
import keras
from keras import layers
from keras.layers import *
from keras.models import Model, load_model
from keras.utils.np_utils import to_categorical
from keras.callbacks import *
from keras.optimizers import *
import tensorflow as tf
import numpy as np
import os
import json
import random
from keras.losses import binary_crossentropy, categorical_crossentropy, categorical_hinge, mean_squared_logarithmic_error
from keras.metrics import mse, mean_absolute_error,cosine_proximity, sparse_categorical_crossentropy, categorical_crossentropy
from sklearn import preprocessing
import copy
from Atten_cy import Atten_cy
import keras.backend.tensorflow_backend as KTF
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
config = tf.ConfigProto()
config.gpu_options.allow_growth=True #不全部占满显存, 按需分配
sess = tf.Session(config=config)
KTF.set_session(sess)
base = "tags/"
# base = ""
def multilabel_mag(y_true, y_pred):
ng_to_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
min_pos = K.min(ng_to_1, axis=-1)
neg = K.max((1. - y_true) * y_pred, axis=-1)
return K.maximum(0., neg - min_pos + 0.5)
class sent_tag_model(object):
def __init__(self):
self.spl = "<->"
self.max_seq_len = 50
self.emd_size = 512
self.filter_size = 512
self.batch_size = 256
self.max_tag_size = 50 #抽样完的tag数
self.word2id = {"<pad>": 0}
self.tag2id = {}
self.id2tag = {}
self.model = None
self.model_dense = None
self.model_tag_emb = None
self.model_text_emb = None
@staticmethod
def load_json(path):
with open(path, "r", encoding="utf-8") as reader:
res = json.load(reader)
return res
@staticmethod
def dump_json(obj, path):
with open(path, "w", encoding="utf-8") as writer:
json.dump(obj, writer, ensure_ascii=False)
def build_model(self):
in_p = layers.Input(shape=(self.max_seq_len,))
emb = layers.Embedding(input_dim=len(self.word2id) + 1, output_dim=self.emd_size)(in_p)
#丰富其特征
conv_2 = layers.Conv1D(kernel_size=1, filters=self.filter_size, strides=1, padding="same")(emb)
conv_3 = layers.Conv1D(kernel_size=2, filters=self.filter_size, strides=1, padding="same")(emb)
conv_4 = layers.Conv1D(kernel_size=3, filters=self.filter_size, strides=1, padding="same")(emb)
conv_5 = layers.Conv1D(kernel_size=4, filters=self.filter_size, strides=1, padding="same")(emb)
x = layers.Concatenate(axis=-1)([emb, conv_2, conv_3, conv_4,conv_5])
x = Atten_cy()(x)
# gm = layers.GlobalAveragePooling1D()(emb)
# x = layers.Concatenate(axis=-1)([x, gm])
x = layers.Dense(self.emd_size)(x)
x = layers.Lambda(lambda x: K.l2_normalize(x, axis=-1))(x)
x_r = layers.RepeatVector(self.max_tag_size)(x)
in_tag = layers.Input(shape=(self.max_tag_size,))
emb_tag = layers.Embedding(input_dim=len(self.tag2id), output_dim=self.emd_size)(in_tag)
emb_tag = layers.Lambda(lambda x: K.l2_normalize(x, axis=-1))(emb_tag)
x_dense_in = layers.Concatenate()([x_r, emb_tag])
desse_in = layers.Input(shape=(None, self.emd_size * 2))
x_dense = layers.Dense(units=2048, activation="relu")(desse_in)
x_dense = layers.Dense(units=1)(x_dense)
output = layers.Lambda(lambda x: K.squeeze(x, axis=-1))(x_dense)
output = layers.Activation("sigmoid")(output)
model_dense = Model(desse_in, output)
output_all = model_dense(x_dense_in)
model = Model([in_p, in_tag], output_all)
opt = Adam(lr=0.00005)
model.compile(loss=multilabel_mag,
optimizer=opt,
metrics=["mae", max_ng, min_pos])
model_tag_emb = Model(in_tag, emb_tag)
model_text_emb = Model(in_p, x)
print(model.summary())
return model, model_dense, model_tag_emb, model_text_emb