A practice of multi-label classification

Multi-label classification is a common task in the NLP field. The most commonly used is sigmoid + BCE. Depending on the number and characteristics of tags, there may be different variants and optimizations.

This article practices a label system of 25w label words. Due to the huge number of tags, it will be difficult to train to fully represent the tags.

Some thoughts based on this task:

(1) Use negative sampling to reduce the representation of the result vector

(2) The label is relevant, and the amount of negative sampling is not the larger the better

(3) On loss, use margin loss to optimize the interval between positive and negative samples

For the effect, see Scheme 3 of "Several Schemes of Document Tagging"

# coding=utf-8

"""
负彩阳版本 - 加快迭代训练效率
"""

import keras
from keras import layers
from keras.layers import *
from keras.models import Model, load_model
from keras.utils.np_utils import to_categorical
from keras.callbacks import *
from keras.optimizers import *
import tensorflow as tf
import numpy as np
import os
import json
import random
from keras.losses import binary_crossentropy, categorical_crossentropy, categorical_hinge, mean_squared_logarithmic_error
from keras.metrics import mse, mean_absolute_error,cosine_proximity, sparse_categorical_crossentropy, categorical_crossentropy
from sklearn import preprocessing

import copy

from Atten_cy import Atten_cy

import keras.backend.tensorflow_backend as KTF

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
config = tf.ConfigProto()
config.gpu_options.allow_growth=True   #不全部占满显存, 按需分配
sess = tf.Session(config=config)
KTF.set_session(sess)



base = "tags/"
# base = ""



def multilabel_mag(y_true, y_pred):
    ng_to_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    min_pos = K.min(ng_to_1, axis=-1)
    neg = K.max((1. - y_true) * y_pred, axis=-1)
    return K.maximum(0., neg - min_pos + 0.5)

class sent_tag_model(object):

    def __init__(self):
        self.spl = "<->"
        self.max_seq_len = 50
        self.emd_size = 512
        self.filter_size = 512
        self.batch_size = 256

        self.max_tag_size = 50 #抽样完的tag数

        self.word2id = {"<pad>": 0}
        self.tag2id = {}
        self.id2tag = {}
        
        
        self.model = None
        self.model_dense = None
        self.model_tag_emb = None
        self.model_text_emb = None

    @staticmethod
    def load_json(path):
        with open(path, "r", encoding="utf-8") as reader:
            res = json.load(reader)
            return res

    @staticmethod
    def dump_json(obj, path):
        with open(path, "w", encoding="utf-8") as writer:
            json.dump(obj, writer, ensure_ascii=False)

    
    def build_model(self):
        in_p = layers.Input(shape=(self.max_seq_len,))
        emb = layers.Embedding(input_dim=len(self.word2id) + 1, output_dim=self.emd_size)(in_p)

        #丰富其特征
        conv_2 = layers.Conv1D(kernel_size=1, filters=self.filter_size, strides=1, padding="same")(emb)
        conv_3 = layers.Conv1D(kernel_size=2, filters=self.filter_size, strides=1, padding="same")(emb)
        conv_4 = layers.Conv1D(kernel_size=3, filters=self.filter_size, strides=1, padding="same")(emb)
        conv_5 = layers.Conv1D(kernel_size=4, filters=self.filter_size, strides=1, padding="same")(emb)

        x = layers.Concatenate(axis=-1)([emb, conv_2, conv_3, conv_4,conv_5])
        x = Atten_cy()(x)
#         gm = layers.GlobalAveragePooling1D()(emb)
#         x = layers.Concatenate(axis=-1)([x, gm])
        x = layers.Dense(self.emd_size)(x)
        x = layers.Lambda(lambda x: K.l2_normalize(x, axis=-1))(x)
        x_r = layers.RepeatVector(self.max_tag_size)(x)



        in_tag = layers.Input(shape=(self.max_tag_size,))
        emb_tag = layers.Embedding(input_dim=len(self.tag2id), output_dim=self.emd_size)(in_tag)

        emb_tag = layers.Lambda(lambda x: K.l2_normalize(x, axis=-1))(emb_tag)


        x_dense_in = layers.Concatenate()([x_r, emb_tag])

        desse_in = layers.Input(shape=(None, self.emd_size * 2))

        x_dense = layers.Dense(units=2048, activation="relu")(desse_in)

        x_dense = layers.Dense(units=1)(x_dense)
        output = layers.Lambda(lambda x: K.squeeze(x, axis=-1))(x_dense)
        output = layers.Activation("sigmoid")(output)

        model_dense = Model(desse_in, output)

        output_all = model_dense(x_dense_in)
        model = Model([in_p, in_tag], output_all)

        opt = Adam(lr=0.00005)

        model.compile(loss=multilabel_mag,
                      optimizer=opt,
                      metrics=["mae", max_ng, min_pos])

        model_tag_emb = Model(in_tag, emb_tag)
        model_text_emb = Model(in_p, x)

        print(model.summary())

        return model, model_dense, model_tag_emb, model_text_emb

   

 

 

 

 

 

 

Guess you like

Origin blog.csdn.net/cyinfi/article/details/107327846