Deep Learning for Extreme Multi-label Text Classification 实现

论文链接：

https://www.semanticscholar.org/paper/Deep-Learning-for-Extreme-Multi-label-Text-Liu-Chang/1a0365567850837931d04126714ae6e2cbfc6270

论文目的：

对于极度多标签文本分类进行解决，这里总的的标签量一般超过1000（可以更多）。

模型结构：

此模型基于CNN-Kim（即一般的TextCNN）有两点改进。

一方面是对于文本卷积层，将Kim的对整个文本序列的max-pooling层进行分块（dynamicpooling），即将原来的待pooling序列沿文本方向分成若干子块，对这些子块pooling之后，再将子块pooling的结果进行拼接来传入后续神经层，想法是这样处理后能提取更多的文本特征。

另一方面是在损失函数处使用binarycross-entropy 代替一般多标签分类常用的softmax cross-entropy。（用sigmoid 代替 softmax）

扫描二维码关注公众号，回复： 1723389 查看本文章

模型结构图：

测试数据：

数据来源：

https://www.kaggle.com/stackoverflow/stacksample

数据说明：

这份数据是stock overflow的一份QA数据，这里仅用到其中的文本部分，即Question

Answer TitleTags。将其抽象成文本多分类问题，即

自变量集合Question Answer Title

因变量集合 Tags

选择这部分数据从模型角度的原因是其Tags集合标签总数多于10000个，符合极度多标签分类的定义，另一方面这是一个QA数据集，除了文本分类问题外还可以做一些QA及记忆网络的尝试。（从关于模型的描述可以看出，模型并非十分复杂，但经过一些数据处理过程可以让我们在QA或其它场景应用处理过的数据集）

数据处理：

数据源包含文件 Questions.csv Answers.csv Tags.csv

数据为英文数据，会涉及到一些简单的数据处理。

数据清洗：

用库：https://github.com/GauravBh1010tt/DL-text

import pandas as pd
from dl_text import dl
import re
import gc
from collections import defaultdict
from functools import reduce

def cleanhtml(raw_html):
    if not raw_html:
        return ""
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def lower_tokenize(text):
    return dl.tokenize(dl.clean(cleanhtml(text).lower()))

这里包含了去掉html标签及使用dl库将英文的一些常用简写映射进行分离的操作（如I’ve 到I have等）。

用于训练的embedding既可以使用glove也可以用word2vec，调用

https://github.com/GradySimon/tensorflow-glove

可以如下获得glove-embedding（不支持partial predict的更新估计方式且较为占内存，优化一般）

def train_glove_embedding(pre = True):
    if pre:
        def row_process(df, min_seq_len = 100, max_r = 5e10):
            have_count = 0
            for r, row in df.iterrows():
                if r % 10000 == 0:
                    print("r %d final" % (r,))
                    if have_count > max_r:
                        return

                tokens = lower_tokenize(row["Body"])
                if len(tokens) > min_seq_len:
                    have_count += 1
                    yield tokens

        q_df = pd.read_csv("Questions.csv", encoding="latin1")[["Body"]]
        a_df = pd.read_csv("Answers.csv", encoding="latin1")[["Body"]]

        q_g = row_process(q_df, min_seq_len=100)
        del q_df
        a_g = row_process(a_df, min_seq_len=100)
        del a_df

        with open("corpus.txt", "w") as f:
            for tokens in q_g:
                if tokens:
                    f.write(" ".join(tokens) + "\n")
            del q_g

            for tokens in a_g:
                if tokens:
                    f.write(" ".join(tokens) + "\n")
            del a_g

        gc.collect()
        return

    corpus = []
    with open("corpus.txt") as f:

        while True:
            line = f.readline()
            if not line:
                break
            corpus.append(line.split(" "))
            if len(corpus) >= 1e5:
                break

    print("length {}".format(len(corpus)))

    from glove_embed import tf_glove
    model = tf_glove.GloVeModel(embedding_size=300, context_size=10, min_occurrences=1,
                                learning_rate=0.05, batch_size=512)
    model.fit_to_corpus(corpus)
    model.train(num_epochs=100)

    with open("glove_embed.txt", "w") as f:
        for i ,word in enumerate(model.words):
            if i % 10000 == 0:
                print("file serlize {} end".format(i))

            f.write("{}\t{}\n".format(word, " ".join(model.embedding_for(word).astype(str).tolist())))

或者如下使用word2vec

def train_w2v_embedding():
    from gensim.models import Word2Vec
    w2v_path = "w2v.model"
    w2v_model = None
    embed_size = 300
    num_epoch = 10

    for epoch in range(num_epoch):
        with open("corpus.txt") as f:
            corpus = []
            while True:
                line = f.readline()
                if not line:
                    break

                if len(corpus) < 5e4:
                    corpus.append(line.split(" "))
                else:
                    if w2v_model is None:
                        w2v_model = Word2Vec(corpus, size=embed_size ,min_count=1, workers=4, iter = 10)
                    else:
                        w2v_model = Word2Vec.load(w2v_path)
                        w2v_model.train(corpus, total_examples = len(corpus), epochs = 10)
                    w2v_model.save(w2v_path)

                    corpus = []
                    print("one train iter end")

        print("epoch {} end".format(epoch))
        wv = w2v_model.wv
        with open("w2v_embed.txt", "w") as f:
            for word in wv.vocab.keys():
                f.write("{}\t{}\n".format(word, " ".join(wv[word].astype(str).tolist())))

下面实现Questions.csv Answers.csv Tags.csv 的数据合并

def classifier_process(first_time = False):
    q_columns = ["Id", "Body", "Title"]
    a_columns = ["ParentId", "Body"]
    tag_columns = ["Id", "Tag"]

    # use half datasets (6e5) two times
    if first_time:
        q_df = pd.read_csv("Questions.csv", nrows=6e5, encoding="latin1")[q_columns]
    else:
        q_df = pd.read_csv("Questions.csv", encoding="latin1")[q_columns]
        q_df = q_df.iloc[int(6e5):]

    q_c = q_df.columns.tolist()
    q_c[q_c.index("Body")] = "q_Body"
    q_df.columns = q_c

    a_df = pd.read_csv("Answers.csv", nrows=1e100, encoding="latin1")[a_columns]
    a_c = a_df.columns.tolist()
    a_c[a_c.index("Body")] = "a_Body"
    a_c[a_c.index("ParentId")] = "Id"
    a_df.columns = a_c

    X_df = pd.merge(q_df, a_df, how = "inner", on = "Id")

    del a_df
    del q_df
    gc.collect()
    print("X_df merge")

    X_df_constuct = defaultdict(list)
    unique_column = X_df.columns.tolist()
    unique_column.remove("a_Body")

    X_df_groupby_id = list(X_df.groupby("Id"))
    for group_df_idx in range(len(X_df_groupby_id)):
        group_df = X_df_groupby_id[group_df_idx][-1]
        for col in unique_column:
            X_df_constuct[col].append(group_df[col].tolist()[0])
        X_df_constuct["a_Body"].append(reduce(lambda x,y: x + y, group_df["a_Body"]))

    X_df = pd.DataFrame.from_dict(X_df_constuct)

    del X_df_constuct
    gc.collect()
    print("X_df unique")

    tag_df = pd.read_csv("Tags.csv", nrows=1e100, encoding="latin1")[tag_columns]

    y_df_construct = defaultdict(list)
    for r_index, row in tag_df.iterrows():
        y_df_construct[row["Id"]].append(row["Tag"])
    tag_df_construct = defaultdict(list)
    for k, v in y_df_construct.items():
        tag_df_construct["Id"].append(k)
        tag_df_construct["Tags"].append(v)

    del tag_df
    gc.collect()
    print("tag_df unique")

    y_df = pd.DataFrame.from_dict(tag_df_construct)
    X_y_df = pd.merge(X_df, y_df, how = "inner", on = "Id")

    del X_df
    del y_df
    gc.collect()
    print("feature prepare start")

    with open("X_y_file_6e5_1.txt", "w") as f:
        for r_index, row in X_y_df.iterrows():
            body_text = row["q_Body"] if row["q_Body"] else ""
            q_body_tokenize = lower_tokenize(body_text)
            body_text = row["a_Body"] if row["a_Body"] else ""
            a_body_tokenize = lower_tokenize(body_text)

            title_text = row["Title"] if row["Title"] else ""
            title_tokenize = lower_tokenize(title_text)

            f.write("{}\t{}\t{}\t{}\n".format(" ".join(q_body_tokenize),
                                              " ".join(a_body_tokenize) , " ".join(title_tokenize), " ".join(filter(lambda x: x if type(x) is str else None,row["Tags"]))))

            if r_index % 10000 == 0:
                print("feature_prepare {} end".format(r_index))

在这里我们设定对于1000个最常见的Tag的多分类，下面根据这个分类目标进行样本过

滤，删去较少出现的Tag对应样本

# use this script to identify used tags num
def eval_tag_count():
    all_list = []
    from collections import Counter
    with open("X_y_file_6e5_0.txt") as f:
        while True:
            line = f.readline()
            if not line:
                break
            all_list.extend(line.split("\t")[-1].replace("\n", "").split(" "))
    with open("X_y_file_6e5_1.txt") as f:
        while True:
            line = f.readline()
            if not line:
                break
            all_list.extend(line.split("\t")[-1].replace("\n", "").split(" "))

    cnt = Counter(all_list)

    print("min_count :{}".format(min(map(lambda x: x[1] ,cnt.most_common(10)))))
    most_common_keys = list(map(lambda x: x[0] ,cnt.most_common(10)))

    with open("X_y_file_12e5_10.txt", "w") as o:
        o_num = 0
        with open("X_y_file_6e5_0.txt") as f:
            while True:
                line = f.readline()
                if not line:
                    break
                tags = list(filter(lambda x: x if x in most_common_keys else None,line.split("\t")[-1].replace("\n", "").split(" ")))
                if tags:
                    o_num += 1
                    if o_num % 10000 == 0:
                        print("o_num: {}".format(o_num))

                    fs, s, t, _ = line[:-1].split("\t")
                    o.write("{}\t{}\t{}\t{}\n".format(fs, s, t, " ".join(tags)))
        with open("X_y_file_6e5_1.txt") as f:
            while True:
                line = f.readline()
                if not line:
                    break
                tags = list(filter(lambda x: x if x in most_common_keys else None,line.split("\t")[-1].replace("\n", "").split(" ")))
                if tags:
                    o_num += 1
                    if o_num % 10000 == 0:
                        print("o_num: {}".format(o_num))

                    fs, s, t, _ = line[:-1].split("\t")
                    o.write("{}\t{}\t{}\t{}\n".format(fs, s, t, " ".join(tags)))

最后对文本进行编码，为送入网络做准备：

def classifier_prepare():
    from gensim.models import Word2Vec
    w2v_path = "w2v.model"
    w2v_model = Word2Vec.load(w2v_path)
    wv = w2v_model.wv
    words = list(wv.vocab.keys())
    # set padding and unknown <unk> idx len(words), <pad> idx len(words) + 1
    word2idx = dict([(w, i) for i, w in enumerate(words)] + [("<unk>", len(words)) ,("<pad>", len(words) + 1)])

    tags_set = set([])
    with open("X_y_file_12e5_10.txt") as f:
        while True:
            line = f.readline()
            if not line:
                break
            tags_set = tags_set.union(set(line.split("\t")[-1][:-1].split(" ")))
    tag2idx = dict((t, i) for i, t in enumerate(tags_set))

    def map_to_idx(input_str):
        return " ".join(map(lambda x: str(word2idx.get(x, word2idx["<unk>"])), input_str.split(" ")))
    def map_to_tdx(input_str):
        return " ".join(map(lambda x: str(tag2idx[x]), input_str.split(" ")))

    # test_ratio = 0.2
    with open("classifier_10_test.txt", "w") as test_o:
        with open("classifier_10_train.txt", "w") as train_o:
            with open("X_y_file_12e5_10.txt") as f:
                line_num = 0
                while True:
                    line = f.readline()
                    if not line:
                        break
                    fs, s, t, ff = line[:-1].split("\t")
                    if line_num % 10 >= 8:
                        test_o.write("{}\t{}\t{}\t{}\n".format(*(list(map(map_to_idx, [fs, s, t])) + [map_to_tdx(ff)])))
                    else:
                        train_o.write("{}\t{}\t{}\t{}\n".format(*(list(map(map_to_idx, [fs, s, t])) + [map_to_tdx(ff)])))
                    line_num += 1
                    if line_num % 10000 == 0:
                        print("line_num :{}".format(line_num))

    import numpy as np
    print("total embed num :{}".format(len(word2idx)))
    embed_array = np.zeros(shape=[len(word2idx), 300])
    embed_over_num = 0
    for w, i in word2idx.items():
        if w in words:
            embed_array[i] = wv[w]
        else:
            embed_array[i] = np.random.random(size=300)

        embed_over_num += 1
        if embed_over_num % 1000 == 0:
            print("embed {} end".format(embed_over_num))

    print("embed_array end")
    import pickle
    with open("embed_and_idx_10.pkl", "wb") as f:
        pickle.dump(
            {
                "embed_array": embed_array,
                "word2idx": word2idx,
                "tag2idx": tag2idx
            }, f
        )
    print("dump end")

数据导出（训练用导出数据函数）

import tensorflow as tf
import numpy as np

import pickle
with open("embed_and_idx.pkl", "rb") as f:
    d = pickle.load(f)
    word2idx = d["word2idx"]
    tag2idx = d["tag2idx"]
    embed_array = d["embed_array"]

padding_idx_s = str(word2idx["<pad>"])

def data_generator(gen_type = "train" ,batch_num = 64, padding_size = 1000, tag_num = 1000,
                   category_limit = False):
    assert gen_type in ["train", "test"]

    def generate_int_array(input_str):
        input_list = input_str.split(" ")[:padding_size]
        return np.array(input_list + [padding_idx_s] * (padding_size - len(input_list))).astype(np.int32)

    def generate_tag_array(input_str):
        input_list = list(map(int, input_str.split(" ")))
        req = [0] * tag_num
        for tag in input_list:
            req[tag] = 1
        return np.array(req).astype(np.float32)

    start_idx = 0
    q_batch_array = np.zeros(shape=[batch_num, padding_size]).astype(np.int32)
    a_batch_array = np.zeros(shape=[batch_num, padding_size]).astype(np.int32)
    t_batch_array = np.zeros(shape=[batch_num, padding_size]).astype(np.int32)
    tag_batch_array = np.zeros(shape=[batch_num, tag_num]).astype(np.float32)

    times = 0

    with open("classifier_{}.txt".format(gen_type)) as f:
        while True:
            line = f.readline()
            if not line:
                return

            fs, s, t, ff = line[:-1].split("\t")

            tag_array = generate_tag_array(ff)

            if category_limit:
                if np.sum(tag_array) > 1 or np.argmax(tag_array) not in [0, 1]:
                    continue
            else:
                sum_tag_array = np.sum(tag_array).astype(np.float32)
                tag_array = tag_array / sum_tag_array

            q_batch_array[start_idx] = generate_int_array(fs)
            a_batch_array[start_idx] = generate_int_array(s)
            t_batch_array[start_idx] = generate_int_array(t)
            tag_batch_array[start_idx] = tag_array

            start_idx += 1
            if start_idx == batch_num:
                times += 1
                if times == 1e10:
                    return

                yield (q_batch_array, a_batch_array, t_batch_array, tag_batch_array)

                start_idx = 0
                q_batch_array = np.zeros(shape=[batch_num, padding_size]).astype(np.int32)
                a_batch_array = np.zeros(shape=[batch_num, padding_size]).astype(np.int32)
                t_batch_array = np.zeros(shape=[batch_num, padding_size]).astype(np.int32)
                tag_batch_array = np.zeros(shape=[batch_num, tag_num]).astype(np.float32)

模型构建：

模型的基本结构为对QuestionAnswer Title 三部分直接拼接后送入分类模型：

class XML_CNN(object):
    def __init__(self, embedding_array, max_seq_len = 1000, num_filters = 3, output_channels = 3,
                 filter_sizes = [3, 4, 5] ,
                 dense_dnn_size = [100],
                  p = 10, class_num = 1000, w2v_dim = 100):

        # param init
        self.w2v_dim = w2v_dim
        self.max_seq_len = max_seq_len
        self.embedding_array = tf.Variable(embedding_array[:, :self.w2v_dim], name="embed_array")

        self.filter_sizes = filter_sizes
        self.num_filters = num_filters
        self.output_channels = output_channels

        assert len(dense_dnn_size) == 1
        self.dnn_layer_dim = dense_dnn_size[0]

        self.q_seq = tf.placeholder(dtype=tf.int32, shape=[None, self.max_seq_len], name="q_deq")
        self.a_seq = tf.placeholder(dtype=tf.int32, shape=[None, self.max_seq_len], name="a_seq")
        self.t_seq = tf.placeholder(dtype=tf.int32, shape=[None, self.max_seq_len], name="t_seq")

        self.input_seq = tf.concat([self.q_seq, self.a_seq, self.t_seq], axis = -1, name="input_seq")

        self.tag_seq = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name="tag_seq")

        self.p = p
        self.class_num = class_num
        self.keep_prob = tf.placeholder(dtype=tf.float32, name="keep_prob")

        # construct
        self.model_construct()
        self.opt_construct()

    def model_construct(self):
        # input layer
        with tf.variable_scope("embed"):
            self.embed_seq = tf.nn.embedding_lookup(self.embedding_array, self.input_seq)
            self.embedded_content_expanded = tf.cast(tf.expand_dims(self.embed_seq, -1, name="embedded_content_expanded"), tf.float32)

        pooled_outputs = []
        for i, filter_size in enumerate(self.filter_sizes):
            temp_pooled_outputs = []
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, self.w2v_dim, 1, self.num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
                b = tf.Variable(tf.constant(0.0, shape=[self.num_filters]), name='b')

                conv = tf.nn.conv2d(
                    self.embedded_content_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name='conv')

                h = tf.nn.sigmoid(tf.nn.bias_add(conv, b), name='relu')
                m = int(h.get_shape()[-3])
                m_d_p = int(m / self.p)
                m_bar = int(m_d_p * self.p)
                h = tf.slice(h, [0, 0, 0, 0], [-1, m_bar ,-1, -1])
                m = int(h.get_shape()[-3])

                index_list = list(range(0, m, m_d_p))
                if m not in index_list:
                    index_list.append(m)

                for i in range(len(index_list) - 1):
                    start = index_list[i]
                    slice_val = tf.slice(h, [0, start, 0, 0], [-1, m_d_p ,-1, -1], name="slice_{}".format(i))

                    pooled = tf.nn.max_pool(
                        slice_val,
                        ksize=[1, m_d_p - filter_size + 1, 1, 1],
                        strides=[1, 1, 1, 1],
                        padding='VALID',
                        name='pool_{}'.format(i))
                    temp_pooled_outputs.append(pooled)

            # Combine all the pooled features
            content_pool = tf.concat(temp_pooled_outputs, 3)
            num_filters_total = int(content_pool.get_shape()[-1]) * int(content_pool.get_shape()[-3])
            content_pool_flat = tf.reshape(content_pool, [-1, num_filters_total])
            pooled_outputs.append(content_pool_flat)

        self.content_pool_flat = tf.concat(pooled_outputs, -1)

        with tf.name_scope("low_rank"):
            W = tf.get_variable(
                "low_W",
                shape=[int(self.content_pool_flat.get_shape()[-1]), self.dnn_layer_dim],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[self.dnn_layer_dim]), name="low_b")

            self.dnn_layer_out = tf.nn.xw_plus_b(self.content_pool_flat, W, b)

        with tf.name_scope("dropout"):
            self.dropout_layer_out = tf.nn.dropout(self.dnn_layer_out, keep_prob=self.keep_prob, name="drop_keep_prob_layer")

        with tf.name_scope("final_layer"):
            W = tf.get_variable(
                "final_W",
                shape=[int(self.dropout_layer_out.get_shape()[-1]), self.class_num],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[self.class_num]), name="final_b")

            self.final_layer = tf.nn.xw_plus_b(self.dropout_layer_out, W, b)
            self.softmax_pred = tf.nn.softmax(self.final_layer)
            self.predictions = tf.argmax(self.final_layer, 1, name="predictions")


    def opt_construct(self, use_bec = False):
        logits = self.final_layer
        labels = self.tag_seq

        if use_bec:
            self.loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels), name="loss")
        else:
            self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels), name="loss")

        self.opt = tf.train.AdamOptimizer(learning_rate=0.001)
        self.train_op = self.opt.minimize(self.loss)

        self.predictions = tf.argmax(self.softmax_pred, 1, name="predictions")

        # Accuracy
        with tf.name_scope("accuracy"):
            self.tag_eval = tf.cast(self.tag_seq > 0, tf.float32)
            self.tag_row_sum = tf.reduce_sum(self.tag_eval, axis=1)
            self.pred_onehot = tf.one_hot(self.predictions, depth=self.class_num)
            greedy_correct_predictions = tf.reduce_prod(tf.cast(tf.subtract(self.tag_eval, self.pred_onehot) >= 0, tf.float32), axis=1)
            self.greedy_accuracy = tf.reduce_mean(tf.cast(greedy_correct_predictions, "float"), name="accuracy")

            correct_predictions = tf.equal(self.predictions, tf.argmax(self.tag_seq, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

    @staticmethod
    def train():
        xml_cnn_ext = XML_CNN(embed_array)
        tg = data_generator(gen_type="train")
        ttg = data_generator(gen_type="test", batch_num=300)
        num_epoch = 100

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            for now_epoch in range(num_epoch):
                step = 0
                while True:
                    try:
                        q_batch_array, a_batch_array, t_batch_array, tag_batch_array = tg.__next__()
                    except:
                        tg = data_generator(gen_type="train")
                        ttg = data_generator(gen_type="test", batch_num=300)
                        print("epoch {} end".format(now_epoch))
                        break


                    _, loss, accuracy, greedy_accuracy = sess.run([xml_cnn_ext.train_op ,xml_cnn_ext.loss, xml_cnn_ext.accuracy, xml_cnn_ext.greedy_accuracy
                                                                ],
                                                               feed_dict={
                                                                   xml_cnn_ext.q_seq: np.zeros(q_batch_array.shape).astype(np.float32),
                                                                   xml_cnn_ext.a_seq: np.zeros(q_batch_array.shape).astype(np.float32),
                                                                   xml_cnn_ext.t_seq: t_batch_array,
                                                                   xml_cnn_ext.tag_seq: tag_batch_array,
                                                                   xml_cnn_ext.keep_prob: 0.7
                                                               })


                    if step % 10 == 0:
                        print("step: {}, train loss: {} acc: {} gred_acc: {}".format(step ,loss, accuracy, greedy_accuracy))

                    if step % 100 == 0:
                        try:
                            q_batch_array, a_batch_array, t_batch_array, tag_batch_array = ttg.__next__()
                        except:
                            ttg = data_generator(gen_type="test", batch_num=1000)
                            q_batch_array, a_batch_array, t_batch_array, tag_batch_array = ttg.__next__()

                        loss, accuracy, greedy_accuracy = sess.run([xml_cnn_ext.loss, xml_cnn_ext.accuracy, xml_cnn_ext.greedy_accuracy],
                                        feed_dict={
                                            xml_cnn_ext.q_seq: np.zeros(q_batch_array.shape).astype(np.float32),
                                            xml_cnn_ext.a_seq: np.zeros(q_batch_array.shape).astype(np.float32),
                                            xml_cnn_ext.t_seq: t_batch_array,
                                            xml_cnn_ext.tag_seq: tag_batch_array,
                                            xml_cnn_ext.keep_prob: 1.0
                                        })
                        print("test loss: {} acc: {} gred_acc: {}".format(loss, accuracy, greedy_accuracy))

                    step += 1

这里使用的是word2vec embedding，模型构造函数中的参数p为文中进行dynamic pooling对于卷积层按序列分的块数。opt_construct use_bec控制使用的损失类别。
由于是多标签分类（一个样本可能有多个Tag），在accuracy层使用self.greedy_accuracy表征至少击中一个标签类别的情况，而self.accuracy则更多的是一种“随机”的准确率。

实验结果：
在p=1且use_bec=False时就对应一般的Kim-CNN。在该数据集上使用softmax的情况要好于sigmoid。在这种极度多标签情形(Tags 数量设定1000)，使用dynamic pooling如p=10与p=1精确度及收敛结果类似（精度在0.6左右稳定），当减少Tags数量，如设定只有10分类时,p=1明显好于p=10。

Deep Learning for Extreme Multi-label Text Classification 实现

猜你喜欢