TextCNN 自编码器 实现

TextCNN 编码解码器可用于提取文本特征,基本是很简单的,

只需要用到反卷积,下面是简单实现:

 

TextCnnEncoder.py

import tensorflow as tf
import numpy as np
from collections import OrderedDict
from format_parse.feature_construct import w2v_construct

class TextCnn_AutoEncoder(object):
    def __init__(self, embedding_array, max_seq_len = 1000, num_filters = 5, output_channels = 5,
                 filter_sizes = [3, 4, 5] ,
                 dense_dnn_size = [10, 10, 10],
                 batch_size = 50):
        self.w2v_dim = embedding_array.shape[1]
        self.max_seq_len = max_seq_len
        self.embedding_array = tf.Variable(embedding_array,name = “Word_embed”)
        self.filter_sizes = filter_sizes
        self.num_filters = num_filters
        self.batch_size = batch_size
        self.output_channels = output_channels

        # dnn layer size must be 3
        self.dnn_layer_0_dim, self.dnn_layer_1_dim, self.dnn_layer_2_dim = dense_dnn_size

        self.input_seq = tf.placeholder(dtype=tf.int32, shape=[None, self.max_seq_len])

        self.kernel_dim_dict = None
        self.kernel_dim_tuple_list = []
        self.conv2_dim_dict = None
        self.conv2_dim_tuple_list = []
        self.decode = None

    @staticmethod
    def deconvLayer(input, output_shape, strides, weight_variable):
        dyn_input_shape = tf.shape(input)
        batch_size = dyn_input_shape[0]
        output_shape = tf.stack([batch_size, output_shape[1], output_shape[2], output_shape[3]])
        output = tf.nn.conv2d_transpose(input, weight_variable, output_shape, strides, padding="VALID")
        return output

    def model_construct(self):
        # encoder layer
        with tf.variable_scope("embed"):
            self.embed_seq = tf.nn.embedding_lookup(self.embedding_array, self.input_seq)
            self.embedded_content_expanded = tf.expand_dims(self.embed_seq, -1, name="embedded_content_expanded")

        pooled_outputs = []
        for i, filter_size in enumerate(self.filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, self.w2v_dim, 1, self.num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
                b = tf.Variable(tf.constant(0.0, shape=[self.num_filters]), name='b')

                self.kernel_dim_tuple_list.append((i, filter_shape))

                conv = tf.nn.conv2d(
                    self.embedded_content_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name='conv')

                self.conv2_dim_tuple_list.append((i, conv.get_shape()))
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, self.max_seq_len - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name='pool')

                pooled_outputs.append(pooled)

        # kernel dim dict used for conv2 transpose
        self.kernel_dim_dict = OrderedDict(sorted(self.kernel_dim_tuple_list, key = lambda t2: t2[0]))
        self.conv2_dim_dict = OrderedDict(sorted(self.conv2_dim_tuple_list, key = lambda t2: t2[0]))

        # Combine all the pooled features
        num_filters_total = self.num_filters * len(self.filter_sizes)
        self.content_pool = tf.concat(pooled_outputs, 3)
        self.content_pool_flat = tf.reshape(self.content_pool, [-1, num_filters_total])

        # middle layer
        with tf.variable_scope("dnn_layer_0"):
            self.dnn_layer_0_out = tf.layers.dense(self.content_pool_flat, units=self.dnn_layer_0_dim)

        with tf.variable_scope("dnn_layer_1"):
            self.dnn_layer_1_out = tf.layers.dense(self.dnn_layer_0_out, units=self.dnn_layer_1_dim)

        with tf.variable_scope("dnn_layer_2"):
            self.dnn_layer_2_out = tf.layers.dense(self.dnn_layer_1_out, units=self.dnn_layer_2_dim)

        self.req_feature = tf.concat([self.dnn_layer_0_out, self.dnn_layer_1_out, self.dnn_layer_2_out], axis=-1)

        # decoder layer
        decode_list = []
        for filter_index, conv2_dim_shape in self.conv2_dim_dict.items():
            with tf.variable_scope("fake_pooling_{}".format(filter_index)):

                temp_out_dim_array = [int(conv2_dim_shape[1]), int(conv2_dim_shape[2]), int(conv2_dim_shape[3])]
                req_dim = np.prod(temp_out_dim_array, axis=-1)
                map_to_conv_layer = tf.layers.dense(self.dnn_layer_2_out, req_dim)

                fake_conv_for_decode = tf.reshape(map_to_conv_layer, [-1] + temp_out_dim_array)

                kernel_shape = self.kernel_dim_dict[filter_index]
                kernel = tf.Variable(tf.random_normal(shape=kernel_shape, stddev=0.1, name="kernel"))

                batch_size = self.batch_size
                conv2d_transpose = TextCnn_AutoEncoder.deconvLayer(fake_conv_for_decode,
                                                                   [batch_size, self.max_seq_len, self.w2v_dim, 1],
                                                                   [1, 1, 1, 1], kernel)

                decrease_conv2d_transpose = tf.squeeze(conv2d_transpose, axis=[-1])
                decode_list.append(decrease_conv2d_transpose)

        self.decode = tf.reduce_sum(decode_list, axis = 0)

    def opt_construct(self):
        self.loss = tf.reduce_mean(tf.pow(tf.subtract(self.embed_seq ,self.decode), tf.constant(2.0, shape=[self.batch_size, self.max_seq_len, self.w2v_dim])))
        self.opt = tf.train.AdamOptimizer(learning_rate=0.001)
        self.train_op = self.opt.minimize(self.loss)

    @staticmethod
    def train_pred():
        train_X_y, test_X_y = w2v_construct()

        import pickle
        with open("w2v_model/text_cnn_req.pkl", "rb") as f:
            text_cnn_req = pickle.load(f)
        word_list_content = text_cnn_req["word_list_content"]
        embed_array = text_cnn_req["embed_array"]
        padding_word_idx = embed_array.shape[0]
        embed_array_with_unknow = np.append(embed_array, np.zeros([1, 100]), axis = 0).astype(np.float32)
        print("load end")

        max_seq_len = max(map(len ,word_list_content))
        print("max_seq_len: {}".format(max_seq_len))

        def batch_generator(batch_size = 50):
            mask_array = np.full(shape=[len(word_list_content), max_seq_len], fill_value=padding_word_idx)
            for row_idx ,word_list in enumerate(word_list_content):
                for col_idx, word_idx in enumerate(word_list):
                    mask_array[row_idx][col_idx] = word_idx

            gap_list = np.arange(0, mask_array.shape[0], batch_size)
            for i in range(len(gap_list) - 1):
                start = gap_list[i]
                end = gap_list[i + 1]

                yield mask_array[start: end, :]

        with tf.Session() as sess:
            tcnn_ext = TextCnn_AutoEncoder(embed_array_with_unknow, max_seq_len = max_seq_len)
            tcnn_ext.model_construct()
            tcnn_ext.opt_construct()

            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()

            num_epoch = 100
            for epoch in range(num_epoch):
                sample_generator = batch_generator()

                step = 0
                while True:
                    try:
                        sample = sample_generator.__next__()
                    except:
                        break

                    _, loss, req_feature = sess.run([tcnn_ext.train_op, tcnn_ext.loss, tcnn_ext.req_feature], feed_dict={
                        tcnn_ext.input_seq: sample
                    })
                    if step % 100 == 0:
                        print("step :{} end, loss {}".format(step, loss))
                    step += 1

                print("epoch {} end".format(epoch))

            train_path = "w2v_model/textcnn.model"
            saver.save(sess, train_path)
            print("model save end")

            X_cnt_train, X_w_train, y_train = train_X_y
            X_cnt_test, X_w_test, y_test = test_X_y

            X_w_train = sess.run(tcnn_ext.req_feature, feed_dict={
                tcnn_ext.input_seq: X_w_train
            })

            X_w_test = sess.run(tcnn_ext.req_feature, feed_dict={
                tcnn_ext.input_seq: X_w_test
            })

            with open("textcnn_feature.pkl", "wb") as f:
                pickle.dump({
                    "train_X_y": (X_cnt_train, X_w_train, y_train),
                    "test_X_y": (X_cnt_test, X_w_test, y_test)
                }, f)

    @staticmethod
    def fake_test():
        import numpy as np
        fake_embed = np.random.random(size=[1000, 100]).reshape([1000, 100]).astype(np.float32)

        fack_list = []
        for _ in range(10000):
            fack_batch_inputs = np.random.randint(0, 1000, size=[50, 1000]).reshape([50, 1000]).astype(np.int32)
            fack_list.append(fack_batch_inputs)

        with tf.Session() as sess:

            tcnn_ext = TextCnn_AutoEncoder(fake_embed)
            tcnn_ext.model_construct()
            tcnn_ext.opt_construct()

            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()

            for step in range(100):
                _, loss, req_feature = sess.run([tcnn_ext.train_op, tcnn_ext.loss, tcnn_ext.req_feature], feed_dict={
                    tcnn_ext.input_seq: fack_list[step]
                })

                print("step :{} end, loss {}".format(step, loss))

            train_path = "w2v_model/textcnn.model"
            saver.save(sess, train_path)


if __name__ == '__main__':
    TextCnn_AutoEncoder.fake_test()
    #TextCnn_AutoEncoder.train_pred()

其特征提取效果

可以通过http://blog.csdn.net/sinat_30665603/article/details/79520012

TensorFlow Fold 初探(一)——TreeLstm情感分类

中的情感数据集试一下

 

目标:通过一句话的文本及相应单词情感值预测该句话情感值。

 

步骤:

首先考虑在树结构中解析出句子及对应的情感值。

 

单个tree行文本的解析可以通过递归方法解析,下面是脚本:

json_retrieve.py

from nltk.tokenize import sexpr
import sys

def tokenize(s):
    try:
        label, phrase = s[1:-1].split(None, 1)
    except:
        return
    return label, sexpr.sexpr_tokenize(phrase)

word_collect_list = []
target_collect_list = []
def iter_over_line(s):
    global word_collect_list, target_collect_list

    tokens = tokenize(s)
    if tokens:
        tokens2 = tokens[1]
        if len(tokens2) >= 2:
            return list(map(iter_over_line,tokens2))
        else:
            word_collect_list.append(tokens2[0])
            target_collect_list.append(int(tokens[0]))

input_str = sys.argv[-1]
iter_over_line(sys.argv[-1])

import json
req = json.dumps({
    "word_collect_list":word_collect_list,
    "target_collect_list":target_collect_list,
    "root_val": int(input_str[1:2])
})
sys.stdout.write(req)

这样就可以全部解析:

parse.py

import codecs, os
from joblib import Parallel, delayed

def retrieve_one_line(s):
    resp = os.popen("python json_retrieve.py -line \"{}\"".format(s.strip())).read()
    return resp

import json
def retrieve_parsed_context(name, n_jobs):
    with codecs.open("trees/{}.txt".format(name)
            , "r", encoding="utf-8") as f:
        req = Parallel(n_jobs=n_jobs) (list(map(delayed(retrieve_one_line) ,f.readlines())))

    with codecs.open("trees_sentence/{}_test.txt".format(name),
                     "w", encoding="utf-8") as o:
        json.dump({"content" :req}, o)
    print("{} end".format(name))

if __name__ == "__main__":
    retrieve_parsed_context("dev", n_jobs=7)
    retrieve_parsed_context("test", n_jobs=7)
    retrieve_parsed_context("train", n_jobs=7)

对于朴素贝叶斯的输入特征使用单词计数及单词情感值计数构成,

之后再使用TextCNN提取的特征替换上述单词计数特征作为对照组,

分别对应 nb_predict textcnn_feature_predict
先构建特征:
feature_construct.py
import json
from copy import deepcopy
from functools import reduce
import numpy as np
from gensim.models import Word2Vec

def nb_feature_construct():
    def retrieve_content(name):
        with open("trees_sentence/{}.txt".format(name), "r", encoding="utf-8") as f:
            return json.load(f)["content"]

    def content_reconstruct(content_ele):
        try:
            content_ele = eval(content_ele)
        except:
            return None

        word_collect_list = content_ele["word_collect_list"]
        content_ele_req = deepcopy(content_ele)
        content_ele_req["word_collect_list"] = list(map(lambda w:w.lower() ,word_collect_list))
        return content_ele_req

    train_content, dev_content, test_content = list(map(retrieve_content ,["train", "dev", "test"]))
    train_content += dev_content

    train_content = list(filter(lambda x: x,map(content_reconstruct ,train_content)))
    test_content = list(filter(lambda x: x,map(content_reconstruct, test_content)))

    all_word_set = reduce(lambda x, y: x.union(y), list(map(lambda c: set(c["word_collect_list"]) ,train_content + test_content)))
    word2idx_dict = dict((word, idx) for idx, word in enumerate(list(all_word_set)))

    def target_cnt_list(target_collect_list):
        req = [0] * 5
        for ele in target_collect_list:
            req[ele] += 1
        return req

    def word_cnt_list(word_collect_list):
        req = [0] * len(word2idx_dict)
        for ele in word_collect_list:
            req[word2idx_dict[ele]] += 1
        return req

    def all_cnt_list(content_ele):
        return (content_ele["root_val"] ,word_cnt_list(content_ele["word_collect_list"]) + target_cnt_list(content_ele["target_collect_list"]))

    def map_to_Xy(content):
        X, y = [], []
        for content_ele in content:
            y_ele, x_ele = all_cnt_list(content_ele)
            X.append(x_ele)
            y.append(y_ele)
        return (np.array(X), np.array(y))

    train_X_y, test_X_y = list(map(map_to_Xy, [train_content, test_content]))

    return train_X_y, test_X_y

def w2v_construct(num_epoch = 100):

    def retrieve_content(name):
        with open("trees_sentence/{}.txt".format(name), "r", encoding="utf-8") as f:
            return json.load(f)["content"]

    def content_reconstruct(content_ele):
        try:
            content_ele = eval(content_ele)
        except:
            return None

        word_collect_list = content_ele["word_collect_list"]
        content_ele_req = deepcopy(content_ele)
        content_ele_req["word_collect_list"] = list(map(lambda w:w.lower() ,word_collect_list))
        return content_ele_req

    train_content, dev_content, test_content = list(map(retrieve_content ,["train", "dev", "test"]))
    train_content += dev_content

    train_content = list(filter(lambda x: x,map(content_reconstruct ,train_content)))
    test_content = list(filter(lambda x: x,map(content_reconstruct, test_content)))

    all_word_set = reduce(lambda x, y: x.union(y), list(map(lambda c: set(c["word_collect_list"]) ,train_content + test_content)))
    word2idx_dict = dict((word, idx) for idx, word in enumerate(list(all_word_set)))

    sentences = list(map(lambda x: x["word_collect_list"] ,train_content + test_content))

    w2v_path = "w2v_model/w2v.model"
    w2v_model = None
    embed_size = 100
    for epoch in range(num_epoch):
        if w2v_model is None:
            w2v_model = Word2Vec(sentences, size=embed_size ,min_count=1, workers=7, iter = 10)
        else:
            w2v_model = Word2Vec.load(w2v_path)
            w2v_model.train(sentences, total_examples = len(sentences), epochs = 10)
        w2v_model.save(w2v_path)

        print("poch :{} end".format(epoch))

    w2v_model = Word2Vec.load(w2v_path)

    wv = w2v_model.wv
    embed_array = np.zeros(shape=[len(word2idx_dict), embed_size])
    for word, idx in word2idx_dict.items():
        embed_array[idx] = wv[word]

    def content_reconstruct(content_ele):
        word_collect_list = content_ele["word_collect_list"]
        content_ele_req = deepcopy(content_ele)
        content_ele_req["word_collect_list"] = list(map(lambda w:word2idx_dict[w] ,word_collect_list))
        return content_ele_req

    train_content = list(filter(lambda x: x,map(content_reconstruct ,train_content)))
    test_content = list(filter(lambda x: x,map(content_reconstruct, test_content)))

    def target_cnt_list(target_collect_list):
        req = [0] * 5
        for ele in target_collect_list:
            req[ele] += 1
        return req

    def word_mask_index(word_collect_list, max_seq_len):
        req = [len(word2idx_dict)] * max_seq_len
        for idx, word in enumerate(word_collect_list):
            req[idx] = word
        return req

    max_seq_len = max(map(len, sentences))
    from functools import partial
    def all_cnt_list(content_ele):
        return (content_ele["root_val"] ,partial(word_mask_index, max_seq_len = max_seq_len)(content_ele["word_collect_list"]), target_cnt_list(content_ele["target_collect_list"]))

    def map_to_Xy(content):
        X_cnt, X_w, y = [], [], []
        for content_ele in content:
            y_ele, x_w_ele, x_cnt_ele = all_cnt_list(content_ele)
            X_cnt.append(x_cnt_ele)
            X_w.append(x_w_ele)
            y.append(y_ele)
        return (np.array(X_cnt), np.array(X_w), np.array(y))

    train_X_y, test_X_y = list(map(map_to_Xy, [train_content, test_content]))

    word_list_content = list(map(lambda x: x["word_collect_list"],train_content)) + \
                        list(map(lambda x: x["word_collect_list"],test_content))

    import pickle
    with open("w2v_model/text_cnn_req.pkl", "wb") as f:
        pickle.dump(
            {"word_list_content": word_list_content,
             "embed_array": embed_array}
        ,f)
    print("dump end")

    return train_X_y, test_X_y



if __name__ == "__main__":
    pass
调用TextCnn_AutoEncoder.train_pred()

进行编码解码特征提取后预测如下:

nb_svc.py

from format_parse.feature_construct import nb_feature_construct
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np

def nb_predict(bi = False):
    train_X_y, test_X_y = nb_feature_construct()

    train_X, train_y = train_X_y
    test_X, test_y = test_X_y

    if bi:
        train_y_req = np.full(train_y.shape, 0)
        train_y_req[train_y == 2] = 1
        train_y_req[train_y > 2] = 2

        test_y_req = np.full(test_y.shape, 0)
        test_y_req[test_y == 2] = 1
        test_y_req[test_y > 2] = 2

        train_y = train_y_req
        test_y = test_y_req

    clf = GaussianNB()
    clf.fit(train_X, train_y)
    test_pred = clf.predict(test_X)
    equal_num = float(np.sum(np.equal(test_pred, test_y)))
    equal_rate = equal_num / len(test_y)

    print("gnb equal num :{}, equal rate :{}".format(equal_num, equal_rate))

    clf = MultinomialNB(alpha=1.0)
    clf.fit(train_X, train_y)
    test_pred = clf.predict(test_X)
    equal_num = float(np.sum(np.equal(test_pred, test_y)))
    equal_rate = equal_num / len(test_y)

    print("mnb equal num :{}, equal rate :{}".format(equal_num, equal_rate))

def textcnn_feature_predict(bi = False):
    import pickle
    with open("textcnn_feature.pkl", "rb") as f:
        textcnn_feature = pickle.load(f)
    X_cnt_train, X_w_train, train_y = textcnn_feature["train_X_y"]
    X_cnt_test, X_w_test, test_y = textcnn_feature["test_X_y"]

    if bi:
        train_y_req = np.full(train_y.shape, 0)
        train_y_req[train_y == 2] = 1
        train_y_req[train_y > 2] = 2

        test_y_req = np.full(test_y.shape, 0)
        test_y_req[test_y == 2] = 1
        test_y_req[test_y > 2] = 2

        train_y = train_y_req
        test_y = test_y_req

    train_X = np.append(X_cnt_train, X_w_train, axis = 1)
    test_X = np.append(X_cnt_test, X_w_test, axis= 1)

    clf = svm.SVC()
    clf.fit(train_X, train_y)
    test_pred = clf.predict(test_X)
    equal_num = float(np.sum(np.equal(test_pred, test_y)))
    equal_rate = equal_num / len(test_y)

    print("svc equal num :{}, equal rate :{}".format(equal_num, equal_rate))

    clf = RandomForestClassifier(n_estimators=100, n_jobs=7, min_samples_leaf=5)
    clf.fit(train_X, train_y)
    test_pred = clf.predict(test_X)
    equal_num = float(np.sum(np.equal(test_pred, test_y)))
    equal_rate = equal_num / len(test_y)

    print("rf equal num :{}, equal rate :{}".format(equal_num, equal_rate))

    clf = LogisticRegression()
    clf.fit(train_X, train_y)
    test_pred = clf.predict(test_X)
    equal_num = float(np.sum(np.equal(test_pred, test_y)))
    equal_rate = equal_num / len(test_y)

    print("log equal num :{}, equal rate :{}".format(equal_num, equal_rate))


if __name__ == "__main__":
    nb_predict()
    textcnn_feature_predict()

5分类结果:

gnb equal num :605.0, equal rate :0.27867342238599724
mnb equal num :919.0, equal rate :0.42330723169046525
svc equal num :715.0, equal rate :0.32934131736526945
rf equal num :826.0, equal rate :0.380469829571626
log equal num :831.0, equal rate :0.38277291570704747

3分类结果:

gnb equal num :990.0, equal rate :0.45601105481345
mnb equal num :1497.0, equal rate :0.6895439889451865
svc equal num :1232.0, equal rate :0.5674804237678489
rf equal num :1353.0, equal rate :0.6232151082450483






猜你喜欢

转载自blog.csdn.net/sinat_30665603/article/details/79606430