TextCNN 编码解码器可用于提取文本特征,基本是很简单的,
只需要用到反卷积,下面是简单实现:
TextCnnEncoder.py
import tensorflow as tf import numpy as np from collections import OrderedDict from format_parse.feature_construct import w2v_construct class TextCnn_AutoEncoder(object): def __init__(self, embedding_array, max_seq_len = 1000, num_filters = 5, output_channels = 5, filter_sizes = [3, 4, 5] , dense_dnn_size = [10, 10, 10], batch_size = 50): self.w2v_dim = embedding_array.shape[1] self.max_seq_len = max_seq_len self.embedding_array = tf.Variable(embedding_array,name = “Word_embed”) self.filter_sizes = filter_sizes self.num_filters = num_filters self.batch_size = batch_size self.output_channels = output_channels # dnn layer size must be 3 self.dnn_layer_0_dim, self.dnn_layer_1_dim, self.dnn_layer_2_dim = dense_dnn_size self.input_seq = tf.placeholder(dtype=tf.int32, shape=[None, self.max_seq_len]) self.kernel_dim_dict = None self.kernel_dim_tuple_list = [] self.conv2_dim_dict = None self.conv2_dim_tuple_list = [] self.decode = None @staticmethod def deconvLayer(input, output_shape, strides, weight_variable): dyn_input_shape = tf.shape(input) batch_size = dyn_input_shape[0] output_shape = tf.stack([batch_size, output_shape[1], output_shape[2], output_shape[3]]) output = tf.nn.conv2d_transpose(input, weight_variable, output_shape, strides, padding="VALID") return output def model_construct(self): # encoder layer with tf.variable_scope("embed"): self.embed_seq = tf.nn.embedding_lookup(self.embedding_array, self.input_seq) self.embedded_content_expanded = tf.expand_dims(self.embed_seq, -1, name="embedded_content_expanded") pooled_outputs = [] for i, filter_size in enumerate(self.filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, self.w2v_dim, 1, self.num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W') b = tf.Variable(tf.constant(0.0, shape=[self.num_filters]), name='b') self.kernel_dim_tuple_list.append((i, filter_shape)) conv = tf.nn.conv2d( self.embedded_content_expanded, W, strides=[1, 1, 1, 1], padding='VALID', name='conv') self.conv2_dim_tuple_list.append((i, conv.get_shape())) h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu') pooled = tf.nn.max_pool( h, ksize=[1, self.max_seq_len - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name='pool') pooled_outputs.append(pooled) # kernel dim dict used for conv2 transpose self.kernel_dim_dict = OrderedDict(sorted(self.kernel_dim_tuple_list, key = lambda t2: t2[0])) self.conv2_dim_dict = OrderedDict(sorted(self.conv2_dim_tuple_list, key = lambda t2: t2[0])) # Combine all the pooled features num_filters_total = self.num_filters * len(self.filter_sizes) self.content_pool = tf.concat(pooled_outputs, 3) self.content_pool_flat = tf.reshape(self.content_pool, [-1, num_filters_total]) # middle layer with tf.variable_scope("dnn_layer_0"): self.dnn_layer_0_out = tf.layers.dense(self.content_pool_flat, units=self.dnn_layer_0_dim) with tf.variable_scope("dnn_layer_1"): self.dnn_layer_1_out = tf.layers.dense(self.dnn_layer_0_out, units=self.dnn_layer_1_dim) with tf.variable_scope("dnn_layer_2"): self.dnn_layer_2_out = tf.layers.dense(self.dnn_layer_1_out, units=self.dnn_layer_2_dim) self.req_feature = tf.concat([self.dnn_layer_0_out, self.dnn_layer_1_out, self.dnn_layer_2_out], axis=-1) # decoder layer decode_list = [] for filter_index, conv2_dim_shape in self.conv2_dim_dict.items(): with tf.variable_scope("fake_pooling_{}".format(filter_index)): temp_out_dim_array = [int(conv2_dim_shape[1]), int(conv2_dim_shape[2]), int(conv2_dim_shape[3])] req_dim = np.prod(temp_out_dim_array, axis=-1) map_to_conv_layer = tf.layers.dense(self.dnn_layer_2_out, req_dim) fake_conv_for_decode = tf.reshape(map_to_conv_layer, [-1] + temp_out_dim_array) kernel_shape = self.kernel_dim_dict[filter_index] kernel = tf.Variable(tf.random_normal(shape=kernel_shape, stddev=0.1, name="kernel")) batch_size = self.batch_size conv2d_transpose = TextCnn_AutoEncoder.deconvLayer(fake_conv_for_decode, [batch_size, self.max_seq_len, self.w2v_dim, 1], [1, 1, 1, 1], kernel) decrease_conv2d_transpose = tf.squeeze(conv2d_transpose, axis=[-1]) decode_list.append(decrease_conv2d_transpose) self.decode = tf.reduce_sum(decode_list, axis = 0) def opt_construct(self): self.loss = tf.reduce_mean(tf.pow(tf.subtract(self.embed_seq ,self.decode), tf.constant(2.0, shape=[self.batch_size, self.max_seq_len, self.w2v_dim]))) self.opt = tf.train.AdamOptimizer(learning_rate=0.001) self.train_op = self.opt.minimize(self.loss) @staticmethod def train_pred(): train_X_y, test_X_y = w2v_construct() import pickle with open("w2v_model/text_cnn_req.pkl", "rb") as f: text_cnn_req = pickle.load(f) word_list_content = text_cnn_req["word_list_content"] embed_array = text_cnn_req["embed_array"] padding_word_idx = embed_array.shape[0] embed_array_with_unknow = np.append(embed_array, np.zeros([1, 100]), axis = 0).astype(np.float32) print("load end") max_seq_len = max(map(len ,word_list_content)) print("max_seq_len: {}".format(max_seq_len)) def batch_generator(batch_size = 50): mask_array = np.full(shape=[len(word_list_content), max_seq_len], fill_value=padding_word_idx) for row_idx ,word_list in enumerate(word_list_content): for col_idx, word_idx in enumerate(word_list): mask_array[row_idx][col_idx] = word_idx gap_list = np.arange(0, mask_array.shape[0], batch_size) for i in range(len(gap_list) - 1): start = gap_list[i] end = gap_list[i + 1] yield mask_array[start: end, :] with tf.Session() as sess: tcnn_ext = TextCnn_AutoEncoder(embed_array_with_unknow, max_seq_len = max_seq_len) tcnn_ext.model_construct() tcnn_ext.opt_construct() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() num_epoch = 100 for epoch in range(num_epoch): sample_generator = batch_generator() step = 0 while True: try: sample = sample_generator.__next__() except: break _, loss, req_feature = sess.run([tcnn_ext.train_op, tcnn_ext.loss, tcnn_ext.req_feature], feed_dict={ tcnn_ext.input_seq: sample }) if step % 100 == 0: print("step :{} end, loss {}".format(step, loss)) step += 1 print("epoch {} end".format(epoch)) train_path = "w2v_model/textcnn.model" saver.save(sess, train_path) print("model save end") X_cnt_train, X_w_train, y_train = train_X_y X_cnt_test, X_w_test, y_test = test_X_y X_w_train = sess.run(tcnn_ext.req_feature, feed_dict={ tcnn_ext.input_seq: X_w_train }) X_w_test = sess.run(tcnn_ext.req_feature, feed_dict={ tcnn_ext.input_seq: X_w_test }) with open("textcnn_feature.pkl", "wb") as f: pickle.dump({ "train_X_y": (X_cnt_train, X_w_train, y_train), "test_X_y": (X_cnt_test, X_w_test, y_test) }, f) @staticmethod def fake_test(): import numpy as np fake_embed = np.random.random(size=[1000, 100]).reshape([1000, 100]).astype(np.float32) fack_list = [] for _ in range(10000): fack_batch_inputs = np.random.randint(0, 1000, size=[50, 1000]).reshape([50, 1000]).astype(np.int32) fack_list.append(fack_batch_inputs) with tf.Session() as sess: tcnn_ext = TextCnn_AutoEncoder(fake_embed) tcnn_ext.model_construct() tcnn_ext.opt_construct() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() for step in range(100): _, loss, req_feature = sess.run([tcnn_ext.train_op, tcnn_ext.loss, tcnn_ext.req_feature], feed_dict={ tcnn_ext.input_seq: fack_list[step] }) print("step :{} end, loss {}".format(step, loss)) train_path = "w2v_model/textcnn.model" saver.save(sess, train_path) if __name__ == '__main__': TextCnn_AutoEncoder.fake_test() #TextCnn_AutoEncoder.train_pred()
其特征提取效果
可以通过http://blog.csdn.net/sinat_30665603/article/details/79520012
TensorFlow Fold 初探(一)——TreeLstm情感分类
中的情感数据集试一下
目标:通过一句话的文本及相应单词情感值预测该句话情感值。
步骤:
首先考虑在树结构中解析出句子及对应的情感值。
单个tree行文本的解析可以通过递归方法解析,下面是脚本:
json_retrieve.py
from nltk.tokenize import sexpr import sys def tokenize(s): try: label, phrase = s[1:-1].split(None, 1) except: return return label, sexpr.sexpr_tokenize(phrase) word_collect_list = [] target_collect_list = [] def iter_over_line(s): global word_collect_list, target_collect_list tokens = tokenize(s) if tokens: tokens2 = tokens[1] if len(tokens2) >= 2: return list(map(iter_over_line,tokens2)) else: word_collect_list.append(tokens2[0]) target_collect_list.append(int(tokens[0])) input_str = sys.argv[-1] iter_over_line(sys.argv[-1]) import json req = json.dumps({ "word_collect_list":word_collect_list, "target_collect_list":target_collect_list, "root_val": int(input_str[1:2]) }) sys.stdout.write(req)
这样就可以全部解析:
parse.py
import codecs, os from joblib import Parallel, delayed def retrieve_one_line(s): resp = os.popen("python json_retrieve.py -line \"{}\"".format(s.strip())).read() return resp import json def retrieve_parsed_context(name, n_jobs): with codecs.open("trees/{}.txt".format(name) , "r", encoding="utf-8") as f: req = Parallel(n_jobs=n_jobs) (list(map(delayed(retrieve_one_line) ,f.readlines()))) with codecs.open("trees_sentence/{}_test.txt".format(name), "w", encoding="utf-8") as o: json.dump({"content" :req}, o) print("{} end".format(name)) if __name__ == "__main__": retrieve_parsed_context("dev", n_jobs=7) retrieve_parsed_context("test", n_jobs=7) retrieve_parsed_context("train", n_jobs=7)
对于朴素贝叶斯的输入特征使用单词计数及单词情感值计数构成,
之后再使用TextCNN提取的特征替换上述单词计数特征作为对照组,
分别对应 nb_predict、 textcnn_feature_predict
先构建特征:
feature_construct.py
import json from copy import deepcopy from functools import reduce import numpy as np from gensim.models import Word2Vec def nb_feature_construct(): def retrieve_content(name): with open("trees_sentence/{}.txt".format(name), "r", encoding="utf-8") as f: return json.load(f)["content"] def content_reconstruct(content_ele): try: content_ele = eval(content_ele) except: return None word_collect_list = content_ele["word_collect_list"] content_ele_req = deepcopy(content_ele) content_ele_req["word_collect_list"] = list(map(lambda w:w.lower() ,word_collect_list)) return content_ele_req train_content, dev_content, test_content = list(map(retrieve_content ,["train", "dev", "test"])) train_content += dev_content train_content = list(filter(lambda x: x,map(content_reconstruct ,train_content))) test_content = list(filter(lambda x: x,map(content_reconstruct, test_content))) all_word_set = reduce(lambda x, y: x.union(y), list(map(lambda c: set(c["word_collect_list"]) ,train_content + test_content))) word2idx_dict = dict((word, idx) for idx, word in enumerate(list(all_word_set))) def target_cnt_list(target_collect_list): req = [0] * 5 for ele in target_collect_list: req[ele] += 1 return req def word_cnt_list(word_collect_list): req = [0] * len(word2idx_dict) for ele in word_collect_list: req[word2idx_dict[ele]] += 1 return req def all_cnt_list(content_ele): return (content_ele["root_val"] ,word_cnt_list(content_ele["word_collect_list"]) + target_cnt_list(content_ele["target_collect_list"])) def map_to_Xy(content): X, y = [], [] for content_ele in content: y_ele, x_ele = all_cnt_list(content_ele) X.append(x_ele) y.append(y_ele) return (np.array(X), np.array(y)) train_X_y, test_X_y = list(map(map_to_Xy, [train_content, test_content])) return train_X_y, test_X_y def w2v_construct(num_epoch = 100): def retrieve_content(name): with open("trees_sentence/{}.txt".format(name), "r", encoding="utf-8") as f: return json.load(f)["content"] def content_reconstruct(content_ele): try: content_ele = eval(content_ele) except: return None word_collect_list = content_ele["word_collect_list"] content_ele_req = deepcopy(content_ele) content_ele_req["word_collect_list"] = list(map(lambda w:w.lower() ,word_collect_list)) return content_ele_req train_content, dev_content, test_content = list(map(retrieve_content ,["train", "dev", "test"])) train_content += dev_content train_content = list(filter(lambda x: x,map(content_reconstruct ,train_content))) test_content = list(filter(lambda x: x,map(content_reconstruct, test_content))) all_word_set = reduce(lambda x, y: x.union(y), list(map(lambda c: set(c["word_collect_list"]) ,train_content + test_content))) word2idx_dict = dict((word, idx) for idx, word in enumerate(list(all_word_set))) sentences = list(map(lambda x: x["word_collect_list"] ,train_content + test_content)) w2v_path = "w2v_model/w2v.model" w2v_model = None embed_size = 100 for epoch in range(num_epoch): if w2v_model is None: w2v_model = Word2Vec(sentences, size=embed_size ,min_count=1, workers=7, iter = 10) else: w2v_model = Word2Vec.load(w2v_path) w2v_model.train(sentences, total_examples = len(sentences), epochs = 10) w2v_model.save(w2v_path) print("poch :{} end".format(epoch)) w2v_model = Word2Vec.load(w2v_path) wv = w2v_model.wv embed_array = np.zeros(shape=[len(word2idx_dict), embed_size]) for word, idx in word2idx_dict.items(): embed_array[idx] = wv[word] def content_reconstruct(content_ele): word_collect_list = content_ele["word_collect_list"] content_ele_req = deepcopy(content_ele) content_ele_req["word_collect_list"] = list(map(lambda w:word2idx_dict[w] ,word_collect_list)) return content_ele_req train_content = list(filter(lambda x: x,map(content_reconstruct ,train_content))) test_content = list(filter(lambda x: x,map(content_reconstruct, test_content))) def target_cnt_list(target_collect_list): req = [0] * 5 for ele in target_collect_list: req[ele] += 1 return req def word_mask_index(word_collect_list, max_seq_len): req = [len(word2idx_dict)] * max_seq_len for idx, word in enumerate(word_collect_list): req[idx] = word return req max_seq_len = max(map(len, sentences)) from functools import partial def all_cnt_list(content_ele): return (content_ele["root_val"] ,partial(word_mask_index, max_seq_len = max_seq_len)(content_ele["word_collect_list"]), target_cnt_list(content_ele["target_collect_list"])) def map_to_Xy(content): X_cnt, X_w, y = [], [], [] for content_ele in content: y_ele, x_w_ele, x_cnt_ele = all_cnt_list(content_ele) X_cnt.append(x_cnt_ele) X_w.append(x_w_ele) y.append(y_ele) return (np.array(X_cnt), np.array(X_w), np.array(y)) train_X_y, test_X_y = list(map(map_to_Xy, [train_content, test_content])) word_list_content = list(map(lambda x: x["word_collect_list"],train_content)) + \ list(map(lambda x: x["word_collect_list"],test_content)) import pickle with open("w2v_model/text_cnn_req.pkl", "wb") as f: pickle.dump( {"word_list_content": word_list_content, "embed_array": embed_array} ,f) print("dump end") return train_X_y, test_X_y if __name__ == "__main__": pass
调用TextCnn_AutoEncoder.train_pred()
进行编码解码特征提取后预测如下:
nb_svc.py
from format_parse.feature_construct import nb_feature_construct from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn import svm from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression import numpy as np def nb_predict(bi = False): train_X_y, test_X_y = nb_feature_construct() train_X, train_y = train_X_y test_X, test_y = test_X_y if bi: train_y_req = np.full(train_y.shape, 0) train_y_req[train_y == 2] = 1 train_y_req[train_y > 2] = 2 test_y_req = np.full(test_y.shape, 0) test_y_req[test_y == 2] = 1 test_y_req[test_y > 2] = 2 train_y = train_y_req test_y = test_y_req clf = GaussianNB() clf.fit(train_X, train_y) test_pred = clf.predict(test_X) equal_num = float(np.sum(np.equal(test_pred, test_y))) equal_rate = equal_num / len(test_y) print("gnb equal num :{}, equal rate :{}".format(equal_num, equal_rate)) clf = MultinomialNB(alpha=1.0) clf.fit(train_X, train_y) test_pred = clf.predict(test_X) equal_num = float(np.sum(np.equal(test_pred, test_y))) equal_rate = equal_num / len(test_y) print("mnb equal num :{}, equal rate :{}".format(equal_num, equal_rate)) def textcnn_feature_predict(bi = False): import pickle with open("textcnn_feature.pkl", "rb") as f: textcnn_feature = pickle.load(f) X_cnt_train, X_w_train, train_y = textcnn_feature["train_X_y"] X_cnt_test, X_w_test, test_y = textcnn_feature["test_X_y"] if bi: train_y_req = np.full(train_y.shape, 0) train_y_req[train_y == 2] = 1 train_y_req[train_y > 2] = 2 test_y_req = np.full(test_y.shape, 0) test_y_req[test_y == 2] = 1 test_y_req[test_y > 2] = 2 train_y = train_y_req test_y = test_y_req train_X = np.append(X_cnt_train, X_w_train, axis = 1) test_X = np.append(X_cnt_test, X_w_test, axis= 1) clf = svm.SVC() clf.fit(train_X, train_y) test_pred = clf.predict(test_X) equal_num = float(np.sum(np.equal(test_pred, test_y))) equal_rate = equal_num / len(test_y) print("svc equal num :{}, equal rate :{}".format(equal_num, equal_rate)) clf = RandomForestClassifier(n_estimators=100, n_jobs=7, min_samples_leaf=5) clf.fit(train_X, train_y) test_pred = clf.predict(test_X) equal_num = float(np.sum(np.equal(test_pred, test_y))) equal_rate = equal_num / len(test_y) print("rf equal num :{}, equal rate :{}".format(equal_num, equal_rate)) clf = LogisticRegression() clf.fit(train_X, train_y) test_pred = clf.predict(test_X) equal_num = float(np.sum(np.equal(test_pred, test_y))) equal_rate = equal_num / len(test_y) print("log equal num :{}, equal rate :{}".format(equal_num, equal_rate)) if __name__ == "__main__": nb_predict() textcnn_feature_predict()
5分类结果:
gnb equal num :605.0, equal rate :0.27867342238599724 mnb equal num :919.0, equal rate :0.42330723169046525 svc equal num :715.0, equal rate :0.32934131736526945 rf equal num :826.0, equal rate :0.380469829571626 log equal num :831.0, equal rate :0.38277291570704747
3分类结果:
gnb equal num :990.0, equal rate :0.45601105481345 mnb equal num :1497.0, equal rate :0.6895439889451865 svc equal num :1232.0, equal rate :0.5674804237678489 rf equal num :1353.0, equal rate :0.6232151082450483