基于LSTM(Long-Short Term Memory,长短时记忆人工神经网络,RNN的一种)搭建一个文本意图分类的深度学习模型(基于Python3和Tensorflow1.2),其结构图如下:
如图1所示,整个模型包括两部分
第一部分:句子特征提取
Step1 读取数据(这里是经过结巴分词后的句子),按比例划分训练集和验证集,这里每个句子都生成了相应的mask向量,用以标记每个输入文本的实际长度(在后期的模型中根据mask向量将padding为0部分所对应的隐藏层输出砍掉)。这里有几个可选项:
1. reverse: 考虑到句子中越靠后的词重要程度越高,因此可对句子进行逆序输入;
2. enhance: 样本数较小的时候可选择数据增强,即打乱句子顺序来构建新样本;
3. sort_by_len: 对句子按照长短进行排序
4. shuffle:打乱样本顺序,随机采样
import numpy as np import sys sys.path.append("..") import random # file path # dataset_path = '/data/PycharmProjects/question_matching_framework/work_space/example/dataset/aaa' def load_cn_data_from_files(classify_files): count = len(classify_files) x_text = [] y = [] for index in range(count): classify_file = classify_files[index] lines = list(open(classify_file, "r").readlines()) label = [0] * count label[index] = 1 labels = [label for _ in lines] if index == 0: x_text = lines y = labels else: x_text = x_text + lines y = np.concatenate([y, labels]) x_text = [clean_str_cn(sent) for sent in x_text] return [x_text, y] def clean_str_cn(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ return string.strip().lower() def load_data(classify_files, config, sort_by_len=True, enhance = True, reverse=True): x_text, y = load_cn_data_from_files(classify_files) new_text = [] if reverse == True: for text in x_text: text_list = text.strip().split(' ') text_list.reverse() reversed_text = ' '.join(text_list) new_text.append(reversed_text) x_text = new_text else: pass y = list(y) original_dataset = list(zip(x_text, y)) if enhance == True: num_sample = len(original_dataset) # shuffle for i in range(num_sample): text_list = original_dataset[i][0].split(' ') random.shuffle(text_list) text_shuffled = ' '.join(text_list) label_shuffled = original_dataset[i][1] x_text.append(text_shuffled) y.append(label_shuffled) else: pass # Randomly shuffle data shuffle_indices = list(range(len(y))) random.shuffle(shuffle_indices) # print(shuffle_indices) x_shuffled = [] y_shuffled_tmp = [] for shuffle_indice in shuffle_indices: x_shuffled.append(x_text[shuffle_indice]) y_shuffled_tmp.append(y[shuffle_indice]) y_shuffled = np.array(y_shuffled_tmp) # train_set length n_samples = len(x_shuffled) # shuffle and generate train and valid data set sidx = np.random.permutation(n_samples) n_train = int(np.round(n_samples * (1. - config.valid_portion))) print("Train/Test split: {:d}/{:d}".format(n_train, (n_samples - n_train))) valid_set_x = [x_shuffled[s] for s in sidx[n_train:]] valid_set_y = [y_shuffled[s] for s in sidx[n_train:]] train_set_x = [x_shuffled[s] for s in sidx[:n_train]] train_set_y = [y_shuffled[s] for s in sidx[:n_train]] train_set = (train_set_x, train_set_y) valid_set = (valid_set_x, valid_set_y) # test_set = (x_test, y_test) # test_set_x, test_set_y = test_set valid_set_x, valid_set_y = valid_set train_set_x, train_set_y = train_set def len_argsort(seq): return sorted(range(len(seq)), key=lambda x: len(seq[x])) if sort_by_len: sorted_index = len_argsort(valid_set_x) valid_set_x = [valid_set_x[i] for i in sorted_index] valid_set_y = [valid_set_y[i] for i in sorted_index] sorted_index = len_argsort(train_set_x) train_set_x = [train_set_x[i] for i in sorted_index] train_set_y = [train_set_y[i] for i in sorted_index] train_set=(train_set_x,train_set_y) valid_set=(valid_set_x,valid_set_y) max_len = config.num_step def generate_mask(data_set): set_x = data_set[0] mask_x = np.zeros([max_len, len(set_x)]) for i,x in enumerate(set_x): x_list = x.split(' ') if len(x_list) < max_len: mask_x[0:len(x_list), i] = 1 else: mask_x[:, i] = 1 new_set = (set_x, data_set[1], mask_x) return new_set train_set = generate_mask(train_set) valid_set = generate_mask(valid_set) train_data = (train_set[0], train_set[1], train_set[2]) valid_data = (valid_set[0], valid_set[1], valid_set[2]) return train_data, valid_data # return batch data set def batch_iter(data,batch_size, shuffle = True): # get data set and label x, y, mask_x = data # mask_x = np.array(mask_x) mask_x = np.asarray(mask_x).T.tolist() data_size = len(x) if shuffle: shuffle_indices = list(range(data_size)) random.shuffle(shuffle_indices) shuffled_x = [] shuffled_y = [] shuffled_mask_x = [] for shuffle_indice in shuffle_indices: shuffled_x.append(x[shuffle_indice]) shuffled_y.append(y[shuffle_indice]) shuffled_mask_x.append(mask_x[shuffle_indice]) else: shuffled_x = x shuffled_y = y shuffled_mask_x = mask_x shuffled_mask_x = np.asarray(shuffled_mask_x).T # .tolist() shuffled_x = np.array(shuffled_x) shuffled_y = np.array(shuffled_y) shuffled_mask_x = np.array(shuffled_mask_x) # num_batches_per_epoch=int((data_size-1)/batch_size) + 1 num_batches_per_epoch = data_size // batch_size for batch_index in range(num_batches_per_epoch): start_index=batch_index*batch_size end_index=min((batch_index+1)*batch_size,data_size) return_x = shuffled_x[start_index:end_index] return_y = shuffled_y[start_index:end_index] return_mask_x = shuffled_mask_x[:,start_index:end_index] yield (return_x,return_y,return_mask_x)
Step2 对输入到模型中的句子进行Word Embedding,将每个词表示成一个数值型的词向量。这个过程中对于不同长度的问题文本,pad和截断成一样长度的。太短的就补空格,太长的就截断。从而构建维数一致的模型句向量输入。(这里调用了别人训练好的词向量模型word2vec.bin)
x_embedded = wv.embedding_lookup(len(list(x)), config.num_step, config.embed_dim, list(x), 0)
第二部分:基于RNN的分类器模型
每个词经过embedding之后,进入LSTM层,这里用的是标准的LSTM,然后经过一个时间序列得到的n个隐藏LSTM神经单元的向量,这些向量经过mean pooling层之后,可以得到一个向量h,然后紧接着是一个Softmax层,得到一个类别分布概率向量,取概率值最大的类别作为最终预测结果。
import inspect import tensorflow as tf class RNN_Model(object): def __init__(self, config, num_classes, is_training=True): keep_prob = config.keep_prob batch_size = config.batch_size num_step = config.num_step embed_dim = config.embed_dim self.embedded_x = tf.placeholder(tf.float32, [None, num_step, embed_dim], name="embedded_chars") self.target = tf.placeholder(tf.int64, [None, num_classes], name='target') self.mask_x = tf.placeholder(tf.float32, [num_step, None], name="mask_x") hidden_neural_size=config.hidden_neural_size hidden_layer_num=config.hidden_layer_num # build LSTM network def lstm_cell(): if 'reuse' in inspect.signature(tf.contrib.rnn.BasicLSTMCell.__init__).parameters: return tf.contrib.rnn.BasicLSTMCell(hidden_neural_size, forget_bias=0.0, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) else: return tf.contrib.rnn.BasicLSTMCell( hidden_neural_size, forget_bias=0.0, state_is_tuple=True) attn_cell = lstm_cell if is_training and keep_prob < 1: def attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=config.keep_prob) cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(hidden_layer_num)], state_is_tuple=True) self._initial_state = cell.zero_state(batch_size, dtype=tf.float32) inputs = self.embedded_x if keep_prob < 1: inputs = tf.nn.dropout(inputs, keep_prob) out_put = [] state = self._initial_state with tf.variable_scope("LSTM_layer"): for time_step in range(num_step): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step,:],state) out_put.append(cell_output) out_put=out_put*self.mask_x[:,:,None] with tf.name_scope("mean_pooling_layer"): out_put = tf.reduce_sum(out_put,0)/(tf.reduce_sum(self.mask_x,0)[:,None]) with tf.name_scope("Softmax_layer_and_output"): softmax_w = tf.get_variable("softmax_w",[hidden_neural_size,num_classes],dtype=tf.float32) softmax_b = tf.get_variable("softmax_b",[num_classes],dtype=tf.float32) # self.logits = tf.matmul(out_put,softmax_w) # self.scores = tf.add(self.logits, softmax_b, name='scores') self.scores = tf.nn.xw_plus_b(out_put, softmax_w, softmax_b, name="scores") with tf.name_scope("loss"): self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.target, logits=self.scores + 1e-10) self.cost = tf.reduce_mean(self.loss) with tf.name_scope("accuracy"): self.prediction = tf.argmax(self.scores, 1, name="prediction") correct_prediction = tf.equal(self.prediction, tf.argmax(self.target, 1)) self.correct_num = tf.reduce_sum(tf.cast(correct_prediction, tf.float32)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") self.probability = tf.nn.softmax(self.scores, name="probability") # add summary loss_summary = tf.summary.scalar("loss", self.cost) # add summary accuracy_summary = tf.summary.scalar("accuracy_summary", self.accuracy) if not is_training: return self.global_step = tf.Variable(0, name="global_step", trainable=False) self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), config.max_grad_norm) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in zip(grads, tvars): if g is not None: grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) self.grad_summaries_merged = tf.summary.merge(grad_summaries) self.summary = tf.summary.merge([loss_summary,accuracy_summary,self.grad_summaries_merged]) optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer.apply_gradients(zip(grads, tvars)) self.train_op=optimizer.apply_gradients(zip(grads, tvars)) self.new_lr = tf.placeholder(tf.float32,shape=[],name="new_learning_rate") self._lr_update = tf.assign(self.lr,self.new_lr) def assign_new_lr(self,session,lr_value): session.run(self._lr_update,feed_dict={self.new_lr:lr_value})
举例(QA中的问题意图分类):
输入:你好 呀
意图类别:greeting
具体代码参见代码