版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/bqw18744018044/article/details/89289596
一、模型主要思想
将CNN结构应用到文本分类中,使用不同尺寸的filter提取文本特征,从而捕获文本的局部信息。
二、模型结构
1.Embedding层:获得词的分布式表示;
2.卷积层:使用多个不同尺寸的filter对Embedding层的输出提取特征;
3.最大池化层:将不同长度的句子变为定长表示(这里不使用avg pooling是因为之前对句子进行padding)
4.全连接层
三、使用TensorFlow实现模型
import numpy as np
import pandas as pd
import tensorflow as tf
import math
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
Using TensorFlow backend.
参数
max_features = 10000 # vocabulary的大小
maxlen = 500
embedding_size = 128
batch_size = 512 # 每个batch中样本的数量
num_epochs = 20
max_learning_rate = 0.005
min_learning_rate = 0.0001
decay_coefficient = 2.5 # learning_rate的衰减系数
dropout_keep_prob = 0.5 # dropout的比例
evaluate_every = 100 # 每100step进行一次eval
num_filters = 128 # filter的数量
filter_sizes = [3,4,5]
数据读取
train = pd.read_csv("../input/labeledTrainData.tsv", header=0,delimiter="\t", quoting=3)
test = pd.read_csv("../input/testData.tsv",header=0,delimiter="\t", quoting=3)
数据处理
# 建立tokenizer
tokenizer = Tokenizer(num_words=max_features,lower=True)
tokenizer.fit_on_texts(list(train['review']) + list(test['review']))
#word_index = tokenizer.word_index
x_train = tokenizer.texts_to_sequences(list(train['review']))
x_train = pad_sequences(x_train,maxlen=maxlen) # padding
y_train = to_categorical(list(train['sentiment'])) # one-hot
x_test = tokenizer.texts_to_sequences(list(test['review']))
x_test = pad_sequences(x_test,maxlen=maxlen) # padding
# 划分训练和验证集
x_train,x_dev,y_train,y_dev = train_test_split(x_train,y_train,test_size=0.3,random_state=0)
构建模型
class TextCNN(object):
def __init__(self,
sequence_length,
num_classes,
vocab_size,
embedding_size,
filter_sizes,
num_filters,
l2_reg_lambda=0.0):
self.input_x = tf.placeholder(tf.int32,[None,sequence_length],name='input_x')
self.input_y = tf.placeholder(tf.float32,[None,num_classes],name='input_y')
self.dropout_keep_prob = tf.placeholder(tf.float32,name='dropout_keep_prob')
self.learning_rate = tf.placeholder(tf.float32,name='learning_rate')
l2_loss = tf.constant(0.0)
# embedding层
with tf.name_scope('embedding'):
self.W = tf.Variable(tf.random_uniform([vocab_size,embedding_size],-1.0,1.0),
name='W',trainable=True)
# [batch_size,sequence_length,embedding_size]
self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
# [batch_size,sequence_length,embedding_size,1]
# 为了将其应用于conv2d,故需要维度类似于图片,即[batch_size,height,width,channels]
# 最后的维度1就是channels
self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
pooled_outputs = []
# 卷积和池化层(包含len(filter_sizes)个)
for i,filter_size in enumerate(filter_sizes):
with tf.name_scope('conv-maxpool-%s'%filter_size):
# [filter_height,filter_width,filter,in_channels,out_channels]
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
# [batch_size,sequence_length-filter_size+1,1,num_filters]
conv = tf.nn.conv2d(
self.embedded_chars_expanded,
W,
strides=[1, 1, 1, 1],
padding="VALID",
name="conv")
# [batch_size,sequence_length-filter_size+1,1,num_filters]
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
# [batch_size,1,1,num_filters]
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="pool")
pooled_outputs.append(pooled)
# 合并所有pool的输出
num_filters_total = num_filters * len(filter_sizes)
# [batch_size,1,1,num_filter*len(filter_sizes)]
self.h_pool = tf.concat(pooled_outputs, len(filter_sizes))
# [bathc_size, num_filter*len(filter_sizes)]
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
# Dropout
with tf.name_scope("dropout"):
self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
# 输出分类
with tf.name_scope("output"):
W = tf.get_variable(
"W",
shape=[num_filters_total, num_classes],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
l2_loss += tf.nn.l2_loss(W)
l2_loss += tf.nn.l2_loss(b)
self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
self.predictions = tf.argmax(self.scores, 1, name="predictions")
# 计算loss
with tf.name_scope("loss"):
# loss
losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
# 正则化后的loss
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
# Accuracy
with tf.name_scope("accuracy"):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
# 用于产生batch
def batch_iter(data, batch_size, num_epochs, shuffle=True):
data_size = len(data)
num_batches_per_epoch = data_size// batch_size # 每个epoch中包含的batch数量
for epoch in range(num_epochs):
# 每个epoch是否进行shuflle
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch+1):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
模型训练
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=True, # 如果指定的设备不存在,允许tf自动分配设备
log_device_placement=False) # 不打印设备分配日志
sess = tf.Session(config=session_conf) # 使用session_conf对session进行配置
# 构建模型
nn = TextCNN(sequence_length=x_train.shape[1],
num_classes=y_train.shape[1],
vocab_size=max_features,
embedding_size=embedding_size,
filter_sizes=filter_sizes,
num_filters=num_filters)
# 用于统计全局的step
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(nn.learning_rate)
tvars = tf.trainable_variables() # 返回需要训练的variable
# tf.gradients(nn.loss, tvars),计算loss对tvars的梯度
grads, _ = tf.clip_by_global_norm(tf.gradients(nn.loss, tvars), 5) # 为了防止梯度爆炸,对梯度进行控制
grads_and_vars = tuple(zip(grads, tvars))
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
sess.run(tf.global_variables_initializer())
batches = batch_iter(np.hstack((x_train,y_train)), batch_size, num_epochs)
decay_speed = decay_coefficient*len(y_train)/batch_size
counter = 0 # 用于记录当前的batch数
for batch in batches:
learning_rate = min_learning_rate + (max_learning_rate - min_learning_rate) * math.exp(-counter/decay_speed)
counter += 1
x_batch,y_batch = batch[:,:-2],batch[:,-2:]
# 训练
feed_dict = {nn.input_x: x_batch,
nn.input_y: y_batch,
nn.dropout_keep_prob: dropout_keep_prob,
nn.learning_rate: learning_rate}
_, step, loss, accuracy= sess.run(
[train_op, global_step, nn.loss, nn.accuracy],
feed_dict)
current_step = tf.train.global_step(sess, global_step)
# Evaluate
if current_step % evaluate_every == 0:
print("\nEvaluation:")
feed_dict = {
nn.input_x: x_dev,
nn.input_y: y_dev,
nn.dropout_keep_prob: 1.0
}
step, loss, accuracy = sess.run(
[global_step, nn.loss, nn.accuracy],
feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
print("")
# predict test set
all_predictions = []
test_batches = batch_iter(x_test, batch_size, num_epochs=1, shuffle=False)
for batch in test_batches:
feed_dict = {
nn.input_x: batch,
nn.dropout_keep_prob: 1.0
}
predictions = sess.run([nn.predictions],feed_dict)[0]
all_predictions.extend(list(predictions))
Evaluation:
2019-04-13T13:27:37.430788: step 100, loss 0.413421, acc 0.810133
Evaluation:
2019-04-13T13:27:50.482590: step 200, loss 0.376269, acc 0.838933
Evaluation:
2019-04-13T13:28:03.504445: step 300, loss 0.366715, acc 0.845333
Evaluation:
2019-04-13T13:28:16.554389: step 400, loss 0.361695, acc 0.849067
Evaluation:
2019-04-13T13:28:29.574545: step 500, loss 0.357396, acc 0.8512
Evaluation:
2019-04-13T13:28:42.598720: step 600, loss 0.353969, acc 0.8532
Evaluation:
2019-04-13T13:28:55.601439: step 700, loss 0.350428, acc 0.8536