This paper using Dynamic K-max pooling and lifting wide convolution sentence modeling accuracy, NLP, when the length of the input vector is relatively large with respect to the length of the convolution kernel, requires the use of a wide convolution implemented in TensorFlow of CNN, padding = 'SAME' denotes convolution width, padding = 'VALID' convolution achieved is narrow, convoluted width can be described on the reference of this blog . Based on the github source Python2.7 migrate to Python3.6.8 on the basis of the commissioning, analytical papers can refer to this blog , has been resolved very well, it does not do much to explain here.
K may be text values for different length sequences dynamically adjusting the K-max pooling, code implementation for simplicity only two layers of convolution, average sentence length further pretreated by padding 37 forms.
Used herein commissioning environment tensorflow1.13.1, Python3.6.8: modified code and processing the posted comments ..
model.py: model defined DCNN
import tensorflow as tf
class DCNN():
def __init__(self, batch_size, sentence_length, num_filters, embed_size, top_k, k1):
self.batch_size = batch_size #50
self.sentence_length = sentence_length #37
self.num_filters = num_filters #[6,14]
self.embed_size = embed_size #100
self.top_k = top_k #4
self.k1 = k1 #19
def per_dim_conv_k_max_pooling_layer(self, x, w, b, k):
self.k1 = k
input_unstack = tf.unstack(x, axis=2)
w_unstack = tf.unstack(w, axis=1)
b_unstack = tf.unstack(b, axis=1)
convs = []
with tf.name_scope("per_dim_conv_k_max_pooling"):
for i in range(self.embed_size):
#conv:[batch_size, sent_length+ws-1, num_filters]
conv = tf.nn.relu(tf.nn.conv1d(input_unstack[i], w_unstack[i], stride=1, padding="SAME") + b_unstack[i])
#[batch_size, sentence_length, num_filters]
conv = tf.reshape(conv, [self.batch_size, self.num_filters[0], self.sentence_length])
values = tf.nn.top_k(conv, k, sorted=False).values
values = tf.reshape(values, [self.batch_size, k, self.num_filters[0]])
#k_max pooling in axis=1
convs.append(values)
conv = tf.stack(convs, axis=2)
#[batch_size, k1, embed_size, num_filters[0]]
#print conv.get_shape()
return conv
def per_dim_conv_layer(self, x, w, b):
#[batch_size, sentence_length, embed_dim, 1]=>embed_dim个[batch_size,sentence_length,1]list
#:[50,37,100,1] =>100个长度为[50,37,1]的list
input_unstack = tf.unstack(x, axis=2)
#[ws[0], embed_dim, 1, num_filters[0]]:[7,100,1,6] =>100个长度为[7,1,6]的list
w_unstack = tf.unstack(w, axis=1)
#[num_filters[0], embed_dim]:[6,100] =>100个长度为[6]的list
b_unstack = tf.unstack(b, axis=1)
convs = []
with tf.name_scope("per_dim_conv"):
for i in range(len(input_unstack)): #100:embed_dim
#conv1d(value,filters,padding)
# value:[batch, in_width, in_channels]=>[batch_size, sentence_length, 1]
# filters:[filter_width, in_channels, out_channels] filter_width可以看作每次与value进行卷积的行数
# in_channels表示value一共有多少列(与value中的in_channels相对应)
# out_channels表示输出通道,可以理解为一共有多少个卷积核,即卷积核的数目。
# =>[ws[0], 1, num_filters[0]]
conv = tf.nn.relu(tf.nn.conv1d(input_unstack[i], w_unstack[i], stride=1, padding="SAME") + b_unstack[i])
convs.append(conv)
conv = tf.stack(convs, axis=2)
#print("conv.shape: ",conv.shape)
#[batch_size, k1+ws-1, embed_size, num_filters[1]]
return conv
def fold_k_max_pooling(self, x, k):
input_unstack = tf.unstack(x, axis=2) #(batch_size, 37, 100, 6)=>100个(batch_size, 37,6)
out = []
with tf.name_scope("fold_k_max_pooling"):
for i in range(0, len(input_unstack), 2): #(0,100,2)共50个循环,每两行直接相加,embed_size/2
fold = tf.add(input_unstack[i], input_unstack[i+1])#[batch_size, k1, num_filters[1]]
conv = tf.transpose(fold, perm=[0, 2, 1]) #[batch_size, num_filters[1],k1]
#返回每一行中最大的k个值及其index组成的元组:(values, indices)
values = tf.nn.top_k(conv, k, sorted=False).values #[batch_size, num_filters[1], top_k]
values = tf.transpose(values, perm=[0, 2, 1])
out.append(values)
fold = tf.stack(out, axis=2)#[batch_size, k2, embed_size/2, num_filters[1]]
return fold
def full_connect_layer(self, x, w, b, wo, dropout_keep_prob):
with tf.name_scope("full_connect_layer"):
h = tf.nn.tanh(tf.matmul(x, w) + b)
h = tf.nn.dropout(h, dropout_keep_prob)
o = tf.matmul(h, wo)
return o
def DCNN(self, sent, W1, W2, b1, b2, k1, top_k, Wh, bh, Wo, dropout_keep_prob):
conv1 = self.per_dim_conv_layer(sent, W1, b1)
#print("after 1st per_dim_conv_layer: ",conv1.shape) (batch_size, 37, 100, 6)
#根据论文,两层卷积层,序列长度为37,则第一层pooling为动态长度19,第二层为固定值top_k
conv1 = self.fold_k_max_pooling(conv1, k1)
#print("after 1st fold_k_max_pooling: ",conv1.shape) (batch_size, 19, 50, 6)
conv2 = self.per_dim_conv_layer(conv1, W2, b2)
#print("after 2nd per_dim_conv_layer: ",conv2.shape) (batch_size, 19, 50, 14)
fold = self.fold_k_max_pooling(conv2, top_k)
#print("after 2nd fold_k_max_pooling: ",fold.shape) (batch_size, 4, 25, 14)
fold_flatten = tf.reshape(fold, [-1, int(top_k*100*14/4)])
#print("after fold_flatten: ",fold_flatten.shape) (batch_size, 1400)
print(fold_flatten.get_shape())
out = self.full_connect_layer(fold_flatten, Wh, bh, Wo, dropout_keep_prob)
return out
train.py: model training
import sys
sys.path.append(r'F:/pycharm_tensorflow/Dynamic-CNN-Sentence-Classification-TF-master/')
from model import *
import dataUtils
import numpy as np
import time
import os
embed_dim = 100
ws = [7, 5]
top_k = 4
k1 = 19
num_filters = [6, 14]
dev = 300
batch_size = 50
n_epochs = 30
num_hidden = 100
sentence_length = 37
num_class = 6
lr = 0.01
evaluate_every = 100
checkpoint_every = 100
num_checkpoints = 5
# Load data
print("Loading data...")
x_, y_, vocabulary, vocabulary_inv, test_size = dataUtils.load_data()
#x_:长度为5952的np.array。(包含5452个训练集和500个测试集)其中每个句子都是padding成长度为37的list(padding的索引为0)
#y_:长度为5952的np.array。每一个都是长度为6的onehot编码表示其类别属性
#vocabulary:长度为8789的字典,说明语料库中一共包含8789各单词。key是单词,value是索引
#vocabulary_inv:长度为8789的list,是按照单词出现次数进行排列。依次为:<PAD?>,\\?,the,what,is,of,in,a....
#test_size:500,测试集大小
# Randomly shuffle data
x, x_test = x_[:-test_size], x_[-test_size:]
y, y_test = y_[:-test_size], y_[-test_size:]
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
x_train, x_dev = x_shuffled[:-dev], x_shuffled[-dev:]
y_train, y_dev = y_shuffled[:-dev], y_shuffled[-dev:]
print("Train/Dev/Test split: {:d}/{:d}/{:d}".format(len(y_train), len(y_dev), len(y_test)))
#--------------------------------------------------------------------------------------#
def init_weights(shape, name):
return tf.Variable(tf.truncated_normal(shape, stddev=0.01), name=name)
sent = tf.placeholder(tf.int64, [None, sentence_length])
y = tf.placeholder(tf.float64, [None, num_class])
dropout_keep_prob = tf.placeholder(tf.float32, name="dropout")
with tf.name_scope("embedding_layer"):
W = tf.Variable(tf.random_uniform([len(vocabulary), embed_dim], -1.0, 1.0), name="embed_W")
sent_embed = tf.nn.embedding_lookup(W, sent)
#input_x = tf.reshape(sent_embed, [batch_size, -1, embed_dim, 1])
#转变为TF卷积需要的NHWC方式,增加最后一维Channel=1
#[batch_size, sentence_length, embed_dim, 1]
input_x = tf.expand_dims(sent_embed, -1)
W1 = init_weights([ws[0], embed_dim, 1, num_filters[0]], "W1")
b1 = tf.Variable(tf.constant(0.1, shape=[num_filters[0], embed_dim]), "b1")
W2 = init_weights([ws[1], int(embed_dim/2), num_filters[0], num_filters[1]], "W2")
b2 = tf.Variable(tf.constant(0.1, shape=[num_filters[1], embed_dim]), "b2")
Wh = init_weights([int(top_k*embed_dim*num_filters[1]/4), num_hidden], "Wh")
bh = tf.Variable(tf.constant(0.1, shape=[num_hidden]), "bh")
Wo = init_weights([num_hidden, num_class], "Wo")
model = DCNN(batch_size, sentence_length, num_filters, embed_dim, top_k, k1)
out = model.DCNN(input_x, W1, W2, b1, b2, k1, top_k, Wh, bh, Wo, dropout_keep_prob)
with tf.name_scope("cost"):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=y))
# train_step = tf.train.AdamOptimizer(lr).minimize(cost)
predict_op = tf.argmax(out, axis=1, name="predictions")
with tf.name_scope("accuracy"):
acc = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(out, 1)), tf.float32))
#-------------------------------------------------------------------------------------------#
print('Started training')
with tf.Session() as sess:
#init = tf.global_variables_initializer().run()
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-3)
grads_and_vars = optimizer.compute_gradients(cost)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Keep track of gradient values and sparsity
grad_summaries = []
for g, v in grads_and_vars:
if g is not None:
grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
grad_summaries_merged = tf.summary.merge(grad_summaries)
# Output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
# Summaries for loss and accuracy
loss_summary = tf.summary.scalar("loss", cost)
acc_summary = tf.summary.scalar("accuracy", acc)
# Train Summaries
train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
# Dev summaries
dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints)
# Initialize all variables
sess.run(tf.global_variables_initializer())
def train_step(x_batch, y_batch):
feed_dict = {
sent: x_batch,
y: y_batch,
dropout_keep_prob: 0.5
}
_, step, summaries, loss, accuracy = sess.run(
[train_op, global_step, train_summary_op, cost, acc],
feed_dict)
print("TRAIN step {}, loss {:g}, acc {:g}".format(step, loss, accuracy))
train_summary_writer.add_summary(summaries, step)
def dev_step(x_batch, y_batch, writer=None):
"""
Evaluates model on a dev set
"""
feed_dict = {
sent: x_batch,
y: y_batch,
dropout_keep_prob: 1.0
}
step, summaries, loss, accuracy = sess.run(
[global_step, dev_summary_op, cost, acc],
feed_dict)
print("VALID step {}, loss {:g}, acc {:g}".format(step, loss, accuracy))
if writer:
writer.add_summary(summaries, step)
return accuracy, loss
batches = dataUtils.batch_iter(list(zip(x_train, y_train)), batch_size, n_epochs)
# Training loop. For each batch...
max_acc = 0
best_at_step = 0
for batch in batches:
x_batch, y_batch = zip(*batch)
train_step(x_batch, y_batch)
current_step = tf.train.global_step(sess, global_step)
if current_step % evaluate_every == 0:
print("\nEvaluation:")
acc_dev, _ = dev_step(x_dev, y_dev, writer=dev_summary_writer)
if acc_dev >= max_acc:
max_acc = acc_dev
best_at_step = current_step
path = saver.save(sess, checkpoint_prefix, global_step=current_step)
print("")
if current_step % checkpoint_every == 0:
print('Best of valid = {}, at step {}'.format(max_acc, best_at_step))
saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
print('Finish training. On test set:')
acc, loss = dev_step(x_test, y_test, writer=None)
print(acc, loss)
dataUtils.py: data preprocessing
from collections import Counter
import itertools
import numpy as np
import re
import os
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9:(),!?\'\`]", " ", string)
string = re.sub(r" : ", ":", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data_and_labels():
"""
Loads data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
# Load data from files
folder_prefix = 'F:/pycharm_tensorflow/Dynamic-CNN-Sentence-Classification-TF-master/data/'
#print(os.path.abspath(folder_prefix+"train"))
x_train = list(open(folder_prefix+"train").readlines())
x_test = list(open(folder_prefix+"test").readlines())
test_size = len(x_test)
x_text = x_train + x_test
x_text = [clean_str(sent) for sent in x_text]
y = [s.split(' ')[0].split(':')[0] for s in x_text]
x_text = [s.split(" ")[1:] for s in x_text]
# Generate labels
all_label = dict()
for label in y:
if not label in all_label:
all_label[label] = len(all_label) + 1
one_hot = np.identity(len(all_label))
y = [one_hot[ all_label[label]-1 ] for label in y]
return [x_text, y, test_size]
def pad_sentences(sentences, padding_word="<PAD/>"):
"""
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
"""
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
def build_vocab(sentences):
"""
Builds a vocabulary mapping from word to index based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
# Build vocabulary
word_counts = Counter(itertools.chain(*sentences))
# Mapping from index to word
# vocabulary_inv=['<PAD/>', 'the', ....]
# 按照词频从高到底排列
vocabulary_inv = [x[0] for x in word_counts.most_common()]
# Mapping from word to index
# vocabulary = {'<PAD/>': 0, 'the': 1, ',': 2, 'a': 3, 'and': 4, ..}
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)} #{词:词频}
return [vocabulary, vocabulary_inv]
def build_input_data(sentences, labels, vocabulary):
"""
Maps sentences and labels to vectors based on a vocabulary.
"""
x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
y = np.array(labels)
return [x, y]
def load_data():
"""
Loads and preprocessed data
Returns input vectors, labels, vocabulary, and inverse vocabulary.
"""
# Load and preprocess data
sentences, labels, test_size = load_data_and_labels()
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x, y = build_input_data(sentences_padded, labels, vocabulary)
return [x, y, vocabulary, vocabulary_inv, test_size]
def batch_iter(data, batch_size, num_epochs):
"""
Generates a batch iterator for a dataset.
"""
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int(len(data)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = (batch_num + 1) * batch_size
if end_index > data_size:
end_index = data_size
start_index = end_index - batch_size
yield shuffled_data[start_index:end_index]