Commissioning Code [paper] A Convolutional Neural Network for Modelling Sentences

Disclaimer: This article is a blogger original article, shall not be reproduced without the bloggers allowed. https://blog.csdn.net/cskywit/article/details/90733729

      This paper using Dynamic K-max pooling and lifting wide convolution sentence modeling accuracy, NLP, when the length of the input vector is relatively large with respect to the length of the convolution kernel, requires the use of a wide convolution implemented in TensorFlow of CNN, padding = 'SAME' denotes convolution width, padding = 'VALID' convolution achieved is narrow, convoluted width can be described on the reference of this blog . Based on the github source Python2.7 migrate to Python3.6.8 on the basis of the commissioning, analytical papers can refer to this blog , has been resolved very well, it does not do much to explain here.

     K may be text values ​​for different length sequences dynamically adjusting the K-max pooling, code implementation for simplicity only two layers of convolution, average sentence length further pretreated by padding 37 forms.

     Used herein commissioning environment tensorflow1.13.1, Python3.6.8: modified code and processing the posted comments ..

model.py: model defined DCNN

import tensorflow as tf

class DCNN():
    def __init__(self, batch_size, sentence_length, num_filters, embed_size, top_k, k1):
        self.batch_size = batch_size    #50
        self.sentence_length = sentence_length #37
        self.num_filters = num_filters  #[6,14]
        self.embed_size = embed_size    #100
        self.top_k = top_k              #4
        self.k1 = k1                    #19

    def per_dim_conv_k_max_pooling_layer(self, x, w, b, k):
        self.k1 = k
        input_unstack = tf.unstack(x, axis=2)
        w_unstack = tf.unstack(w, axis=1)
        b_unstack = tf.unstack(b, axis=1)
        convs = []
        with tf.name_scope("per_dim_conv_k_max_pooling"):
            for i in range(self.embed_size):
                #conv:[batch_size, sent_length+ws-1, num_filters]
                conv = tf.nn.relu(tf.nn.conv1d(input_unstack[i], w_unstack[i], stride=1, padding="SAME") + b_unstack[i])
                #[batch_size, sentence_length, num_filters]
                conv = tf.reshape(conv, [self.batch_size, self.num_filters[0], self.sentence_length])
                values = tf.nn.top_k(conv, k, sorted=False).values
                values = tf.reshape(values, [self.batch_size, k, self.num_filters[0]])
                #k_max pooling in axis=1
                convs.append(values)
            conv = tf.stack(convs, axis=2)
        #[batch_size, k1, embed_size, num_filters[0]]
        #print conv.get_shape()
        return conv

    def per_dim_conv_layer(self, x, w, b):
        #[batch_size, sentence_length, embed_dim, 1]=>embed_dim个[batch_size,sentence_length,1]list
        #:[50,37,100,1] =>100个长度为[50,37,1]的list
        input_unstack = tf.unstack(x, axis=2)  
        #[ws[0], embed_dim, 1, num_filters[0]]:[7,100,1,6] =>100个长度为[7,1,6]的list
        w_unstack = tf.unstack(w, axis=1)
        #[num_filters[0], embed_dim]:[6,100] =>100个长度为[6]的list     
        b_unstack = tf.unstack(b, axis=1)      
        convs = []
        with tf.name_scope("per_dim_conv"):
            for i in range(len(input_unstack)):  #100:embed_dim
                #conv1d(value,filters,padding)
                #       value:[batch, in_width, in_channels]=>[batch_size, sentence_length, 1]
                #       filters:[filter_width, in_channels, out_channels] filter_width可以看作每次与value进行卷积的行数 
                #           in_channels表示value一共有多少列(与value中的in_channels相对应)
                #           out_channels表示输出通道,可以理解为一共有多少个卷积核,即卷积核的数目。  
                #         =>[ws[0], 1, num_filters[0]]
                conv = tf.nn.relu(tf.nn.conv1d(input_unstack[i], w_unstack[i], stride=1, padding="SAME") + b_unstack[i])
                convs.append(conv)
            conv = tf.stack(convs, axis=2)
            #print("conv.shape: ",conv.shape)
            #[batch_size, k1+ws-1, embed_size, num_filters[1]]
        return conv

    def fold_k_max_pooling(self, x, k):
        input_unstack = tf.unstack(x, axis=2)  #(batch_size, 37, 100, 6)=>100个(batch_size, 37,6)
        out = []
        with tf.name_scope("fold_k_max_pooling"):
            for i in range(0, len(input_unstack), 2):  #(0,100,2)共50个循环,每两行直接相加,embed_size/2
                fold = tf.add(input_unstack[i], input_unstack[i+1])#[batch_size, k1, num_filters[1]]
                conv = tf.transpose(fold, perm=[0, 2, 1])   #[batch_size, num_filters[1],k1]
                #返回每一行中最大的k个值及其index组成的元组:(values, indices)
                values = tf.nn.top_k(conv, k, sorted=False).values #[batch_size, num_filters[1], top_k]
                values = tf.transpose(values, perm=[0, 2, 1])
                out.append(values)
            fold = tf.stack(out, axis=2)#[batch_size, k2, embed_size/2, num_filters[1]]
        return fold

    def full_connect_layer(self, x, w, b, wo, dropout_keep_prob):
        with tf.name_scope("full_connect_layer"):
            h = tf.nn.tanh(tf.matmul(x, w) + b)
            h = tf.nn.dropout(h, dropout_keep_prob)
            o = tf.matmul(h, wo)
        return o

    def DCNN(self, sent, W1, W2, b1, b2, k1, top_k, Wh, bh, Wo, dropout_keep_prob):
        conv1 = self.per_dim_conv_layer(sent, W1, b1)
        #print("after 1st per_dim_conv_layer: ",conv1.shape)   (batch_size, 37, 100, 6)
        #根据论文,两层卷积层,序列长度为37,则第一层pooling为动态长度19,第二层为固定值top_k
        conv1 = self.fold_k_max_pooling(conv1, k1)
        #print("after 1st fold_k_max_pooling: ",conv1.shape)    (batch_size, 19, 50, 6)
        conv2 = self.per_dim_conv_layer(conv1, W2, b2)
        #print("after 2nd per_dim_conv_layer: ",conv2.shape)      (batch_size, 19, 50, 14)
        fold = self.fold_k_max_pooling(conv2, top_k)
        #print("after 2nd fold_k_max_pooling: ",fold.shape)        (batch_size, 4, 25, 14) 
        fold_flatten = tf.reshape(fold, [-1, int(top_k*100*14/4)])  
        #print("after fold_flatten: ",fold_flatten.shape)           (batch_size, 1400)
        print(fold_flatten.get_shape()) 
        out = self.full_connect_layer(fold_flatten, Wh, bh, Wo, dropout_keep_prob)
        return out
 

 train.py: model training

import sys
sys.path.append(r'F:/pycharm_tensorflow/Dynamic-CNN-Sentence-Classification-TF-master/')
from model import *
import dataUtils
import numpy as np
import time
import os

embed_dim = 100
ws = [7, 5]
top_k = 4
k1 = 19
num_filters = [6, 14]
dev = 300
batch_size = 50
n_epochs = 30
num_hidden = 100
sentence_length = 37
num_class = 6
lr = 0.01
evaluate_every = 100
checkpoint_every = 100
num_checkpoints = 5

# Load data
print("Loading data...")
x_, y_, vocabulary, vocabulary_inv, test_size = dataUtils.load_data()
#x_:长度为5952的np.array。(包含5452个训练集和500个测试集)其中每个句子都是padding成长度为37的list(padding的索引为0)
#y_:长度为5952的np.array。每一个都是长度为6的onehot编码表示其类别属性
#vocabulary:长度为8789的字典,说明语料库中一共包含8789各单词。key是单词,value是索引
#vocabulary_inv:长度为8789的list,是按照单词出现次数进行排列。依次为:<PAD?>,\\?,the,what,is,of,in,a....
#test_size:500,测试集大小

# Randomly shuffle data
x, x_test = x_[:-test_size], x_[-test_size:]
y, y_test = y_[:-test_size], y_[-test_size:]
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

x_train, x_dev = x_shuffled[:-dev], x_shuffled[-dev:]
y_train, y_dev = y_shuffled[:-dev], y_shuffled[-dev:]

print("Train/Dev/Test split: {:d}/{:d}/{:d}".format(len(y_train), len(y_dev), len(y_test)))
#--------------------------------------------------------------------------------------#

def init_weights(shape, name):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.01), name=name)

sent = tf.placeholder(tf.int64, [None, sentence_length])
y = tf.placeholder(tf.float64, [None, num_class])
dropout_keep_prob = tf.placeholder(tf.float32, name="dropout")


with tf.name_scope("embedding_layer"):
    W = tf.Variable(tf.random_uniform([len(vocabulary), embed_dim], -1.0, 1.0), name="embed_W")
    sent_embed = tf.nn.embedding_lookup(W, sent)
    #input_x = tf.reshape(sent_embed, [batch_size, -1, embed_dim, 1])
    #转变为TF卷积需要的NHWC方式,增加最后一维Channel=1
    #[batch_size, sentence_length, embed_dim, 1]
    input_x = tf.expand_dims(sent_embed, -1)
    

W1 = init_weights([ws[0], embed_dim, 1, num_filters[0]], "W1")
b1 = tf.Variable(tf.constant(0.1, shape=[num_filters[0], embed_dim]), "b1")

W2 = init_weights([ws[1], int(embed_dim/2), num_filters[0], num_filters[1]], "W2")
b2 = tf.Variable(tf.constant(0.1, shape=[num_filters[1], embed_dim]), "b2")

Wh = init_weights([int(top_k*embed_dim*num_filters[1]/4), num_hidden], "Wh")
bh = tf.Variable(tf.constant(0.1, shape=[num_hidden]), "bh")

Wo = init_weights([num_hidden, num_class], "Wo")

model = DCNN(batch_size, sentence_length, num_filters, embed_dim, top_k, k1)
out = model.DCNN(input_x, W1, W2, b1, b2, k1, top_k, Wh, bh, Wo, dropout_keep_prob)

with tf.name_scope("cost"):
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=y))
# train_step = tf.train.AdamOptimizer(lr).minimize(cost)

predict_op = tf.argmax(out, axis=1, name="predictions")
with tf.name_scope("accuracy"):
    acc = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(out, 1)), tf.float32))
#-------------------------------------------------------------------------------------------#

print('Started training')
with tf.Session() as sess:
    #init = tf.global_variables_initializer().run()

    global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.AdamOptimizer(1e-3)
    grads_and_vars = optimizer.compute_gradients(cost)
    train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

    # Keep track of gradient values and sparsity
    grad_summaries = []
    for g, v in grads_and_vars:
        if g is not None:
            grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
            sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
            grad_summaries.append(grad_hist_summary)
            grad_summaries.append(sparsity_summary)
    grad_summaries_merged = tf.summary.merge(grad_summaries)

    # Output directory for models and summaries
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))

    # Summaries for loss and accuracy
    loss_summary = tf.summary.scalar("loss", cost)
    acc_summary = tf.summary.scalar("accuracy", acc)

    # Train Summaries
    train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
    train_summary_dir = os.path.join(out_dir, "summaries", "train")
    train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

    # Dev summaries
    dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
    dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
    dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

    # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
    checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints)

    # Initialize all variables
    sess.run(tf.global_variables_initializer())

    def train_step(x_batch, y_batch):
        feed_dict = {
            sent: x_batch,
            y: y_batch,
            dropout_keep_prob: 0.5
        }
        _, step, summaries, loss, accuracy = sess.run(
            [train_op, global_step, train_summary_op, cost, acc],
            feed_dict)
        print("TRAIN step {}, loss {:g}, acc {:g}".format(step, loss, accuracy))
        train_summary_writer.add_summary(summaries, step)

    def dev_step(x_batch, y_batch, writer=None):
        """
        Evaluates model on a dev set
        """
        feed_dict = {
            sent: x_batch,
            y: y_batch,
            dropout_keep_prob: 1.0
        }
        step, summaries, loss, accuracy = sess.run(
            [global_step, dev_summary_op, cost, acc],
            feed_dict)
        print("VALID step {}, loss {:g}, acc {:g}".format(step, loss, accuracy))
        if writer:
            writer.add_summary(summaries, step)
        return accuracy, loss


    batches = dataUtils.batch_iter(list(zip(x_train, y_train)), batch_size, n_epochs)
    # Training loop. For each batch...
    max_acc = 0
    best_at_step = 0
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        train_step(x_batch, y_batch)
        current_step = tf.train.global_step(sess, global_step)
        if current_step % evaluate_every == 0:
            print("\nEvaluation:")
            acc_dev, _ = dev_step(x_dev, y_dev, writer=dev_summary_writer)
            if acc_dev >= max_acc:
                max_acc = acc_dev
                best_at_step = current_step
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
            print("")
        if current_step % checkpoint_every == 0:
            print('Best of valid = {}, at step {}'.format(max_acc, best_at_step))

    saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
    print('Finish training. On test set:')
    acc, loss = dev_step(x_test, y_test, writer=None)
    print(acc, loss)

dataUtils.py: data preprocessing

from collections import Counter
import itertools
import numpy as np
import re
import os

def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9:(),!?\'\`]", " ", string)
    string = re.sub(r" : ", ":", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def load_data_and_labels():
    """
    Loads data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    folder_prefix = 'F:/pycharm_tensorflow/Dynamic-CNN-Sentence-Classification-TF-master/data/'
    #print(os.path.abspath(folder_prefix+"train"))
    x_train = list(open(folder_prefix+"train").readlines())
    x_test = list(open(folder_prefix+"test").readlines())
    test_size = len(x_test)
    x_text = x_train + x_test

    x_text = [clean_str(sent) for sent in x_text]
    y = [s.split(' ')[0].split(':')[0] for s in x_text]
    x_text = [s.split(" ")[1:] for s in x_text]
    # Generate labels
    all_label = dict()
    for label in y:
        if not label in all_label:
            all_label[label] = len(all_label) + 1
    one_hot = np.identity(len(all_label))
    y = [one_hot[ all_label[label]-1 ] for label in y]
    return [x_text, y, test_size]

def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    # vocabulary_inv=['<PAD/>', 'the', ....]
    # 按照词频从高到底排列
    vocabulary_inv = [x[0] for x in word_counts.most_common()]  
    # Mapping from word to index
    # vocabulary = {'<PAD/>': 0, 'the': 1, ',': 2, 'a': 3, 'and': 4, ..}
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)} #{词:词频}
    return [vocabulary, vocabulary_inv]

def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

def load_data():
    """
    Loads and preprocessed data
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels, test_size = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv, test_size]

def batch_iter(data, batch_size, num_epochs):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_data = data[shuffle_indices]
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = (batch_num + 1) * batch_size
            if end_index > data_size:
                end_index = data_size
                start_index = end_index - batch_size
            yield shuffled_data[start_index:end_index]

 

Guess you like

Origin blog.csdn.net/cskywit/article/details/90733729