BI-DIRECTIONAL ATTENSION FLOW FOR MACHINE COMPREHENSION 论文阅读及实现

论文链接：

https://arxiv.org/abs/1611.01603

论文目的：

在context中选取与query相关的内容（本博文举的例子为context下标选取），可以看作是一种QA结构。类似的可以参看前文——End-To-End Memory Networks 论文阅读。

模型特征：

总体特征：包含3个层次embedding: character-level word-level contextual-level，

使用双向attension来获得query-aware context表征。

数据说明：

这里使用上文 Deep Learningfor Extreme Multi-label Text Classification 实现

的QA数据集，将Question部分作为Context， Title作为Query，Tag作为所求的answer（这里要求Tag在Question中有，会执行相应的数据过滤），这里我们之估计Start indice，因为Tag一般是单个单词。

数据准备：

取上次处理得到的数据 X_y_file_12e5.txt

判断样本是否符合建模要求，并进行训练集测试集分割：

def context_query_data_process():
    def valid_func(question_text, tag_text):
        q_list = question_text.split(" ")
        t_list = tag_text.split(" ")
        for t in t_list:
            if t in q_list:
                return "{}\t{}".format(question_text, t)
    with open("context_query_tag_test.txt", "w") as test_o:
        with open("context_query_tag_train.txt", "w") as train_o:
            with open("X_y_file_12e5.txt") as f:
                line_num = 0
                while True:
                    line = f.readline()
                    if not line:
                        break
                    question_text, s, title_text, tag_text = line[:-1].split("\t")
                    valid = valid_func(question_text, tag_text)
                    if valid:
                        ff, s = valid.split("\t")
                        if line_num % 10 >= 8:
                            test_o.write("{}\t{}\t{}\n".format(ff, title_text, s))
                        else:
                            train_o.write("{}\t{}\t{}\n".format(ff, title_text, s))
                        line_num += 1
                        if line_num % 10000 == 0:
                            print("line_num :{}".format(line_num))

数据集编码：

def index_cq():
    from functools import reduce
    from collections import Counter

    with open("context_query_tag_test.txt") as f:
        test_context = f.read().replace("\t", " ").replace("\n", " ")
        cnt = Counter(test_context.split(" "))
        words = list(map(lambda x: x[0],cnt.most_common(10000)))
        all_char_set = reduce(lambda x, y: x.union(y),map(set, words)).union(set(["<", ">"]))
        print("w c start")

    word2idx = dict([(w, i) for i, w in enumerate(words)] + [("<unk>", len(words)) ,("<pad>", len(words) + 1)])
    char2idx = dict((c, i) for i, c in enumerate(all_char_set))
    print("w c end")

    def map_word(input_list):
        return map(lambda x: word2idx.get(x, word2idx["<unk>"]),input_list)
    def map_char(input_list):
        char_nest_map = map(lambda char_list: map(lambda x: str(char2idx[x]), char_list), map(list ,input_list))
        return map(lambda char_map: "_".join(char_map), char_nest_map)

    with open("cqtt.txt", "w") as o:
        with open("context_query_tag_test.txt") as f:
            line_num = 0
            while True:
                line = f.readline()
                if not line:
                    break
                q_t, t_t, t = map(lambda x: x.split(" ") ,line[:-1].split("\t"))
                qtw = map_word(q_t)
                qtc = map_char(q_t)
                ttw = map_word(t_t)
                ttc = map_char(t_t)
                tw = map_word(t)
                qtw, qtc, ttw, ttc, tw = map(lambda inner_list: " ".join(map(str ,inner_list)), [qtw, qtc, ttw, ttc, tw])
                o.write("{}\t{}\t{}\t{}\t{}\n".format(qtw, qtc, ttw, ttc, tw))
                line_num += 1
                if line_num % 1000 == 0:
                    print(line_num)
    print("cqtt end")
    with open("cqtn.txt", "w") as o:
        with open("context_query_tag_train.txt") as f:
            line_num = 0
            while True:
                line = f.readline()
                if not line:
                    break
                q_t, t_t, t = map(lambda x: x.split(" ") ,line[:-1].split("\t"))
                qtw = map_word(q_t)
                qtc = map_char(q_t)
                ttw = map_word(t_t)
                ttc = map_char(t_t)
                tw = map_word(t)
                qtw, qtc, ttw, ttc, tw = map(lambda inner_list: " ".join(map(str ,inner_list)), [qtw, qtc, ttw, ttc, tw])
                o.write("{}\t{}\t{}\t{}\t{}\n".format(qtw, qtc, ttw, ttc, tw))
                line_num += 1
                if line_num % 1000 == 0:
                    print(line_num)
    print("cqtn end")

    import pickle
    with open("idx.pkl", "wb") as f:
        pickle.dump({
            "word2idx": word2idx,
            "char2idx": char2idx
        }, f)

数据导出（训练用导出数据函数）：

import tensorflow as tf
from tensorflow.contrib import rnn
import numpy as np

import pickle
with open("idx.pkl", "rb") as f:
    d = pickle.load(f)
    word2idx = d["word2idx"]
    char2idx = d["char2idx"]

padding_idx_s = str(word2idx["<pad>"])
padding_char_idx = len(char2idx)


def data_generator(gen_type = "n" ,batch_num = 64,
                   T = 100, J = 10, word_length = 10):
    assert gen_type in ["n", "t"]

    def split_word(word_join_str, word_padding):
        input_list = list(map(int, word_join_str.split(" ")))[:word_padding]
        return np.array(input_list + [padding_idx_s] * (word_padding - len(input_list))).astype(np.int32)

    def split_char(char_join_str, char_padding, word_padding):
        def char_nest_process(char_inner_str):
            input_list = char_inner_str.split("_")[:char_padding]
            return input_list + [padding_char_idx] * (char_padding - len(input_list))

        input_split = char_join_str.split(" ")[:word_padding]
        req = np.array(list(map(char_nest_process ,input_split))).astype(np.int32)
        trail = np.full(shape=[word_padding - len(input_split), char_padding], fill_value=padding_char_idx)
        req = np.append(req, trail, axis=0).astype(np.int32)

        return req

    start_idx = 0

    c_word_batch = np.zeros(shape=[batch_num, T]).astype(np.int32)
    c_char_batch = np.zeros(shape=[batch_num, T, word_length]).astype(np.int32)
    q_word_batch = np.zeros(shape=[batch_num, J]).astype(np.int32)
    q_char_batch = np.zeros(shape=[batch_num, J, word_length]).astype(np.int32)
    p1_fake_batch = np.zeros(shape=[batch_num, T]).astype(np.int32)
    p2_fake_batch = np.zeros(shape=[batch_num, T]).astype(np.int32)


    times = 0
    with open("cqt{}.txt".format(gen_type)) as f:
        while True:
            line = f.readline()
            if not line:
                return

            qtw, qtc, ttw, ttc, tw = line[:-1].split("\t")
            tw = split_word(tw, 1)
            qtw = split_word(qtw, T)
            if tw[0] not in qtw:
                continue
            else:
                p1idx = qtw.tolist().index(tw[0])
                if p1idx == (T - 1):
                    continue
            qtc = split_char(qtc, word_length, T)
            ttw = split_word(ttw, J)
            ttc = split_char(ttc, word_length, J)
            p1, p2 = [0] * T, [0] * T
            p1[p1idx] = 1
            p2[p1idx + 1] = 1
            p1 = np.array(p1).astype(np.int32)
            p2 = np.array(p2).astype(np.int32)

            c_word_batch[start_idx] = qtw
            c_char_batch[start_idx] = qtc
            q_word_batch[start_idx] = ttw
            q_char_batch[start_idx] = ttc
            p1_fake_batch[start_idx] = p1
            p2_fake_batch[start_idx] = p2

            start_idx += 1
            if start_idx == batch_num:
                times += 1
                if times == 1e10:
                    return

                yield (c_word_batch,
                        c_char_batch,
                        q_word_batch,
                        q_char_batch,
                        p1_fake_batch,
                        p2_fake_batch)

                start_idx = 0
                c_word_batch = np.zeros(shape=[batch_num, T]).astype(np.int32)
                c_char_batch = np.zeros(shape=[batch_num, T, word_length]).astype(np.int32)
                q_word_batch = np.zeros(shape=[batch_num, J]).astype(np.int32)
                q_char_batch = np.zeros(shape=[batch_num, J, word_length]).astype(np.int32)
                p1_fake_batch = np.zeros(shape=[batch_num, T]).astype(np.int32)
                p2_fake_batch = np.zeros(shape=[batch_num, T]).astype(np.int32)

模型构建：

class BIDAF(object):
    '''
    char_embed_size: embed size for single character
    char_size: like [a-zA-Z...] element num
    word_length: max single word length
    '''
    def __init__(self, char_embed_size = 10, char_size = padding_char_idx + 1, word_length = 10,
                 word_size = len(word2idx), word_embed_size = 50, T = 200, J = 20, batch_num = 64):

        self.char_embed_size = char_embed_size
        self.char_size = char_size
        self.word_length = word_length

        self.word_size = word_size
        self.word_embed_size = word_embed_size

        self.loss = None
        self.p1_accuracy = None
        self.p2_accuracy = None
        self.accuracy = None

        self.batch_num = batch_num

        with tf.device('/cpu:0'), tf.name_scope("char_embedding"):
            self.char_W = tf.Variable(
                tf.random_uniform([self.char_size, self.char_embed_size], -1.0, 1.0),
                name="char_W")

        with tf.device('/cpu:0'), tf.name_scope("word_embedding"):
            # this layer maybe replaced by w2v or glove in the future
            self.word_W = tf.Variable(
                tf.random_uniform([self.word_size, self.word_embed_size], -1.0, 1.0),
                name="word_W")
        self.word_length = word_length

        self.T = T
        self.J = J

        self.c_char = tf.placeholder(dtype=tf.int32, shape=[None, T, word_length], name="c_char")
        self.c_word = tf.placeholder(dtype=tf.int32, shape=[None, T], name = "c_word")

        self.q_char = tf.placeholder(dtype=tf.int32, shape=[None, J, word_length], name="q_char")
        self.q_word = tf.placeholder(dtype=tf.int32, shape=[None, J], name="q_word")

        self.p1_seq = tf.placeholder(dtype=tf.int32, shape=[None, T], name="p1_seq")
        self.p2_seq = tf.placeholder(dtype=tf.int32, shape=[None, T], name="p2_seq")

        self.model_construct()
        self.opt_construct()

    def model_construct(self):
        # embedding scope
        # Context Embed
        self.c_char_embed_flat = self.char_embed_layer(tf.reshape(self.c_char, [-1, self.word_length]))
        self.c_char_embed = tf.reshape(self.c_char_embed_flat, [-1, self.T, int(self.c_char_embed_flat.get_shape()[-1])], name="c_char_embed")
        self.c_word_embed = self.word_embed_layer(self.c_word)
        self.c_embed = tf.concat([self.c_char_embed, self.c_word_embed], axis=-1, name="c_embed")
        c_embed_last_dim = int(self.c_embed.get_shape()[-1])

        # Query Embed
        self.q_char_embed_flat = self.char_embed_layer(tf.reshape(self.q_char, [-1, self.word_length]))
        self.q_char_embed = tf.reshape(self.q_char_embed_flat, [-1, self.J, int(self.q_char_embed_flat.get_shape()[-1])], name="q_char_embed")
        self.q_word_embed = self.word_embed_layer(self.q_word)
        self.q_embed = tf.concat([self.q_char_embed, self.q_word_embed], axis=-1, name="q_embed")
        q_embed_last_dim = int(self.q_embed.get_shape()[-1])

        # high way scope
        with tf.variable_scope("high_way_layer") as scope:
            self.X = tf.reshape(self.high_way_layer(tf.reshape(self.c_embed, [-1, c_embed_last_dim])), [-1, self.T, c_embed_last_dim])
            scope.reuse_variables()
            self.Q = tf.reshape(self.high_way_layer(tf.reshape(self.q_embed, [-1, q_embed_last_dim])), [-1, self.J, q_embed_last_dim])

        # first bilstm layer
        with tf.variable_scope("first_lstm_layer") as scope:
            self.H = self.first_bilstm_layer(self.X)
            scope.reuse_variables()
            self.U = self.first_bilstm_layer(self.Q)

        # Similarity layer
        # [batch_num, T, J]
        self.S = self.similarity_layer(self.H, self.U)

        # Context_to_query attension
        self.A = tf.nn.softmax(self.S, dim=-1, name="A")
        d = int(self.U.get_shape()[-1]) / 2
        self.U_bar_list = []
        for i in range(self.batch_num):
            A = tf.squeeze(tf.slice(self.A, [i, 0, 0], [1, -1, -1]))
            U = tf.squeeze(tf.slice(self.U, [i, 0, 0], [1, -1, -1]))

            self.U_bar_list.append(tf.expand_dims(tf.matmul(A, U), 0))

        self.U_bar = tf.concat(self.U_bar_list, axis=0)

        # Query_to_context attension
        self.b = tf.nn.softmax(tf.reduce_max(self.S, axis=-1), dim=-1, name="b")

        self.h_bar_list = []
        for i in range(self.batch_num):
            b = tf.slice(self.b, [i, 0], [1, -1])
            H = tf.squeeze(tf.slice(self.H, [i, 0, 0], [1, -1, -1]))
            self.h_bar_list.append(tf.matmul(b, H))


        self.h_bar = tf.concat(self.h_bar_list, axis=1)
        self.H_bar = tf.reshape(tf.tile(self.h_bar, [1 ,self.T]), [-1 ,self.T, int(2 * d)])

        # G layer
        self.G = self.G_layer(self.H, self.H_bar, self.U_bar)

        # second lstm layer
        self.M = self.second_bilstm_layer(self.G)

        # third lstm layer
        self.M2 = self.third_bilstm_layer(self.M)

        # p1 layer
        GM = tf.concat([self.G, self.M], axis = -1)
        Pw1 = tf.get_variable(
            "Pw1",
            shape=[10 * d, 1],
            initializer=tf.contrib.layers.xavier_initializer())
        self.p1 = tf.reshape(tf.matmul(tf.reshape(GM, [-1, int(10 * d)]), Pw1), [-1, self.T], name="p1")

        # p2 layer
        GM2 = tf.concat([self.G, self.M2], axis = -1)
        Pw2 = tf.get_variable(
            "Pw2",
            shape=[10 * d, 1],
            initializer=tf.contrib.layers.xavier_initializer())
        self.p2 = tf.reshape(tf.matmul(tf.reshape(GM2, [-1, int(10 * d)]), Pw2), [-1, self.T], name="p2")

    def opt_construct(self, use_single_p = True):
        self.softmax_p1 = tf.nn.softmax(self.p1)
        self.softmax_p2 = tf.nn.softmax(self.p2)
        self.p1_labels = tf.cast(self.p1_seq, tf.float32)
        self.p2_labels = tf.cast(self.p2_seq, tf.float32)

        p1_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.p1, labels=self.p1_labels), name="p1_loss")
        p2_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.p2, labels=self.p2_labels), name="p2_loss")
        if use_single_p:
            self.loss = p1_loss
        else:
            self.loss = p1_loss + p2_loss

        self.opt = tf.train.AdamOptimizer(learning_rate=0.001)
        self.train_op = self.opt.minimize(self.loss)

        self.pred_p1 = tf.argmax(self.softmax_p1, 1, name="pred_p1")
        self.pred_p2 = tf.argmax(self.softmax_p2, 1, name="pred_p2")

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_pred_p1 = tf.equal(self.pred_p1, tf.argmax(self.p1_labels, 1))
            correct_pred_p2 = tf.equal(self.pred_p2, tf.argmax(self.p2_labels, 1))
            correct_pred = tf.multiply(tf.cast(correct_pred_p1, tf.float32), tf.cast(correct_pred_p2, tf.float32))

            if use_single_p:
                self.p1_accuracy = self.p2_accuracy = self.accuracy = tf.reduce_mean(tf.cast(correct_pred_p1, "float"), name="p1_accuracy")
            else:
                self.p1_accuracy = tf.reduce_mean(tf.cast(correct_pred_p1, "float"), name="p1_accuracy")
                self.p2_accuracy = tf.reduce_mean(tf.cast(correct_pred_p2, "float"), name="p2_accuracy")
                self.accuracy = tf.reduce_mean(correct_pred, name="accuracy")


    def char_embed_layer(self, input_char, num_filters = 3, filter_size = 2):
        with tf.name_scope("char_embedding"):
            embedded_chars = tf.nn.embedding_lookup(self.char_W, input_char)
            embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)

            filter_shape = [filter_size, self.char_embed_size, 1, num_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
            conv = tf.nn.conv2d(
                embedded_chars_expanded,
                W,
                strides=[1, 1, 1, 1],
                padding="VALID",
                name="conv")
            # Apply nonlinearity
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
            # Maxpooling over the outputs
            pooled = tf.nn.max_pool(
                h,
                ksize=[1, self.word_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name="pool")

            pooled_shape = pooled.get_shape()
            total_size = None
            for idx in list(range(len(pooled_shape)))[1:]:
                if total_size is None:
                    total_size = int(pooled_shape[-1 * idx])
                else:
                    total_size *= int(pooled_shape[-1 * idx])

            print("total size of char_embed_layer: {}".format(total_size))

            return tf.reshape(pooled, [-1, int(total_size)])

    def word_embed_layer(self, input_word):
        with tf.name_scope("word_embedding"):
            embedded_word = tf.nn.embedding_lookup(self.word_W, input_word)
            return embedded_word

    def high_way_layer(self, input):
        input_shape = input.get_shape()
        last_dim = int(input_shape[-1])

        HW = tf.get_variable(
            "HW",
            shape=[last_dim, last_dim],
            initializer=tf.contrib.layers.xavier_initializer())
        Hb = tf.get_variable( shape=[last_dim], name="Hb", initializer=tf.contrib.layers.xavier_initializer())

        H = tf.nn.xw_plus_b(input, HW, Hb)

        TW = tf.get_variable(
            "TW",
            shape=[last_dim, last_dim],
            initializer=tf.contrib.layers.xavier_initializer())
        Tb = tf.get_variable( shape=[last_dim], name="Tb", initializer=tf.contrib.layers.xavier_initializer())

        T = tf.nn.xw_plus_b(input, TW, Tb)

        return H * T + input * (1 - T)

    def first_bilstm_layer(self, input):
        input_shape = input.get_shape()
        d = int(input_shape[-1])

        fw_cell = rnn.BasicLSTMCell(d, forget_bias=1., state_is_tuple=True, reuse=tf.get_variable_scope().reuse)
        bw_cell = rnn.BasicLSTMCell(d, forget_bias=1., state_is_tuple=True, reuse=tf.get_variable_scope().reuse)

        rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
            fw_cell, bw_cell, input, scope='first-bi-lstm',
            dtype=tf.float32)

        return tf.concat(rnn_outputs, axis=2, name='first_bilstm_output')

    def similarity_layer(self, H, U):
        d = int(H.get_shape()[-1]) / 2
        h_dim = int(H.get_shape()[-2])
        u_dim = int(U.get_shape()[-2])
        print("similarity layer h_dim: {}, u_dim: {}".format(h_dim, u_dim))

        Sw = tf.get_variable(
            "Sw",
            shape=[6 * d, 1],
            initializer=tf.contrib.layers.xavier_initializer())

        H = tf.reshape(tf.transpose(H, [0, 2, 1]), [-1 ,h_dim])
        U = tf.reshape(tf.transpose(U, [0, 2, 1]), [-1 ,u_dim])
        
        fH = tf.tile(H, [u_dim, 1])
        fU = tf.tile(tf.expand_dims(tf.concat(tf.unstack(U, axis=-1), axis=0), -1), [1, h_dim])
        fHU = fH * fU
        
        fH = tf.reshape(fH, [-1 ,u_dim , int(2 * d) ,h_dim])
        fU = tf.reshape(fU, [-1, u_dim, int(2 * d), h_dim])
        fHU = tf.reshape(fHU, [-1, u_dim, int(2 * d), h_dim])
        
        f = tf.concat([fH, fU, fHU], axis=2)
        f = tf.reshape(tf.transpose(f, [0, 1, 3, 2]), [-1, int(6 * d)])

        # [batch_num, T, J]
        return tf.transpose(tf.reshape(tf.squeeze(tf.matmul(f, Sw)), [-1, u_dim, h_dim]), [0, 2, 1])


    def G_layer(self, H, H_bar, U_bar):
        d = int(H.get_shape()[-1]) / 2

        def g(h, h_bar, u_bar):
            return tf.concat([h, u_bar, h * u_bar, h * h_bar], axis=-1)

        h_list = tf.unstack(H, axis=1, name="h_list")
        h_bar_list = tf.unstack(H_bar, axis=1, name="h_bar_list")
        u_bar_list = tf.unstack(U_bar, axis=1, name="u_bar_list")

        g_list = []
        for idx in range(len(h_list)):
            h = h_list[idx]
            h_bar = h_bar_list[idx]
            u_bar = u_bar_list[idx]
            g_ele = g(h, h_bar, u_bar)
            g_list.append(g_ele)

        return tf.transpose(tf.reshape(tf.concat(g_list, axis = -1), [-1, int(8 * d), self.T]), [0, 2, 1], name="G")

    def second_bilstm_layer(self, input):
        input_shape = input.get_shape()
        d = int(int(input_shape[-1]) / 8)

        with tf.name_scope("second_lstm_layer"):
            fw_cell = rnn.BasicLSTMCell(d, forget_bias=1., state_is_tuple=True)
            bw_cell = rnn.BasicLSTMCell(d, forget_bias=1., state_is_tuple=True)

            rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                fw_cell, bw_cell, input, scope='second-bi-lstm',
                dtype=tf.float32)

            return tf.concat(rnn_outputs, axis=2, name='second_bilstm_output')

    def third_bilstm_layer(self, input):
        input_shape = input.get_shape()
        d = int(int(input_shape[-1]) / 2)

        with tf.name_scope("third_lstm_layer"):
            fw_cell = rnn.BasicLSTMCell(d, forget_bias=1., state_is_tuple=True)
            bw_cell = rnn.BasicLSTMCell(d, forget_bias=1., state_is_tuple=True)

            rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                fw_cell, bw_cell, input, scope='third-bi-lstm',
                dtype=tf.float32)

            return tf.concat(rnn_outputs, axis=2, name='third_bilstm_output')


    @staticmethod
    def train():
        from time import time
        bidaf_ext = BIDAF()
        print("model construct end :")

        tg = data_generator(gen_type="n")
        ttg = data_generator(gen_type="t")
        num_epoch = 100

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            for now_epoch in range(num_epoch):
                step = 0
                while True:
                    try:
                        c_word,c_char,q_word,q_char,p1_fake,p2_fake = tg.__next__()
                    except:
                        tg = data_generator(gen_type="n")
                        ttg = data_generator(gen_type="t")
                        print("epoch {} end".format(now_epoch))
                        break

                    _ , \
                    loss, \
                    p1_accuracy, \
                    p2_accuracy, \
                    accuracy,\
                    H, U_bar, H_bar \
                        = sess.run([
                        bidaf_ext.train_op,
                        bidaf_ext.loss,
                        bidaf_ext.p1_accuracy,
                        bidaf_ext.p2_accuracy,
                        bidaf_ext.accuracy,

                        bidaf_ext.H,
                        bidaf_ext.U_bar,
                        bidaf_ext.H_bar,
                    ],
                        feed_dict={
                            bidaf_ext.c_word: c_word,
                            bidaf_ext.c_char: c_char,
                            bidaf_ext.q_word: q_word,
                            bidaf_ext.q_char: q_char,
                            bidaf_ext.p1_seq: p1_fake,
                            bidaf_ext.p2_seq: p2_fake
                        })

                    if step % 10 == 0:
                        print("train loss :{} p1_accuracy :{} p2_accuracy :{} accuracy :{}".
                              format(loss, p1_accuracy, p2_accuracy, accuracy))

                    if step % 100 == 0:
                        try:
                            c_word,c_char,q_word,q_char,p1_fake,p2_fake = ttg.__next__()
                        except:
                            ttg = data_generator(gen_type="t")
                            c_word,c_char,q_word,q_char,p1_fake,p2_fake = ttg.__next__()


                        loss, \
                        p1_accuracy, \
                        p2_accuracy, \
                        accuracy  = sess.run([
                            bidaf_ext.loss,
                            bidaf_ext.p1_accuracy,
                            bidaf_ext.p2_accuracy,
                            bidaf_ext.accuracy,
                        ],
                            feed_dict={
                                bidaf_ext.c_word: c_word,
                                bidaf_ext.c_char: c_char,
                                bidaf_ext.q_word: q_word,
                                bidaf_ext.q_char: q_char,
                                bidaf_ext.p1_seq: p1_fake,
                                bidaf_ext.p2_seq: p2_fake
                            })
                        print("test loss :{} p1_accuracy :{} p2_accuracy :{} accuracy :{}".
                              format(loss, p1_accuracy, p2_accuracy, accuracy))

                    step += 1

这里由于较长序列对性能影响较大，限制了较小的T、J。

use_single_p 可以用来控制是否考虑end indice。

测试集的accuracy在各个epoch的表现：

test loss :0.6456390023231506 p1_accuracy :0.017999999225139618
test loss :0.050495319068431854 p1_accuracy :0.057999998331069946
epoch 0 end
test loss :0.0502440445125103 p1_accuracy :0.03999999910593033
test loss :0.05010032281279564 p1_accuracy :0.06199999898672104
epoch 1 end
test loss :0.05005083978176117 p1_accuracy :0.035999998450279236
test loss :0.04990968108177185 p1_accuracy :0.06599999964237213
epoch 2 end
test loss :0.04992210119962692 p1_accuracy :0.04399999976158142
test loss :0.04977176710963249 p1_accuracy :0.07199999690055847
epoch 3 end
test loss :0.04978775978088379 p1_accuracy :0.03400000184774399
test loss :0.049564428627491 p1_accuracy :0.07199999690055847
epoch 4 end
test loss :0.049547359347343445 p1_accuracy :0.057999998331069946
test loss :0.04935314133763313 p1_accuracy :0.09799999743700027
epoch 5 end
test loss :0.04918454959988594 p1_accuracy :0.06800000369548798
test loss :0.04861310124397278 p1_accuracy :0.14000000059604645
epoch 6 end
test loss :0.04811424762010574 p1_accuracy :0.10999999940395355
test loss :0.04380807653069496 p1_accuracy :0.22200000286102295
epoch 7 end
test loss :0.040549907833337784 p1_accuracy :0.24799999594688416
test loss :0.034186024218797684 p1_accuracy :0.414000004529953
epoch 8 end
test loss :0.032501496374607086 p1_accuracy :0.43799999356269836
test loss :0.02821703627705574 p1_accuracy :0.5040000081062317
epoch 9 end
test loss :0.02707063965499401 p1_accuracy :0.5440000295639038
test loss :0.024162035435438156 p1_accuracy :0.5879999995231628
epoch 10 end
test loss :0.02386888675391674 p1_accuracy :0.5979999899864197
test loss :0.021960947662591934 p1_accuracy :0.6320000290870667
epoch 11 end
test loss :0.022267578169703484 p1_accuracy :0.6240000128746033
test loss :0.0203940998762846 p1_accuracy :0.6620000004768372
epoch 12 end
test loss :0.021203691139817238 p1_accuracy :0.6380000114440918
test loss :0.019260596483945847 p1_accuracy :0.6779999732971191
epoch 13 end
test loss :0.020675132051110268 p1_accuracy :0.6499999761581421
test loss :0.018571924418210983 p1_accuracy :0.6859999895095825
epoch 14 end
test loss :0.020104004070162773 p1_accuracy :0.6639999747276306
test loss :0.01809442788362503 p1_accuracy :0.6940000057220459
epoch 15 end
test loss :0.019806142896413803 p1_accuracy :0.6620000004768372
test loss :0.018006963655352592 p1_accuracy :0.6980000138282776
epoch 16 end
test loss :0.019820837303996086 p1_accuracy :0.6639999747276306
test loss :0.018198983743786812 p1_accuracy :0.6959999799728394
epoch 17 end
test loss :0.01980511285364628 p1_accuracy :0.6679999828338623
test loss :0.018297001719474792 p1_accuracy :0.6919999718666077
epoch 18 end
test loss :0.0200297012925148 p1_accuracy :0.6639999747276306
test loss :0.0184195376932621 p1_accuracy :0.699999988079071
epoch 19 end
test loss :0.020458657294511795 p1_accuracy :0.6620000004768372
test loss :0.018728474155068398 p1_accuracy :0.699999988079071
epoch 20 end
test loss :0.020740188658237457 p1_accuracy :0.6620000004768372
test loss :0.01889195665717125 p1_accuracy :0.7039999961853027
epoch 21 end
test loss :0.021275488659739494 p1_accuracy :0.6660000085830688
test loss :0.019528238102793694 p1_accuracy :0.6959999799728394
epoch 22 end
test loss :0.021669382229447365 p1_accuracy :0.671999990940094
test loss :0.019826726987957954 p1_accuracy :0.6819999814033508
epoch 23 end
test loss :0.022092627361416817 p1_accuracy :0.656000018119812
test loss :0.020767534151673317 p1_accuracy :0.6819999814033508
epoch 24 end
test loss :0.022723300382494926 p1_accuracy :0.6579999923706055
test loss :0.021448861807584763 p1_accuracy :0.6800000071525574
epoch 25 end
test loss :0.023822899907827377 p1_accuracy :0.628000020980835
test loss :0.0220272745937109 p1_accuracy :0.671999990940094
epoch 26 end
test loss :0.02495754137635231 p1_accuracy :0.6299999952316284

BI-DIRECTIONAL ATTENSION FLOW FOR MACHINE COMPREHENSION 论文阅读及实现

猜你喜欢