机器学习保险行业问答开放数据集DeepQA-1原始例程的tensorflow版改写程序

首先还是感谢https://github.com/chatopera/insuranceqa-corpus-zh作者的辛苦付出，构建了保险行业的中文语料库，并且提供了一个训练以及测试例程，解决了很多人的燃眉之急，可以说是雪中送炭了。

前几篇文章是对原始例程的详细注解，但这个例程并不是基于tensorflow的，而且也只是提供了loss和accuracy的计算数据，并未提供给其他人（如用户或客户）演示的方法。比如：将每一个测试语句中的id转换为词，并且分离问题和答复，问题与答复是否搭配能够让人一目了然。基于这2点重要不足，我hu花了已经时间和精力将原始例程进行了改写，现将程序代码贴出，大家一起分享。

源码如下（基于anaconda）：

import os
import sys
import numpy as np
import tensorflow as tf

import deep_qa_1.data as corpus

import visual.loss as visual_loss
import visual.accuracy as visual_acc

curdir = os.path.dirname(os.path.abspath('__file__'))
sys.path.insert(0, os.path.dirname(curdir))
print(curdir)
print(sys.path)

input_layer_size = 0
output_layer_size = 0
layers = []
layers_num = 0
epoch = 0
learning_rate = 0.0
batch_size = 0
eval_every_N_steps = 0

model_dir = "model/"

def init(hidden_layers = [100, 50], 
         question_max_length = 20, 
         utterance_max_length = 99, 
         lr = 0.001, 
         ep = 10, 
         batchsize = 100,
         eens = 500):
    '''
    Neural Network to train question and answering model
    '''
    global input_layer_size
    global output_layer_size
    global layers
    global layers_num
    global epoch
    global learning_rate
    global batch_size
    global eval_every_N_steps
    
    input_layer_size = question_max_length + utterance_max_length + 1 # 1 is for <GO>, 20+99+1=120
    output_layer_size = 2 # just the same shape as labels
    layers = [input_layer_size] + hidden_layers + [output_layer_size] # [2] is for output layer
    layers_num = len(layers)
    epoch = ep
    learning_rate = lr
    batch_size = batchsize
    eval_every_N_steps = eens

#init(ep=10, lr=0.0001, eens=200)
#init(ep=50, lr=0.0001, eens=200)
init(ep=500, lr=0.0001, eens=200)

#测试数据集
test_data = corpus.load_test()

#输入数据（问+分隔符+答）
x_data = tf.placeholder(dtype=tf.float32, shape=[None, input_layer_size], name="input_data")
#输出目标
y_target = tf.placeholder(dtype=tf.float32, shape=[None, output_layer_size], name="output_label")


''' 
定义一个创建一层神经网络并进行数据汇总的函数add_layer。 
这个函数的输入参数有输入数据inputs,输入的维度in_size,输出的维度out_size和激活函数activation_function，默认使用Relu。 
在函数内，显示初始化这层神经网络的权重和偏置。然后对输入做矩阵乘法并加上偏置，如果定义了激活函数还要经过激活函数。
'''  
def add_layer(inputs, in_size, out_size, activation_function=None):
    loc_w = tf.Variable(tf.random_normal([in_size, out_size]))
    loc_b = tf.Variable(tf.zeros([1, out_size]) + 0.1)  # 官方推荐biases初始值不为0
    #loc_y = tf.matmul(inputs, loc_w) + loc_b
    loc_y = tf.add(tf.matmul(inputs, loc_w), loc_b)
    if activation_function is None:
        loc_outputs = loc_y
    else:
        loc_outputs = activation_function(loc_y)
    return loc_outputs

"""
定义feed_dict。
该函数先判断训练标记，如果训练标记为true,则从训练集中获取一个batch的样本; 
如果训练标记为false，则获取测试集数据。
"""
def feed_dict(train):
    xs = []
    ys = []
    if train:
        for mini_batch in corpus.load_train():
            for x,y_ in mini_batch:
                xs.append(x)
                ys.append(y_)
    else:
        for (x,y_) in test_data:
            xs.append(x)
            ys.append(y_)

    return {x_data: xs, y_target: ys}

""" 
使用add_layer创建隐藏层1，输入维度是一条问答语句的尺寸（99+1+20=120），输出的维度是隐藏节点数100。 
"""  
#hidden_layer1 = add_layer(x_data, 120, 100, activation_function=tf.sigmoid)
w_1 = tf.Variable(tf.random_normal([120, 100]), name="w_1")
b_1 = tf.Variable(tf.zeros([1, 100]) + 0.1, name="b_1")
hidden_output1 = tf.sigmoid(tf.add(tf.matmul(x_data, w_1), b_1))

""" 
使用add_layer创建隐藏层2，输入维度是隐含层1的尺寸（100），输出的维度是隐藏节点数50。 
"""  
#hidden_layer2 = add_layer(hidden_layer1, 100, 50, activation_function=tf.sigmoid)
w_2 = tf.Variable(tf.random_normal([100, 50]), name="w_2")
b_2 = tf.Variable(tf.zeros([1, 50]) + 0.1, name="b_2")
hidden_output2 = tf.sigmoid(tf.add(tf.matmul(hidden_output1, w_2), b_2))

""" 
使用add_layer创建输出层，输入维度是隐含层2的尺寸（50），输出的维度是输出层节点数2。 
"""  
#output = add_layer(hidden_layer2, 50, 2)
w_output = tf.Variable(tf.random_normal([50, 2]), name="w_out")
b_output = tf.Variable(tf.zeros([1, 2]) + 0.1, name="b_out")
output = tf.add(tf.matmul(hidden_output2, w_output), b_output)

variables_dict = {'w_out': w_output,  'w_2': w_2, 'w_1': w_1, 'b_out': b_output, 'b_2': b_2, 'b_1': b_1}

""" 
使用tf.nn.sigmoid_cross_entropy_with_logits()对前面输出层的结果进行sigmoid处理并计算交叉熵损失cross_entropy。 
"""  
diff = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_target, logits=output)
cross_entropy = tf.reduce_mean(diff)

""" 
使用Adma优化器对损失进行优化。
"""  
optimizer = tf.train.AdamOptimizer(learning_rate);
train_step = optimizer.minimize(cross_entropy);
"""
统计预测正确的样本数并计算正确率accuray。
"""
correct_prediction = tf.equal(tf.argmax(output, 1), tf.arg_max(y_target, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))  

#init_op = tf.initialize_all_variables() #deprecated
init_op = tf.global_variables_initializer()

""" 
使用tf.train.Saver()创建模型的保存器。 
"""
saver = tf.train.Saver(variables_dict)

#is_train = True
is_train = False

with tf.Session() as sess:
    sess.run(init_op)
    if is_train:
        for i in range(epoch):
            sess.run(train_step, feed_dict=feed_dict(True))
            print("i is: %d" %i)
            if (i+1)%50 == 0:
                print("save model parameter")
                #将模型保存到./model/model.ckpt 文件
                saver.save(sess, model_dir+"model.ckpt", global_step=i)
        print("train finished")
    else:
        #ckpt = tf.train.get_checkpoint_state(model_dir)
        #if ckpt and ckpt.model_checkpoint_path:
            #saver.restore(sess, ckpt.model_checkpoint_path)
        model_file = tf.train.latest_checkpoint(model_dir)
        print("model_file is: %s" %model_file)
        saver.restore(sess, model_file)
        val_acc = sess.run(accuracy, feed_dict=feed_dict(False))
        #print('val_loss:%f, val_acc:%f'%(val_loss,val_acc))
        print('val_acc: %f' %(val_acc))
        #将问题由id转换成中文,并选取正确的回复,也转换成中文
        xs = []
        ys = []
        print("len of test_data is")
        print(len(test_data))
        for (x, y_) in test_data:
            xs.append(x)
            ys.append(y_)
        #print(xs[0:1])
        for sentence in xs[0:2]:
            for word in sentence:
                #print(word)
                if word == 24998:
                    print("", end="")
                elif word == 24999:
                    print("?")
                else:
                    print(corpus.vocab_data["id2word"][str(word)], end="")
            print("\n")

机器学习保险行业问答开放数据集DeepQA-1原始例程的tensorflow版改写程序

猜你喜欢