Tensorflow之基于LSTM神经网络写唐诗

最近看了不少关于写诗的博客，在前人的基础上做了一些小的改动，因比较喜欢一次输入很长的开头句，所以让机器人输出压缩为一个开头字生成两个诗句，写五言和七言诗，当然如果你想写更长的诗句是可以继续改动的。

在输入做了些改动，去除误输入的标点符号，例如输入下面词句：

怒发冲冠，凭栏处，潇潇雨歇。抬望眼，仰天长啸，壮怀激烈。

机器人写出如下：

怒漠多无度袍小，巡管山明恰见偷。                              
发杵共鸿莼散暮，家山曾住上阳台。
冲钩麻衣隐步障，楼舟复别赤轮楼。
冠盘一线倾中令，音信长思两足阴。
凭栏十字送月沈，莫待长筵韦与兵。
栏湿地闲鱼脚吠，匣中虚伴虎前羊。
处向梅香千万里，石城心中明碧簟。
潇湘夜塘独思不，几处深笼不尽迟。
潇湘十二海云天，五月今留报主人。
雨后青青沧练过，月明南渡雁来新。
歇鞭千里知贤哭，清镜草浮麋没繁。
抬吾谢安犹带减，醉来赤此不才齐。
望台已待隋家咏，楼上不能犹扫成。
眼看药炉香岭上，比惟无言旧青春。
仰归安得衰无事，三笑出师湘水春。
天初碧玉衣襟淡，国药满川霞彩寒。
长叹榆关家已远，从来父子屈襄郎。
啸花青石速望尽，宛逐汀洲随并年。
壮时天下还如旧，生计孤吟去杀无。
怀哉却寄终拘束，莫道人来有也才。
激眼剑旗喧并髻，新菰麦落破门骄。
烈灵不识槛西间，芳草青天有五禽。

怒搜温液切，若近太阳香。
发欲奔宾影，争得频人怜。
冲腾临缺曙，谢豹出红残。
冠盖若移在，想得绛皇皇。
凭高晋家雨，才不寄黄金。
栏落临巨浸，根孔恨仙桃。
处世愿越游，飘扬共行之。
潇湘南北洞，蜀国湘江湄。
潇湘弦管绝，上月洞庭时。
雨历道中朔，不起列仙风。
歇毂须江道，家歌住忽依。
抬山弄弓寞，装束岛霞裙。
望闻拜天子，幸有窦金狐。
眼看尽无些，香雨不兴天。
仰阮子不笔，不敢相思逸。
天与十二胆，彩笔取时七。
长安陇波归，银没乌方地。
啸作胡人船，大作康庄匠。
壮何时七两，隐括为匡庐。
怀君青纪肥，常带牺胆圣。
激逐鸱子子，人来儆此处。
烈太仓毛黄，家长受德吉。

代码如下：

main.py

import collections
import os
import sys
import re
import numpy as np
import tensorflow as tf
from model import rnn_model
from poems import process_poems, generate_batch

os.environ['TF_CPP_MIN_LOG_LEVEL']='2'


tf.flags.DEFINE_integer('batch_size', 64, 'batch size.')
tf.flags.DEFINE_float('learning_rate', 0.01, 'learning rate.')
# set this to 'main.py' relative path
tf.flags.DEFINE_string('checkpoints_dir', './checkpoints/', 'checkpoints save path.')
tf.flags.DEFINE_string('file_path', './data/poetry.txt', 'file name of poems.')

tf.flags.DEFINE_integer('epochs', 50, 'train how many epochs.')

FLAGS = tf.flags.FLAGS

start_token = 'G'
end_token = 'E'

#开始训练
def run_training():
    if not os.path.exists(os.path.dirname(FLAGS.checkpoints_dir)):
        os.mkdir(os.path.dirname(FLAGS.checkpoints_dir))
    if not os.path.exists(FLAGS.checkpoints_dir):
        os.mkdir(FLAGS.checkpoints_dir)
    # 单词转化的数字:向量，单词和数字一一对应的字典，单词
    poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path)
    # 真实值和目标值
    batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int)
    # 数据占位符
    input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None])
    output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None])

    end_points = rnn_model(model='lstm', input_data=input_data, output_data=output_targets, vocab_size=len(
        vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate)
    # 实例化保存模型
    saver = tf.train.Saver(tf.global_variables())
    # 全局变量进行初始化
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    with tf.Session() as sess:
        # sess = tf_debug.LocalCLIDebugWrapperSession(sess=sess)
        # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
        # 先执行，全局变量初始化
        sess.run(init_op)

        start_epoch = 0
        # 把之前训练过的checkpoint拿出来
        checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoints_dir)
        if checkpoint:
            # 拿出训练保存模型
            saver.restore(sess, checkpoint)
            print("[INFO] restore from the checkpoint {0}".format(checkpoint))
            start_epoch += int(checkpoint.split('-')[-1])
        print('[INFO] start training...')
        try:
            for epoch in range(start_epoch, FLAGS.epochs):
                n = 0
                # 多少行唐诗//每次训练的个数
                n_chunk = len(poems_vector) // FLAGS.batch_size
                for batch in range(n_chunk):
                    loss, _, _ = sess.run([
                        end_points['total_loss'],  # 损失
                        end_points['last_state'],  # 最后一次输出
                        end_points['train_op']  # 训练优化损失
                    ], feed_dict={input_data: batches_inputs[n], output_targets: batches_outputs[n]})
                    n += 1
                    print('[INFO] Epoch: %d , batch: %d , training loss: %.6f' % (epoch, batch, loss))
                if epoch % 6 == 0:  # 每隔多少次保存
                    saver.save(sess, FLAGS.checkpoints_dir, global_step=epoch)
        except KeyboardInterrupt:
            print('[INFO] Interrupt manually, try saving checkpoint for now...')
            saver.save(sess, FLAGS.checkpoints_dir, global_step=epoch)
            print('[INFO] Last epoch were saved, next time will start from epoch {}.'.format(epoch))


def to_word(predict, vocabs):
    t = np.cumsum(predict)
    s = np.sum(predict)
    # searchsorted 在前面查找后面的
    sample = int(np.searchsorted(t, np.random.rand(1) * s))
    # sample = np.argmax(predict)
    if sample > len(vocabs):
        sample = len(vocabs) - 1
    return vocabs[sample]

#调用模型生成诗句
def gen_poem(begin_words, num):
    batch_size = 1
    print('[INFO] loading corpus from %s' % FLAGS.file_path)
    # 单词转化的数字:向量，单词和数字一一对应的字典，单词
    poems_vector, word_int_map, vocabularies = process_poems(FLAGS.file_path)
    # 此时输入为1个
    input_data = tf.placeholder(tf.int32, [batch_size, None])
    # 损失等
    end_points = rnn_model(model='lstm', input_data=input_data, output_data=None, vocab_size=len(
        vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate)

    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    with tf.Session() as sess:
        sess.run(init_op)
        # 保存模型的位置，拿回sess
        checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoints_dir)
        # checkpoint = tf.train.latest_checkpoint('./model/')

        saver.restore(sess, checkpoint)
        # saver.restore(sess,'./model/-24')
        # 从字典里面获取到的开始值
        x = np.array([list(map(word_int_map.get, start_token))])

        [predict, last_state] = sess.run([end_points['prediction'], end_points['last_state']],
                                         feed_dict={input_data: x})
        poem = ''
        for begin_word in begin_words:

            while True:
                if begin_word:
                    word = begin_word
                else:
                    word = to_word(predict, vocabularies)
                sentence = ''
                while word != end_token:

                    sentence += word
                    x = np.zeros((1, 1))
                    x[0, 0] = word_int_map[word]
                    [predict, last_state] = sess.run([end_points['prediction'], end_points['last_state']],
                                                     feed_dict={input_data: x, end_points['initial_state']: last_state})
                    word = to_word(predict, vocabularies)
                # word = words[np.argmax(probs_)]
                if len(sentence) == 2 + 2 * num and ('，' or '？') not in sentence[:num] and ('，' or '？') not in sentence[num+1:-1] and sentence[num] == '，' and '□' not in sentence:
                    poem += sentence
                    # sentence = ''
                    break
                else:
                    print("我正在写诗呢")
                    
        return poem

#这里将生成的诗句，按照中文诗词的格式输出
#同时方便接入应用
def pretty_print_poem(poem):
    poem_sentences = poem.split('。')
    # print(poem_sentences)
    for s in poem_sentences:
        if s != '' and len(s) > 10:
        # if s != '':

            print(s + '。')


def main():
    if len(sys.argv) == 2:
        if sys.argv[1] == '1':
            print('[INFO] train tang poem...')
            run_training()
        elif sys.argv[1] == '2':
            num = int(input("请输入训练诗句（5：五言，7：七言):"))
            if num == 5 or num == 7:
                print('[INFO] write tang poem...')
                begin_word = input('开始作诗，请输入起始字:')
                if len(begin_word) == 0:
                    print("请输入词句")
                    return
                r1 = '[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
                begin_word = re.sub(r1, '', begin_word)
                poem2 = gen_poem(begin_word, num)
                pretty_print_poem(poem2)
            else:
                print('输入有误')
           
        else:
            print('a',sys.argv[1])
            print("请按照以下方式执行：")
            print("python xxxx.py 1(1：训练，2：写诗)")
    else:
        print(len(sys.argv))
        print("请按照以下方式执行：")
        print("python xxxx.py 1(1：训练，2：写诗)")

if __name__ == '__main__':
    main()

将诗歌和单词转换为一一对应的数字：

def process_poems(file_name):
    # 诗集
    poems = []
    with open(file_name, "r", encoding='utf-8', ) as f:
        for line in f.readlines():
            try:
                title, content = line.strip().split(':')
                content = content.replace(' ', '')
                if '_' in content or '(' in content  or '《' in content or '[' in content or \
                        start_token in content or end_token in content:
                    continue
                if len(content) < 5 or len(content) > 79:
                    continue
                content = start_token + content + end_token
                poems.append(content)
            except ValueError as e:
                pass
    # 按诗的字数排序
    poems = sorted(poems, key=lambda l: len(line))

    # 统计每个字出现次数
    all_words = []
    for poem in poems:
        all_words += [word for word in poem]
    # 这里根据包含了每个字对应的频率 Counter({'我': 3, '你': 2, '他': 1})
    counter = collections.Counter(all_words)
    
    # items转化为列表，里面为元组 [('他', 1), ('你', 2), ('我', 3)]
    count_pairs = sorted(counter.items(), key=lambda x: x[-1])
    # ('他', '你', '我')，(1, 2, 3)
    words, _ = zip(*count_pairs)

    # 取前多少个常用字
    words = words[:len(words)] + (' ',)
    # words = words[:len(words)]
    # 每个字映射为一个数字ID  {'他': 0, '你': 1, '我': 2}
    word_int_map = dict(zip(words, range(len(words))))
    # 将诗歌中每个字转换成一一对应的数字，输入poem中的每个字，转化成对应数字返回
    # 没有获得word对应数字Id,就返回len(words)
    poems_vector = [list(map(lambda word: word_int_map.get(word, len(words)), poem)) for poem in poems]

    return poems_vector, word_int_map, words

定义真实值和目标值：

def generate_batch(batch_size, poems_vec, word_to_int):
    # 每次取64首诗进行训练
    # 计算有多少个batch_size
    n_chunk = len(poems_vec) // batch_size
    x_batches = []
    y_batches = []
    for i in range(n_chunk):
        start_index = i * batch_size
        end_index = start_index + batch_size

        batches = poems_vec[start_index:end_index]
        # 找到这个batch的所有poem中最长的poem的长度
        length = max(map(len, batches))
        # 填充一个这么大小的空batch，空的地方放空格进行填充
        x_data = np.full((batch_size, length), word_to_int[' '], np.int32)
        for row in range(batch_size):
            # 每一行就是一首诗，在原本的长度上把诗还原上去
            x_data[row, :len(batches[row])] = batches[row]
        y_data = np.copy(x_data)
        # y的话就是x向左边也就是前面移动一个
        y_data[:, :-1] = x_data[:, 1:]
        """
        x_data             y_data
        [6,2,4,6,9]       [2,4,6,9,9]
        [1,4,2,8,5]       [4,2,8,5,5]
        """
        x_batches.append(x_data)
        y_batches.append(y_data)
    return x_batches, y_batches

定义训练模型：

# -*- coding: utf-8 -*-
import os
import tensorflow as tf
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'  #不显示一些提示警告信息

def rnn_model(model, input_data, output_data, vocab_size, rnn_size=128, num_layers=2, batch_size=64,
              learning_rate=0.01):
    """
    construct rnn seq2seq model.
    :param model: model class 模型种类
    :param input_data: input data placeholder 输入
    :param output_data: output data placeholder 输出
    :param vocab_size: 词长度
    :param rnn_size: 一个RNN单元的大小
    :param num_layers: RNN层数,神经元
    :param batch_size: 步长
    :param learning_rate: 学习速率
    :return:
    """
    end_points = {}

    def rnn_cell():
        if model == 'rnn':
            cell_fun = tf.contrib.rnn.BasicRNNCell
        elif model == 'gru':
            cell_fun = tf.contrib.rnn.GRUCell
        elif model == 'lstm':
            # 基础模型
            cell_fun = tf.contrib.rnn.BasicLSTMCell
        # 指定rnn_size大小，H,C，控制值和输出值，是否当做元组返回，默认为True
        cell = cell_fun(rnn_size, state_is_tuple=True)
        return cell
    # 基本单元
    cell = tf.contrib.rnn.MultiRNNCell([rnn_cell() for _ in range(num_layers)], state_is_tuple=True)
    # cell = tf.contrib.rnn.MultiRNNCell([rnn_cell()] * num_layers, state_is_tuple=True)


    if output_data is not None:
        # 初始化
        initial_state = cell.zero_state(batch_size, tf.float32)
    else:
        initial_state = cell.zero_state(1, tf.float32)
    # 指定用cpu运行
    with tf.device("/cpu:0"):
        # 输入的向量转化为128维向量，所以先构建隐层，指定值为+1 到-1区间
        embedding = tf.get_variable('embedding', initializer=tf.random_uniform(
            [vocab_size + 1, rnn_size], -1.0, 1.0))
        # embedding = tf.Variable(tf.random_uniform([vocab_size + 1,rnn_size],-1.0,1.0))
        # 输入的不是所有的词，是所有词的一部分，寻找属于哪个词
        inputs = tf.nn.embedding_lookup(embedding, input_data)

    # [batch_size, ?, rnn_size] = [64, ?, 128]
    # 输出，和最后一次的输出
    outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state)
    # 转换为128列的输出，output相当于中间一个128维隐层结果，向量
    output = tf.reshape(outputs, [-1, rnn_size])
    # 128维的权重，和词汇量
    weights = tf.Variable(tf.truncated_normal([rnn_size, vocab_size + 1]))
    # 偏置
    bias = tf.Variable(tf.zeros(shape=[vocab_size + 1]))
    # bias加到前面没一行，预测值
    logits = tf.nn.bias_add(tf.matmul(output, weights), bias=bias)
    # [?, vocab_size+1]

    if output_data is not None:
        # output_data must be one-hot encode  指定深度为，词汇量深度+1，真实值
        labels = tf.one_hot(tf.reshape(output_data, [-1]), depth=vocab_size + 1)
        # should be [?, vocab_size+1]
        # 计算损失,真实值和预测值
        loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)
        # loss shape should be [?, vocab_size+1]
        # 对损失求平均
        total_loss = tf.reduce_mean(loss)
        # 训练，优化损失
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)

        end_points['initial_state'] = initial_state
        end_points['output'] = output
        end_points['train_op'] = train_op
        end_points['total_loss'] = total_loss
        end_points['loss'] = loss
        end_points['last_state'] = last_state
    else:
        # 预测值
        prediction = tf.nn.softmax(logits)

        end_points['initial_state'] = initial_state
        end_points['last_state'] = last_state
        end_points['prediction'] = prediction

    return end_points

训练的时候大概训练了半天吧，基于cpu,loss降到2到3之间就没训练了。古诗里面有些宋词，所以可能会写出一些宋词来。因此当写出宋词的时候，统统过滤了，让机器人从新写一次。

怒浩叛奴风入襄，相思犹记浇秋木。
发夜在关江伯均，屏帏倾酒醺醺时。
冲棱出与征轮钓，北阙朱门注胜囚。
冠子弄衣诗句苦，雁归羊祜得行戎。
凭仗眼巡分一已，拂眉下枕风吹鬓。
栏畔秋风听别情，无将日月悲卮花。
处飞更醉紫骝出，莫怕黄芽头孤酒。
潇湘似人风共恶，月影湿金横向波。
潇湘湘水涵红团，拖蜡桐花过酒楼。
雨丰寒鸟没疏角，雨过嵩门逐几层。
歇山地在秦栖狎，借问纯山色色微。
抬普高花量且白，也遭纱帽却离歌。
望华衔泥不复相，时时避缴如红兰。
眼眼南天南岳畔，十年章岸一何当。
仰头忽要天台绿，九没街边雪里临。
天下别容兵起力，怀王过写复何人。
长江九泽分明月，应逐征帆转汉阳。
啸骨立亡何所赠，别时身去坦云中。
壮时烟雨分雪发，战马翩翩一千尺。
怀乡生抛怀甲得，终竟颠应年子人。
激尔饷于无一事，岛层明月上高台。
烈士不能还续竹，应经此去虽迷定。

Tensorflow之基于LSTM神经网络写唐诗

猜你喜欢