手把手教您使用LSTM生成唐诗

一、引入安装包

import time
from collections import namedtuple
import numpy as np
import tensorflow as tf

二、将全部故事转变成数字

# 处理输入数据
with open(r'./tangshi.txt','r',encoding='utf-8') as f:
    text=f.read()
#set就是将所有的文字全部转变成为一个数字集合
vocab=set(text)
vocab_to_int={c:i for i,c in enumerate(vocab)}
# 把字转变成为对应的数字
int_to_vocab=dict(enumerate(vocab))
# 把数字转变成对应的字
encoded=np.array([vocab_to_int[c] for c in text] ,dtype=np.int32)
#将全部古诗转变成为数字,np.array()需要指出数据的类型

进行一个小验证，观察一下输出是不是挺合理

###进行一些输出的验证
print("len(encoded):",len(encoded))
print("len(vocab):",len(vocab))
"""
len(encoded): 3451603
len(vocab): 7649
挺合理
"""

三、创建mini-batch

batch_size=64
time_steps=50
# 创建mini-batch,投给输入以及输出
def generate_batch(encoded,batch_size,time_steps):
    num_batch=len(encoded)//batch_size//time_steps
    # num_batch表示batch的个数，我们只保留整除的结果
    arr=encoded[:num_batch*batch_size*time_steps].reshape(batch_size,-1)
    x=[]
    y=[]
    for i in range(0,arr.shape[1],time_steps):
        x=arr[:,i:i+time_steps]
        y=np.zeros_like(x)
        # 复制一份
        y[:,:-1]=x[:,1:]
        # 说明已经到头了，随便赋予一个数值就行
        if (i+time_steps)==arr.shape[1]:
            y[:,-1]=y[:,0]
        else:
            y[:,-1]=arr[:,i+time_steps]
        yield x,y  
"""
使用yield可以怎么使用
batches=generate_batch(encoded,batch_size,time_steps)
x,y=next(batches)
"""

四、构建计算图

# 开始设置各种变量，构建计算图
tf.reset_default_graph()
batch_size=64
input_dim=len(vocab)
time_steps=50
learning_rate=0.01
num_units=128
num_layers=2
output_dim=len(vocab)
grad_clip=5
#隐藏层单元数128
inputs=tf.placeholder(name="inputs",shape=[None,time_steps],dtype=tf.int32)
# 对应batch_x
targets=tf.placeholder(name="targets",shape=[None,time_steps],dtype=tf.int32)
# 对应batch_y
# 输入以及输出都是二维的
input_data=tf.one_hot(inputs,len(vocab))
print("input_data.shape:",input_data.shape)
# 将输入转变成三维的one-hot编码
# 构建lstm单元
# lstm_cell=tf.contrib.rnn.LayerNormBasicLSTMCell(num_units)
cells=tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LayerNormBasicLSTMCell(num_units) for _ in range(num_layers)])
initial_state=cells.zero_state(batch_size,dtype=tf.float32)
"""
这里的state就是初始化了state.c以及state.h分别表示长时记忆以及短时记忆
"""
out,state=tf.nn.dynamic_rnn(cells, input_data, initial_state=initial_state)
# 创建输出层，输出的数据是三维的，展开
out=tf.reshape(out,[-1,num_units])
print("out.shape:",out.shape)
out_weights=tf.get_variable(name="out_weights",shape=[num_units,output_dim],dtype=tf.float32)
out_bias=tf.get_variable(name="out_bias",shape=[output_dim],dtype=tf.float32)
logits=tf.add(tf.matmul(out,out_weights),out_bias)
print("logits.shape:",logits.shape)
predictions=tf.nn.softmax(logits,name="predictions")# 每隔所占的标准比例
# 计算损失
outputs=tf.one_hot(targets,len(vocab))
print("outputs.shape:",outputs.shape)
outputs=tf.reshape(outputs,[-1,len(vocab)])
print("outputs.shape:",outputs.shape)
loss=tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=outputs)
print("loss.shape:",loss.shape)
loss=tf.reduce_mean(loss)
# 确定优化函数
tvars=tf.trainable_variables()##找出需要训练的参数变量
grads,_=tf.clip_by_global_norm(tf.gradients(loss,tvars),grad_clip)
train_op=tf.train.AdamOptimizer(learning_rate)
optimizer=train_op.apply_gradients(zip(grads,tvars))

"""
输出：
input_data.shape: (?, 50, 7649)
out.shape: (3200, 128)
logits.shape: (3200, 7649)
outputs.shape: (?, 50, 7649)
outputs.shape: (?, 7649)
loss.shape: (3200,)
"""

【在这个过程中遇到的小问题：】

tf.one_hot()
在最后一个维度上进行扩展
tf.clip_by_global_norm理解

在Tensorboard中展示计算图：
在这里插入图片描述

五、进行训练

# 开始进行训练以及测试
init=tf.global_variables_initializer()
epochs=20
save_every_n=400
batches=generate_batch(encoded,batch_size,time_steps)
saver=tf.train.Saver(max_to_keep=10)
with tf.Session() as sess:
    sess.run(init)
    sess.run(initial_state)
    count=0
    for index in range(epochs):
        for x,y in generate_batch(encoded,batch_size,time_steps):
            count+=1
            start=time.time()
            feed={inputs:x,targets:y}
            sess.run(optimizer,feed_dict=feed)
            loss_=sess.run(loss,feed_dict=feed)
            end=time.time()
            if count%500==0:
                print("轮数:{}/{}".format(index,epochs),
                      "训练步数:{}".format(count),
                      "损失:{:4f}".format(loss_),
                      "时间{:.4f}".format((end-start)),
                     )
            if count%save_every_n==0:
                saver.save(sess,"./save/{}_{}_{}.ckpt".format(count,index,epochs))
    saver.save(sess,"./save/{}_{}_{}.ckpt".format(count,index,epochs))

在进行训练的过程中，由于特殊情况停止了训练，然后需要从头开始训练，非常麻烦，所以在有了下边的代码，加入了断电之后可以继续生成的功能。

# 开始进行训练以及测试
init=tf.global_variables_initializer()
epochs=2
save_every_n=200
batches=generate_batch(encoded,batch_size,time_steps)
saver=tf.train.Saver(max_to_keep=10)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    sess.run(init)
    sess.run(initial_state)
    # 加入断点续训功能
    ckpt = tf.train.get_checkpoint_state("./save/")
    print(ckpt)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
    count=0
    for index in range(epochs):
        for x,y in generate_batch(encoded,batch_size,time_steps):
            count+=1
            start=time.time()
            feed={inputs:x,targets:y}
            sess.run(optimizer,feed_dict=feed)
            loss_=sess.run(loss,feed_dict=feed)
            gs=tf.train.global_step(sess, global_steps)
            #gs=sess.run(global_steps) 这条语句也可以
            end = time.time()
            if count%10==0:
                print("轮数:{}/{}".format(index,epochs),
                      "训练步数:{}".format(gs),
                      "计数次数：{}".format(count),
                      "损失:{:.4f}".format(loss_),
                      "{:.4f}".format((end-start)))
            if count%save_every_n==0:
                saver.save(sess,"./save/{}_{}_{}.ckpt".format(count,index,epochs))
    saver.save(sess,"./save/{}_{}_{}.ckpt".format(count,index,epochs))

部分输出展示：
轮数:0/2 训练步数:210 计数次数：10 损失:7.1789 0.6681
轮数:0/2 训练步数:220 计数次数：20 损失:6.7573 0.6814
轮数:0/2 训练步数:230 计数次数：30 损失:6.6701 0.6787
轮数:0/2 训练步数:240 计数次数：40 损失:6.7215 0.6784
轮数:0/2 训练步数:250 计数次数：50 损失:6.6892 0.6803
轮数:0/2 训练步数:260 计数次数：60 损失:6.6715 0.6945
轮数:0/2 训练步数:270 计数次数：70 损失:6.7109 0.6776
轮数:0/2 训练步数:280 计数次数：80 损失:6.6895 0.6671
轮数:0/2 训练步数:290 计数次数：90 损失:6.7066 0.6769
轮数:0/2 训练步数:300 计数次数：100 损失:6.6703 0.7228
轮数:0/2 训练步数:310 计数次数：110 损失:6.7029 0.6847
轮数:0/2 训练步数:320 计数次数：120 损失:6.6948 0.6739
轮数:0/2 训练步数:330 计数次数：130 损失:6.6475 0.6792
轮数:0/2 训练步数:340 计数次数：140 损失:6.7281 0.6739
轮数:0/2 训练步数:350 计数次数：150 损失:6.6740 0.6796
轮数:0/2 训练步数:360 计数次数：160 损失:6.6269 0.7110
轮数:0/2 训练步数:370 计数次数：170 损失:6.6863 0.6875
轮数:0/2 训练步数:380 计数次数：180 损失:6.6472 0.6754
轮数:0/2 训练步数:390 计数次数：190 损失:6.6592 0.7117
轮数:0/2 训练步数:400 计数次数：200 损失:6.7368 0.6814
轮数:0/2 训练步数:410 计数次数：210 损失:6.7022 0.6690
轮数:0/2 训练步数:420 计数次数：220 损失:6.7272 0.6770
轮数:0/2 训练步数:430 计数次数：230 损失:6.6623 0.6487
轮数:0/2 训练步数:440 计数次数：240 损失:6.6454 0.6741
轮数:0/2 训练步数:450 计数次数：250 损失:6.6999 0.6757
轮数:0/2 训练步数:460 计数次数：260 损失:6.6248 0.6642
轮数:0/2 训练步数:470 计数次数：270 损失:6.6507 0.6734
轮数:0/2 训练步数:480 计数次数：280 损失:6.5726 0.6740
轮数:0/2 训练步数:490 计数次数：290 损失:6.5282 0.6848
轮数:0/2 训练步数:500 计数次数：300 损失:6.5218 0.6756
轮数:0/2 训练步数:510 计数次数：310 损失:6.4808 0.6770
轮数:0/2 训练步数:520 计数次数：320 损失:6.4407 0.6653
轮数:0/2 训练步数:530 计数次数：330 损失:6.4833 0.6680
轮数:0/2 训练步数:540 计数次数：340 损失:6.4142 0.6830
轮数:1/2 训练步数:550 计数次数：350 损失:6.3838 0.6533
轮数:1/2 训练步数:560 计数次数：360 损失:6.3213 0.6808
轮数:1/2 训练步数:570 计数次数：370 损失:6.2740 0.6749
轮数:1/2 训练步数:580 计数次数：380 损失:6.2192 0.6863
轮数:1/2 训练步数:590 计数次数：390 损失:6.1916 0.6983
轮数:1/2 训练步数:600 计数次数：400 损失:6.1218 0.6693
轮数:1/2 训练步数:610 计数次数：410 损失:6.0958 0.6999
轮数:1/2 训练步数:620 计数次数：420 损失:6.0795 0.6878

补充知识：

1、np.random.choice(a,b,p)
表示的含义：从a中按照概率p选择b个数字。
2、tensorflow如何实现断点续训:参考博客
3、参考链接：https://github.com/NELSONZHAO/zhihu/blob/master/anna_lstm/anna_lstm-tf1.0.ipynb

欢迎批评指指正