版权声明:凡由本人原创,如有转载请注明出处https://me.csdn.net/qq_41424519,谢谢合作 https://blog.csdn.net/qq_41424519/article/details/81943237
#数据处理
import collections
import numpy as np
start_token = 'B'#begin
end_token = 'E'#end
#数据集:总共有34646首诗,1721655个字(6110个去重后的字)
def process_poems(file_name):
# poems -> list of numbers诗集
poems = []#是二维数组,但不是矩阵,因为每首诗的长度不同
with open(file_name, "r", encoding='utf-8', ) as f:
for line in f.readlines():
try:
title, content = line.strip().split(':')#以冒号来分开
content = content.replace(' ', '')#去掉所有的空格
if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content or \
start_token in content or end_token in content:#去掉特殊符号
continue
if len(content) < 5 or len(content) > 79:#内容少于5个字或大于79个字为异常诗需要剔除,跳出本次循环
continue
content = start_token + content + end_token
poems.append(content)
except ValueError as e:
pass
#取出所有诗中所有的字构成一维数组,比如['低','头','思','故','乡','低',]
all_words = [word for poem in poems for word in poem]
#以字为key,该字出现的次数为value形成字典,按value从大到小排列{'不':6000,'的':5800,}
counter = collections.Counter(all_words)
#对计数结果进行由大到小排序,返回的结果是一个数组,数组元素为(某个字,该字的次数),[('不':6000),('的':5800),]
count_pairs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
#按照出现次数由大到小的顺序取出所有的字放到一个小括号中,字与字之间用逗号隔开,('不','的',)
words, _ = zip(*count_pairs)
#末尾加一个空格,('不','的', , ,' ')
words = words + (' ',)
#为每个字打上位置标签,从0开始,形成字典,高频次的字在前面{'不':0,'的':1, , ,' ':6110}
word_int_map = dict(zip(words, range(len(words))))
#每首诗中的字都能在word_int_map中找到位置标签,poems_vector是二维矩阵,行数为诗的个数,列代表每首诗的字的位置标签
poems_vector = [list(map(lambda word: word_int_map.get(word, len(words)), poem)) for poem in poems]
return poems_vector, word_int_map, words
def generate_batch(batch_size, poems_vec, word_to_int):
#总共切多少块,batch_size如果取64,取整数原则34646//64=541块
n_chunk = len(poems_vec) // batch_size
x_batches = []
y_batches = []
for i in range(n_chunk):
start_index = i * batch_size
end_index = start_index + batch_size
#batches为二维数组,每块固定有多少首诗,长度为batch_size
batches = poems_vec[start_index:end_index]
#确定该batches中字数最多的那首诗的字数
length = max(map(len, batches))
#每个x_data由空格组成一个batch_size*length的矩阵
x_data = np.full((batch_size, length), word_to_int[' '], np.int32)
#row代表第几首诗,每首诗都有length的长度,从第一个字的位置开始由batch来还原,填不满的就空着。
for row, batch in enumerate(batches):
x_data[row, :len(batch)] = batch
#复制过来后,x_data的变化不会引起y_data变化
y_data = np.copy(x_data)
#y_data代表目标输出,由x_data向左移动形成,最后一列没有意义,给出每首诗的上一个字预测一下个字,直到预测完整首诗
y_data[:, :-1] = x_data[:, 1:]
"""
x_data y_data
第一首:[6,2,4,6,9] 第一首:[2,4,6,9,9]
第二首:[1,4,2,8,5] 第二首:[4,2,8,5,5]
"""
#x_batches 和y_batches的长度均为541,即储存着541块数据子集
x_batches.append(x_data)
y_batches.append(y_data)
return x_batches, y_batches
#设置模型
import tensorflow as tf
import numpy as np
tf.reset_default_graph()
def rnn_model(model, input_data, output_data, vocab_size, rnn_size=128, num_layers=2, batch_size=64,
learning_rate=0.01):
"""
construct rnn seq2seq model.
:param model: model class
:param input_data: input data placeholder
:param output_data: output data placeholder
:param vocab_size:len(words)=6110
:param rnn_size:128
:param num_layers:2
:param batch_size:64
:param learning_rate:0.01
:return:
"""
end_points = {}#终点
# 构建RNN基本单元RNNcell
if model == 'rnn':
cell_fun = tf.contrib.rnn.BasicRNNCell
elif model == 'gru':
cell_fun = tf.contrib.rnn.GRUCell
elif model == 'lstm':
cell_fun = tf.contrib.rnn.BasicLSTMCell
cell = cell_fun(rnn_size, state_is_tuple=True)
# 构建堆叠rnn,这里选用两层的rnn
cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
# 如果是训练模式,output_data不为None,则初始状态shape为[batch_size * rnn_size]
# 如果是生成模式,output_data为None,则初始状态shape为[1 * rnn_size]
if output_data is not None:
initial_state = cell.zero_state(batch_size, tf.float32)
else:
initial_state = cell.zero_state(1, tf.float32)
# 构建隐层
with tf.device("/cpu:0"):
embedding = tf.get_variable('embedding', initializer=tf.random_uniform(
#返回-1和1之间均匀分布的(vocab_size + 1)*rnn_size矩阵,列由128维表示,每一字构成一行,行数就是字对应的word_int映射
[vocab_size + 1, rnn_size], -1.0, 1.0))#返回-1和1之间均匀分布的(vocab_size + 1)*rnn_size矩阵
#input_data是二维矩阵代表某一块batch数据集,每一行代表一首诗,由poem_vec表示,
#通过遍历每首诗的每个字,字的表示是word_int映射,找到了该映射数字也就找到了在embedding对应的行数
inputs = tf.nn.embedding_lookup(embedding, input_data)
# [batch_size, ?, rnn_size] = [64, ?, 128]
outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state)
#只有output的列数和weights的行数相等才能做相乘
output = tf.reshape(outputs, [-1, rnn_size])
weights = tf.Variable(tf.truncated_normal([rnn_size, vocab_size + 1]))#产生正态分布做权重
bias = tf.Variable(tf.zeros(shape=[vocab_size + 1]))#产生偏置的个数与weights的列数相等
logits = tf.nn.bias_add(tf.matmul(output, weights), bias=bias)#矩阵相乘
# [?, vocab_size+1]
if output_data is not None:
# output_data must be one-hot encode
labels = tf.one_hot(tf.reshape(output_data, [-1]), depth=vocab_size + 1)
# should be [?, vocab_size+1]
loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)
# loss shape should be [?, vocab_size+1]
total_loss = tf.reduce_mean(loss)
train_op = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
end_points['initial_state'] = initial_state
end_points['output'] = output
end_points['train_op'] = train_op
end_points['total_loss'] = total_loss
end_points['loss'] = loss
end_points['last_state'] = last_state
else:
prediction = tf.nn.softmax(logits)
end_points['initial_state'] = initial_state
end_points['last_state'] = last_state
end_points['prediction'] = prediction
return end_points
#开始训练
import os
import numpy as np
import tensorflow as tf
from poems.model import rnn_model
from poems.poems import process_poems, generate_batch
#tf.app.flags.DEFINE_xxx()就是添加命令行的optional argument(可选参数)
#os.path.abspath(path):返回path规范化的绝对路径。
#epoch:1个epoch等于使用训练集中的全部样本训练一次
tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size.')
tf.app.flags.DEFINE_float('learning_rate', 0.01, 'learning rate.')
tf.app.flags.DEFINE_string('model_dir', os.path.abspath('./model'), 'model save path.')
tf.app.flags.DEFINE_string('file_path', os.path.abspath('./data/poems.txt'), 'file name of poems.')
tf.app.flags.DEFINE_string('model_prefix', 'poems', 'model save prefix.')
tf.app.flags.DEFINE_integer('epochs', 50, 'train how many epochs.')
#f.app.flags.FLAGS可以从对应的命令行参数取出参数。
FLAGS = tf.app.flags.FLAGS
def run_training():
if not os.path.exists(FLAGS.model_dir):
os.makedirs(FLAGS.model_dir)
#产生诗向量,字映射表,字集合
poems_vector, word_to_int, words = process_poems(FLAGS.file_path)
batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int)
input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None])
output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None])
# 构建模型
end_points = rnn_model(model='lstm', input_data=input_data, output_data=output_targets, vocab_size=len(
words), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate)
#tf里面提供模型保存的是tf.train.Saver()模块。检查checkpoint 内容最好的方法是使用Saver 加载它。
saver = tf.train.Saver(tf.global_variables())
#创建一个包含几个操作的op结点,当这个op结点运行完成,所有作为input的ops都被运行完成,这个操作没有返回值
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session() as sess:
#运行以上的节点
sess.run(init_op)
start_epoch = 0
checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
# 从上次中断的checkpoint开始训练
if checkpoint:
saver.restore(sess, checkpoint)
print("## restore from the checkpoint {0}".format(checkpoint))
start_epoch += int(checkpoint.split('-')[-1])
print('## start training...')
try:
for epoch in range(start_epoch, FLAGS.epochs):
n = 0
#输入的数据集切成多少块
n_chunk = len(poems_vector) // FLAGS.batch_size
for batch in range(n_chunk):
loss, _, _ = sess.run([
end_points['total_loss'],
end_points['last_state'],
end_points['train_op']#batches_inputs[n]代表取出第几块数据集
], feed_dict={input_data: batches_inputs[n], output_targets: batches_outputs[n]})
n += 1
print('Epoch: %d, batch: %d, training loss: %.6f' % (epoch, batch, loss))
if epoch % 6 == 0:
saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_prefix), global_step=epoch)
except KeyboardInterrupt:
# 如果Ctrl+c中断,保存checkpoint,
print('## Interrupt manually, try saving checkpoint for now...')
saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_prefix), global_step=epoch)
print('## Last epoch were saved, next time will start from epoch {}.'.format(epoch))
def main(_):
run_training()
if __name__ == '__main__':
tf.app.run()
import tensorflow as tf
from poems.model import rnn_model
from poems.poems import process_poems
import numpy as np
start_token = 'B'
end_token = 'E'
model_dir = './model/'
corpus_file = './data/poems.txt'
lr = 0.0002
#predict就相当于output(由一系列的h拼凑成)
def to_word(predict, vocabs):
#predict[0]的形状为[0.3,0.9,0.6],按理说长度小于vocabularies的长度,
predict = predict[0]
# 归一化处理
predict /= np.sum(predict)
#sample的返回值为标签,0到len(predict)之间的整数
sample = np.random.choice(np.arange(len(predict)), p=predict)
if sample > len(vocabs):
return vocabs[-1]
else:
return vocabs[sample]
def gen_poem(begin_word):
batch_size = 1
print('## loading corpus from %s' % model_dir)
poems_vector, word_int_map, vocabularies = process_poems(corpus_file)
input_data = tf.placeholder(tf.int32, [batch_size, None])
end_points = rnn_model(model='lstm', input_data=input_data, output_data=None, vocab_size=len(
vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=lr)
saver = tf.train.Saver(tf.global_variables())
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session() as sess:
sess.run(init_op)
checkpoint = tf.train.latest_checkpoint(model_dir)
saver.restore(sess, checkpoint)
x = np.array([list(map(word_int_map.get, start_token))])
#predict相当于output,形状是[batch_size,step,cell_num]
# batch_size = 1,上述形状就变为[step,cell_num]
#predict的形状就如下所示:
#[[0.3,0.9,0.6],
# [0.8,0.5,0.7]]
[predict, last_state] = sess.run([end_points['prediction'], end_points['last_state']],
feed_dict={input_data: x})
if begin_word:
word = begin_word
else:
word = to_word(predict, vocabularies)
poem_ = ''
i = 0
while word != end_token:
poem_ += word
i += 1
#生成的诗的字数不超过24
if i >= 24:
break
#x为[[ 0]],x[0, 0]=0
x = np.zeros((1, 1))
#word_int_map[word]的位置index赋值给x[0, 0],比如‘雨’对应的index为100,那么x[0, 0]初始值就是100
x[0, 0] = word_int_map[word]
[predict, last_state] = sess.run([end_points['prediction'], end_points['last_state']],
feed_dict={input_data: x, end_points['initial_state']: last_state})
word = to_word(predict, vocabularies)
return poem_
def pretty_print_poem(poem_):
poem_sentences = poem_.split('。')
for s in poem_sentences:
if s != '' and len(s) > 10:
print(s + '。')
if __name__ == '__main__':
begin_char = input('## please input the first character:')
poem = gen_poem(begin_char)
pretty_print_poem(poem_=poem)
## please input the first character:雨
## loading corpus from ./model/
INFO:tensorflow:Restoring parameters from D:\python运行文件\poems-mater\model\poems-48
雨寒入槛近,瀑布雀雀嘲。
人世共寂寥,何人似全禄
数据集入群找我要吧,群号:228735640