徳英翻译程序代码

#encoding=utf-8
#1.导入必要的库,创建一个计算图会话
import os
import string
import requests
import io
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from zipfile import ZipFile
from collections import Counter
from tensorflow.models.rnn.translate import seq2seq_model
sess = tf.Session()




#2.设置模型参数
learning_rate = 0.1 #学习率
lr_decay_rate = 0.99 #学习率衰减1%
lr_decay_every = 100 #每100次迭代衰减1%
max_gradient = 5.0 #截止最大梯度
batch_size = 50
num_layers = 3
rnn_size = 500 #RNN大小
layer_size = 512
generations = 10000
vocab_size = 10000 #词频
save_every = 1000
eval_every = 500
output_every = 50
punct = string.punctuation #标点符号
data_dir = 'temp'
data_file = 'eng_ger.txt'
model_path = 'seq2seq_model'
full_model_dir = os.path.join(data_dir,model_path)




#3.英文句子测试模型,观察训练的模型效果
test_english = ['hello where is my computer','the quick brown fox jumped over the lazy dog','is it going to rain tomorrow']




#4.创建模型文件夹。检查语料文件是否已经下载,若已下载,则直接读取文件;否则下载并保存到指定文件夹。
if not os.path.exists(full_model_dir):
	os.makedirs(full_model_dir)

#创建数据文件夹:data_dir
if not os.path.exists(data_dir):
	os.makedirs(data_dir)
print('Loading English-German Data')

#检查数据,如果不存在,下载并保存它
#把data_file加入到data_dir路径中,判断:data_file是否存在,不存在则print
if not os.path.isfile(os.path.join(data_dir, data_file)):
	print('Data not found, downloading Eng-Ger sentences from www.manythings.org')
	sentence_url = 'http://www.manythings.org/anki/deu-eng.zip'
	r = requests.get(sentence_url)
	z = ZipFile(io.BytesIO(r.content)) #io编程,用r.content初始化BytesIO并对这个二进制文件进行读写
	file = z.read('deu.txt')

	#格式化数据
	eng_ger_data = file.decode() #将文件解码
	eng_ger_data = eng_ger_data.encode('ascii',errors = 'ignore') #编码成ASC码
	eng_ger_data = eng_ger_data.decode().split('\n') #把ASC码解码并按行切片

	#写入文件
	#with open() as:打开文件,读取文件,关闭并保存文件的好方法
	with open(os.path.join(data_dir, data_file), 'w') as out_conn:
		for sentence in eng_ger_data:
			out_conn.write(sentence + '\n')
else: #若data_file存在
	eng_ger_data = []
	with open(os.path.join(data_dir, data_file), 'r') as in_conn:
		for row in in_conn:
			eng_ger_data.append(row[:-1])




#5.清洗预料数据集,移除标点符号,分割句子中的英语和德语,并全部转化为小写
eng_ger_data = [''.join(char for char in sent if char not in punct) for sent in eng_ger_data]

#用tab键对句子切片
eng_ger_data = [x.split('\t') for x in eng_ger_data if len(x) >= 1]
[english_sentence,german_sentence] = [list(x) for x in zip(*eng_ger_data)]
english_sentence = [x.lower().split() for x in english_sentence]
german_sentence = [x.lower().split() for x in german_sentence]




#6.创建英语和德语词汇表,词频至少10000,不符合要求的标为0(未知),大部分低频次为代词(名字或地名)
all_english_words = [word for sentence in english_sentence for word in sentence]
all_english_counts = Counter(all_english_words)
eng_word_keys = [x[0] for x in all_english_counts.most_common(vocab_size-1)] #-1是因为0也在里面
eng_vocab2ix = dict(zip(eng_word_keys, range(1,vocab_size)))
eng_ix2vocab = {val:key for key, val in eng_vocab2ix.items()}
english_processed = []
for sent in english_sentence:
	temp_sentence = []
	for word in sent:
		try:
			temp_sentence.append(eng_vocab2ix[word])
		except:
			temp_sentence.append(0)
	english_processed.append(temp_sentence)
all_german_words = [word for sentence in german_sentence for word in sentence]
all_german_counts = Counter(all_german_words)
ger_word_keys = [x[0] for x in all_german_counts.most_common(vocab_size - 1)]
ger_vocab2ix = dict(zip(ger_word_keys, range(1, vocab_size)))
ger_ix2vocab = {val:key for key, val in ger_vocab2ix.items()}
german_processed = []
for sent in german_sentence:
	temp_sentence = []
	for word in sent:
		try:
			temp_sentence.append(eng_vocab2ix[word])
		except:
			temp_sentence.append(0)
	german_processed.append(temp_sentence)




#7.预处理测试词汇,将其写入词汇索引中
test_data = []
for sentence in test_english:
	temp_sentence = []
	for word in sentence.split(' '):
		try:
			temp_sentence.append(eng_vocab2ix[word])
		except:
			temp_sentence.append(0)
	test_data.append(temp_sentence)




#8.句子长短不一,腰围不同长度的句子创建单独的模型,可以最小化短句中填充字符的影响。方法:将相似长度的句子分桶处理,为每个分桶设置长度范围。
x_maxs = [5, 7, 11, 50]
y_maxs = [10, 12, 17, 60]
buckets = [x for x in zip(x_maxs, y_maxs)]
bucketed_data = [[] for _ in range(len(x_maxs))]
for eng, ger in zip(english_processed, german_processed):
	for ix, (x_max, y_max) in buckets:
		if (len(eng) <= x_max) and (len(ger) <= y_max):
			bucketed_data[ix].append([eng, ger])
			break




#9.将上述参数传入tensorflow的seq2seq模型,创建translation_model()函数保证训练模型和测试模型可以共享相同的变量
def translation_model(sess, input_vocab_size, output_vocab_size, buckets, rnn_size, num_layers, max_gradient, learning_rate, lr_decay_rate, forward_only):
	model = seq2seq_model.Seq2SeqModel(input_vocab_size, output_vocab_size, buckets, rnn_size, num_layers, max_gradient, batch_size, learning_rate, lr_decay_rate, forward_only = forward_only, dtype = tf.float32)
	return(model)




#10.创建训练模型,使用tf.variable_scope管理模型变量,声明训练模型的变量在scope范围内可重用。创建测试模型,其批量大小为1
input_vocab_size = vocab_size
output_vocab_size = vocab_size
with tf.variable_scope('translate_model') as scope:
	translate_model = translation_model(sess, vocab_size, vocab_size, buckets, rnn_size, num_layers, max_gradient, learning_rate, lr_decay_rate, False)
	#测试模型的变量可重用
	scope.reuse_variables()
	test_model = translation_model(sess, vocab_size, vocab_size, buckets, rnn_size, num_layers, max_gradient, learning_rate, lr_decay_rate, True)
	test_model.batch_size = 1




#11.初始化模型变量
init = tf.global_variables_initializer()
sess.run(init)




#12.调用step()函数迭代训练seq2seq模型。该模型含有get_batch()函数,可以从分桶索引迭代批量句子。衰减学习率,保存训练模型,并利用测试句子进行模型评估
train_loss = []
for i in range(generations):
	rand_bucket_ix = np.random.choice(len(bucketed_data))

	model_outputs = translate_model.get_batch(bucketed_data, rand_bucket_ix)
	encoder_inputs, decoder_inputs,target_weights = model_outputs

	#得到(gradient norm, loss, and outputs)
	_, step_loss, _ = translate_model.step(sess, encoder_inputs, decoder_inputs, target_weights, rand_bucket_ix, False)

	#输出状态
	if (i + 1) % output_every == 0:
		train_loss.append(step_loss)
		print('Gen #{} out of {}. Loss: {:.4}'.format(i + 1, generations, step_loss))

	#检查是否应该降低学习率
	if (i + 1) % lr_decay_every ==0:
		sess.run(translate_model.learning_rate_decay_op)

	#保存模型
	if (i + 1) % save_every == 0:
		print('Saving model to {}.'.format(full_model_dir))
		model_save_path = os.path.join(full_model_dir, "eng_ger_translation.ckpt")
		translate_model.saver.save(sess, model_save_path, global_step = i)

	#评估测试集
	if (i + 1) % eval_every == 0:
		for ix, sentence in enumerate(test_data):
			#句子在哪个桶
			bucket_id = next(index for index, val in enumerate(x_maxs) if val >= len(sentence))
			#获取RNN模型输出
			encoder_inputs, decoder_inputs, target_weights = test_model.get_batch({bucket_id: [(sentence, [])]}, bucket_id)
			#获取逻辑回归函数logits
			_, test_loss, output_logits = test_model.step(sess,encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
			ix_output = [int(np.argmax(logit, axis = 1)) for logit in output_logits]
			#如果输出中出现0,则输出结束
			ix_output = ix_output[0:[ix for ix, x in enumerate(ix_output + [0]) if x == 0][0]]
			#从索引中获取德语单词
			test_german = [ger_ix2vocab[x] for x in ix_output]
			print('English:{}'.format(test_english[ix]))
			print('German:{}'.format(test_german))

猜你喜欢

转载自blog.csdn.net/kudou1994/article/details/80617318