【笔记】机器翻译 Transformer代码笔记


本文代码来源Github:transformer/tf1.2_legacy at master · Kyubyong/transformer · GitHub
作者已更新较新版本tensorflow对应的transformer代码,本笔记基于老代码
做笔记使用

代码1:hyperparams.py 定义超参数文件

# -*- coding: utf-8 -*-
#/usr/bin/python2
'''
June 2017 by kyubyong park. 
[email protected].
https://www.github.com/kyubyong/transformer
'''
class Hyperparams: #超参数
    '''Hyperparameters'''
    # data 训练集与测试集
    source_train = 'corpora/train.tags.de-en.de'
    target_train = 'corpora/train.tags.de-en.en'
    source_test = 'corpora/IWSLT16.TED.tst2014.de-en.de.xml'
    target_test = 'corpora/IWSLT16.TED.tst2014.de-en.en.xml'
    
    # training
    #batch_size调参重点
    #mini-batch gradient decent,小批的梯度下降,这种方法把数据分为若干个批,按批来更新参数,这样,一个批中的一组数据共同决定了本次梯度的方向,下降起来就不容易跑偏,减少了随机性。
    batch_size = 32 # alias = N 在实际机翻训练过程中batch_size一般设置从4000—8000不等,要具体情况具体分析
    lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
    # 在实际训练中,一般动态设置学习率,从大到小以达到细分精度找到“最优解”
    logdir = 'logdir' # log directory
    
    # model
    maxlen = 10 # alias = T. 单词最大长度,实习训练中忽略此项的限制
                # Feel free to increase this if you are ambitious.
    #min_cnt调参
    min_cnt = 20 # words whose occurred less than min_cnt are encoded as <UNK>.
    #调参重点
    hidden_units = 512 # alias = C 隐藏节点
    num_blocks = 6 # number of encoder/decoder blocks
    num_epochs = 20 #迭代20次所有样本
    num_heads = 8 #多头注意力机制中的层数H
    dropout_rate = 0.1 #残差丢弃正则化,根据实际情况可继续增大
    sinusoid = False # If True, use sinusoid. If false, positional embedding. 不使用正弦曲线

 代码2:prepro.py 生成词汇表

# -*- coding: utf-8 -*-
#/usr/bin/python2
'''
June 2017 by kyubyong park. 
[email protected].
https://www.github.com/kyubyong/transformer
'''
from __future__ import print_function #本机python2,使用python3的print()函数
from hyperparams import Hyperparams as hp #超参数
import tensorflow as tf
import numpy as np
import codecs #使用codecs.open()读写文件,避免编码不统一报错
import os
import regex #正则表达式
from collections import Counter #计数器

def make_vocab(fpath, fname): #生成词汇表
    '''Constructs vocabulary.
    
    Args:
      fpath: A string. Input file path. 输入路径,训练集
      fname: A string. Output file name. 输出路径,词汇表
    
    Writes vocabulary line by line to `preprocessed/fname`
    '''  
    text = codecs.open(fpath, 'r', 'utf-8').read() #用unicode编码方式读取
    text = regex.sub("[^\s\p{Latin}']", "", text) #正则表达式,只保留英文单词
    words = text.split()
    word2cnt = Counter(words) #计数器,输出词典:key=单词,value=个数
    if not os.path.exists('preprocessed'): os.mkdir('preprocessed') #输出路径
    #使用with语句:不用close(),同时避免异常
    #str.format()格式化函数,类似于print('',% )中的%
    with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout:
        #先写入四个特殊词
        #<PAD>主要用来进行字符补全,编号0
        #<UNK>未登录词/低频词,编号1
        #<S>句子开始的标识,编号2
        # </S>句子结尾的标识,编号3
        fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
        #collections.Counter.most_common(N)按照频次从大到小排列词典,只显示前N个单词。
        for word, cnt in word2cnt.most_common(len(word2cnt)):
            fout.write(u"{}\t{}\n".format(word, cnt)) #按照 单词\t频次\n来写入

if __name__ == '__main__':
    make_vocab(hp.source_train, "de.vocab.tsv")
    make_vocab(hp.target_train, "en.vocab.tsv") #两个词汇表
    print("Done")

代码3:data_load.py 格式化数据,生成batch

# -*- coding: utf-8 -*-
#/usr/bin/python2
'''
June 2017 by kyubyong park. 
[email protected].
https://www.github.com/kyubyong/transformer
'''
from __future__ import print_function
from hyperparams import Hyperparams as hp
import tensorflow as tf
import numpy as np
import codecs
import regex

#词汇表转化为字典格式,并删除频次较低的
def load_de_vocab(): #德语
    # splitlines()按行切片(\n,\r,\r\n);line.split()[0]将遍历好的每一行(一个单词一个频次)列表化,并取第0个元素/单词;如果单词的频次大于我们在hyperhparams.py中设定的参数就保存该单词到一个列表中
    vocab = [line.split()[0] for line in codecs.open('preprocessed/de.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt]
    word2idx = {word: idx for idx, word in enumerate(vocab)} #转换为字典的形式进行保存单词,并给每个单词进行编号
    idx2word = {idx: word for idx, word in enumerate(vocab)}
    return word2idx, idx2word

def load_en_vocab(): #英语
    vocab = [line.split()[0] for line in codecs.open('preprocessed/en.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt]
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for idx, word in enumerate(vocab)}
    return word2idx, idx2word
'''
举例:
word2idx ={'<PAD>': 0, '<UNK>': 1, '<STR>': 2, '<EOS>': 3, '有': 4, 
'的': 5, '`': 6, '-': 7, '卦': 8, '八': 9, ..., '爬': 1642, 'U': 1643}
idx2word={
   
   {0: '<PAD>', 1: '<UNK>', 2: '<STR>', 3: '<EOS>', 4: '有', 
5: '的', 6: '`', 7: '-', 8: '卦', 9: '八', ..., 1642: '爬', 1643: 'U'}}
'''

#数据处理
def create_data(source_sents, target_sents): #source_sents存放源语言句子的列表, target_sents目标语言句子
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
    
    # Index 索引
    x_list, y_list, Sources, Targets = [], [], [], []
    for source_sent, target_sent in zip(source_sents, target_sents): #使用zip()函数同时遍历两个句子列表
        #x,y 一个新句子
        x = [de2idx.get(word, 1) for word in (source_sent + u" </S>").split()] # 1: OOV, </S>: End of Text
        y = [en2idx.get(word, 1) for word in (target_sent + u" </S>").split()] #给每一个句子的末尾加上</s>终止符,并遍历句子中的每一个单词,将已经存在于word2idx中的那个单词对应的ID添加到新的列表中,如果这个单词不存在于word2idx中,那么就返回 ID‘1’到新列表中组成一个新的‘ID句子’(其中1代表<UNK>)
        if max(len(x), len(y)) <= hp.maxlen: #我们在hyperhparams.py中设置的最大句子长度
            x_list.append(np.array(x)) #源语言ID句子
            y_list.append(np.array(y)) #目标语言ID句子
            Sources.append(source_sent) #源语言句子
            Targets.append(target_sent) #目标语言句子
            #超过长度阈值的丢弃
    
    # Pad 填充 对应site特殊词中的编号0:<PAD>
    X = np.zeros([len(x_list), hp.maxlen], np.int32) #二维0矩阵:句子个数*最大句长
    Y = np.zeros([len(y_list), hp.maxlen], np.int32)
    for i, (x, y) in enumerate(zip(x_list, y_list)):
        #保证每个ID句子的长度/元素个数都是相同的
        X[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0)) #对每一个ID句子做填充,左侧填充0个0,右侧填充hp.maxlen-len(x)个0,并且0也是四个特殊词中的一个:<PAD>编号0
        Y[i] = np.lib.pad(y, [0, hp.maxlen-len(y)], 'constant', constant_values=(0, 0))
    
    return X, Y, Sources, Targets #X和Y的shape为[len(x_list), hp.maxlen],Sources, Targets的shape为[1, len(x_list)]

#加载训练集,对训练集做数据处理,返回定长ID句子
def load_train_data():
    de_sents = [regex.sub("[^\s\p{Latin}']", "", line) for line in codecs.open(hp.source_train, 'r', 'utf-8').read().split("\n") if line and line[0] != "<"] #不加载特殊词
    en_sents = [regex.sub("[^\s\p{Latin}']", "", line) for line in codecs.open(hp.target_train, 'r', 'utf-8').read().split("\n") if line and line[0] != "<"]
    
    X, Y, Sources, Targets = create_data(de_sents, en_sents)
    return X, Y

#加载测试集,对测试集做数据处理,返回定长ID句子
def load_test_data():
    def _refine(line):
        line = regex.sub("<[^>]+>", "", line) #删除所有非空的< >项
        line = regex.sub("[^\s\p{Latin}']", "", line) 
        return line.strip() #删除字符串首尾的指定字符(默认为空格)
    #读取源语言与目标语言文本,切片并处理;将句子开头前4个字符为<seg的保存到新列表中
    de_sents = [_refine(line) for line in codecs.open(hp.source_test, 'r', 'utf-8').read().split("\n") if line and line[:4] == "<seg"]
    en_sents = [_refine(line) for line in codecs.open(hp.target_test, 'r', 'utf-8').read().split("\n") if line and line[:4] == "<seg"]
        
    X, Y, Sources, Targets = create_data(de_sents, en_sents) #数据处理为定长ID句子
    return X, Sources, Targets # (1064, 150) 测试集不需要处理目标语言为ID句子

#生成batch数据,每次得到一个batch
def get_batch_data():
    # Load data
    X, Y = load_train_data() #训练集定长ID句子

    # calc total batch count
    num_batch = len(X) // hp.batch_size #需要几个batch来表示总数据,len(X)表示行数/句子个数
    
    # Convert to tensor
    #将python的数据类型转换成TensorFlow可用的tensor数据类型
    X = tf.convert_to_tensor(X, tf.int32)
    Y = tf.convert_to_tensor(Y, tf.int32)
    
    # Create Queues
    #创建文件名队列
    #从输入中每次取一个切片返回到一个输入队列里,该队列作为之后tf.train.shuffle_batch的一个参数,用以生成!一个!batch的数据。
    input_queues = tf.train.slice_input_producer([X, Y])
            
    # create batch queues
    #队头出队,队尾补充数据入队,打乱顺序;num_threads多线程入队,batch_size每次从队列中出队的数据数量,
    #capacity队列中最大元素数量,min_after_dequeue队列中最少存在的元素数量,allow_smaller_final_batch队列中最后小于batch_size的样本不进行出队
    x, y = tf.train.shuffle_batch(input_queues,
                                num_threads=8,
                                batch_size=hp.batch_size, 
                                capacity=hp.batch_size*64,   
                                min_after_dequeue=hp.batch_size*32, 
                                allow_smaller_final_batch=False)
    #不过现在好像都改用tf.data下的函数进行数据处理了
    return x, y, num_batch # shape分别为(N, T), (N, T);N为batch_size的大小,T为最大句子长度maxlen;一个batch

猜你喜欢

转载自blog.csdn.net/nyist_yangguang/article/details/122724962