Python-Tensorflow2.0 project combat-SequencetoSequence combined with attention mechanism to achieve machine translation

1. Model introduction

2. Data source and model purpose

Three, model actual combat

3.1 Import related libraries

3.2 Model architecture and data preprocessing

3.3 Implementation of SequenceToSequence + Attention Mechanism

3.4 The construction of loss function and model training

3.5 model evaluation and visualization of Attention

Fourth, the actual combat summary

1. Model introduction

2. Data source and model purpose

Data source: API download address

Model purpose: to build Spanish-"English translation

Three, model actual combat

3.1 Import related libraries

The screenshots of the environment taken in this experiment are as follows:

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
2.0.0
sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)
matplotlib 3.3.0
numpy 1.18.5
pandas 1.1.0
sklearn 0.23.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
# 设置gpu内存自增长
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

3.2 Model architecture and data preprocessing

# 1. preprocessing
# 2.build model
# 2.1 encoder
# 2.2 attention
# 2.3 decoder
# 2.4 loss & optimizer
# 2.5 train
# 3. evaluation
# 3.1 give sentence,return translate results
# 3.2 visualize results (attention)
en_spa_file_path = './spa-eng/spa.txt'
import unicodedata
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c) !='Mn') # 如果一个unicode是由多个ASCII组成的，就将其拆开;如果不是重音的话
en_sentence = 'Then what?'
sp_sentence = '¿Entonces qué?'
print(unicode_to_ascii(en_sentence))
print(unicode_to_ascii(sp_sentence)) # 忽略了e的重音符号


import re
def preprocess_sentence(s):
    '''将标点符号和正文区分开'''
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([¿,?.!])",r" \1 ",s) # 标点符号前后加空格
    s = re.sub(r"[' ']+",r" ",s) # 空格去重
    s = re.sub(r"[^a-zA-Z¿,?.!]",r" ",s)
    s = s.rstrip().strip() # 去掉后前空格
    s = '<start> '+s + ' <end>' # 添加起止符
    return s

print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence))


def parse_data(filename):
    lines = open(filename,encoding="UTF-8").read().strip().split('\n')
    sentence_pairs = [line.split('\t') for line in lines]
    preprocessed_sentence_pairs =[
        (preprocess_sentence(en),preprocess_sentence(sp)) for en,sp in sentence_pairs
    ]
    return zip(*preprocessed_sentence_pairs) # 返回英文列表和西班牙列表

en_dataset,sp_dataset = parse_data(en_spa_file_path)
print(en_dataset[-1])
print(sp_dataset[-1])


def tokenizer(lang):
    '''文本id化'''
    lang_tokenizer = keras.preprocessing.text.Tokenizer(
        num_words=None,
        filters='',
        split=' '
    )
    lang_tokenizer.fit_on_texts(lang) # 构建词表
    tensor = lang_tokenizer.texts_to_sequences(lang) # 转化操作
    tensor = keras.preprocessing.sequence.pad_sequences(tensor,padding = 'post') # 句子后做padding
    return tensor,lang_tokenizer

input_tensor,input_tokenizer = tokenizer(sp_dataset[0:30000])
output_tensor,output_tokenizer = tokenizer(en_dataset[0:30000]) # trick

def max_length(tensor):
    return max(len(t) for t in tensor)

max_length_input = max_length(input_tensor)
max_length_output = max_length(output_tensor)
print(max_length_input)
print(max_length_output)


from sklearn.model_selection import train_test_split # 训练和测试集切分
input_train,input_eval,output_train,output_eval =  train_test_split(
    input_tensor,
    output_tensor,
    test_size = 0.2
)
print(len(input_train),len(input_eval),len(output_train),len(output_eval))


def convert(example,tokenizer):
    '''查看tokenizer是否正常工作'''
    for t in example:
        if t !=0:
            print('%d --> %s'%(t,tokenizer.index_word[t])) # 查看词表中的词
            
convert(input_train[0],input_tokenizer)
convert(output_train[0],output_tokenizer)


def make_dataset(input_tensor,output_tensor,batch_size,epochs,shuffle):
    '''转化为高效的数据管道'''
    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor,output_tensor)
    )
    if shuffle:
        dataset = dataset.shuffle(30000)
    dataset = dataset.repeat(epochs).batch(batch_size,drop_remainder=True) # 不足batch就丢掉
    return dataset
batch_size = 64
epochs = 20
train_dataset = make_dataset(input_train,output_train,batch_size,epochs,True)
test_dataset = make_dataset(input_eval,output_eval,batch_size,1,False)
for x,y in train_dataset.take(1):
    print(x.shape,y.shape)
    print(x)
    print(y)

The final data processing result part is shown in the figure below

3.3 Implementation of SequenceToSequence + Attention Mechanism

embedding_units = 256
units = 1024 # rnn的输出维度
input_vocab_size = len(input_tokenizer.word_index)+1
output_vocab_size = len(output_tokenizer.word_index)+1


# 这里GPU内存不够！
class Encoder(keras.Model):
    def __init__(self,vocab_size,embedding_units,encoding_units,batch_size):
        super(Encoder,self).__init__()
        self.batch_size = batch_size
        self.encoding_units = encoding_units
        self.embedding = keras.layers.Embedding(vocab_size,embedding_units)
        self.gru = keras.layers.GRU(self.encoding_units,return_sequences=True,return_state=True,
                                   recurrent_initializer='glorot_uniform') # lstm的变种，遗忘门  = 输入门-输出门
        
    
    def call(self,x,hidden):
        x = self.embedding(x)
        output,state = self.gru(x,initial_state=hidden) # 得到每一步的输出和最后一步的隐含状态
        return output,state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size,self.encoding_units))
    
encoder = Encoder(input_vocab_size,embedding_units,units,batch_size)
sample_hidden = encoder.initialize_hidden_state() # 初始化隐藏层参数
sample_output,sample_hidden = encoder(x,sample_hidden)
print("sample_hidden shape ",sample_hidden.shape)
print("sample_output shape ",sample_output.shape)



class BahdanauAttention(keras.Model):
    # 实现Attention机制
    def __init__(self,units):
        super(BahdanauAttention,self).__init__()
        self.W1 = keras.layers.Dense(units) # 为decoder和encoder分别做一个全连接
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
        
    def call(self,decoder_hidden,encoder_outputs): # decode某一步的隐含状态，encoder的每一步输出
        # decoder_hidden.shape = (batch_size,units)
        # encoder_outputs.shape = (batch_size,length,units)
        # 为了实现加和，需要对decoder进行扩展
        decoder_hidden_with_time_axis = tf.expand_dims(decoder_hidden,axis=1)
        # before V: (batch_size,length,units)
        # after V: (batch_size,length,1)
        score = self.V(
                    tf.nn.tanh(
                        self.W1(encoder_outputs) + self.W2(decoder_hidden_with_time_axis)))
        # shape:(batch_size,length,1)
        attention_weights = tf.nn.softmax(score,axis=1)
        # context_vector.shape = (batch_size,length,units)
        context_vector = attention_weights*encoder_outputs # broadcast
        # (batchsize,units)
        context_vector = tf.reduce_sum(context_vector,axis=1)
        return context_vector,attention_weights
    
attention_model = BahdanauAttention(units=10)
attention_results,attention_weights = attention_model(sample_hidden,sample_output)
print("attention reuslts shape ",attention_results.shape)
print("attention weights shape ",attention_weights.shape)


class Decoder(keras.Model):
    def __init__(self,vocab_size,embedding_units,decoding_units,batch_size):
        super(Decoder,self).__init__()
        self.batch_size = batch_size
        self.decoding_units = decoding_units
        self.embedding = keras.layers.Embedding(vocab_size,embedding_units)
        self.gru = keras.layers.GRU(self.decoding_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')
        self.fc = keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.decoding_units)
    
    def call(self,x,hidden,encoding_outputs): # hidden是上一步的输出
        # context_v:(batch_size,units)
        context_vector,attention_weights = self.attention(hidden,encoding_outputs)
        # before embedding :x (batchsize,1)
        # afeter embedding: x (batchsize,1,embedding_units)
        x = self.embedding(x)
        # 最后一维度大小相加
        combined_x = tf.concat([
            tf.expand_dims(context_vector,axis=1),x],axis=-1) #https://blog.csdn.net/leviopku/article/details/82380118 相当于在最后一维进行拼接
        # output.shape:(batchsize,1,decoding_units)
        # state.shape:batchsize,decoding_units
        output,state = self.gru(combined_x)
        # outputshape:(batchsize,decoding_units)生成二维会舍去维度为1的维度
        output = tf.reshape(output,(-1,output.shape[2]))
        # out.shape:(batchsize,vocab_size)
        output = self.fc(output)
        return output,state,attention_weights
decoder = Decoder(output_vocab_size,embedding_units,units,batch_size)
outputs = decoder(tf.random.uniform((batch_size,1)),
                 sample_hidden,sample_output)
decoder_output,decoder_hidden,decoder_aw = outputs
print("decoder_output shape ",decoder_output.shape)
print("decoder_hidden shape", decoder_hidden.shape)
print("decoder_aw shape",decoder_aw.shape)

Here I forgot the use of reshape, I wrote a small demo (can be ignored)

test = tf.constant([[[1,1,1],[2,2,2]]])
print(test.shape)
test = tf.reshape(test,(-1,test.shape[2]))
print(test.shape)

3.4 The construction of loss function and model training

optimizer = keras.optimizers.Adam()
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                        reduction='none') # 结果的输出没有被激活函数，所以from_logits=True
def loss_function(real,pred):
    '''计算单步损失函数'''
    mask = tf.math.logical_not(tf.math.equal(real,0)) # 取反操作，排除paddding的损失函数
    loss_ = loss_object(real,pred)
    mask = tf.cast(mask,dtype=loss_.dtype)
    loss_*=mask # 防止padding部分污染操作
    return tf.reduce_mean(loss_)


@tf.function
def train_step(inp,targ,encoding_hidden):
    '''多步计算损失函数'''
    loss = 0
    with tf.GradientTape() as tape:
        encoding_outputs,encoding_hidden = encoder(inp,encoding_hidden) # 进行模型计算
        decoding_hidden = encoding_hidden
        '''
        e.g. <start> I am here <end>
        1. <start> -> I
        2. I -> am {同时具有<start>的信息}
        3. am -> here
        4. here -> end
        '''
        for t in range(0,targ.shape[1]-1):
            decoding_input = tf.expand_dims(targ[:,t],1) # 实际输入的应该是一个已有的targ序列，而不仅仅是单步的targ值
            predictions,decoding_hidden,_ = decoder(
            decoding_input,decoder_hidden,encoding_outputs
            )
            loss += loss_function(targ[:,t+1],predictions)
    batch_loss = loss / int(targ.shape[0])
    variables = encoder.trainable_variables+decoder.trainable_variables
    gradients = tape.gradient(loss,variables) # batch loss也可以
    optimizer.apply_gradients(zip(gradients,variables))
    return batch_loss



epochs = 10
steps_per_epoch = len(input_tensor)//batch_size

# 模型训练
for epoch in range(epochs):
    start = time.time()
    encoding_hidden =  encoder.initialize_hidden_state()
    total_loss = 0
    for (batch,(inp,targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss  = train_step(inp,targ,encoding_hidden)
        total_loss +=batch_loss
        if batch %100==0:
            print('Epoch:{} Batch:{} Loss:{:.4f}'.format(epoch+1,batch,batch_loss.numpy()))
    print('Epoch:{} Loss:{:.4f}'.format(epoch+1,total_loss/steps_per_epoch))
    print('Time take for 1 epoch {} sec\n'.format(time.time()-start))

The model training result part is shown in the following figure:

3.5 Model evaluation and Attention visualization

def evalute(input_sentence):
    attention_matrix = np.zeros((max_length_output,max_length_input)) # 可以理解为每个输出值都和输入值有一定的关系
    input_sentence = preprocess_sentence(input_sentence) # 文本预处理
    inputs = [input_tokenizer.word_index[token] for token in input_sentence.split(' ')] # 文本id化
    inputs =keras.preprocessing.sequence.pad_sequences(
        [inputs],maxlen=max_length_input,padding='post'
    ) # padding
    inputs =tf.convert_to_tensor(inputs)
    result = ''
    # encoding_hidden = encoder.initialize_hidden_state()
    encoding_hidden = tf.zeros((1,units))
    encoding_outputs,encoding_hidden = encoder(inputs,encoding_hidden)
    decoding_hidden = encoding_hidden
    # e.g. <start> ->A
    # A ->B ->C->D
    # decoding_input.shape [1,1]
    # 这里注意与train的不同，train是直接使用targ,evaluate是使用上一步输出
    decoding_input = tf.expand_dims([output_tokenizer.word_index['<start>']],0)
    for t in range(max_length_output):
        predictions,decoding_hidden,attention_weights = decoder(decoding_input,decoding_hidden,encoding_outputs)
        # attentionweights.shape(batchsize,inputlength,1) ->(1,16,1)
        attention_weights = tf.reshape(attention_weights,(-1,)) # shape为16的向量
        attention_matrix[t] = attention_weights.numpy()
        # predictions.shape :(batchszie,vocabsize) (1,4935)
        predicted_id =tf.argmax(predictions[0]).numpy() # 取最大概率的id作为下一步预测
        result += output_tokenizer.index_word[predicted_id] + ' '
        if output_tokenizer.index_word[predicted_id] =='<end>':
            return result,input_sentence,attention_matrix
        decoding_input = tf.expand_dims([predicted_id],0)
    return result,input_sentence,attention_matrix
        
def plot_attention(attention_matrix,input_sentence,predicted_sentence):
    fig  = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1,1,1)
    ax.matshow(attention_matrix,cmap='viridis') # 配色方案
    font_dict = {'fontsize':14}
    ax.set_xticklabels(['']+input_sentence,fontdict=font_dict,rotation=90)
    ax.set_yticklabels(['']+ predicted_sentence,fontdict=font_dict,)
    plt.show()
    
def translate(input_sentence):
    '''综合调用'''
    results,input_sentence,attention_matrix =evalute(input_sentence)
    print("Input: %s"%(input_sentence))
    print("Predicted translation:%s"%(results))
    
    # 有些reuslt不一定能达到max_length_output的长度，padding部分也要去掉
    attention_matrix = attention_matrix[:len(results.split(' ')),:len(input_sentence.split(' '))]
    plot_attention(attention_matrix,input_sentence.split(' '),results.split(' '))


translate(u'hace mucho calor aquí.') # 不能出现没有的词，不然会报keyerror,就是在index索引的时候出现问题

The output result is shown in the figure below: (The input here is obtained by Google Translate)

Fourth, the actual combat summary

In essence, the Attention mechanism is a weight vector that links the output of each stage of the Encoder with the output of a certain step of the Decoder, which effectively avoids the limitation of the original SeqToSeq model that only uses the output of the last step of the Encoder.