自然语言处理--使用序列到序列网络（编码-解码架构）构建一个简单聊天机器人

聊天机器人用到了编码-解码架构，它的前半部分是序列编码器，该网络将序列（如自然语言文本）转换为较低维的表示形式，编码-解码架构的后半部分是序列解码器。序列解码器设计成将向量重新转换回人类可读的文本。
结构图：
在这里插入图片描述
from nlpia.loaders import get_data
from tensorflow.compat.v1.keras.layers import CuDNNLSTM as LSTM
from keras.models import Model
from keras.layers import Input, Dense
import numpy as np

# 为训练准备语料库
df = get_data('moviedialog')
print(df.head())
input_texts, target_texts = [], []
input_vocabulary = set()
output_vocabulary = set()
# 初始词条和终止词条
start_token = '\t'
stop_token = '\n'
# max_training_samples 定义了训练使用的行数。它是
# 定义的最大值和从文件中加载的总行数中较小的数
max_training_samples = min(25000, len(df) - 1)
for input_text, target_text in zip(df.statement, df.reply):
    # target_text 需要用起始词条和终止词条进行包装
    target_text = start_token + target_text + stop_token
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_vocabulary:
            input_vocabulary.add(char)
    for char in target_text:
        if char not in output_vocabulary:
            output_vocabulary.add(char)

# 字符级序列到序列模型参数
input_vocabulary = sorted(input_vocabulary)
output_vocabulary = sorted(output_vocabulary)
input_vocab_size = len(input_vocabulary)
output_vocab_size = len(output_vocabulary)
# 对于输入数据和目标数据，确定序列词条的最大数量
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
# 建立字符字典
input_token_index = dict([(char, i) for i, char in enumerate(input_vocabulary)])
target_token_index = dict([(char, i) for i, char in enumerate(output_vocabulary)])
# 建立反向字符字典
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())
# 生成独热编码训练集
encoder_input_data = np.zeros((len(input_texts),
                                max_encoder_seq_length, input_vocab_size), dtype='float32')
decoder_input_data = np.zeros((len(input_texts),
                                max_decoder_seq_length, output_vocab_size), dtype='float32')
decoder_target_data = np.zeros((len(input_texts),
                                max_decoder_seq_length, output_vocab_size), dtype='float32')
# 对解码器的训练数据，大家将创建 decoder_input_data 和 decoder_target_data（后者落后于前者一个时刻）
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1

# 构造和训练一个基于序列到序列网络的聊天机器人
batch_size = 64
epochs = 100
num_neurons = 256
# Keras 中的思想编码
encoder_inputs = Input(shape=(None, input_vocab_size))
encoder = LSTM(num_neurons, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]
# Keras 中的思想解码器
decoder_inputs = Input(shape=(None, output_vocab_size))
decoder_lstm = LSTM(num_neurons, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
# 稠密层
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
# Keras 函数式 API（Model()）
# 如果期望具有多个输入或输出，则可以将输入和输出参数定义为列表
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
# 在 Keras 中训练一个序列到序列模型
# 训练输入为一个列表，在训练期间第一个列表
# 元素传递给编码器网络，第二个元素传递给解码器网络
model.fit([encoder_input_data, decoder_input_data],
            decoder_target_data, batch_size=batch_size, epochs=epochs,
            validation_split=0.1)

# 组装序列生成模型;构造回复生成器模型:
# 使用训练过的网络生成序列回复，在生成输出序列之前，需要获取训练层的结构，并将其重新组装以用于生成序列

# 使用通用 Keras Model 生成单独的文本的编码器，在此模型上调用预测方法将返回思想向量
encoder_model = Model(encoder_inputs, encoder_states)

# 构造单独的随机思想的序列生成器即解码器
thought_input = [Input(shape=(num_neurons,)), Input(shape=(num_neurons,))]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=thought_input)
# 更新后的 LSTM 状态将成为下一次迭代的新细胞状态
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(inputs=[decoder_inputs] + thought_input,
                        outputs=[decoder_outputs] + decoder_states)

# 预测输出序列
# 使用 decode_sequence 函数调用训练好的模型生成回复序列
def decode_sequence(input_seq):
    '''
    给定初始状态和初始词条，解码器生成的词条在第 2 个时刻成为解码器的输入，而第 2 个
    时刻的输出又变成第 3 个时刻的输入，以此类推。LSTM 记忆状态始终都在更新记忆并增进输出
    '''
    # 生成思想向量作为解码器的输入
    thought = encoder_model.predict(input_seq)
    # 解码器的第一个输入词条是初始词条
    target_seq = np.zeros((1, 1, output_vocab_size))
    target_seq[0, 0, target_token_index[start_token]] = 1.
    stop_condition = False
    generated_sequence = ''
    while not stop_condition:
        # 将已生成的词条和最新状态传递给解码器，以预测下一个序列元素
        output_tokens, h, c = decoder_model.predict([target_seq] + thought)
        generated_token_idx = np.argmax(output_tokens[0, -1, :])
        generated_char = reverse_target_char_index[generated_token_idx]
        generated_sequence += generated_char
        if (generated_char == stop_token or
                len(generated_sequence) > max_decoder_seq_length ):
            # 将 stop_condition 设置为 True 将停止循环
            stop_condition = True
        target_seq = np.zeros((1, 1, output_vocab_size))
        target_seq[0, 0, generated_token_idx] = 1.
        # 更新思想向量状态
        thought = [h, c]
    return generated_sequence

# 生成回复
def response(input_text):
    input_seq = np.zeros((1, max_encoder_seq_length, input_vocab_size), dtype='float32')
    for t, char in enumerate(input_text):
        input_seq[0, t, input_token_index[char]] = 1.
    decoded_sentence = decode_sequence(input_seq)
    print('Bot Reply (Decoded sentence):', decoded_sentence)

# 与聊天机器人交谈
response("what is the internet?")
response("why?")
response("do you like coffee?")
response("do you like football?")
自然语言处理--使用序列到序列网络（编码-解码架构）构建一个简单聊天机器人

猜你喜欢