自然语言处理--Keras 实现LSTM循环神经网络分类 IMDB 电影评论数据集

LSTM 对于循环网络的每一层都引入了状态（state）的概念，状态作为网络的记忆（memory）。但什么是记忆呢？记忆将由一个向量来表示，这个向量与元胞中神经元的元素数量相同。记忆单元将是一个由 n个元素长的浮点数（float）向量。

LSTM循环神经网络展开后的结构：
在这里插入图片描述
其中某个时刻LSTM单元的结构：
包含3 个门：遗忘门、输入/候选门和输出门

有了 LSTM，模型可以开始学习人类习以为常和在潜意识层面上处理的语言模式。有了这些模式，我们不仅可以更精确地预测样本类别，还可以开始使用这些语言模式生成新的文本。

现使用包含lstm的循环神经网络分类 IMDB 电影评论数据集：
该算法的美妙之处在于，通过“记忆”它可以学习看到的词条之间的关系。网络现在能够对这些关系建模，尤其是在我们提供的代价函数的上下文中。

import numpy as np
import glob
import os
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from nlpia.loaders import get_data

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM
from keras.models import model_from_json
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# 为NumPy和TensorFlow设置随机种子以确保可以得到一致的结果
np.random.seed(123)
tf.set_random_seed(123)

# 向量化及分词器
word_vectors = KeyedVectors.load_word2vec_format('xxx\\googlenews-vectors-negative300.bin.gz',
                                                   binary=True)

# 加载数据
# 文档加载预处理
def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will
    try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    # glob是实用的文件名匹配库，glob.glob()函数将会匹配给定路径下的所有pattern，并以列表形式返回。
    # 用它可以查找符合特定规则的文件路径名
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r', encoding='UTF-8') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r', encoding='UTF-8') as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)

    return dataset

'''
这里有一些信息损失。谷歌新闻的 Word2vec 词汇表中只包含了一部分停用词，
使很多像“a”这样的常用词将在函数处理过程中被丢弃，这个处理不是特别理想，不过大家可
以通过这个方法得到一个信息有损失情况下的卷积神经网络的基线性能。如果想要避免信息损
失，大家可以单独训练 word2vec 模型，以确保有更好的向量覆盖率。另外，数据中还有很多类
似于<br\>的 HTML 标签，它们通常与文本的情感无关，必须从数据中去除。
'''
# 数据分词和向量化的方法
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass # No matching token in the Google w2v vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data

# 目标标签
def collect_expected(dataset):
    """ Peel off the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

# 分析填充或截断分别样本数
def test_len(data, maxlen):
    total_len = truncated = exact = padded = 0
    for sample in data:
        total_len += len(sample)
        if len(sample) > maxlen:
            truncated += 1
        elif len(sample) < maxlen:
            padded += 1
        else:
            exact +=1
    print('Padded: {}'.format(padded))
    print('Equal: {}'.format(exact))
    print('Truncated: {}'.format(truncated))
    print('Avg length: {}'.format(total_len/len(data)))

dataset = pre_process_data('xxx\\aclImdb\\train')
# vectorized_data结构：[[[词向量], [], ...], [[], [], ...], ...]
vectorized_data = tokenize_and_vectorize(dataset)
# [...]
expected = collect_expected(dataset)
# 分析填充或截断分别样本数，以决定maxlen大小
print(test_len(vectorized_data, 400))

# 划分训练集/测试集
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

# RNN 参数
# maxlen 变量用于设置评论的最大长度，
# 需要展开RNN网络 400 次，可能发生梯度消失相关的问题，但是即使这样，这个网络也是非常有效的
maxlen = 205
# 在后向传播误差和更新权重前，向网络输入的样本数量
batch_size = 32
# 词向量的长度
embedding_dims = 300
# 整个训练数据集在网络中的传入次数
epochs = 2

# 填充及截断词条序列，长度不够的填充元素为0的词向量，
# 通常我们不需要对循环神经网络使用填充或截断，
# 因为它们可以处理任意长度的输入序列
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen
    """
    new_data = []
    # Create a vector of 0s the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            # Append the appropriate number 0 vectors to the list
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        # 最后将扩展后的数据放在扩展数据列表的最后
        new_data.append(temp)
    return new_data

# 收集经过扩展和截断的数据，并将其转换为 numpy 数组，以便在 Keras 中使用
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)
# 大小为样本数量×序列长度×词向量长度
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

# RNN
# 初始化一个空的 Keras 网络
num_neurons = 50
# 标准的分层模型-Sequential()（分层的）模型
model = Sequential()

# 神奇的 Keras 处理了组装神经网络的各个复杂环节：我们只需要将想要
# 的循环层添加到我们的网络中
# 添加一个循环层
# 这一层的输出将是一个 400 个元素的向量，其中每个元素都是一个 50 个元素的向量
# return_sequences：True则每个时刻都要返回网络输出,输出为400 个向量，每个向量为 50 维
# return_sequences：False则只返回最后一个时刻的 50 维向量
model.add(LSTM( num_neurons, return_sequences=True,
                     input_shape=(maxlen, embedding_dims)))
# 添加一个 dropout 层
model.add(Dropout(.2))
# 网络层 Flatten()将输入从 400 × 50 的张量扁平化为一个长度为 20 000 个元素的向量
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# 编译循环神经网络
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
# model.summary():用于审察模型内部情况
print("模型摘要:\n", model.summary())

# 训练并保存模型
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

model_structure = model.to_json()
with open("lstm_model1.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("lstm_weights1.h5")

# 预测
# 加载模型
with open("lstm_model1.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('lstm_weights1.h5')

sample_1 = "I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? " \
           "The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
print("Sample's sentiment, 1 - pos, 2 - neg : {}".format(model.predict_classes(test_vec)))
print("Raw output of sigmoid function: {}".format(model.predict(test_vec)))

自然语言处理--Keras 实现LSTM循环神经网络分类 IMDB 电影评论数据集

猜你喜欢