自然语言处理--Keras 实现循环神经网络分类 IMDB 电影评论数据集

那么为什么要使用 RNN 呢？不一定要选择循环神经网络，与前馈网络或卷积神经网络相比，它训练和传递新样本的成本相对较高（循环层计算成本较高）。
但是循环网络在记忆能力方面的特殊优势即记住句子中出现过的词，这是进入包括 NLP 或所有其他序列数据的更大世界的起点。
结构：
在这里插入图片描述
import numpy as np
import glob
import os
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from nlpia.loaders import get_data

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN
from keras.models import model_from_json
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# 为NumPy和TensorFlow设置随机种子以确保可以得到一致的结果
np.random.seed(123)
tf.set_random_seed(123)

# 向量化及分词器
word_vectors = KeyedVectors.load_word2vec_format('xxx\\googlenews-vectors-negative300.bin.gz',
                                                   binary=True)

# 加载数据
# 文档加载预处理
def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will
    try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    # glob是实用的文件名匹配库，glob.glob()函数将会匹配给定路径下的所有pattern，并以列表形式返回。
    # 用它可以查找符合特定规则的文件路径名
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r', encoding='UTF-8') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r', encoding='UTF-8') as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)

    return dataset

'''
这里有一些信息损失。谷歌新闻的 Word2vec 词汇表中只包含了一部分停用词，
使很多像“a”这样的常用词将在函数处理过程中被丢弃，这个处理不是特别理想，不过大家可
以通过这个方法得到一个信息有损失情况下的卷积神经网络的基线性能。如果想要避免信息损
失，大家可以单独训练 word2vec 模型，以确保有更好的向量覆盖率。另外，数据中还有很多类
似于<br\>的 HTML 标签，它们通常与文本的情感无关，必须从数据中去除。
'''
# 数据分词和向量化的方法
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass # No matching token in the Google w2v vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data

# 目标标签
def collect_expected(dataset):
    """ Peel off the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

dataset = pre_process_data('xxx\\aclImdb\\train')
# vectorized_data结构：[[[词向量], [], ...], [[], [], ...], ...]
vectorized_data = tokenize_and_vectorize(dataset)
# [...]
expected = collect_expected(dataset)

# 划分训练集/测试集
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

# RNN 参数
# maxlen 变量用于设置评论的最大长度，
# 需要展开RNN网络 400 次，可能发生梯度消失相关的问题，但是即使这样，这个网络也是非常有效的
maxlen = 400
# 在后向传播误差和更新权重前，向网络输入的样本数量
batch_size = 32
# 词向量的长度
embedding_dims = 300
# 整个训练数据集在网络中的传入次数
epochs = 3

# 填充及截断词条序列，长度不够的填充元素为0的词向量，
# 通常我们不需要对循环神经网络使用填充或截断，
# 因为它们可以处理任意长度的输入序列
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen
    """
    new_data = []
    # Create a vector of 0s the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            # Append the appropriate number 0 vectors to the list
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        # 最后将扩展后的数据放在扩展数据列表的最后
        new_data.append(temp)
    return new_data

# 收集经过扩展和截断的数据，并将其转换为 numpy 数组，以便在 Keras 中使用
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)
# 大小为样本数量×序列长度×词向量长度
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

# RNN
# 初始化一个空的 Keras 网络
num_neurons = 50
# 标准的分层模型-Sequential()（分层的）模型
model = Sequential()

# 神奇的 Keras 处理了组装神经网络的各个复杂环节：我们只需要将想要
# 的循环层添加到我们的网络中
# 添加一个循环层
# 这一层的输出将是一个 400 个元素的向量，其中每个元素都是一个 50 个元素的向量
# return_sequences：True则每个时刻都要返回网络输出,输出为400 个向量，每个向量为 50 维
# return_sequences：False则只返回最后一个时刻的 50 维向量
model.add(SimpleRNN( num_neurons, return_sequences=True,
                     input_shape=(maxlen, embedding_dims)))
# 添加一个 dropout 层
model.add(Dropout(.2))
# 网络层 Flatten()将输入从 400 × 50 的张量扁平化为一个长度为 20 000 个元素的向量
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# 编译循环神经网络
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
# model.summary():用于审察模型内部情况
print(model.summary())

# 训练并保存模型
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

model_structure = model.to_json()
with open("simplernn_model1.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("simplernn_weights1.h5")

# 预测
sample_1 = "I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."
with open("simplernn_model1.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('simplernn_weights1.h5')
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
print(model.predict_classes(test_vec))
自然语言处理--Keras 实现循环神经网络分类 IMDB 电影评论数据集

猜你喜欢