Procesamiento del lenguaje natural: keras implementa una red convolucional unidimensional para construir un clasificador de sentimientos en el conjunto de datos de revisión de películas de IMDB

¿Por qué elegir CNN en la tarea de clasificación de PNL?
1. La red neuronal CNN puede procesar texto como imágenes y "entenderlas"
2. El principal beneficio es la alta eficiencia
3. En muchos sentidos, debido a las limitaciones causadas por el tamaño de la capa de agrupación y el kernel de convolución (aunque el kernel de convolución puede ser más grande) hará que la información se descarte, pero esto no significa que no sean modelos útiles. El uso de CNN puede detectar y predecir emociones de manera efectiva en conjuntos de datos relativamente grandes
4. Incluso si se basa en la incrustación de palabras de Word2vec, CNN también puede funcionar con menos incrustaciones de palabras sin mapear todo el idioma.

import numpy as np
# 处理填充输入数据的辅助模块
from keras.preprocessing import sequence
# 基础的 Keras 神经网络模型
from keras.models import Sequential
# 模型中常用的层对象
from keras.layers import Dense, Dropout, Activation
# 卷积层和池化
from keras.layers import Conv1D, GlobalMaxPooling1D
import glob
import os
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from nlpia.loaders import get_data
from keras.models import model_from_json
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# 为NumPy和TensorFlow设置随机种子以确保可以得到一致的结果
np.random.seed(123)
tf.set_random_seed(123)

# 加载数据
# 文档加载预处理
def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will
    try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    # glob是实用的文件名匹配库，glob.glob()函数将会匹配给定路径下的所有pattern，并以列表形式返回。
    # 用它可以查找符合特定规则的文件路径名
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r', encoding='UTF-8') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r', encoding='UTF-8') as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)

    return dataset

dataset = pre_process_data('xxx\\aclImdb\\aclImdb\\train')
# 元组的第一个元素是情感的目标值：1 表示积极情感，0 表示消极情感
print(dataset[0])

# 向量化及分词器
word_vectors = KeyedVectors.load_word2vec_format('xxx\\googlenews-vectors-negative300.bin.gz',
                                                   binary=True)

'''
这里有一些信息损失。谷歌新闻的 Word2vec 词汇表中只包含了一部分停用词，
使很多像“a”这样的常用词将在函数处理过程中被丢弃，这个处理不是特别理想，不过大家可
以通过这个方法得到一个信息有损失情况下的卷积神经网络的基线性能。如果想要避免信息损
失，大家可以单独训练 word2vec 模型，以确保有更好的向量覆盖率。另外，数据中还有很多类
似于<br\>的 HTML 标签，它们通常与文本的情感无关，必须从数据中去除。
'''
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass # No matching token in the Google w2v vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data

# 目标标签
def collect_expected(dataset):
    """ Peel off the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

# vectorized_data结构：[[[词向量], [], ...], [[], [], ...], ...]
vectorized_data = tokenize_and_vectorize(dataset)
# [...]
expected = collect_expected(dataset)

# 划分训练集/测试集
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

# CNN 参数
# maxlen 变量用于设置评论的最大长度，
# 因为卷积神经网络的每个输入必须具有相同的维数，所以需要截断超出 400 个词条的样
# 本，并填充少于 400 个词条的样本，填充值可以是 Null 或 0
maxlen = 400
# 在后向传播误差和更新权重前，向网络输入的样本数量
batch_size = 32
# 传入卷积神经网络中词条向量的长度
embedding_dims = 300
# 要训练的卷积核的数量
filters = 250
# 卷积核大小：每个卷积核将是一个矩阵：embedding_dims × kernel_size，
# 在这里是 250 × 3
kernel_size = 3
# 在普通的前馈网络中传播链端点的神经元的数量
hidden_dims = 250
# 整个训练数据集在网络中的传入次数
epochs = 4

# 填充及截断词条序列，长度不够的填充元素为0的词向量
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen
    """
    new_data = []
    # Create a vector of 0s the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            # Append the appropriate number 0 vectors to the list
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        # 最后将扩展后的数据放在扩展数据列表的最后
        new_data.append(temp)
    return new_data

# 收集经过扩展和截断的数据，并将其转换为 numpy 数组，以便在 Keras 中使用
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)
# 大小为样本数量×序列长度×词向量长度
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

# 卷积神经网络
# 构建一个一维 CNN
print('Build model...')
# Keras 中标准的模型定义方式
model = Sequential()
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1,
                 input_shape=(maxlen, embedding_dims)))
# 可选的池化方法有 GlobalMaxPooling1D()、MaxPooling1D(n)或 AvgPooling1D(n)，其中
# n 表示池化区域大小，默认值为 2
# 全局最大池化
model.add(GlobalMaxPooling1D())
# 带 dropout 的全连接层
# 从一个普通的全连接隐藏层开始，然后加入dropout 和 ReLU
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
# 输出层，是实际的分类器
model.add(Dense(1))
model.add(Activation('sigmoid'))

# 编译 CNN:编译为初始未训练状态
# compile()完成模型的构建
# loss可选：binary_crossentropy 和 categorical_crossentropy...
# optimizer可选：随机梯度下降、Adam和 RSMProp...
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# 训练 CNN，fit()完成模型的训练：
# 通过反向传播每个样本的误差来学习最后面的卷积核和前馈全连接网络之间的权重，
# 以及 250 个不同的卷积核各自的权重
# batch_size:反向传播更新权重之前处理的数据样本
# 数。每个批次中 n 个样本的累计误差会同时处理

# 当训练中的损失持续减少，而验证损失 val_loss 在周期结束时与前一周期相比开始增加
# 时，就出现了明显的过拟合。找到验证损失曲线开始向上弯曲的中间值是获得一个好模型的关键。
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

# 保存模型
# 将模型的结构保存在 JSON 文件中，并将训练后的权重保存在另
# 一个文件中，以便之后重新实例化
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("cnn_weights.h5")

# 加载保存的模型
with open("cnn_model.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('cnn_weights.h5')

# 预测
# 测试样本数据
sample_1 = "I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
print(model.predict(test_vec))
print(model.predict_classes(test_vec))

Procesamiento del lenguaje natural: keras implementa una red convolucional unidimensional para construir un clasificador de sentimientos en el conjunto de datos de revisión de películas de IMDB

Supongo que te gusta