自然语言处理--keras实现LSTM字符级建模

词是有含义的,在对语言建模时,许多含义隐藏在字符里面。语音语调、头韵、韵律—如果我们把它们分解到字符级别,可以对所有这些建模。人类不需要分解得如此细致就可以为语言建模。但是,从建模中产生的定义非常复杂,并不容易传授给机器,这就是我们讨论这个问题的原因。对于我们见过的字符,当我们查看文本中哪个字符出现在哪个字符后面时,可以发现文本中的许多固有的模式。在这个范式中,空格、逗号或句号都变成了另一个字符。

现让我们在 IMDB 电影评论数据集中,在字符级别上尝试 LSTM:

import numpy as np
import glob
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from random import shuffle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten, LSTM
from tensorflow.keras.models import model_from_json
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# 为NumPy和TensorFlow设置随机种子以确保可以得到一致的结果
np.random.seed(123)
tf.set_random_seed(123)

# 加载数据
# 文档加载预处理
def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will
    try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    # glob是实用的文件名匹配库,glob.glob()函数将会匹配给定路径下的所有pattern,并以列表形式返回。
    # 用它可以查找符合特定规则的文件路径名
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r', encoding='UTF-8') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r', encoding='UTF-8') as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)

    return dataset

# 目标标签
def collect_expected(dataset):
    """ Peel off the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

# 计算样本平均长度,决定将网络展开至多远
def avg_len(data):
    total_len = 0
    for sample in data:
        total_len += len(sample[1])
    return total_len/len(data)

# 准备基于字符模型的字符串,清除一些与文本的自然语言无关的词条数据
def clean_data(data):
    """Shift to lower case, replace unknowns with UNK, and listify"""
    new_data = []
    VALID = 'abcdefghijklmnopqrstuvwxyz0123456789"\'?!.,:; '
    for sample in data:
        new_sample = []
        for char in sample[1].lower():
            if char in VALID:
                new_sample.append(char)
            else:
                new_sample.append('UNK')
        new_data.append(new_sample)
    return new_data

# 填充和截断字符
def char_pad_trunc(data, maxlen=1500):
    """ We truncate to maxlen or add in PAD tokens """
    new_dataset = []
    for sample in data:
        if len(sample) > maxlen:
            new_data = sample[:maxlen]
        elif len(sample) < maxlen:
            pads = maxlen - len(sample)
            new_data = sample + ['PAD'] * pads
        else:
            new_data = sample
        new_dataset.append(new_data)
    return new_dataset

# 使用独热编码字符,而不是使用 Word2vec
# 基于字符的模型“字典”
def create_dicts(data):
    """ Modified from Keras LSTM example"""
    chars = set()
    for sample in data:
        chars.update(set(sample))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return char_indices, indices_char

# 字符的独热编码
def onehot_encode(dataset, char_indices, maxlen=1500):
    """
    One-hot encode the tokens

    Args:
        dataset list of lists of tokens
        char_indices
            dictionary of {key=character, value=index to use encoding vector}
        maxlen int Length of each sample
    Return:
        np array of shape (samples, tokens, encoding length)
     """
    X = np.zeros((len(dataset), maxlen, len(char_indices.keys())))
    print("len(char_indices.keys()):", len(char_indices.keys()))
    for i, sentence in enumerate(dataset):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
    return X

# 加载和预处理 IMDB 数据
dataset = pre_process_data('xxx\\aclImdb\\train')
maxlen = 1500
# [...]
expected = collect_expected(dataset)
print(avg_len(dataset))
listified_data = clean_data(dataset)
common_length_data = char_pad_trunc(listified_data, maxlen=1500)
char_indices, indices_char = create_dicts(common_length_data)
encoded_data = onehot_encode(common_length_data, char_indices, 1500)

# 将数据集划分为训练集(80%)和测试集(20%)
split_point = int(len(encoded_data)*.8)
x_train = encoded_data[:split_point]
y_train = expected[:split_point]
x_test = encoded_data[split_point:]
y_test = expected[split_point:]

# 大小为样本数量×序列长度×词向量长度
x_train = np.reshape(x_train, (len(x_train), maxlen, len(char_indices.keys())))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, len(char_indices.keys())))
y_test = np.array(y_test)

# 建立基于字符的 LSTM 网络
num_neurons = 40

model = Sequential()
model.add(LSTM(num_neurons, return_sequences=True, input_shape=(maxlen, len(char_indices.keys()))))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
print(model.summary())

# 训练基于字符的 LSTM 网络
batch_size = 32
epochs = 10
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
            validation_data=(x_test, y_test))

# 保存模型
model_structure = model.to_json()
with open("char_lstm_model3.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("char_lstm_weights3.h5")

猜你喜欢

转载自blog.csdn.net/fgg1234567890/article/details/113532702