Natural language processing--keras implements LSTM character-level modeling

Words have meanings. When modeling language, many meanings are hidden in characters. Phonetic intonation, alliteration, prosody-if we break them down to the character level, we can model all of these. Humans do not need to be so detailed to model language. However, the definitions derived from the modeling are very complex and not easy to pass on to the machine, which is why we are discussing this issue. For the characters we have seen, when we look at which character appears after which character in the text, we can find many inherent patterns in the text. In this paradigm, a space, comma, or period becomes another character.

Let us now try LSTM at the character level in the IMDB movie review dataset:

import numpy as np
import glob
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from random import shuffle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten, LSTM
from tensorflow.keras.models import model_from_json
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# 为NumPy和TensorFlow设置随机种子以确保可以得到一致的结果
np.random.seed(123)
tf.set_random_seed(123)

# 加载数据
# 文档加载预处理
def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will
    try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    # glob是实用的文件名匹配库,glob.glob()函数将会匹配给定路径下的所有pattern,并以列表形式返回。
    # 用它可以查找符合特定规则的文件路径名
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r', encoding='UTF-8') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r', encoding='UTF-8') as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)

    return dataset

# 目标标签
def collect_expected(dataset):
    """ Peel off the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

# 计算样本平均长度,决定将网络展开至多远
def avg_len(data):
    total_len = 0
    for sample in data:
        total_len += len(sample[1])
    return total_len/len(data)

# 准备基于字符模型的字符串,清除一些与文本的自然语言无关的词条数据
def clean_data(data):
    """Shift to lower case, replace unknowns with UNK, and listify"""
    new_data = []
    VALID = 'abcdefghijklmnopqrstuvwxyz0123456789"\'?!.,:; '
    for sample in data:
        new_sample = []
        for char in sample[1].lower():
            if char in VALID:
                new_sample.append(char)
            else:
                new_sample.append('UNK')
        new_data.append(new_sample)
    return new_data

# 填充和截断字符
def char_pad_trunc(data, maxlen=1500):
    """ We truncate to maxlen or add in PAD tokens """
    new_dataset = []
    for sample in data:
        if len(sample) > maxlen:
            new_data = sample[:maxlen]
        elif len(sample) < maxlen:
            pads = maxlen - len(sample)
            new_data = sample + ['PAD'] * pads
        else:
            new_data = sample
        new_dataset.append(new_data)
    return new_dataset

# 使用独热编码字符,而不是使用 Word2vec
# 基于字符的模型“字典”
def create_dicts(data):
    """ Modified from Keras LSTM example"""
    chars = set()
    for sample in data:
        chars.update(set(sample))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return char_indices, indices_char

# 字符的独热编码
def onehot_encode(dataset, char_indices, maxlen=1500):
    """
    One-hot encode the tokens

    Args:
        dataset list of lists of tokens
        char_indices
            dictionary of {key=character, value=index to use encoding vector}
        maxlen int Length of each sample
    Return:
        np array of shape (samples, tokens, encoding length)
     """
    X = np.zeros((len(dataset), maxlen, len(char_indices.keys())))
    print("len(char_indices.keys()):", len(char_indices.keys()))
    for i, sentence in enumerate(dataset):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
    return X

# 加载和预处理 IMDB 数据
dataset = pre_process_data('xxx\\aclImdb\\train')
maxlen = 1500
# [...]
expected = collect_expected(dataset)
print(avg_len(dataset))
listified_data = clean_data(dataset)
common_length_data = char_pad_trunc(listified_data, maxlen=1500)
char_indices, indices_char = create_dicts(common_length_data)
encoded_data = onehot_encode(common_length_data, char_indices, 1500)

# 将数据集划分为训练集(80%)和测试集(20%)
split_point = int(len(encoded_data)*.8)
x_train = encoded_data[:split_point]
y_train = expected[:split_point]
x_test = encoded_data[split_point:]
y_test = expected[split_point:]

# 大小为样本数量×序列长度×词向量长度
x_train = np.reshape(x_train, (len(x_train), maxlen, len(char_indices.keys())))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, len(char_indices.keys())))
y_test = np.array(y_test)

# 建立基于字符的 LSTM 网络
num_neurons = 40

model = Sequential()
model.add(LSTM(num_neurons, return_sequences=True, input_shape=(maxlen, len(char_indices.keys()))))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
print(model.summary())

# 训练基于字符的 LSTM 网络
batch_size = 32
epochs = 10
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
            validation_data=(x_test, y_test))

# 保存模型
model_structure = model.to_json()
with open("char_lstm_model3.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("char_lstm_weights3.h5")

Guess you like

Origin blog.csdn.net/fgg1234567890/article/details/113532702