词是有含义的,在对语言建模时,许多含义隐藏在字符里面。语音语调、头韵、韵律—如果我们把它们分解到字符级别,可以对所有这些建模。人类不需要分解得如此细致就可以为语言建模。但是,从建模中产生的定义非常复杂,并不容易传授给机器,这就是我们讨论这个问题的原因。对于我们见过的字符,当我们查看文本中哪个字符出现在哪个字符后面时,可以发现文本中的许多固有的模式。在这个范式中,空格、逗号或句号都变成了另一个字符。
现让我们在 IMDB 电影评论数据集中,在字符级别上尝试 LSTM:
import numpy as np
import glob
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from random import shuffle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten, LSTM
from tensorflow.keras.models import model_from_json
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
# 为NumPy和TensorFlow设置随机种子以确保可以得到一致的结果
np.random.seed(123)
tf.set_random_seed(123)
# 加载数据
# 文档加载预处理
def pre_process_data(filepath):
"""
This is dependent on your training data source but we will
try to generalize it as best as possible.
"""
positive_path = os.path.join(filepath, 'pos')
negative_path = os.path.join(filepath, 'neg')
pos_label = 1
neg_label = 0
dataset = []
# glob是实用的文件名匹配库,glob.glob()函数将会匹配给定路径下的所有pattern,并以列表形式返回。
# 用它可以查找符合特定规则的文件路径名
for filename in glob.glob(os.path.join(positive_path, '*.txt')):
with open(filename, 'r', encoding='UTF-8') as f:
dataset.append((pos_label, f.read()))
for filename in glob.glob(os.path.join(negative_path, '*.txt')):
with open(filename, 'r', encoding='UTF-8') as f:
dataset.append((neg_label, f.read()))
shuffle(dataset)
return dataset
# 目标标签
def collect_expected(dataset):
""" Peel off the target values from the dataset """
expected = []
for sample in dataset:
expected.append(sample[0])
return expected
# 计算样本平均长度,决定将网络展开至多远
def avg_len(data):
total_len = 0
for sample in data:
total_len += len(sample[1])
return total_len/len(data)
# 准备基于字符模型的字符串,清除一些与文本的自然语言无关的词条数据
def clean_data(data):
"""Shift to lower case, replace unknowns with UNK, and listify"""
new_data = []
VALID = 'abcdefghijklmnopqrstuvwxyz0123456789"\'?!.,:; '
for sample in data:
new_sample = []
for char in sample[1].lower():
if char in VALID:
new_sample.append(char)
else:
new_sample.append('UNK')
new_data.append(new_sample)
return new_data
# 填充和截断字符
def char_pad_trunc(data, maxlen=1500):
""" We truncate to maxlen or add in PAD tokens """
new_dataset = []
for sample in data:
if len(sample) > maxlen:
new_data = sample[:maxlen]
elif len(sample) < maxlen:
pads = maxlen - len(sample)
new_data = sample + ['PAD'] * pads
else:
new_data = sample
new_dataset.append(new_data)
return new_dataset
# 使用独热编码字符,而不是使用 Word2vec
# 基于字符的模型“字典”
def create_dicts(data):
""" Modified from Keras LSTM example"""
chars = set()
for sample in data:
chars.update(set(sample))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
return char_indices, indices_char
# 字符的独热编码
def onehot_encode(dataset, char_indices, maxlen=1500):
"""
One-hot encode the tokens
Args:
dataset list of lists of tokens
char_indices
dictionary of {key=character, value=index to use encoding vector}
maxlen int Length of each sample
Return:
np array of shape (samples, tokens, encoding length)
"""
X = np.zeros((len(dataset), maxlen, len(char_indices.keys())))
print("len(char_indices.keys()):", len(char_indices.keys()))
for i, sentence in enumerate(dataset):
for t, char in enumerate(sentence):
X[i, t, char_indices[char]] = 1
return X
# 加载和预处理 IMDB 数据
dataset = pre_process_data('xxx\\aclImdb\\train')
maxlen = 1500
# [...]
expected = collect_expected(dataset)
print(avg_len(dataset))
listified_data = clean_data(dataset)
common_length_data = char_pad_trunc(listified_data, maxlen=1500)
char_indices, indices_char = create_dicts(common_length_data)
encoded_data = onehot_encode(common_length_data, char_indices, 1500)
# 将数据集划分为训练集(80%)和测试集(20%)
split_point = int(len(encoded_data)*.8)
x_train = encoded_data[:split_point]
y_train = expected[:split_point]
x_test = encoded_data[split_point:]
y_test = expected[split_point:]
# 大小为样本数量×序列长度×词向量长度
x_train = np.reshape(x_train, (len(x_train), maxlen, len(char_indices.keys())))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, len(char_indices.keys())))
y_test = np.array(y_test)
# 建立基于字符的 LSTM 网络
num_neurons = 40
model = Sequential()
model.add(LSTM(num_neurons, return_sequences=True, input_shape=(maxlen, len(char_indices.keys()))))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
print(model.summary())
# 训练基于字符的 LSTM 网络
batch_size = 32
epochs = 10
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
validation_data=(x_test, y_test))
# 保存模型
model_structure = model.to_json()
with open("char_lstm_model3.json", "w") as json_file:
json_file.write(model_structure)
model.save_weights("char_lstm_weights3.h5")