import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
import numpy as np
import matplotlib.pyplot as plt
imdb_dir = '/Users/zheng/Desktop/pythonmachinelearning/aclImdb/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []
for label_type in ['neg', 'pos']:
dir_name = os.path.join(train_dir, label_type)
for fname in os.listdir(dir_name):
if fname[-4:] == '.txt':
f = open(os.path.join(dir_name, fname), 'r', encoding='mac_roman')
texts.append(f.read())
f.close()
if label_type == 'neg':
labels.append(0)
else:
labels.append(1)
maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000
embedding_dim = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:',labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples:training_samples + validation_samples]
y_val = labels[training_samples:training_samples + validation_samples]
glove_dir = '/Users/zheng/Desktop/pythonmachinelearning/glove.6B'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding='mac_roman')
for line in f:
values = line.split()
word = values[0]
values = np.array(values)
list_values = values[1:]
coefs = np.asarray(list_values, dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
if i < max_words:
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['acc'])
history = model.fit(x_train,y_train,
epochs = 10,
batch_size = 32,
validation_data = ([x_val, y_val]),
callbacks=None,
verbose = 2)
model.save_weights('pre_trained_glove_model.h5')
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label = 'Training_acc')
plt.plot(epochs, val_acc, 'b', label = 'Validation_acc')
plt.title('Training and Validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label = 'Training_loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation_loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()
遇到的问题及解决:
f = open(os.path.join(dir_name, fname), 'r', encoding='mac_roman')
os.open报错
解决办法encoding='mac_roman’或者尝试encoding='utf-8’也可以
UnicodeDecodeError: ‘gbk’ codec can’t decode byte 0x93 in position 130: illegal multibyte sequence
原因是读取的text文本内容格式,采取gbk不可用
for line in f:
values = line.split()
word = values[0]
values = np.array(values)
list_values = values[1:]
coefs = np.asarray(list_values, dtype='float32')
embeddings_index[word] = coefs
f.close()
遇到问题:不可以整形str为dtype=‘float32‘的数字形式,原因是读取的text为list列表格式,默认形式为str,讲values=np.array(values),并将values[0]也就是每个单词取出,剩下的打包成为二维矩阵,既可以读取float32.
可能是由于下载或者解压问题,40w个vector里单词的编码有计算机不识别的符号,修改了半角或者全角依然报错,索性直接手动改掉了,40w个vectors读入了399994个,剩下6个问题有待寻找…
贴一下我寻找的过程:
import numpy as np
import os
glove_dir = '/Users/zheng/Desktop/pythonmachinelearning/glove.6B'
embeddings_index = {}
f = open(os.path.join(glove_dir,
'/Users/zheng/Desktop/pythonmachinelearning/glove.6B/glove.6B.100d.txt'),
encoding='utf-8')
for line in f:
values = line.split()
#print(len(values))
word = values[0]
if len(values)==101:
#print (word)
values = np.array(values)
list_values = values[1:]
#print(len(list_values))
#list_values = np.array(list_values)
if len(list_values) == 100:
coefs = np.asarray(list_values,dtype='float32')
#print(coefs)
embeddings_index[word] = coefs
else:
print(word)
print(line)
coefs = np.asarray(list_values,dtype='float32')
embeddings_index[word] = coefs
else:
print(word)
f.close()
print('Found %s word vectors.' % len(embeddings_index))
训练过程中有error:callbacks错误
https://www.cnblogs.com/BlueBlueSea/p/11063712.html
找到一篇文章解决了报错原因,改verbose=2即可
下面是训练的loss和acc
以上。
希望我能每次都记录下复现的错误。
扫描二维码关注公众号,回复:
10366469 查看本文章