Python深度学习6-1复现(学习笔记)

import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
import numpy as np
import matplotlib.pyplot as plt
imdb_dir = '/Users/zheng/Desktop/pythonmachinelearning/aclImdb/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), 'r', encoding='mac_roman')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000
embedding_dim = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:',labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples:training_samples + validation_samples]
y_val = labels[training_samples:training_samples + validation_samples]

glove_dir = '/Users/zheng/Desktop/pythonmachinelearning/glove.6B'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding='mac_roman')
for line in f:
    values = line.split()
    word = values[0]
    values = np.array(values)
    list_values = values[1:]
    coefs = np.asarray(list_values, dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

model.compile(optimizer = 'rmsprop',
              loss = 'binary_crossentropy',
              metrics = ['acc'])
history = model.fit(x_train,y_train,
                    epochs = 10,
                    batch_size = 32,
                    validation_data = ([x_val, y_val]),
                    callbacks=None,
                    verbose = 2)
model.save_weights('pre_trained_glove_model.h5')

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label = 'Training_acc')
plt.plot(epochs, val_acc, 'b', label = 'Validation_acc')
plt.title('Training and Validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label = 'Training_loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation_loss')
plt.title('Training and Validation loss')
plt.legend()

plt.show()

遇到的问题及解决:

f = open(os.path.join(dir_name, fname), 'r', encoding='mac_roman')

os.open报错
解决办法encoding='mac_roman’或者尝试encoding='utf-8’也可以
UnicodeDecodeError: ‘gbk’ codec can’t decode byte 0x93 in position 130: illegal multibyte sequence
原因是读取的text文本内容格式,采取gbk不可用

for line in f:
    values = line.split()
    word = values[0]
    values = np.array(values)
    list_values = values[1:]
    coefs = np.asarray(list_values, dtype='float32')
    embeddings_index[word] = coefs
f.close()

遇到问题:不可以整形str为dtype=‘float32‘的数字形式,原因是读取的text为list列表格式,默认形式为str,讲values=np.array(values),并将values[0]也就是每个单词取出,剩下的打包成为二维矩阵,既可以读取float32.

可能是由于下载或者解压问题,40w个vector里单词的编码有计算机不识别的符号,修改了半角或者全角依然报错,索性直接手动改掉了,40w个vectors读入了399994个,剩下6个问题有待寻找…
贴一下我寻找的过程:

import numpy as np
import os

glove_dir = '/Users/zheng/Desktop/pythonmachinelearning/glove.6B'

embeddings_index = {}
f = open(os.path.join(glove_dir,
                      '/Users/zheng/Desktop/pythonmachinelearning/glove.6B/glove.6B.100d.txt'),
         encoding='utf-8')
for line in f:
    values = line.split()
    #print(len(values))
    word = values[0]
    if len(values)==101:
        #print (word)
        values = np.array(values)
        list_values = values[1:]
        #print(len(list_values))
        #list_values = np.array(list_values)
        if len(list_values) == 100:
            coefs = np.asarray(list_values,dtype='float32')
            #print(coefs)
            embeddings_index[word] = coefs
        else:
            print(word)
            print(line)
        coefs = np.asarray(list_values,dtype='float32')
        embeddings_index[word] = coefs
    else:
        print(word)
f.close()
print('Found %s word vectors.' % len(embeddings_index))

训练过程中有error:callbacks错误

https://www.cnblogs.com/BlueBlueSea/p/11063712.html

上述链接

找到一篇文章解决了报错原因,改verbose=2即可

下面是训练的loss和acc
Training and Validation accuracy
Training and Validation loss

以上。

希望我能每次都记录下复现的错误。

扫描二维码关注公众号,回复: 10366469 查看本文章
发布了8 篇原创文章 · 获赞 9 · 访问量 1821

猜你喜欢

转载自blog.csdn.net/shayinzzh/article/details/104884982
6-1