吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:使用训练好的单词向量实现新闻摘要分类

import pandas as pd
df = pd.read_json('/Users/chenyi/Documents/News_Category_Dataset.json', lines=True)
df.head()
df.category = df.category.map(lambda x:"WORLDPOST" if x == "THE WORLDPOST" else x)
categories = df.groupby('category')
print("total categories: ", categories.ngroups)
print(categories.size())

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence, one_hot
df['text'] = df.headline + " " + df.short_description

# 将单词进行标号
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
X = tokenizer.texts_to_sequences(df.text)
df['words'] = X

#记录每条数据的单词数
df['word_length'] = df.words.apply(lambda i: len(i))
#清除单词数不足5个的数据条目
df = df[df.word_length >= 5]
df.word_length.describe()

maxlen = 50
X = list(sequence.pad_sequences(df.words, maxlen=maxlen))

# 将分类进行编号
categories = df.groupby('category').size().index.tolist()
category_int = {}
int_category = {}
for i, k in enumerate(categories):
    category_int.update({k:i})
    int_category.update({i:k})

df['c2id'] = df['category'].apply(lambda x: category_int[x])
import numpy as np
import keras.utils as utils
from sklearn.model_selection import train_test_split
import numpy as np



X = np.array(X)
Y = utils.to_categorical(list(df.c2id))


# 将数据分成两部分,80%用于训练,20%用于测试

seed = 29
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)
word_index = tokenizer.word_index

EMBEDDING_DIM = 100
embeddings_index = {}
f = open('/Users/chenyi/Documents/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' %len(embeddings_index))
from keras.initializers import Constant

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    #根据单词挑选出对应向量
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
     
embedding_layer = Embedding(len(word_index)+1, EMBEDDING_DIM, 
                           embeddings_initializer=Constant(embedding_matrix),
                           input_length = maxlen,
                            trainable=False
                           )
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(layers.Dense(64, activation='relu'))
#当结果是输出多个分类的概率时,用softmax激活函数,它将为30个分类提供不同的可能性概率值
model.add(layers.Dense(len(int_category), activation='softmax'))

#对于输出多个分类结果,最好的损失函数是categorical_crossentropy
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(x_train, y_train, epochs=20, validation_data=(x_val, y_val), batch_size=512)

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.title('Training and validation accuracy')
plt.plot(epochs, acc, 'red', label='Training acc')
plt.plot(epochs, val_acc, 'blue', label='Validation acc')
plt.legend()

plt.figure()
plt.title('Training and validation loss')
plt.plot(epochs, loss, 'red', label='Training loss')
plt.plot(epochs, val_loss, 'blue', label='Validation loss')
plt.legend()

plt.show()

 

猜你喜欢

转载自www.cnblogs.com/tszr/p/12237900.html
今日推荐