呉ユーキション - 生まれの自然ニューラルネットワークと深い学習実用的なのPython + Keras + TensorFlow:ニューラルネットワークの分類を使用したニューストピック

インポートPd等パンダ
DF = pd.read_json(' /Users/chenyi/Documents/News_Category_Dataset.json '、行= TRUE)
df.head()

カテゴリ= df.groupby(' カテゴリ' 印刷" トータルカテゴリ:" 、categories.ngroups)
 プリント(categories.size())

df.category = df.category.map(ラムダ X:" WORLDPOST "  の場合のx == " WORLDPOST "  のx)
カテゴリ= df.groupby(' カテゴリ' 印刷" トータルカテゴリ:" 、categories.ngroups)
 プリント(categories.size())

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence, one_hot
df['text'] = df.headline + " " + df.short_description

# 将单词进行标号
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
X = tokenizer.texts_to_sequences(df.text)
df['words'] = X
#记录每条数据的单词数
df['word_length'] = df.words.apply(lambda i: len(i))
#清除单词数不足5个的数据条目
df = df[df.word_length >= 5]
df.word_length.describe()

def word2Frequent(sequences):
    word_index = {}
    for sequence in sequences:
        for word in sequence:
            word_index[word] = word_index.get(word, 0) + 1
    return word_index
word_index = word2Frequent(df.words)


count = 10000
#将单词按照频率按照升序排序,然后取出排在第一万位的单词频率
s = [(k, word_index[k]) for k in sorted(word_index, key=word_index.get, reverse=True)]
print(s[0])
frequent_to_index = {}
for i in range(count):
    frequent_to_index[s[i][0]] = 9999 - i

# 将分类进行编号
categories = df.groupby('category').size().index.tolist()
category_int = {}
int_category = {}
for i, k in enumerate(categories):
    category_int.update({k:i})
    int_category.update({i:k})

df['c2id'] = df['category'].apply(lambda x: category_int[x])
import numpy as np
import keras.utils as utils
from sklearn.model_selection import train_test_split
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i in range(len(sequences)):
        for word in sequences[i]:
            if frequent_to_index.get(word, None) is not None:
                pos = frequent_to_index[word]
                results[i, pos] = 1.0   
    return results

X = np.array(df.words)
X = vectorize_sequences(X)
print(X[0])
Y = utils.to_categorical(list(df.c2id))


# 将数据分成两部分,80%用于训练,20%用于测试

seed = 29
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
#当结果是输出多个分类的概率时,用softmax激活函数,它将为30个分类提供不同的可能性概率值
model.add(layers.Dense(len(int_category), activation='softmax'))

#对于输出多个分类结果,最好的损失函数是categorical_crossentropy
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(x_train, y_train, epochs=20, validation_data=(x_val, y_val), batch_size=512)

import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.xlabel('Epochs')
plt.ylabel('acc')
plt.legend()
plt.show()

from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(4, activation='relu'))
#当结果是输出多个分类的概率时,用softmax激活函数,它将为30个分类提供不同的可能性概率值
model.add(layers.Dense(len(int_category), activation='softmax'))

#对于输出多个分类结果,最好的损失函数是categorical_crossentropy
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(x_train, y_train, epochs=20, batch_size=512)

results = model.evaluate(x_val, y_val)
print(results)

 

おすすめ

転載: www.cnblogs.com/tszr/p/12232595.html