keras新闻分类：多分类问题

本文链接： https://blog.csdn.net/yitian1585531/article/details/86558354

from keras.datasets import reuters
import keras
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Activation
from keras.models import Sequential


max_words = 10000  #在文章中找出最常出现的TOP1000的单词
batch_size = 10   #每批次的数量
epochs = 5   #周期数

print('Loading data...')
#获取训练集和测试集，num_words代表选出top多少的单词，test_split代表测试集的占比
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,test_split=0.2)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

#Vectorizing sequence data...
tokenizer = Tokenizer(num_words=max_words) #Tokenizer是一个用于向量化文本，或将文本转换为序列
#one-hot化数据   https://blog.csdn.net/lovebyz/article/details/77712003
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')  #行为数据组，列为对应的TOP1000单词，如果单词在文章出现过，则为1，否则为0
# print(x_train)
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

#分类主题的one-hot化
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

#model
model = Sequential()
model.add(Dense(512, activation='relu',input_shape=(max_words,)))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes,activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer='adam',metrics=['accuracy'])

history = model.fit(x_train, y_train,batch_size=batch_size,
                    epochs=epochs,verbose=1,validation_split=0.1)

score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)

print('Test score:', score[0])
print('Test accuracy:', score[1])

#http://www.sharewen.com/article/20171205231801.html
#https://blog.csdn.net/cskywit/article/details/80900093

#模型评估，数据预处理，特征工程，过拟合

#监督学习，无监督学习，自监督学习，强化学习

机器学习流程：
#1.定义问题，收集数据，验证，
#2.选择衡量成功的指标精度or准确率or召回率orROC,AUC
#3.评估方法
#4.数据格式化，张量形式，标准化
#5.模型
#6.扩大过拟合模型优化与泛化的对立，过拟合与欠拟合，容量不足与容量过大的
#7.正则化与调节超参数

深度学习最全优化方法总结比较（SGD，Adagrad，Adadelta，Adam，Adamax，Nadam）

源参考

keras新闻分类：多分类问题

猜你喜欢