《机器学习Python实践》第三章——第一个机器学习项目

第三章——第一个机器学习项目
一个机器学习项目的步骤:
1)导入数据;
2)概述数据;
3)数据可视化;
4)评估算法;
5)实施预测。

导入类库

from pandas import read_csv
from pandas import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

1)导入数据

filename='iris.data.csv'
names=['separ-length','separ-width','petal-length','petal-width','class']
dataset=read_csv(filename,names=names)

2)概述数据

数据的维度、查看数据自身、统计描述所有的数据特征、数据分类的分布情况。

print('数据维度:行%s,列 %s'% dataset.shape)   #显示数据维度
print(dataset.head(10))                      #查看数据的前10行
print(dataset.describe())                    #统计描述数据的行数、中位值、最大值、最小值、四分位值等
print(dataset.groupby('class').size())       #数据分类分布

3)数据可视化;

#箱线图(盒图)         #盒图
dataset.plot(kind='box',subplots=True, layout=(2,2), sharex=False, sharey=False)
pyplot.show()

#直方图
dataset.hist()
pyplot.show()

#散点矩阵图
scatter_matrix(dataset)
pyplot.show()

4)评估算法

1)分离出评估数据集
2)采用10折交叉验证来评估算法模型
3)生成6个不同的模型来预测新数据
4)选择最优模型。

分离出评估数据

#分离数据集
#80%用于训练,20%用于评估
array=dataset.values
X=array[:,0:4]
Y=array[:,4]
validation_size=0.2
seed=7
X_train,X_validation,Y_train,Y_validation=\
    train_test_split(X,Y,test_size=validation_size,random_state=seed)

创建模型、10折交叉验证

#算法审查
models={}
models['LR']=LogisticRegression()
models['LDA']=LinearDiscriminantAnalysis()
models['KNN']=KNeighborsClassifier()
models['CART']=DecisionTreeClassifier()
models['NB']=GaussianNB()
models['SVM']=SVC()

#评估算法
results=[]
for key in models:
    kfold=KFold(n_splits=10, random_state=seed)
    cv_results=cross_val_score(models[key],X_train,Y_train,cv=kfold, scoring='accuracy')
    results.append(cv_results)
    print('%s: %f (%f)' %(key, cv_results.mean(), cv_results.std()))

选择最优模型

5)实施预测

选取最优的模型进行预测

#使用测试集来评估算法
svm=SVC()
svm.fit(X=X_train,y=Y_train)
predictions= svm.predict(X_validation)
print(accuracy_score(Y_validation,predictions))
print(confusion_matrix(Y_validation,predictions))
print(classification_report(Y_validation,predictions))

完整的python代码:

# -*- coding: utf-8 -*-
"""
Created on Mon Jun  4 09:42:41 2018

@author: np
"""

from pandas import read_csv
from pandas import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

filename='iris.data.csv'
names=['separ-length','separ-width','petal-length','petal-width','class']
dataset=read_csv(filename,names=names)

#分离数据集
#80%用于训练,20%用于评估
array=dataset.values
X=array[:,0:4]
Y=array[:,4]
validation_size=0.2
seed=7
X_train,X_validation,Y_train,Y_validation=\
    train_test_split(X,Y,test_size=validation_size,random_state=seed)

#算法审查
models={}
models['LR']=LogisticRegression()
models['LDA']=LinearDiscriminantAnalysis()
models['KNN']=KNeighborsClassifier()
models['CART']=DecisionTreeClassifier()
models['NB']=GaussianNB()
models['SVM']=SVC()

#评估算法
results=[]
for key in models:
    kfold=KFold(n_splits=10, random_state=seed)
    cv_results=cross_val_score(models[key],X_train,Y_train,cv=kfold, scoring='accuracy')
    results.append(cv_results)
    print('%s: %f (%f)' %(key, cv_results.mean(), cv_results.std()))


#箱线图比较算法
fig=pyplot.figure()
fig.suptitle("Algorithm Comparision")
ax=fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()

#使用测试集来评估算法
svm=SVC()
svm.fit(X=X_train,y=Y_train)
predictions= svm.predict(X_validation)
print(accuracy_score(Y_validation,predictions))
print(confusion_matrix(Y_validation,predictions))
print(classification_report(Y_validation,predictions))

猜你喜欢

转载自blog.csdn.net/zhenaoxi1077/article/details/80564572