第一个机器学习项目(鸢尾花分类问题)

鸢尾花分类

1、下载和安装在Python中机器学习的各个方面的类库

2、 导入数据,通过描述性分析、可视化等数据进行分析

3、 创建六个模型,并从中选择准确度最高的模型

鸢尾花数据集特点:

1、所有的特征数据都是数字

2、这是一个分类问题,可以方便地通过有监督学习算法来解决问题

3、所有的特征采用相同的单位,不需要进行尺度的转换

按照下面的步骤实现这个项目:

(1) 导入数据

(2) 概述数据

(3) 数据可视化

(4) 评估算法

(5) 实施预测

#导入类库

from pandas import read_csv

from pandas.plotting import scatter_matrix

from matplotlib import pyplot

from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

#导入数据

filename = 'iris.data.csv'

names = ['sepal length','sepal width','petal length','petal width','class']

dataset = read_csv(filename,names=names)

print(dataset.dtypes)

#显示数据维度

print('数据维度:%s,%s' %(dataset.shape))

#查看数据自身

print(dataset.head(10))

#统计描述数据信息

print(dataset.describe())

#分类分布情况

print(dataset.groupby('class').size())

#箱线图

dataset.plot(kind='box',subplots=True,layout=(2,2),sharex=False,sharey=False)

pyplot.show()

#直方图

dataset.hist()

pyplot.show()

#散点矩阵图

scatter_matrix(dataset)

pyplot.show()

#分离数据集

#按照80%的训练数据集,20%的评估数据集

array = dataset.values

X = array[:,0:4]

y = array[:,4]

validation_size = 0.2

seed = 7

X_train,X_validation,y_train,y_validation = train_test_split(X,y,test_size = validation_size,random_state = seed)

#评估模式

#10折交叉验证来分离训练数据集,并评估算法模型的准确度

#9份用来训练模型,1份用来评估算法

#算法审查

models = {}

models['LR'] = LogisticRegression()

models['LDA'] = LinearDiscriminantAnalysis()

models['KNN'] = KNeighborsClassifier()

models['CART'] = DecisionTreeClassifier()

models['NB'] = GaussianNB()

models['SVM'] = SVC()

#评估算法

results = []

for key in models:

    kfold = KFold(n_splits=10,random_state=seed)

    cv_results = cross_val_score(models[key],X_train,y_train,cv=kfold,scoring='accuracy')

    

    results.append(cv_results)

print('%s:%f(%f)' %(key,cv_results.mean(),cv_results.std()))

#SVM算法具有最高的准确度得分

fig = pyplot.figure()

fig.suptitle('Algorithm Comparison')

ax = fig.add_subplot(111)

pyplot.boxplot(results)

ax.set_xticklabels(models.keys())

pyplot.show()

#使用评估数据集评估算法

svm = SVC()

svm.fit(X_train,y_train)

predictions = svm.predict(X_validation)

print(accuracy_score(y_validation,predictions))

print(confusion_matrix(y_validation,predictions))

print(classification_report(y_validation,predictions))

猜你喜欢

转载自blog.csdn.net/zhangyuee19501107/article/details/81051676