K近邻(KNN)支持向量机(SVM)随机森林(RF)逻辑回归(LR)演示

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from pandas.plotting import scatter_matrix
from sklearn.datasets import load_iris
#第一步,数据集的获取
iris = load_iris()
#第二步,数据预处理,删除缺失值
iris_d = pd.DataFrame(iris['data'], columns=['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']) #数据转化为DataFrame类型
iris_d['Species'] = iris.target #目标类型的获取
iris_d.dropna(inplace=True) #数据预处理,删除缺失值
iris_d
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 2
146 6.3 2.5 5.0 1.9 2
147 6.5 3.0 5.2 2.0 2
148 6.2 3.4 5.4 2.3 2
149 5.9 3.0 5.1 1.8 2

150 rows × 5 columns

#第三步,数据可视化
iris_d.groupby('Species').size() #数据集的数量
Species
0    50
1    50
2    50
dtype: int64
iris_d.describe()#数据集的描述性统计
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
count 150.000000 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333 1.000000
std 0.828066 0.435866 1.765298 0.762238 0.819232
min 4.300000 2.000000 1.000000 0.100000 0.000000
25% 5.100000 2.800000 1.600000 0.300000 0.000000
50% 5.800000 3.000000 4.350000 1.300000 1.000000
75% 6.400000 3.300000 5.100000 1.800000 2.000000
max 7.900000 4.400000 6.900000 2.500000 2.000000
iris_d.plot(kind = 'box') #变量之间的箱线图,展现数据的离散程度
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SRzISdsR-1657980589619)(output_5_0.png)]

iris_d.hist() #数据集的直方图,用于展示数据的分布特征
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nGtOklWn-1657980589620)(output_6_0.png)]

scatter_matrix(iris_d) #展现了变量之间的关系,非线性相关和线性相关
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ME0vkIc4-1657980589621)(output_7_0.png)]

#第四部分,特征工程
array = iris_d.values
X = array[:,0:4]
Y = array[:,4]
#选取模型的特征,本模型选择80%数据量作为训练数据,20%作为测试数据
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=7) 
#第五部分,机器学习模型和评估
#K近邻(KNN)
model = KNeighborsClassifier()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
#模型评估,交叉验证
print(accuracy_score(y_test, predictions))
0.9
#机器学习模型
#支持向量机(SVM)
model = SVC()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
#模型评估,交叉验证
print(accuracy_score(y_test, predictions))
0.8666666666666667
#机器学习模型
#随机森林(RF)
model = RandomForestClassifier()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
#模型评估,交叉验证
print(accuracy_score(y_test, predictions))
0.8666666666666667
#机器学习模型
#逻辑回归(LR)
model = LogisticRegression()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
#模型评估,交叉验证
print(accuracy_score(y_test, predictions))
0.8666666666666667


d:\program files\python3.7\lib\site-packages\sklearn\linear_model\_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

猜你喜欢

转载自blog.csdn.net/qq_42830971/article/details/125825869
今日推荐