Logistic Regression Classification Based on Iris Dataset

Step1: Library function import

# 基础函数库
import numpy as np
import pandas as pd

# 绘图函数库
import matplotlib.pyplot as plt
import seaborn as sns

# iris数据
from sklearn.datasets import load_iris

insert image description here

Step2: Data reading/loading

data = load_iris() #得到数据
iris_target = data.target
iris_features = pd.DataFrame(data=data.data, columns=data.feature_names)#将data转成DataFram格式
# 利用info()查看数据的整体信息
iris_features.info()

insert image description here

#利用head()或者tail()简单查看头部或尾部数据
iris_features.head()

insert image description here

#查看标签列表
iris_target

insert image description here

#统计每个类别的数量
pd.Series(iris_target).value_counts()

insert image description here

## 对于特征进行一些统计描述
iris_features.describe()

insert image description here

Step4: Visual description

## 合并标签和特征信息
iris_all = iris_features.copy()# 浅拷贝,防止对原始数据的修改
iris_all["target"] = iris_target

## 特征与标签组合的散点可视化
sns.pairplot(data=iris_all, diag_kind='hist', hue = 'target')
plt.show()

insert image description here

#箱型图展示数据
for col in iris_features.columns:
    sns.boxplot(x='target',y=col, saturation=0.5, palette='pastel',data=iris_all)
    plt.title(col)
    plt.show()

insert image description here

Step5: Use the logistic regression model to train and predict on the binary classification

## 为了正确评估模型性能,将数据划分为训练集和测试集,并在训练集上训练模型,在测试集上验证模型性能
from sklearn.model_selection import train_test_split

## 选择其类别为0和1的样本(不包括类别为2的样本)
iris_features_part = iris_features.iloc[:100]
iris_target_part = iris_target[:100]

## 测试集大小为20%,
x_train, x_test, y_train, y_test = train_test_split(iris_features_part, iris_target_part,test_size=0.2, random_state=2020)

## 从sklearn中导入逻辑回归模型
from sklearn.linear_model import LogisticRegression

## 定义 逻辑回归模型
clf = LogisticRegression(random_state=0, solver='lbfgs')

## 在训练集上训练逻辑回归模型
clf.fit(x_train, y_train)

print("逻辑回归的权重为:",clf.coef_)
print("逻辑回归的截距为:",clf.intercept_)

insert image description here

train_predict = clf.predict(x_train)
test_predict = clf.predict(x_test)

from sklearn import metrics

## 利用准确度评估模型效果
print("训练集预测和真实值的准确度:",metrics.accuracy_score(y_true=y_train, y_pred=train_predict))
print("测试集预测和真实值的准确度:",metrics.accuracy_score(y_true=y_test, y_pred=test_predict))

## 查看混淆矩阵
confusion_matrix_result = metrics.confusion_matrix(test_predict,y_test)
print('混淆矩阵结果为:\n',confusion_matrix_result)

insert image description here

Step6: Use the logistic regression model to train and predict on multiple classifications

## 测试集大小为20%
x_train, x_test, y_train, y_test = train_test_split(iris_features,iris_target,test_size=0.2, random_state = 2020)

## 定义逻辑回归模型
clf = LogisticRegression(random_state=0,solver='lbfgs')

# 在训练集上训练逻辑回归模型
clf.fit(x_train,y_train)

print("逻辑回归的权重为:\n",clf.coef_)
print("逻辑回归的截距为:\n",clf.intercept_)

insert image description here

train_predict = clf.predict(x_train)
test_predict = clf.predict(x_test)

## 由于逻辑回归模型是概率预测模型,所以我们可以利用predict_proba函数预测其概率
train_predict_proba = clf.predict_proba(x_train)
test_predict_proba = clf.predict_proba(x_test)
## 其中第一列代表预测为0类的概率,第二列代表预测为1类的概率,第三列代表预测为2类的概率。

## 利用准确度评估模型效果
print("训练集预测和真实值的准确度:",metrics.accuracy_score(y_true=y_train, y_pred=train_predict))
print("测试集预测和真实值的准确度:",metrics.accuracy_score(y_true=y_test, y_pred=test_predict))

## 查看混淆矩阵
confusion_matrix_result = metrics.confusion_matrix(test_predict,y_test)
print('混淆矩阵结果为:\n',confusion_matrix_result)

insert image description here

Guess you like

Origin blog.csdn.net/BigCabbageFy/article/details/108127843