本文使用logistic regression算法,对Breast Cancer Wisconsin (Diagnostic) Database数据集中malignant(恶性)与benign(良性)两种病情进行分类预测。步骤包括数据预处理、特征选择、特征对比、建模与评价几个步骤。
读取数据集
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
import os
os.chdir('/Users/zhaohaibo/Desktop')
# 读取数据集
data = load_breast_cancer()
X = data.data
y = data.target
# 将data保存为xls
df = pd.DataFrame(data.data)
df.columns = data.feature_names
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
对标签重新编码
- 因为数据集已经将编码工作做好了,这步可以省略。
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)
特征选择
- 使用RFECV进行特征选择,保留排名前三的数据特征时交叉验证分数最高,
- 对特征进行排名:worst radius、worst texture、worst concave points。保留这三个特征。
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
svc = SVC(kernel="linear")
dt = DecisionTreeClassifier()
rfecv = RFECV(estimator=dt, step=1, cv=StratifiedKFold(2), scoring='accuracy')
rfecv.fit(X, y)
print("Optimal number of features : %d" % rfecv.n_features_)
print("Ranking of features nums: %s" % data.feature_names[rfecv.ranking_])
print("Ranking of features names: %s" % rfecv.ranking_)
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.savefig("feature.jpg")
plt.show()
feature_list = []
for i in range(len(rfecv.ranking_)):
if(rfecv.ranking_[i] == 1):
feature_list.append(i)
X = X[:,feature_list]
特征分析
- 使用seaborn.pairplot()
XX = pd.DataFrame(X,columns=data.feature_names[feature_list])
yy = []
for i in range(len(y)):
if(y[i]==0):
yy.append("malignant")
else:
yy.append("benign")
XX['label'] = yy
import seaborn as sns
sns.pairplot(XX, vars=["worst radius","worst texture", "worst concave points"],
hue="label", palette="husl",
markers=["o","x"]
,diag_kind="kde")
plt.savefig("duibi.jpg")
切分数据集
- 使用sklearn.model_selection.train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=666, shuffle = False)# shuffle默认为True
数据归一化处理
- 使用最大最小值归一化
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.fit_transform(X_test)
网格搜索寻参(存在问题,未解决)
- (模型使用logistic regression)
param_grid = [
{
'tol': [0.00001,0.0001,0.001,0.01,0.1],
'multi_class':['multinomial','ovr'],
'c':[0.01,0.1,1,10],
'class_weight':['balenced','None'],
'solver':['sag','saga','liblinear','newton-cg'],
'max_iter':[10,100,1000,10000,100000]
}
]
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid_search = GridSearchCV(LogisticRegression(), param_grid)
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
使用logistic regression建模
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty='l2',dual=False, tol=0.001,
C=1.0, fit_intercept=True, intercept_scaling=1,
class_weight=None, random_state=0, solver='sag',
max_iter=100, multi_class='multinomial', verbose=1,
warm_start=False, n_jobs=-1)
clf.fit(X_train, y_train)
clf.predict(X_test)
score = clf.score(X_test, y_test)
print("The accuracy of Logistic Regression classifier:",score)
绘制混淆矩阵
def plot_confusion_matrix(cm, classes, normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
Normalization can be applied by setting `normalize = True`.
"""
plt.imshow(cm, interpolation='nearest',cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:,np.newaxis]
print("Normalized confusion matrix")
else:
print("Confusion matrix, without normalization")
print(cm)
thresh = cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
plt.text(j, i, cm[i,j],
horizontalalignment="center",
color="white" if cm[i,j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.savefig("matrix.jpg")
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
prediction = clf.predict(X_test)
cm = confusion_matrix(y_test, prediction)
cm_plot_labels = ['malignant', 'benign']
plot_confusion_matrix(cm, cm_plot_labels, title='Confusion Matrix')
模型评价
# accuracy_score
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
print("The accuracy of Logistic Regression classifier:",accuracy)
#precision & recall & f1-score
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=prediction))
/ | Precision | Recall | F1-score | Support |
---|---|---|---|---|
Healthy | 0.93 | 0.98 | 0.95 | 43 |
Cancer | 0.99 | 0.98 | 0.99 | 145 |
Avg/total | 0.98 | 0.98 | 0.98 | 188 |
绘制ROC曲线
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, prediction)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC(area = %0.2f)' % (roc_auc))
plt.xlabel("FPR (False Positive Rate)")
plt.ylabel("TPR (True Positive Rate)")
plt.title("Receiver Operating Characteristic, ROC(AUC = %0.2f)"% (roc_auc))
plt.savefig("ROC.jpg")
plt.show()
全部代码:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 9 23:39:32 2018
@author: zhaohaibo
"""
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
import os
os.chdir('/Users/zhaohaibo/Desktop')
class Cancer(object):
# def __init__(self):
def Encode(self, y):
encoder = LabelEncoder()
y = encoder.fit_transform(y)
return y
def RFECV(self, X, y):
svc = SVC(kernel="linear")
dt = DecisionTreeClassifier()
rfecv = RFECV(estimator=dt, step=1, cv=StratifiedKFold(2), scoring='accuracy')
rfecv.fit(X, y)
print("Optimal number of features : %d" % rfecv.n_features_)
print("Ranking of features nums: %s" % data.feature_names[rfecv.ranking_])
print("Ranking of features names: %s" % rfecv.ranking_)
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.savefig("feature.jpg")
plt.show()
feature_list = []
for i in range(len(rfecv.ranking_)):
if(rfecv.ranking_[i] == 1):
feature_list.append(i)
X = X[:,feature_list]
return X, feature_list
def Seaborn(self, X, y, feature_list):
XX = pd.DataFrame(X,columns=data.feature_names[feature_list])
yy = []
for i in range(len(y)):
if(y[i]==0):
yy.append("malignant")
else:
yy.append("benign")
XX['label'] = yy
import seaborn as sns
sns.pairplot(XX, vars=["worst radius","worst texture", "worst concave points"],
hue="label", palette="husl",
markers=["o","x"]
,diag_kind="kde")
plt.savefig("duibi.jpg")
def Train_Test_Split(self, X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=666, shuffle = False)# shuffle默认为True
return X_train, X_test, y_train, y_test
def Scaler(self, X_train, X_test):
min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.fit_transform(X_test)
return X_train, X_test
def GridSearch():
param_grid = [
{
'tol': [0.00001,0.0001,0.001,0.01,0.1],
'multi_class':['multinomial','ovr'],
'c':[0.01,0.1,1,10],
'class_weight':['balenced','None'],
'solver':['sag','saga','liblinear','newton-cg'],
'max_iter':[10,100,1000,10000,100000]
}
]
grid_search = GridSearchCV(LogisticRegression(), param_grid)
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
def Model(self, X_train, y_train, X_test, y_test):
clf = LogisticRegression(penalty='l2',dual=False, tol=0.001,
C=1.0, fit_intercept=True, intercept_scaling=1,
class_weight=None, random_state=0, solver='sag',
max_iter=100, multi_class='multinomial', verbose=1,
warm_start=False, n_jobs=-1)
clf.fit(X_train, y_train)
clf.predict(X_test)
score = clf.score(X_test, y_test)
print("The accuracy of Logistic Regression classifier:",score)
return clf
def plot_confusion_matrix(self, cm, classes, normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
Normalization can be applied by setting `normalize = True`.
"""
plt.figure()
plt.imshow(cm, interpolation='nearest',cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:,np.newaxis]
print("Normalized confusion matrix")
else:
print("Confusion matrix, without normalization")
print(cm)
thresh = cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
plt.text(j, i, cm[i,j],
horizontalalignment="center",
color="white" if cm[i,j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.savefig("matrix.jpg")
def Matrix(self, y_test, prediction):
cm = confusion_matrix(y_test, prediction)
cm_plot_labels = ['malignant', 'benign']
self.plot_confusion_matrix(cm, cm_plot_labels, title='Confusion Matrix')
def Metrics(self, y_test, prediction):
accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
print("The accuracy of Logistic Regression classifier:",accuracy)
#precision & recall & f1-score
print(classification_report(y_true=y_test, y_pred=prediction))
#绘制ROC曲线
fpr, tpr, thresholds = roc_curve(y_test, prediction)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, lw=1, label='ROC(area = %0.2f)' % (roc_auc))
plt.xlabel("FPR (False Positive Rate)")
plt.ylabel("TPR (True Positive Rate)")
plt.title("Receiver Operating Characteristic, ROC(AUC = %0.2f)"% (roc_auc))
plt.savefig("ROC.jpg")
plt.show()
def SaveExl(data):
df = pd.DataFrame(data.data)
df.columns = data.feature_names
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
def main(self, X, y):
y = self.Encode(y)
X, feature_list = self.RFECV(X, y)
self.Seaborn(X, y, feature_list)
X_train, X_test, y_train, y_test = self.Train_Test_Split(X, y)
X_train, X_test = self.Scaler(X_train, X_test)
# self.GridSearch()
clf = self.Model(X_train, y_train, X_test, y_test)
prediction = clf.predict(X_test)
self.Matrix(y_test, prediction)
self.Metrics(y_test, prediction)
#self.SaveExl(data)
if __name__ == '__main__':
data = load_breast_cancer()
X = data.data
y = data.target
cancer = Cancer()
cancer.main(X, y)