ML-LogisticRegression

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.feature_selection import SelectFromModel,RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

dataset = datasets.load_breast_cancer()
X,y = dataset.data,dataset.target

def preProcessing(X):
    scaler = preprocessing.StandardScaler()
    inputVec = scaler.fit_transform(X)
    return inputVec

inputVec = preProcessing(X)

def selectFeatureSubset(inputVec,y):
    clf = ExtraTreesClassifier(n_estimators=15,criterion='gini',random_state=1)
    clf.fit(inputVec,y)
    print('feature score:\n',clf.feature_importances_)
    model = SelectFromModel(clf,prefit=True)
    tmp = model.transform(inputVec)
    common = []
    for index in range(inputVec.shape[1]):
        if inputVec[0,index] in list(tmp[0]):
            common.append(index)
    print('feature index:',common)
    
    model = RFE(LogisticRegression(),10)
    model.fit(inputVec,y)
    print('feature index:',list(np.arange(inputVec.shape[1])[model.support_]))

selectFeatureSubset(inputVec,y) 

def selectModelBestParams(inputVec,y):
    x = inputVec[:,[7,10,13,15,20,21,22,23,26,27]]
    params = {
            'C':[0.01,0.1,1.0,10,100,1000],
            'penalty':['l1','l2']
        }
    model = GridSearchCV(LogisticRegression(),params)
    model.fit(x,y)
    print('best params:\n',model.best_params_)

selectModelBestParams(inputVec,y)

def trainModel(inputVec,y):
    x = inputVec[:,[7,10,13,15,20,21,22,23,26,27]]
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)
    clf = LogisticRegression(C=1.0,penalty='l2')
    clf.fit(x_train,y_train)
    train = clf.predict(x_train)
    print('训练集:\n')
    print('accuracy:',accuracy_score(y_train,train))
    '''
    accuracy = (162+283)/(162+283+3)=0.978
    '''
    print(classification_report(y_train,train,target_names=dataset.target_names))
    print(pd.DataFrame(confusion_matrix(y_train,train)))
    '''
    准确率:
        0   162/(162+2)=0.99
        1   283/(283+8)=0.97
    召回率:
        0   162/(162+8)=0.95
        1   283/(283+2)=0.99
    f1-score:
        0   2*162/(2*162+2+8)=0.97
        1   2*283/(2*283+2+8)=0.98
    '''
    test = clf.predict(x_test)
    print('测试集:\n')
    print('accuracy:',accuracy_score(y_test,test))
    '''
    accuracy = (38+72)/(38+72+4)=0.96
    '''
    print(classification_report(y_test,test,target_names=dataset.target_names))
    print(pd.DataFrame(confusion_matrix(y_test,test)))
    
    fig,ax = plt.subplots(1,2,figsize=(20,4))
    ax[0].scatter(np.arange(x_train.shape[0]),y_train-train,marker='.',color='lime',s=50)
    ax[0].set_title('train set',fontsize=20,color='black',fontweight='bold')
    ax[0].grid(linestyle='--',color='gray')
    ax[0].legend(['predict-actual'],fontsize=16,facecolor='silver')
    ax[1].scatter(np.arange(x_test.shape[0]),y_test-test,marker='.',color='blue',s=30)
    ax[1].set_title('test set',fontsize=20,color='black',fontweight='bold')
    ax[1].grid(linestyle='--',color='gray')
    ax[1].legend(['predict-actual'],fontsize=16,facecolor='silver',loc='center right')
    fig.savefig('lr.png')
    plt.show()

trainModel(inputVec,y)



猜你喜欢

转载自blog.csdn.net/qq_42394743/article/details/80990945
ML