LogisticRegression 预测分类

版权声明:本文为博主原创文章,转载请标明出处。 https://blog.csdn.net/chuan403082010/article/details/85049614
import numpy as np
import pandas as pd
path = "breast-cancer-wisconsin.data"
names = ['id','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape',
         'Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei',
        'Bland Chromatin','Normal Nucleoli','Mitoses','Class']
## 加载数据
data = pd.read_csv(path,na_values='?')
data.columns = names
## 删除为空的数据
data= data.dropna()
data.dtypes
# ## 使用过采样,进行类别平衡
print(data['Class'].value_counts())
appden_data = data[data['Class']==4][:-35]
data =data.append(appden_data)
## 提取X和
Y = data['Class']
X = data.drop(['Class'],axis=1,inplace=False)
## 标准化
from sklearn.preprocessing import StandardScaler
ssCoder = StandardScaler()
X_ss = ssCoder.fit_transform(X)
X = pd.DataFrame(X_ss)

## 分割数据集
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5,random_state=0)

# data.head()
## 逻辑回归模型训练
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0)
clf.fit(X_train,Y_train)
clf.predict(X_test)
clf.score(X_test,Y_test)
# 0.9729119638826185
import pandas as pd
import numpy as np
path = "datas/breast-cancer-wisconsin.data"
names = ['id','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape',
         'Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei',
        'Bland Chromatin','Normal Nucleoli','Mitoses','Class']
## 加载数据
df = pd.read_csv(path,names=names)
df.drop(columns=["id"],inplace=True)
## 删除为空的数据
df = df.replace("?",np.nan).dropna()
df["Bare Nuclei"] = df["Bare Nuclei"].astype(np.int64)
df["Class"] = df["Class"]/2 - 1

## 使用过采样,进行类别平衡
a = df["Class"].value_counts()  #0.0    444         1.0    239    避免硬编码
df1 = df[df["Class"]==1]
df1 = df1.sample(n=a[0]-a[1],random_state=0)
df = pd.concat([df,df1])

## 提取X和Y
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

## 标准化
from sklearn.preprocessing import StandardScaler
ssCoder = StandardScaler()
X = pd.DataFrame(ssCoder.fit_transform(X))
## 分割数据集
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3,random_state=0)

## 逻辑回归模型训练
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,Y_train)
model.score(X_test,Y_test)

from sklearn import metrics
a = model.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_score=[i[1] for i in a], pos_label=1)
metrics.auc(fpr, tpr)

from sklearn.externals import joblib
# 保存模型
joblib.dump(model,'LR.model')
joblib.dump(ssCoder,'ssCoder.model')

将模型保存后加载出来

import pandas as pd
import numpy as np
path = "datas/breast-cancer-wisconsin.data"
names = ['id','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape',
         'Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei',
        'Bland Chromatin','Normal Nucleoli','Mitoses','Class']
## 加载数据
df = pd.read_csv(path,names=names)
df.drop(columns=["id"],inplace=True)
## 删除为空的数据
df = df.replace("?",np.nan).dropna()
df["Bare Nuclei"] = df["Bare Nuclei"].astype(np.int64)
df["Class"] = df["Class"]/2 - 1

## 使用过采样,进行类别平衡
a = df["Class"].value_counts()  #0.0    444         1.0    239    避免硬编码
df1 = df[df["Class"]==1]
df1 = df1.sample(n=a[0]-a[1],random_state=0)
df = pd.concat([df,df1])

## 提取X和Y
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

from sklearn.externals import joblib
# 加载模型
model = joblib.load('LR.model')
ssCoder = joblib.load('ssCoder.model')  
X = pd.DataFrame(ssCoder.transform(X))  #-2.90711233e-02, -8.89522504e-01, -9.28310730e-01
model.score(X,Y)

猜你喜欢

转载自blog.csdn.net/chuan403082010/article/details/85049614