泰坦尼克号幸存者预测
泰坦尼克号的沉没是世界上最严重额海难事故之一,我们通过分类树模型来预测一下哪些人可能成为幸存者。
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
读取数据
data = pd.read_csv(r"E:\machineLearning\cai\decisiontree\data.csv") data
探索数据
data.info()
#探索数据,查看数据信息
data.head(10)
#展示前10行,默认为5
data.drop(['Cabin', 'Name', 'Ticket'], inplace=True, axis=1) #筛选特征,删除不需要的特征, inplace= true 表示用删除后的表覆盖原表, axis 表示轴向,axis=1表示列,0表示行 #等价于 data = data.drop(['Cabin', 'Name', 'Ticket']) data
数据预处理
data["Age"] = data["Age"].fillna(data["Age"].mean()) #处理缺失值,用均值填补 data = data.dropna(axis=0) #删除有缺失值的行
data["Embarked"].unique() #提取“Embarked”的类型,即去重后返回 >>array(['S', 'C', 'Q'], dtype=object)
labels = data["Embarked"].unique().tolist() #将数组转换成列表
data["Embarked"] = data["Embarked"].apply(lambda x:labels.index(x)) #将列表的元素转换成其索引值, S-->0 , C-->1 , Q-->2
data["Sex"] == "male" #判断 >>0 True 1 False 2 False 3 False 4 True ... 886 True 887 False 888 False 889 True 890 True Name: Sex, Length: 889, dtype: bool
data["Sex"] = (data["Sex"] == "male").astype("int") #将True和False的布尔值,转换成1/0
特征筛选完毕,非数值型特征已经转换成数值
x = data.iloc[:,data.columns != "Survived"] #取出所有行,和不包括“Survived”的列
y = data.iloc[:,data.columns == "Survived"]
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.3) Xtrain.shape #确认数据为70% >>>(622, 8)
Xtrain #发现索引打乱
Xtrain.index = range(Xtrain.shape[0]) #Xtrain.shape 返回(622,8),所以Xtrain.shape[0]是622,故而range(622)为0-621的整数 for i in [Xtrain, Xtest, Ytrain, Ytest]: i.index = range(i.shape[0])
特征与标签分离完毕,数据构造完成
建模
clf = DecisionTreeClassifier(random_state= 25) clf = clf.fit(Xtrain, Ytrain) score = clf.score(Xtest, Ytest) score >>>0.7340823970037453
打分不高,用交叉验证尝试提高模型准确度
clf = DecisionTreeClassifier(random_state= 25) score = cross_val_score(clf,x,y,cv=10).mean() score >>>0.7469611848825333
发现分值依旧偏低,尝试通过剪枝提高分值。做学习曲线,探索max_depth的最优值
tr=[] te=[] for i in range(10): clf = DecisionTreeClassifier(random_state=25 ,max_depth=i+1 ) clf = clf.fit(Xtrain, Ytrain) score_tr = clf.score(Xtrain, Ytrain) score_te = cross_val_score(clf, x, y, cv=10).mean() tr.append(score_tr) te.append(score_te) print(max(te)) >>>0.8143896833503576 plt.plot(range(1,11), tr, color="red", label="train") plt.plot(range(1,11), te, color="blue", label="test") plt.xticks(range(1,11)) plt.legend() plt.show() #如果在训练集上表现的很好,在测试集上表现的很糟糕,说明过拟合;如果在测试集上表现的很好,在训练集上表现的很不好,说明模型欠拟合
随着
max_depth
的加深,训练集的拟合程度越来越深,出现过拟合现象,所以当且仅当max_depth=3时,可能通过调参获取最优模型tr=[] te=[] for i in range(10): clf = DecisionTreeClassifier(random_state=25 ,max_depth=i+1 ,criterion="entropy") #将基尼系数改为信息熵,当模型欠拟合时会采用这一参数 clf = clf.fit(Xtrain, Ytrain) score_tr = clf.score(Xtrain, Ytrain) score_te = cross_val_score(clf, x, y, cv=10).mean() tr.append(score_tr) te.append(score_te) print(max(te)) >>>0.8166624106230849 plt.plot(range(1,11), tr, color="red", label="train") plt.plot(range(1,11), te, color="blue", label="test") plt.xticks(range(1,11)) plt.legend() plt.show()
网格搜索,确定最优参数
能够帮我们同时调整多个参数,本质上是枚举
import numpy as np gini_thresholds = np.linspace(0,0.5,20)#在0~0.5有顺序且等间隔的取50个值 parameters = {'splitter':('best','random') ,'criterion':("gini","entropy") ,"max_depth":[*range(1,10)] ,'min_samples_leaf':[*range(1,50,5)] ,'min_impurity_decrease':[*np.linspace(0,0.5,20)] #信息增益最小值,当小于这个值的时候就不再分支 } clf = DecisionTreeClassifier(random_state=25) GS = GridSearchCV(clf, parameters, cv=10) #parameters是参数及其取值的字典 GS.fit(Xtrain,Ytrain)
GS.best_params_ #从我们输入的参数和参数取值的列表中,返回最佳组合 >>>{'criterion': 'entropy', 'max_depth': 6, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'splitter': 'random'} GS.best_score_ >>>0.8183307731694829