0.前言
本文对Kaggle泰坦尼克比赛的训练集和测试集进行分析,并对乘客的生存结果进行了预测.作为数据挖掘的入门项目,本人将思路记录下来,以供参考.如有不足之处,欢迎指正.
1.导入数据
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_initial = test
train_len = len(train)
'''每个特征的含义:
PassengerId (乘客编号)
Survived (存活与否)
Pclass (客舱等级)
Name (姓名)
Sex (性别)
Age (年龄)
SibSp (兄妹人数)
Parch (父母子女人数)
Ticket (船票编号)
Fare (票价)
Cabin (客舱位置)
Embarked (登船地点)
'''
train.head(10)
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th… |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
5 |
6 |
0 |
3 |
Moran, Mr. James |
male |
NaN |
0 |
0 |
330877 |
8.4583 |
NaN |
Q |
6 |
7 |
0 |
1 |
McCarthy, Mr. Timothy J |
male |
54.0 |
0 |
0 |
17463 |
51.8625 |
E46 |
S |
7 |
8 |
0 |
3 |
Palsson, Master. Gosta Leonard |
male |
2.0 |
3 |
1 |
349909 |
21.0750 |
NaN |
S |
8 |
9 |
1 |
3 |
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) |
female |
27.0 |
0 |
2 |
347742 |
11.1333 |
NaN |
S |
9 |
10 |
1 |
2 |
Nasser, Mrs. Nicholas (Adele Achem) |
female |
14.0 |
1 |
0 |
237736 |
30.0708 |
NaN |
C |
test.head(10)
|
PassengerId |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
892 |
3 |
Kelly, Mr. James |
male |
34.5 |
0 |
0 |
330911 |
7.8292 |
NaN |
Q |
1 |
893 |
3 |
Wilkes, Mrs. James (Ellen Needs) |
female |
47.0 |
1 |
0 |
363272 |
7.0000 |
NaN |
S |
2 |
894 |
2 |
Myles, Mr. Thomas Francis |
male |
62.0 |
0 |
0 |
240276 |
9.6875 |
NaN |
Q |
3 |
895 |
3 |
Wirz, Mr. Albert |
male |
27.0 |
0 |
0 |
315154 |
8.6625 |
NaN |
S |
4 |
896 |
3 |
Hirvonen, Mrs. Alexander (Helga E Lindqvist) |
female |
22.0 |
1 |
1 |
3101298 |
12.2875 |
NaN |
S |
5 |
897 |
3 |
Svensson, Mr. Johan Cervin |
male |
14.0 |
0 |
0 |
7538 |
9.2250 |
NaN |
S |
6 |
898 |
3 |
Connolly, Miss. Kate |
female |
30.0 |
0 |
0 |
330972 |
7.6292 |
NaN |
Q |
7 |
899 |
2 |
Caldwell, Mr. Albert Francis |
male |
26.0 |
1 |
1 |
248738 |
29.0000 |
NaN |
S |
8 |
900 |
3 |
Abrahim, Mrs. Joseph (Sophie Halaut Easu) |
female |
18.0 |
0 |
0 |
2657 |
7.2292 |
NaN |
C |
9 |
901 |
3 |
Davies, Mr. John Samuel |
male |
21.0 |
2 |
0 |
A/4 48871 |
24.1500 |
NaN |
S |
train.isnull().sum()
test.isnull().sum()
2.特征分析
2.1 数值数据
sns.heatmap(train[["Survived","Age","SibSp","Parch","Fare"]].corr(),annot=True, fmt = ".2f",cmap = "coolwarm")
plt.title('Pearson Correlation of Numerical Features')
plt.show()
g = sns.FacetGrid(train, hue="Survived",aspect=4)
g.map(sns.kdeplot,'Age',shade= True)
g.set(xlim=(0, train['Age'].max()))
g.add_legend()
plt.show()
g = sns.factorplot(x="SibSp",y="Survived",data=train,kind="bar")
g.set_ylabels("survival probability")
plt.show()
g = sns.factorplot(x="Parch",y="Survived",data=train,kind="bar")
g.set_ylabels("survival probability")
plt.show()
g = sns.distplot(train['Fare'],label='skewness:{:.2f}'.format(train['Fare'].skew()))
g.legend(loc="best")
plt.show()
2.2 分类数据
g = sns.barplot(x="Sex",y="Survived",data=train)
g.set_ylabel("Survival Probability")
plt.show()
g = sns.barplot(x="Pclass",y="Survived",data=train)
g.set_ylabel("Survival Probability")
plt.show()
sns.factorplot(data=train, x="Embarked", y="Survived")
plt.show()
train['Cabin_Bool'] = (train["Cabin"].notnull().astype('int'))
sns.barplot(x="Cabin_Bool", y="Survived", data=train)
plt.show()
3.填充缺失数据
combined = pd.concat([train, test], axis = 0, ignore_index= True)
combined.isnull().sum()
3.1 填充Fare, Embarked
combined.Embarked.value_counts()
combined['Embarked'] = combined['Embarked'].fillna('S')
combined[combined.Fare.isnull()]
|
Age |
Cabin |
Cabin_Bool |
Embarked |
Fare |
Name |
Parch |
PassengerId |
Pclass |
Sex |
SibSp |
Survived |
Ticket |
1043 |
60.5 |
NaN |
NaN |
S |
NaN |
Storey, Mr. Thomas |
0 |
1044 |
3 |
male |
0 |
NaN |
3701 |
combined[combined.Pclass==3]['Fare'].mean()
13.302888700564969
combined['Fare'].fillna(value = combined[combined.Pclass==3]['Fare'].mean(), inplace = True)
3.2 填充Age
fig = plt.figure(figsize=(10,10))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
sns.boxplot(y='Age',x= 'Sex',data = combined,ax = ax1)
sns.boxplot(y='Age',x= 'Pclass',data = combined,ax = ax2)
sns.boxplot(y='Age',x= 'Parch',data = combined,ax = ax3)
sns.boxplot(y='Age',x= 'SibSp',data = combined,ax = ax4)
plt.show()
combined["Sex"] = combined["Sex"].map({"male": 0, "female":1})
sns.heatmap(combined[["Age","Sex","SibSp","Parch","Pclass"]].corr(),annot=True, fmt = ".2f",cmap = "coolwarm")
plt.show()
index_nan_age = list(combined['Age'][combined.Age.isnull()].index)
for i in index_nan_age:
median_pred = combined['Age'][((combined['SibSp'] == combined.iloc[i]["SibSp"]) & (combined['Parch'] == combined.iloc[i]["Parch"]) & (combined['Pclass'] == combined.iloc[i]["Pclass"]))].median()
median_col = combined['Age'].median()
if not np.isnan(median_pred):
combined['Age'][i] = median_pred
else:
combined['Age'][i] = median_col
combined.head()
|
Age |
Cabin |
Cabin_Bool |
Embarked |
Fare |
Name |
Parch |
PassengerId |
Pclass |
Sex |
SibSp |
Survived |
Ticket |
0 |
22.0 |
NaN |
0.0 |
S |
7.2500 |
Braund, Mr. Owen Harris |
0 |
1 |
3 |
0 |
1 |
0.0 |
A/5 21171 |
1 |
38.0 |
C85 |
1.0 |
C |
71.2833 |
Cumings, Mrs. John Bradley (Florence Briggs Th… |
0 |
2 |
1 |
1 |
1 |
1.0 |
PC 17599 |
2 |
26.0 |
NaN |
0.0 |
S |
7.9250 |
Heikkinen, Miss. Laina |
0 |
3 |
3 |
1 |
0 |
1.0 |
STON/O2. 3101282 |
3 |
35.0 |
C123 |
1.0 |
S |
53.1000 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
0 |
4 |
1 |
1 |
1 |
1.0 |
113803 |
4 |
35.0 |
NaN |
0.0 |
S |
8.0500 |
Allen, Mr. William Henry |
0 |
5 |
3 |
0 |
0 |
0.0 |
373450 |
4.特征工程
4.1 从Name中提取头衔
title = [i.split(',')[1].split('.')[0].strip() for i in combined['Name']]
combined['Title'] = pd.Series(title)
plt.figure(figsize=(10,6))
sns.countplot(x="Title",data=combined)
plt.xticks(rotation='45')
plt.show()
combined["Title"].replace(['Don','Rev','Dr','Major','Lady','Sir','Col','Capt','the Countess','Jonkheer', 'Dona'], value='Rare', inplace = True)
combined["Title"] = combined["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
combined["Title"].value_counts()
g = sns.barplot(x="Title",y="Survived",data= combined)
g.set(xticklabels = ["Master","Miss-Mrs","Mr","Rare"], ylabel='survival probability')
plt.show()
4.2 从Parch和SibSp中提取家庭人数
combined['Fam_Size'] = combined['Parch'] + combined['SibSp'] + 1
sns.factorplot(x="Fam_Size",y="Survived",data = combined)
plt.show()
def make_category(size):
if size == 1:
return 'single'
elif size <= 3:
return 'small'
elif size == 4:
return 'medium'
else:
return 'large'
combined['Fam_Size_Class'] = combined['Fam_Size'].map(make_category)
dummy_fam_size = pd.get_dummies(combined['Fam_Size_Class'],prefix ='Fam_Size')
combined = pd.concat([combined, dummy_fam_size], axis = 1)
combined.head()
|
Age |
Cabin |
Cabin_Bool |
Embarked |
Fare |
Name |
Parch |
PassengerId |
Pclass |
Sex |
SibSp |
Survived |
Ticket |
Title |
Fam_Size |
Fam_Size_Class |
Fam_Size_large |
Fam_Size_medium |
Fam_Size_single |
Fam_Size_small |
0 |
22.0 |
NaN |
0.0 |
S |
7.2500 |
Braund, Mr. Owen Harris |
0 |
1 |
3 |
0 |
1 |
0.0 |
A/5 21171 |
2 |
2 |
small |
0 |
0 |
0 |
1 |
1 |
38.0 |
C85 |
1.0 |
C |
71.2833 |
Cumings, Mrs. John Bradley (Florence Briggs Th… |
0 |
2 |
1 |
1 |
1 |
1.0 |
PC 17599 |
1 |
2 |
small |
0 |
0 |
0 |
1 |
2 |
26.0 |
NaN |
0.0 |
S |
7.9250 |
Heikkinen, Miss. Laina |
0 |
3 |
3 |
1 |
0 |
1.0 |
STON/O2. 3101282 |
1 |
1 |
single |
0 |
0 |
1 |
0 |
3 |
35.0 |
C123 |
1.0 |
S |
53.1000 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
0 |
4 |
1 |
1 |
1 |
1.0 |
113803 |
1 |
2 |
small |
0 |
0 |
0 |
1 |
4 |
35.0 |
NaN |
0.0 |
S |
8.0500 |
Allen, Mr. William Henry |
0 |
5 |
3 |
0 |
0 |
0.0 |
373450 |
2 |
1 |
single |
0 |
0 |
1 |
0 |
4.3 从Cabin提取首字母
letter = [i[0] if pd.notnull(i) else 'X' for i in combined['Cabin'] ]
combined['Cabin'] = pd.Series(letter)
fig = plt.figure(figsize=(10,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
sns.countplot(combined['Cabin'],order=['A','B','C','D','E','F','G','T','X'], ax=ax1)
sns.barplot(x = combined['Cabin'], y = combined['Survived'], order=['A','B','C','D','E','F','G','T','X'], ax=ax2)
plt.show()
combined = pd.get_dummies(combined, columns = ["Cabin"],prefix="Cabin")
4.4 从Ticket中提取字母
Ticket = []
for i in list(combined['Ticket']):
if not i.isdigit() :
Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0])
else:
Ticket.append("X")
combined["Ticket"] = Ticket
combined = pd.get_dummies(combined, columns = ["Ticket"], prefix="Ticket")
combined.head()
|
Age |
Cabin_Bool |
Embarked |
Fare |
Name |
Parch |
PassengerId |
Pclass |
Sex |
SibSp |
… |
Ticket_SOTONO2 |
Ticket_SOTONOQ |
Ticket_SP |
Ticket_STONO |
Ticket_STONO2 |
Ticket_STONOQ |
Ticket_SWPP |
Ticket_WC |
Ticket_WEP |
Ticket_X |
0 |
22.0 |
0.0 |
S |
7.2500 |
Braund, Mr. Owen Harris |
0 |
1 |
3 |
0 |
1 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
38.0 |
1.0 |
C |
71.2833 |
Cumings, Mrs. John Bradley (Florence Briggs Th… |
0 |
2 |
1 |
1 |
1 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
2 |
26.0 |
0.0 |
S |
7.9250 |
Heikkinen, Miss. Laina |
0 |
3 |
3 |
1 |
0 |
… |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
3 |
35.0 |
1.0 |
S |
53.1000 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
0 |
4 |
1 |
1 |
1 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
4 |
35.0 |
0.0 |
S |
8.0500 |
Allen, Mr. William Henry |
0 |
5 |
3 |
0 |
0 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
5 rows × 64 columns
combined.info()
combined = pd.get_dummies(combined, columns = ["Pclass"],prefix="Pclass")
combined = pd.get_dummies(combined, columns = ["Embarked"],prefix="Embarked")
combined.drop(['Cabin_Bool','Name','PassengerId','Fam_Size_Class'], axis = 1, inplace = True)
combined.head()
|
Age |
Fare |
Parch |
Sex |
SibSp |
Survived |
Title |
Fam_Size |
Fam_Size_large |
Fam_Size_medium |
… |
Ticket_SWPP |
Ticket_WC |
Ticket_WEP |
Ticket_X |
Pclass_1 |
Pclass_2 |
Pclass_3 |
Embarked_C |
Embarked_Q |
Embarked_S |
0 |
22.0 |
7.2500 |
0 |
0 |
1 |
0.0 |
2 |
2 |
0 |
0 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
38.0 |
71.2833 |
0 |
1 |
1 |
1.0 |
1 |
2 |
0 |
0 |
… |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
2 |
26.0 |
7.9250 |
0 |
1 |
0 |
1.0 |
1 |
1 |
0 |
0 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
3 |
35.0 |
53.1000 |
0 |
1 |
1 |
1.0 |
1 |
2 |
0 |
0 |
… |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
1 |
4 |
35.0 |
8.0500 |
0 |
0 |
0 |
0.0 |
2 |
1 |
0 |
0 |
… |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
5 rows × 64 columns
在建模前查看数据集的类别,确保都是数值,才能放进模型里.
combined.info()
5.建模
train = combined[:train_len]
test = combined[train_len:]
train['Survived'] = train['Survived'].astype(int)
train_Y = train["Survived"]
train_X = train.drop(labels = ["Survived"],axis = 1)
test_X = test.drop(labels=["Survived"],axis = 1)
5.1 交叉验证
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier,GradientBoostingClassifier,ExtraTreesClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, KFold
rf_score = cross_val_score(RandomForestClassifier(random_state = 2), X = train_X, y= train_Y, scoring = 'accuracy', cv = 10, n_jobs = -1)
print('Random Forrest: {:.3f}'.format(rf_score.mean()))
knn_score = cross_val_score(KNeighborsClassifier(), X = train_X, y= train_Y, scoring = 'accuracy', cv = 10, n_jobs = -1)
print('KNN: {:.3f}'.format(knn_score.mean()))
lr_score = cross_val_score(LogisticRegression(random_state = 2), X = train_X, y= train_Y, scoring = 'accuracy', cv = 10, n_jobs = -1)
print('logistic regresssion: {:.3f}'.format(lr_score.mean()))
gb_score = cross_val_score(GradientBoostingClassifier(random_state = 2), X = train_X, y= train_Y, scoring = 'accuracy', cv = 10, n_jobs = -1)
print('GradientBoosting: {:.3f}'.format(gb_score.mean()))
et_score = cross_val_score(ExtraTreesClassifier(random_state = 2), X = train_X, y= train_Y, scoring = 'accuracy', cv = 10, n_jobs = -1)
print('Extra Tree: {:.3f}'.format(et_score.mean()))
ada_score = cross_val_score(AdaBoostClassifier(DecisionTreeClassifier(random_state = 2),random_state = 2, learning_rate = 0.1), X = train_X, y= train_Y, scoring = 'accuracy', cv = 10, n_jobs = -1)
print('Ada Boost: {:.3f}'.format(ada_score.mean()))
Random Forrest: 0.813
KNN: 0.723
logistic regresssion: 0.824
Gradient Boosting: 0.829
Extra Tree: 0.807
Ada Boost: 0.813
5.2 调参
kfold = KFold(n_splits=10)
rf = RandomForestClassifier()
rf_param_grid = {"max_depth": [None],
"max_features": [1, 3, 10],
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [False],
"n_estimators" :[100,300],
"criterion": ["gini"]}
gs_rf = GridSearchCV(rf,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs=-1, verbose = 1)
gs_rf.fit(train_X,train_Y)
rf_best = gs_rf.best_estimator_
lr = LogisticRegression()
lr_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
gs_lr = GridSearchCV(lr,param_grid = lr_param_grid, cv=kfold, scoring="accuracy", n_jobs=-1, verbose = 1)
gs_lr.fit(train_X,train_Y)
lr_best = gs_lr.best_estimator_
gb = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
'n_estimators' : [100,200,300],
'learning_rate': [0.1, 0.05, 0.01],
'max_depth': [4, 8],
'min_samples_leaf': [100,150],
'max_features': [0.3, 0.1]
}
gs_gb = GridSearchCV(gb,param_grid = gb_param_grid, cv=kfold, scoring="accuracy", n_jobs= -1, verbose = 1)
gs_gb.fit(train_X,train_Y)
gb_best = gs_gb.best_estimator_
5.3 模型融合
使用投票分类法,将3种模型融合
voting_est = VotingClassifier(estimators = [('rf',rf_best),('lr',lr_best),('gb',gb_best)], voting = 'soft', n_jobs = -1)
voting_est.fit(train_X, train_Y)
predictions = voting_est.predict(test_X)
5.4 生成预测结果
result = pd.DataFrame({'PassengerId': test_initial['PassengerId'], 'Survived': predictions})
result.to_csv('result.csv', index = False)
提交结果,分数如下. 第一次参赛,结果不算差, 但还有改进的空间, 要继续努力呀.