版权声明:博主是初学者,博文可能会有错误,望批评指正!转载请注明本博客地址,谢谢! https://blog.csdn.net/LieQueov/article/details/87904619
1.题目链接
Titanic: Machine Learning from Disaster
2.参考资料
3.线上成绩
截至2019年2月24日
排名:413/9909 前4%
线上成绩:0.82296
4.流程及代码
4.1 载入数据
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
train_y = train_data["Survived"]
train_x = train_data
# 将训练集和测试集合成
all_data = pd.concat((train_x, test_data)).reset_index(drop=True)
4.2 缺失值填充
## 根据计算不同仓位的在不同港口的中位数售价,最接近票价的港口,填充Embarked
P1 = all_data[all_data['Pclass']==1][['Fare','Embarked']]
P1.groupby('Embarked')['Fare'].median()
all_data.loc[829,'Embarked'] = 'C'
all_data.loc[61,'Embarked'] = 'C'
P3 = all_data[all_data['Pclass']==3][['Fare','Embarked']]
P3.groupby('Embarked')['Fare'].median()
all_data.loc[1043,'Fare'] = 8.05
all_data_Name = all_data['Name']
all_data_Family = all_data_Name.str.split(',',expand=True)[0]
all_data['Family'] = all_data_Family
only_have_Sibsp = all_data[all_data['Age'].isnull()==True][(all_data['SibSp'] > 0) & (all_data['Parch']==0)]
Sibsp_ticket = only_have_Sibsp['Ticket'].unique()
only_have_Sibsp = all_data[all_data['Age'].isnull()==True][(all_data['SibSp'] > 0) & (all_data['Parch']==0)]
Sibsp_ticket = only_have_Sibsp['Ticket'].unique()
for f in Sibsp_ticket:
tmp = all_data[all_data['Ticket'] == f][['Ticket','Family','Age']]
for i in tmp['Age'][all_data['Age'].isnull() == True].index :
index_notnull = tmp['Age'][all_data['Age'].isnull() == False].index
for j in index_notnull:
if (all_data.loc[i,'Family'] == all_data.loc[j,'Family'] and all_data.loc[i,'SibSp'] == all_data.loc[j,'SibSp']):
all_data.loc[i,'Age'] = all_data.loc[j,'Age']
if (all_data.loc[i,'Parch'] == 0 and all_data.loc[j,'Parch']== 0):
all_data.loc[i,'Age'] = all_data.loc[j,'Age']
#其他年龄数据填充为中位数
all_data['Age'] = all_data['Age'].fillna(all_data['Age'].median())
4.3 增加特征
#1.家族生还率
#2.分舱位中年龄排名
#3.分舱位票价的排名
#4.亲属数量(简单的相加)
#5.是否有兄弟姐妹
#6.是否有子女或父母
#7.是否独自一人
DEFAULT_SURVIVAL_VALUE = 0.5
all_data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
for grp, grp_df in all_data[['Survived','Name', 'Family', 'Fare', 'Ticket', 'PassengerId',
'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Family', 'Fare']):
if (len(grp_df) != 1):
for ind, row in grp_df.iterrows():
smax = grp_df.drop(ind)['Survived'].max()
smin = grp_df.drop(ind)['Survived'].min()
passID = row['PassengerId']
if (smax == 1.0):
all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
elif (smin==0.0):
all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0
print("Number of passengers with family survival information:",
all_data.loc[all_data['Family_Survival']!=0.5].shape[0])
for _, grp_df in all_data.groupby('Ticket'):
if (len(grp_df) != 1):
for ind, row in grp_df.iterrows():
if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
smax = grp_df.drop(ind)['Survived'].max()
smin = grp_df.drop(ind)['Survived'].min()
passID = row['PassengerId']
if (smax == 1.0):
all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
elif (smin==0.0):
all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0
print("Number of passenger with family/group survival information: "
+str(all_data[all_data['Family_Survival']!=0.5].shape[0]))
#2
tmp = all_data[all_data['Pclass'] == 1]
r = 0
pre = 0
for i in tmp.sort_values('Age').index:
if pre < all_data.loc[i,'Age']: r = r + 1
all_data.loc[i,'Pclass1_Age_Rank'] = r
pre = all_data.loc[i,'Age']
tmp = all_data[all_data['Pclass'] == 2]
r = 0
pre = 0
for i in tmp.sort_values('Age').index:
if pre < all_data.loc[i,'Age']: r = r + 1
all_data.loc[i,'Pclass2_Age_Rank'] = r
tmp = all_data[all_data['Pclass'] == 3]
r = 0
pre = 0
for i in tmp.sort_values('Age').index:
if pre < all_data.loc[i,'Age']: r = r + 1
all_data.loc[i,'Pclass3_Age_Rank'] = r
all_data['Pclass1_Age_Rank'] = all_data['Pclass1_Age_Rank'].fillna(0)
all_data['Pclass2_Age_Rank'] = all_data['Pclass2_Age_Rank'].fillna(0)
all_data['Pclass3_Age_Rank'] = all_data['Pclass3_Age_Rank'].fillna(0)
tmp = all_data[(all_data['Pclass'] == 1)]
r = 0
pre = 0
for i in tmp.sort_values('Fare').index:
if pre < all_data.loc[i,'Fare'] : r = r + 1
all_data.loc[i,'Pclass1_Fare_Rank'] = r
tmp = all_data[(all_data['Pclass'] == 2)]
r = 0
pre = 0
for i in tmp.sort_values('Fare').index:
if pre < all_data.loc[i,'Fare'] : r = r + 1
all_data.loc[i,'Pclass2_Fare_Rank'] = r
tmp = all_data[(all_data['Pclass'] == 3)]
r = 0
pre = 0
for i in tmp.sort_values('Fare').index:
if pre < all_data.loc[i,'Fare'] : r = r + 1
all_data.loc[i,'Pclass3_Fare_Rank'] = r
all_data['Pclass1_Fare_Rank'] = all_data['Pclass1_Fare_Rank'].fillna(0)
all_data['Pclass2_Fare_Rank'] = all_data['Pclass2_Fare_Rank'].fillna(0)
all_data['Pclass3_Fare_Rank'] = all_data['Pclass3_Fare_Rank'].fillna(0)
all_data['Family_Size'] = all_data['Parch'] + all_data['SibSp'] + 1
all_data['Have_SibSp'] = all_data['SibSp'].apply(lambda x:1 if x>0 else 0)
all_data['Have_Parch'] = all_data['Parch'].apply(lambda x:1 if x>0 else 0)
all_data['Is_Alone'] = 1
all_data['Is_Alone'].loc[all_data['Family_Size'] > 1] = 0
4.4 删除不参与模型训练的特征
all_data.drop("PassengerId",axis = 1, inplace = True)
all_data.drop("Cabin",axis = 1, inplace = True)
all_data.drop("Ticket",axis = 1,inplace = True)
all_data.drop("Name",axis = 1,inplace = True)
all_data.drop("Family",axis = 1, inplace = True)
all_data.drop("Survived",axis = 1, inplace = True)
4.5对类别型变量进行编码
def encoder(x):
if x == 'C':
return 1
elif x=='Q':
return 2
else:
return 3
## 对特征 Embarked 进行编码
all_data['Embarked'] = all_data['Embarked'].apply(lambda x: encoder(x))
all_data['Sex'] = all_data['Sex'].astype(str)
## 对类别型数据进行 one-hot编码
all_data = pd.get_dummies(all_data)
4.6 将数据分为训练集和测试集
n_train = len(train_x)
train_x = all_data[:n_train]
test_x = all_data[n_train:]
4.7 模型训练
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm,feature_selection
from sklearn.model_selection import StratifiedKFold, cross_val_score
import warnings
warnings.filterwarnings('ignore')
def acc_cv(model):
n_folds = 5
skf = StratifiedKFold(n_folds, shuffle=True, random_state= 8)#为了shuffle数据
acc = cross_val_score(model, train_x.values,train_y, scoring="accuracy", cv = skf)
return(acc.mean(),acc.std())
rbf_svc = make_pipeline(StandardScaler(), svm.SVC(C=1.0,random_state=2))
acc_cv(rbf_svc)
rbf_svc.fit(train_x.values,train_y.values)
test_y = rbf_svc.predict(test_x.values)
sub = pd.DataFrame()
sub['PassengerId'] = test_data['PassengerId']
sub['Survived'] = test_y
sub.to_csv('submission_0224.csv',index=False)
5.总结
(1)特征的结果的影响远远大于模型。根据参考增加了‘Family_Survival’特征,线上成绩提高了2%。将时间放在寻找或者生成新特征上相较于模型调参更划算。
(2)集成模型或者说复杂模型的成绩不一定好过单个模型。复杂的模型更容易过拟合。
(3)交叉验证的集合的准确率的平均值和线上集合的准确率变化有误差。比如线下为0.822时,线上为0.80。但是,通过调整参数,线下为0.827时,线上可能只有0.79。但是如果线下从0.81提升到0.83。线上也会得到提升。
(4)组合特征的向下分解过细不一定有利于结果,比如本代码中特征'Pclass1_Age_Rank'表示在舱位1中年龄的排名,我们还可以将'Pclass1_Age_Rank'分解成'Pclass1_Age_Male_Rank'和'Pclass1_Age_Female_Rank',表示在舱位1的男性中年龄排名和在舱位1的女性中年龄排名,但线下和线上结果都比之前差。可能的原因是:通过这样的分解,训练数据中会变得更加稀疏,分解过细,也导致符合这个要求的数据更少,更难获得一定的特性。比如 在舱位1中女性可能就只有1个人(只是举例,和真实数据不同)。