Kaggle练习——Titanic

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

inputData = pd.read_csv('./train.csv')
trainData = inputData.copy(deep = True)

拷贝一份trainData用于数据处理,不动原数据。(其实好像没啥用)

del trainData['PassengerId']
del trainData['Cabin']
del trainData['Ticket']

删除机舱和乘客id

trainData.describe()
Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
trainData['Age'].mode(1)
0    24.0
dtype: float64

pandas中mode可以用来求众数,试一试.
不给具体列的时候,每一列都是众数。
axis = 1是统计每一行的众数
axis = 0是我对于一行,抽取每一列的众数

# print(trainData['Name'])
trainData['Name'] = trainData['Name'].str.extract(r", (.*?)\.", expand=False)
print(trainData["Name"])
trainData.rename(columns={
    
    'Name':'Title'}, inplace=True)
0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: Name, Length: 891, dtype: object
#将title合并为几个组
trainData["Title"]=trainData["Title"].replace(['Mr','Don'],'Mr')
trainData["Title"]=trainData["Title"].replace(['Mrs','Miss','Mme','Ms','Lady','Dona','Mlle'],'Ms')
trainData["Title"]=trainData["Title"].replace(['Sir','Major','Col','Capt'],'Major')
trainData["Title"]=trainData["Title"].replace(['Master','Jonkheer','the Countess'],'Jonkheer')
trainData["Title"]=trainData["Title"].replace(['Rev','Dr'],'Rev')

把title分类

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

a = LabelEncoder().fit_transform(trainData['Sex'])
OneHotEncoder( sparse=False ).fit_transform(a.reshape(-1,1)) # 注意: 这里把 a 用 reshape 转换成 2-D array

listUniq = trainData['Title'].unique().reshape(-1,1)
print(listUniq)
enc = OneHotEncoder()
enc.fit(listUniq)
enc.transform(trainData[['Title']]).toarray()
# b = LabelEncoder().fit_transform(trainData['Title'])
# trainData['Title'] = OneHotEncoder( sparse=False ).fit_transform(b.reshape(-1,1))

[['Mr']
 ['Ms']
 ['Jonkheer']
 ['Rev']
 ['Major']]





array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]])

对性别和类别进行onehot编码。成功是成功了,发现不会用……无语子,不知道怎么放进模型)

trainData[trainData['Title'] == 'Mr']['Age'].mode(1)
0    19.0
1    25.0
dtype: float64
trainData[trainData['Title'] == 'Ms']['Age'].mode(1)
0    24.0
dtype: float64
trainData[trainData['Title'] == 'Major']['Age'].mode(1)
0    45.0
1    49.0
2    52.0
3    56.0
4    60.0
5    70.0
dtype: float64
trainData[trainData['Title'] == 'Jonkheer']['Age'].mode(1)
0    1.0
1    4.0
dtype: float64
trainData[trainData['Title'] == 'Rev']['Age'].mode(1)
0    54.0
dtype: float64
trainData = pd.get_dummies(trainData, columns=['Sex'])
trainData = pd.get_dummies(trainData, columns=['Title'])

trainData
Survived Pclass Age SibSp Parch Fare Embarked Sex_female Sex_male Title_Jonkheer Title_Major Title_Mr Title_Ms Title_Rev
0 0 3 22.0 1 0 7.2500 S 0 1 0 0 1 0 0
1 1 1 38.0 1 0 71.2833 C 1 0 0 0 0 1 0
2 1 3 26.0 0 0 7.9250 S 1 0 0 0 0 1 0
3 1 1 35.0 1 0 53.1000 S 1 0 0 0 0 1 0
4 0 3 35.0 0 0 8.0500 S 0 1 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 0 2 27.0 0 0 13.0000 S 0 1 0 0 0 0 1
887 1 1 19.0 0 0 30.0000 S 1 0 0 0 0 1 0
888 0 3 NaN 1 2 23.4500 S 1 0 0 0 0 1 0
889 1 1 26.0 0 0 30.0000 C 0 1 0 0 1 0 0
890 0 3 32.0 0 0 7.7500 Q 0 1 0 0 1 0 0

891 rows × 14 columns

目前只会用get_dummies进行独热编码(编码前就应该处理好每个类的年龄的众数,用于填充age)

trainData.loc[trainData.Title_Jonkheer == 1, 'Age'] = trainData.loc[trainData.Title_Jonkheer == 1, 'Age'].fillna(2.0)
print(trainData[trainData['Title_Jonkheer']==1])
# print(trainData.loc[trainData['Title_Jonkheer']==1])
# trainData[trainData['Title_Jonkheer']==1]['Age'] = trainData[trainData['Title_Jonkheer']==1]['Age'].fillna(2.0)

     Survived  Pclass    Age  SibSp  Parch      Fare Embarked  Sex_female  \
7           0       3   2.00      3      1   21.0750        S           0   
16          0       3   2.00      4      1   29.1250        Q           0   
50          0       3   7.00      4      1   39.6875        S           0   
59          0       3  11.00      5      2   46.9000        S           0   
63          0       3   4.00      3      2   27.9000        S           0   
65          1       3   2.00      1      1   15.2458        C           0   
78          1       2   0.83      0      2   29.0000        S           0   
125         1       3  12.00      1      0   11.2417        C           0   
159         0       3   2.00      8      2   69.5500        S           0   
164         0       3   1.00      4      1   39.6875        S           0   
165         1       3   9.00      0      2   20.5250        S           0   
171         0       3   4.00      4      1   29.1250        Q           0   
176         0       3   2.00      3      1   25.4667        S           0   
182         0       3   9.00      4      2   31.3875        S           0   
183         1       2   1.00      2      1   39.0000        S           0   
193         1       2   3.00      1      1   26.0000        S           0   
261         1       3   3.00      4      2   31.3875        S           0   
278         0       3   7.00      4      1   29.1250        Q           0   
305         1       1   0.92      1      2  151.5500        S           0   
340         1       2   2.00      1      1   26.0000        S           0   
348         1       3   3.00      1      1   15.9000        S           0   
386         0       3   1.00      5      2   46.9000        S           0   
407         1       2   3.00      1      1   18.7500        S           0   
445         1       1   4.00      0      2   81.8583        S           0   
480         0       3   9.00      5      2   46.9000        S           0   
489         1       3   9.00      1      1   15.9000        S           0   
549         1       2   8.00      1      1   36.7500        S           0   
709         1       3   2.00      1      1   15.2458        C           0   
751         1       3   6.00      0      1   12.4750        S           0   
755         1       2   0.67      1      1   14.5000        S           0   
759         1       1  33.00      0      0   86.5000        S           1   
787         0       3   8.00      4      1   29.1250        Q           0   
788         1       3   1.00      1      2   20.5750        S           0   
802         1       1  11.00      1      2  120.0000        S           0   
803         1       3   0.42      0      1    8.5167        C           0   
819         0       3  10.00      3      2   27.9000        S           0   
822         0       1  38.00      0      0    0.0000        S           0   
824         0       3   2.00      4      1   39.6875        S           0   
827         1       2   1.00      0      2   37.0042        C           0   
831         1       2   0.83      1      1   18.7500        S           0   
850         0       3   4.00      4      2   31.2750        S           0   
869         1       3   4.00      1      1   11.1333        S           0   

     Sex_male  Title_Jonkheer  Title_Major  Title_Mr  Title_Ms  Title_Rev  
7           1               1            0         0         0          0  
16          1               1            0         0         0          0  
50          1               1            0         0         0          0  
59          1               1            0         0         0          0  
63          1               1            0         0         0          0  
65          1               1            0         0         0          0  
78          1               1            0         0         0          0  
125         1               1            0         0         0          0  
159         1               1            0         0         0          0  
164         1               1            0         0         0          0  
165         1               1            0         0         0          0  
171         1               1            0         0         0          0  
176         1               1            0         0         0          0  
182         1               1            0         0         0          0  
183         1               1            0         0         0          0  
193         1               1            0         0         0          0  
261         1               1            0         0         0          0  
278         1               1            0         0         0          0  
305         1               1            0         0         0          0  
340         1               1            0         0         0          0  
348         1               1            0         0         0          0  
386         1               1            0         0         0          0  
407         1               1            0         0         0          0  
445         1               1            0         0         0          0  
480         1               1            0         0         0          0  
489         1               1            0         0         0          0  
549         1               1            0         0         0          0  
709         1               1            0         0         0          0  
751         1               1            0         0         0          0  
755         1               1            0         0         0          0  
759         0               1            0         0         0          0  
787         1               1            0         0         0          0  
788         1               1            0         0         0          0  
802         1               1            0         0         0          0  
803         1               1            0         0         0          0  
819         1               1            0         0         0          0  
822         1               1            0         0         0          0  
824         1               1            0         0         0          0  
827         1               1            0         0         0          0  
831         1               1            0         0         0          0  
850         1               1            0         0         0          0  
869         1               1            0         0         0          0  

用2.0填充所有的第五类,以下也是同种操作

trainData.loc[trainData.Title_Rev == 1, 'Age'] = trainData.loc[trainData.Title_Rev == 1, 'Age'].fillna(54.0)
trainData.loc[trainData.Title_Major == 1, 'Age'] = trainData.loc[trainData.Title_Major == 1, 'Age'].fillna(55.34)
trainData.loc[trainData.Title_Mr == 1, 'Age'] = trainData.loc[trainData.Title_Mr == 1, 'Age'].fillna(22.0)
trainData.loc[trainData.Title_Ms == 1, 'Age'] = trainData.loc[trainData.Title_Ms == 1, 'Age'].fillna(24.0)
import numpy as np
fig = plt.figure(figsize=(20,8),dpi=80) # figsize设置图片大小,dpi设置清晰度
# plt.xticks(trainData['Age'])
ax = fig.add_subplot(111) # 创建一个一行一列的图
ax.scatter(trainData['Age'], trainData['Survived']) 

<matplotlib.collections.PathCollection at 0x224610cae50>

在这里插入图片描述

#将Age合并为几个组
trainData["Age"]=trainData["Age"].replace(r'^[0-8](\.|$)',0,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[9-16](\.|$)',1,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[17-24](\.|$)',2,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[25-32](\.|$)',3,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[33-40](\.|$)',4,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[41-48](\.|$)',5,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[49-56](\.|$)',6,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[57-64](\.|$)',7,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[65-100](\.|$)',8,regex=True)
print(trainData['Age'])

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    24.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64
trainData['Embarked'] = trainData['Embarked'].fillna(trainData['Embarked'].mode(1))
print(trainData['Embarked'])
0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object
trainData = pd.get_dummies(trainData, columns=['Embarked'])

对年龄分段,对embarked独热编码

from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
trainData_minmax = min_max_scaler.fit_transform(trainData)
print(trainData_minmax)
[[0.         1.         0.27117366 ... 0.         0.         1.        ]
 [1.         0.         0.4722292  ... 1.         0.         0.        ]
 [1.         1.         0.32143755 ... 0.         0.         1.        ]
 ...
 [0.         1.         0.2963056  ... 0.         0.         1.        ]
 [1.         0.         0.32143755 ... 1.         0.         0.        ]
 [0.         1.         0.39683338 ... 0.         1.         0.        ]]

数据进行归一化,避免某一特征对其有过大影响

from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection as cv

split_train, split_cv = cv.train_test_split(trainData_minmax, test_size=0.25, random_state=0)

# # 训练集
# train_df = split_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Sex_.*|Title_.*|Embarked_.*') #过滤规则
# print (train_df.shape) # 大小
# print (train_df.columns.tolist()) # 列名
# train_df.head() #PassengerId age  fare  embarked  都没有了  说明上述过滤规则没问题,并且第一列是标签列

# 生成模型
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(split_train[:,1:], split_train[:,0])

#用生成的模型 对 验证集 数据进行预测
# cv_df =  split_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Sex_.*|Title_.*|Embarked_.*')
predictions = knn.predict(split_cv[:,1:]) # 返回的是验证集的预测标签


# 计算正确率
from sklearn.metrics import accuracy_score
accuracy_score(split_cv[:,0],predictions.astype(np.int32))
0.8071748878923767

验证正确率为0.8,下面我们导入测试数据。

testData = pd.read_csv('./test.csv')
del testData['Cabin']
del testData['Ticket']

testData.describe()
PassengerId Pclass Age SibSp Parch Fare
count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000
mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188
std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576
min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000
25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800
50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200
75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000
max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200
testData['Fare'].mode(1)
0    7.75
dtype: float64
# print(trainData['Name'])
testData['Name'] = testData['Name'].str.extract(r", (.*?)\.", expand=False)
print(testData["Name"])
testData.rename(columns={
    
    'Name':'Title'}, inplace=True)
0          Mr
1         Mrs
2          Mr
3          Mr
4         Mrs
        ...  
413        Mr
414      Dona
415        Mr
416        Mr
417    Master
Name: Name, Length: 418, dtype: object
testData["Title"]=testData["Title"].replace(['Mr','Don'],'Mr')
testData["Title"]=testData["Title"].replace(['Mrs','Miss','Mme','Ms','Lady','Dona','Mlle'],'Ms')
testData["Title"]=testData["Title"].replace(['Sir','Major','Col','Capt'],'Major')
testData["Title"]=testData["Title"].replace(['Master','Jonkheer','the Countess'],'Jonkheer')
testData["Title"]=testData["Title"].replace(['Rev','Dr'],'Rev')

print(testData[testData['Title'] == 'Rev']['Age'].mode(1))
print(testData[testData['Title'] == 'Mr']['Age'].mode(1))
print(testData[testData['Title'] == 'Ms']['Age'].mode(1))
print(testData[testData['Title'] == 'Major']['Age'].mode(1))
print(testData[testData['Title'] == 'Jonkheer']['Age'].mode(1))
0    30.0
1    41.0
2    53.0
dtype: float64
0    21.0
dtype: float64
0    22.0
dtype: float64
0    47.0
1    53.0
dtype: float64
0     6.0
1    13.0
dtype: float64
testData = pd.get_dummies(testData, columns=['Sex'])
testData = pd.get_dummies(testData, columns=['Title'])
testData.loc[testData.Title_Jonkheer == 1, 'Age'] = testData.loc[testData.Title_Jonkheer == 1, 'Age'].fillna(9.5)
testData.loc[testData.Title_Rev == 1, 'Age'] = testData.loc[testData.Title_Rev == 1, 'Age'].fillna(41.34)
testData.loc[testData.Title_Major == 1, 'Age'] = testData.loc[testData.Title_Major == 1, 'Age'].fillna(50.0)
testData.loc[testData.Title_Mr == 1, 'Age'] = testData.loc[testData.Title_Mr == 1, 'Age'].fillna(21.0)
testData.loc[testData.Title_Ms == 1, 'Age'] = testData.loc[testData.Title_Ms == 1, 'Age'].fillna(22.0)
testData['Embarked'] = testData['Embarked'].fillna(testData['Embarked'].mode(1))
testData['Fare'] = testData['Fare'].fillna(7.75)
testData = pd.get_dummies(testData, columns=['Embarked'])
testData_minmax = min_max_scaler.fit_transform(testData) # 这里pid也归一化了
output = knn.predict(testData_minmax[:,1:])
print(testData_minmax[:,1:])
[[1.         0.4527232  0.         ... 0.         1.         0.        ]
 [1.         0.61756561 0.125      ... 0.         0.         1.        ]
 [0.5        0.8153765  0.         ... 0.         1.         0.        ]
 ...
 [1.         0.50547277 0.         ... 0.         0.         1.        ]
 [1.         0.27469339 0.         ... 0.         0.         1.        ]
 [1.         0.12303838 0.125      ... 1.         0.         0.        ]]
testData.isnull()
PassengerId Pclass Age SibSp Parch Fare Sex_female Sex_male Title_Jonkheer Title_Major Title_Mr Title_Ms Title_Rev Embarked_C Embarked_Q Embarked_S
0 False False False False False False False False False False False False False False False False
1 False False False False False False False False False False False False False False False False
2 False False False False False False False False False False False False False False False False
3 False False False False False False False False False False False False False False False False
4 False False False False False False False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
413 False False False False False False False False False False False False False False False False
414 False False False False False False False False False False False False False False False False
415 False False False False False False False False False False False False False False False False
416 False False False False False False False False False False False False False False False False
417 False False False False False False False False False False False False False False False False

418 rows × 16 columns

np.savetxt("tmp.csv", testData, delimiter=',')
outSet = pd.DataFrame(output, index=testData['PassengerId'], dtype=int)
print(outSet)
outSet.columns=['Survived']
             0
PassengerId   
892          0
893          0
894          0
895          0
896          0
...         ..
1305         0
1306         1
1307         0
1308         0
1309         0

[418 rows x 1 columns]
print(outSet)
outSet.to_csv('./gender_submission.csv', sep=',', header=True, index=True, float_format='%d')
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 0
...               ...
1305                0
1306                1
1307                0
1308                0
1309                0

[418 rows x 1 columns]

猜你喜欢

转载自blog.csdn.net/m0_50470999/article/details/108553981