import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
inputData = pd.read_csv('./train.csv')
trainData = inputData.copy(deep = True)
拷贝一份trainData用于数据处理,不动原数据。(其实好像没啥用)
del trainData['PassengerId']
del trainData['Cabin']
del trainData['Ticket']
删除机舱和乘客id
trainData.describe()
Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
trainData['Age'].mode(1)
0 24.0
dtype: float64
pandas中mode可以用来求众数,试一试.
不给具体列的时候,每一列都是众数。
axis = 1是统计每一行的众数
axis = 0是我对于一行,抽取每一列的众数
# print(trainData['Name'])
trainData['Name'] = trainData['Name'].str.extract(r", (.*?)\.", expand=False)
print(trainData["Name"])
trainData.rename(columns={
'Name':'Title'}, inplace=True)
0 Mr
1 Mrs
2 Miss
3 Mrs
4 Mr
...
886 Rev
887 Miss
888 Miss
889 Mr
890 Mr
Name: Name, Length: 891, dtype: object
#将title合并为几个组
trainData["Title"]=trainData["Title"].replace(['Mr','Don'],'Mr')
trainData["Title"]=trainData["Title"].replace(['Mrs','Miss','Mme','Ms','Lady','Dona','Mlle'],'Ms')
trainData["Title"]=trainData["Title"].replace(['Sir','Major','Col','Capt'],'Major')
trainData["Title"]=trainData["Title"].replace(['Master','Jonkheer','the Countess'],'Jonkheer')
trainData["Title"]=trainData["Title"].replace(['Rev','Dr'],'Rev')
把title分类
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
a = LabelEncoder().fit_transform(trainData['Sex'])
OneHotEncoder( sparse=False ).fit_transform(a.reshape(-1,1)) # 注意: 这里把 a 用 reshape 转换成 2-D array
listUniq = trainData['Title'].unique().reshape(-1,1)
print(listUniq)
enc = OneHotEncoder()
enc.fit(listUniq)
enc.transform(trainData[['Title']]).toarray()
# b = LabelEncoder().fit_transform(trainData['Title'])
# trainData['Title'] = OneHotEncoder( sparse=False ).fit_transform(b.reshape(-1,1))
[['Mr']
['Ms']
['Jonkheer']
['Rev']
['Major']]
array([[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 0., 1., 0.],
...,
[0., 0., 0., 1., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 1., 0., 0.]])
对性别和类别进行onehot编码。成功是成功了,发现不会用……无语子,不知道怎么放进模型)
trainData[trainData['Title'] == 'Mr']['Age'].mode(1)
0 19.0
1 25.0
dtype: float64
trainData[trainData['Title'] == 'Ms']['Age'].mode(1)
0 24.0
dtype: float64
trainData[trainData['Title'] == 'Major']['Age'].mode(1)
0 45.0
1 49.0
2 52.0
3 56.0
4 60.0
5 70.0
dtype: float64
trainData[trainData['Title'] == 'Jonkheer']['Age'].mode(1)
0 1.0
1 4.0
dtype: float64
trainData[trainData['Title'] == 'Rev']['Age'].mode(1)
0 54.0
dtype: float64
trainData = pd.get_dummies(trainData, columns=['Sex'])
trainData = pd.get_dummies(trainData, columns=['Title'])
trainData
Survived | Pclass | Age | SibSp | Parch | Fare | Embarked | Sex_female | Sex_male | Title_Jonkheer | Title_Major | Title_Mr | Title_Ms | Title_Rev | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | S | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | C | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | S | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | S | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | S | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 0 | 2 | 27.0 | 0 | 0 | 13.0000 | S | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
887 | 1 | 1 | 19.0 | 0 | 0 | 30.0000 | S | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
888 | 0 | 3 | NaN | 1 | 2 | 23.4500 | S | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
889 | 1 | 1 | 26.0 | 0 | 0 | 30.0000 | C | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
890 | 0 | 3 | 32.0 | 0 | 0 | 7.7500 | Q | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
891 rows × 14 columns
目前只会用get_dummies进行独热编码(编码前就应该处理好每个类的年龄的众数,用于填充age)
trainData.loc[trainData.Title_Jonkheer == 1, 'Age'] = trainData.loc[trainData.Title_Jonkheer == 1, 'Age'].fillna(2.0)
print(trainData[trainData['Title_Jonkheer']==1])
# print(trainData.loc[trainData['Title_Jonkheer']==1])
# trainData[trainData['Title_Jonkheer']==1]['Age'] = trainData[trainData['Title_Jonkheer']==1]['Age'].fillna(2.0)
Survived Pclass Age SibSp Parch Fare Embarked Sex_female \
7 0 3 2.00 3 1 21.0750 S 0
16 0 3 2.00 4 1 29.1250 Q 0
50 0 3 7.00 4 1 39.6875 S 0
59 0 3 11.00 5 2 46.9000 S 0
63 0 3 4.00 3 2 27.9000 S 0
65 1 3 2.00 1 1 15.2458 C 0
78 1 2 0.83 0 2 29.0000 S 0
125 1 3 12.00 1 0 11.2417 C 0
159 0 3 2.00 8 2 69.5500 S 0
164 0 3 1.00 4 1 39.6875 S 0
165 1 3 9.00 0 2 20.5250 S 0
171 0 3 4.00 4 1 29.1250 Q 0
176 0 3 2.00 3 1 25.4667 S 0
182 0 3 9.00 4 2 31.3875 S 0
183 1 2 1.00 2 1 39.0000 S 0
193 1 2 3.00 1 1 26.0000 S 0
261 1 3 3.00 4 2 31.3875 S 0
278 0 3 7.00 4 1 29.1250 Q 0
305 1 1 0.92 1 2 151.5500 S 0
340 1 2 2.00 1 1 26.0000 S 0
348 1 3 3.00 1 1 15.9000 S 0
386 0 3 1.00 5 2 46.9000 S 0
407 1 2 3.00 1 1 18.7500 S 0
445 1 1 4.00 0 2 81.8583 S 0
480 0 3 9.00 5 2 46.9000 S 0
489 1 3 9.00 1 1 15.9000 S 0
549 1 2 8.00 1 1 36.7500 S 0
709 1 3 2.00 1 1 15.2458 C 0
751 1 3 6.00 0 1 12.4750 S 0
755 1 2 0.67 1 1 14.5000 S 0
759 1 1 33.00 0 0 86.5000 S 1
787 0 3 8.00 4 1 29.1250 Q 0
788 1 3 1.00 1 2 20.5750 S 0
802 1 1 11.00 1 2 120.0000 S 0
803 1 3 0.42 0 1 8.5167 C 0
819 0 3 10.00 3 2 27.9000 S 0
822 0 1 38.00 0 0 0.0000 S 0
824 0 3 2.00 4 1 39.6875 S 0
827 1 2 1.00 0 2 37.0042 C 0
831 1 2 0.83 1 1 18.7500 S 0
850 0 3 4.00 4 2 31.2750 S 0
869 1 3 4.00 1 1 11.1333 S 0
Sex_male Title_Jonkheer Title_Major Title_Mr Title_Ms Title_Rev
7 1 1 0 0 0 0
16 1 1 0 0 0 0
50 1 1 0 0 0 0
59 1 1 0 0 0 0
63 1 1 0 0 0 0
65 1 1 0 0 0 0
78 1 1 0 0 0 0
125 1 1 0 0 0 0
159 1 1 0 0 0 0
164 1 1 0 0 0 0
165 1 1 0 0 0 0
171 1 1 0 0 0 0
176 1 1 0 0 0 0
182 1 1 0 0 0 0
183 1 1 0 0 0 0
193 1 1 0 0 0 0
261 1 1 0 0 0 0
278 1 1 0 0 0 0
305 1 1 0 0 0 0
340 1 1 0 0 0 0
348 1 1 0 0 0 0
386 1 1 0 0 0 0
407 1 1 0 0 0 0
445 1 1 0 0 0 0
480 1 1 0 0 0 0
489 1 1 0 0 0 0
549 1 1 0 0 0 0
709 1 1 0 0 0 0
751 1 1 0 0 0 0
755 1 1 0 0 0 0
759 0 1 0 0 0 0
787 1 1 0 0 0 0
788 1 1 0 0 0 0
802 1 1 0 0 0 0
803 1 1 0 0 0 0
819 1 1 0 0 0 0
822 1 1 0 0 0 0
824 1 1 0 0 0 0
827 1 1 0 0 0 0
831 1 1 0 0 0 0
850 1 1 0 0 0 0
869 1 1 0 0 0 0
用2.0填充所有的第五类,以下也是同种操作
trainData.loc[trainData.Title_Rev == 1, 'Age'] = trainData.loc[trainData.Title_Rev == 1, 'Age'].fillna(54.0)
trainData.loc[trainData.Title_Major == 1, 'Age'] = trainData.loc[trainData.Title_Major == 1, 'Age'].fillna(55.34)
trainData.loc[trainData.Title_Mr == 1, 'Age'] = trainData.loc[trainData.Title_Mr == 1, 'Age'].fillna(22.0)
trainData.loc[trainData.Title_Ms == 1, 'Age'] = trainData.loc[trainData.Title_Ms == 1, 'Age'].fillna(24.0)
import numpy as np
fig = plt.figure(figsize=(20,8),dpi=80) # figsize设置图片大小,dpi设置清晰度
# plt.xticks(trainData['Age'])
ax = fig.add_subplot(111) # 创建一个一行一列的图
ax.scatter(trainData['Age'], trainData['Survived'])
<matplotlib.collections.PathCollection at 0x224610cae50>
#将Age合并为几个组
trainData["Age"]=trainData["Age"].replace(r'^[0-8](\.|$)',0,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[9-16](\.|$)',1,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[17-24](\.|$)',2,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[25-32](\.|$)',3,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[33-40](\.|$)',4,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[41-48](\.|$)',5,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[49-56](\.|$)',6,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[57-64](\.|$)',7,regex=True)
trainData["Age"]=trainData["Age"].replace(r'^[65-100](\.|$)',8,regex=True)
print(trainData['Age'])
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
...
886 27.0
887 19.0
888 24.0
889 26.0
890 32.0
Name: Age, Length: 891, dtype: float64
trainData['Embarked'] = trainData['Embarked'].fillna(trainData['Embarked'].mode(1))
print(trainData['Embarked'])
0 S
1 C
2 S
3 S
4 S
..
886 S
887 S
888 S
889 C
890 Q
Name: Embarked, Length: 891, dtype: object
trainData = pd.get_dummies(trainData, columns=['Embarked'])
对年龄分段,对embarked独热编码
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
trainData_minmax = min_max_scaler.fit_transform(trainData)
print(trainData_minmax)
[[0. 1. 0.27117366 ... 0. 0. 1. ]
[1. 0. 0.4722292 ... 1. 0. 0. ]
[1. 1. 0.32143755 ... 0. 0. 1. ]
...
[0. 1. 0.2963056 ... 0. 0. 1. ]
[1. 0. 0.32143755 ... 1. 0. 0. ]
[0. 1. 0.39683338 ... 0. 1. 0. ]]
数据进行归一化,避免某一特征对其有过大影响
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection as cv
split_train, split_cv = cv.train_test_split(trainData_minmax, test_size=0.25, random_state=0)
# # 训练集
# train_df = split_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Sex_.*|Title_.*|Embarked_.*') #过滤规则
# print (train_df.shape) # 大小
# print (train_df.columns.tolist()) # 列名
# train_df.head() #PassengerId age fare embarked 都没有了 说明上述过滤规则没问题,并且第一列是标签列
# 生成模型
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(split_train[:,1:], split_train[:,0])
#用生成的模型 对 验证集 数据进行预测
# cv_df = split_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Sex_.*|Title_.*|Embarked_.*')
predictions = knn.predict(split_cv[:,1:]) # 返回的是验证集的预测标签
# 计算正确率
from sklearn.metrics import accuracy_score
accuracy_score(split_cv[:,0],predictions.astype(np.int32))
0.8071748878923767
验证正确率为0.8,下面我们导入测试数据。
testData = pd.read_csv('./test.csv')
del testData['Cabin']
del testData['Ticket']
testData.describe()
PassengerId | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
count | 418.000000 | 418.000000 | 332.000000 | 418.000000 | 418.000000 | 417.000000 |
mean | 1100.500000 | 2.265550 | 30.272590 | 0.447368 | 0.392344 | 35.627188 |
std | 120.810458 | 0.841838 | 14.181209 | 0.896760 | 0.981429 | 55.907576 |
min | 892.000000 | 1.000000 | 0.170000 | 0.000000 | 0.000000 | 0.000000 |
25% | 996.250000 | 1.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 |
50% | 1100.500000 | 3.000000 | 27.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 1204.750000 | 3.000000 | 39.000000 | 1.000000 | 0.000000 | 31.500000 |
max | 1309.000000 | 3.000000 | 76.000000 | 8.000000 | 9.000000 | 512.329200 |
testData['Fare'].mode(1)
0 7.75
dtype: float64
# print(trainData['Name'])
testData['Name'] = testData['Name'].str.extract(r", (.*?)\.", expand=False)
print(testData["Name"])
testData.rename(columns={
'Name':'Title'}, inplace=True)
0 Mr
1 Mrs
2 Mr
3 Mr
4 Mrs
...
413 Mr
414 Dona
415 Mr
416 Mr
417 Master
Name: Name, Length: 418, dtype: object
testData["Title"]=testData["Title"].replace(['Mr','Don'],'Mr')
testData["Title"]=testData["Title"].replace(['Mrs','Miss','Mme','Ms','Lady','Dona','Mlle'],'Ms')
testData["Title"]=testData["Title"].replace(['Sir','Major','Col','Capt'],'Major')
testData["Title"]=testData["Title"].replace(['Master','Jonkheer','the Countess'],'Jonkheer')
testData["Title"]=testData["Title"].replace(['Rev','Dr'],'Rev')
print(testData[testData['Title'] == 'Rev']['Age'].mode(1))
print(testData[testData['Title'] == 'Mr']['Age'].mode(1))
print(testData[testData['Title'] == 'Ms']['Age'].mode(1))
print(testData[testData['Title'] == 'Major']['Age'].mode(1))
print(testData[testData['Title'] == 'Jonkheer']['Age'].mode(1))
0 30.0
1 41.0
2 53.0
dtype: float64
0 21.0
dtype: float64
0 22.0
dtype: float64
0 47.0
1 53.0
dtype: float64
0 6.0
1 13.0
dtype: float64
testData = pd.get_dummies(testData, columns=['Sex'])
testData = pd.get_dummies(testData, columns=['Title'])
testData.loc[testData.Title_Jonkheer == 1, 'Age'] = testData.loc[testData.Title_Jonkheer == 1, 'Age'].fillna(9.5)
testData.loc[testData.Title_Rev == 1, 'Age'] = testData.loc[testData.Title_Rev == 1, 'Age'].fillna(41.34)
testData.loc[testData.Title_Major == 1, 'Age'] = testData.loc[testData.Title_Major == 1, 'Age'].fillna(50.0)
testData.loc[testData.Title_Mr == 1, 'Age'] = testData.loc[testData.Title_Mr == 1, 'Age'].fillna(21.0)
testData.loc[testData.Title_Ms == 1, 'Age'] = testData.loc[testData.Title_Ms == 1, 'Age'].fillna(22.0)
testData['Embarked'] = testData['Embarked'].fillna(testData['Embarked'].mode(1))
testData['Fare'] = testData['Fare'].fillna(7.75)
testData = pd.get_dummies(testData, columns=['Embarked'])
testData_minmax = min_max_scaler.fit_transform(testData) # 这里pid也归一化了
output = knn.predict(testData_minmax[:,1:])
print(testData_minmax[:,1:])
[[1. 0.4527232 0. ... 0. 1. 0. ]
[1. 0.61756561 0.125 ... 0. 0. 1. ]
[0.5 0.8153765 0. ... 0. 1. 0. ]
...
[1. 0.50547277 0. ... 0. 0. 1. ]
[1. 0.27469339 0. ... 0. 0. 1. ]
[1. 0.12303838 0.125 ... 1. 0. 0. ]]
testData.isnull()
PassengerId | Pclass | Age | SibSp | Parch | Fare | Sex_female | Sex_male | Title_Jonkheer | Title_Major | Title_Mr | Title_Ms | Title_Rev | Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
1 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
2 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
3 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
4 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
413 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
414 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
415 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
416 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
417 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
418 rows × 16 columns
np.savetxt("tmp.csv", testData, delimiter=',')
outSet = pd.DataFrame(output, index=testData['PassengerId'], dtype=int)
print(outSet)
outSet.columns=['Survived']
0
PassengerId
892 0
893 0
894 0
895 0
896 0
... ..
1305 0
1306 1
1307 0
1308 0
1309 0
[418 rows x 1 columns]
print(outSet)
outSet.to_csv('./gender_submission.csv', sep=',', header=True, index=True, float_format='%d')
Survived
PassengerId
892 0
893 0
894 0
895 0
896 0
... ...
1305 0
1306 1
1307 0
1308 0
1309 0
[418 rows x 1 columns]