kaggle入门学习 泰坦尼克号生存预测
参考博客:Kaggle|入门:泰坦尼克号生存预测(线性回归)
特征选择
准确率为: 0.7878787878787878
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 418 non-null int64
1 Pclass 418 non-null int64
2 Name 418 non-null object
3 Sex 418 non-null object
4 Age 332 non-null float64
5 SibSp 418 non-null int64
6 Parch 418 non-null int64
7 Ticket 418 non-null object
8 Fare 417 non-null float64
9 Cabin 91 non-null object
10 Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
Process finished with exit code 0
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import math
from sklearn.linear_model import LinearRegression #导入线性回归
#训练集进行交叉验证,得到均值
from sklearn.model_selection import KFold
data_train = pd.read_csv('train.csv')
#数据填充
#填充 Age
data_train['Age'] = data_train['Age'].fillna(data_train['Age'].median())
#填充 embarked
ans = data_train['Embarked'].value_counts() #统计各元素出现的次数
#返回最大值索引
fillstr = ans.idxmax()
data_train['Embarked']=data_train['Embarked'].fillna(fillstr)
#修改embarked 与 sex
data_train.loc[data_train["Sex"] == "male","Sex"] = 0
data_train.loc[data_train["Sex"] == "female","Sex"] = 1
data_train.loc[data_train['Embarked']=='C','Embarked'] = 0
data_train.loc[data_train['Embarked']=='Q','Embarked'] = 1
data_train.loc[data_train['Embarked']=='S','Embarked'] = 2
#计算相关系数
def cal_corr(a,b):
a_avg = sum(a) / len(a)
b_avg = sum(b) / len(b)
cov_ab = sum([(x - a_avg) * (y - b_avg) for x,y in zip(a,b)])
sq = math.sqrt(sum([(x - a_avg) **2 for x in a]) * sum([(x - b_avg) **2 for x in b]))
corr_factor = cov_ab /sq
return corr_factor
#计算相关系数
train_corr = data_train.drop('PassengerId',axis=1).corr()
#相关系数热力图
plt.subplots(figsize=(15,9)) #调整画布大小
sns.heatmap(train_corr,vmin=-1,vmax=1,annot=True,square=True)
# plt.show()
#多元线性回归
# 选取简单的可用数字特征
predictors = ["Pclass", "Age", "Parch", "Fare", "Sex", "Embarked", 'SibSp']
# 初始化现行回归算法
alg = LinearRegression()
# 样本平均分成3份,3折交叉验证
kf = KFold(n_splits=3, shuffle=False, random_state=1)
predictions = []
for train, test in kf.split(data_train):
train_predictors = (data_train[predictors].iloc[train, :])
train_target = data_train["Survived"].iloc[train]
alg.fit(train_predictors, train_target)
test_predictions = alg.predict(data_train[predictors].iloc[test, :])
predictions.append(test_predictions)
# 测试准确率
predictions = np.concatenate(predictions, axis=0)
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0
accuracy = sum(predictions == data_train["Survived"]) / len(predictions)
print ("准确率为: ", accuracy)
# 对data——test进行一样的预处理
data_test = pd.read_csv('test.csv')
ans = data_test['Embarked'].value_counts()
# 返回最大值索引
fillstr = ans.idxmax()
data_test['Embarked'] = data_test['Embarked'].fillna(fillstr)
data_test.info()
data_test.loc[data_test["Sex"] == "male", "Sex"] = 0
data_test.loc[data_test["Sex"] == "female", "Sex"] = 1
data_test.loc[data_test['Embarked'] == 'C', 'Embarked'] = 0
data_test.loc[data_test['Embarked'] == 'Q', 'Embarked'] = 1
data_test.loc[data_test['Embarked'] == 'S', 'Embarked'] = 2
data_test['Age'] = data_test['Age'].fillna(data_test['Age'].median())
mid = data_test['Fare'].median()
data_test['Fare'] = data_test['Fare'].fillna(value=mid)
test_predictions = alg.predict(data_test[predictors])
test_predictions[test_predictions >= 0.5] = 1
test_predictions[test_predictions < 0.5] = 0
# 导出结果
result = pd.DataFrame(
{
'PassengerId': data_test['PassengerId'].values, 'Survived': test_predictions.astype(np.int32)})
# writer = pd.ExcelWriter('submission.csv')
result.to_csv(r"submission.csv",index=False)