kaggle入门学习-1 泰坦尼克号生存预测

kaggle入门学习 泰坦尼克号生存预测

参考博客:Kaggle|入门:泰坦尼克号生存预测(线性回归)

特征选择

在这里插入图片描述

准确率为:  0.7878787878787878
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB

Process finished with exit code 0

在这里插入图片描述

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import math

from sklearn.linear_model import LinearRegression    #导入线性回归

#训练集进行交叉验证,得到均值
from sklearn.model_selection import  KFold

data_train = pd.read_csv('train.csv')
#数据填充
#填充 Age
data_train['Age'] = data_train['Age'].fillna(data_train['Age'].median())
#填充 embarked
ans = data_train['Embarked'].value_counts()    #统计各元素出现的次数
#返回最大值索引
fillstr = ans.idxmax()

data_train['Embarked']=data_train['Embarked'].fillna(fillstr)

#修改embarked 与 sex
data_train.loc[data_train["Sex"] == "male","Sex"] = 0
data_train.loc[data_train["Sex"] == "female","Sex"] = 1

data_train.loc[data_train['Embarked']=='C','Embarked'] = 0
data_train.loc[data_train['Embarked']=='Q','Embarked'] = 1
data_train.loc[data_train['Embarked']=='S','Embarked'] = 2


#计算相关系数
def cal_corr(a,b):
    a_avg = sum(a) / len(a)
    b_avg = sum(b) / len(b)

    cov_ab = sum([(x - a_avg) * (y - b_avg) for x,y in zip(a,b)])
    sq = math.sqrt(sum([(x - a_avg) **2 for x in a]) * sum([(x - b_avg) **2 for x in b]))

    corr_factor = cov_ab /sq

    return corr_factor

#计算相关系数
train_corr = data_train.drop('PassengerId',axis=1).corr()

#相关系数热力图
plt.subplots(figsize=(15,9))  #调整画布大小
sns.heatmap(train_corr,vmin=-1,vmax=1,annot=True,square=True)
# plt.show()

#多元线性回归

# 选取简单的可用数字特征
predictors = ["Pclass", "Age", "Parch", "Fare", "Sex", "Embarked", 'SibSp']

# 初始化现行回归算法
alg = LinearRegression()
# 样本平均分成3份,3折交叉验证
kf = KFold(n_splits=3, shuffle=False, random_state=1)
predictions = []
for train, test in kf.split(data_train):
    train_predictors = (data_train[predictors].iloc[train, :])
    train_target = data_train["Survived"].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(data_train[predictors].iloc[test, :])
    predictions.append(test_predictions)

# 测试准确率

predictions = np.concatenate(predictions, axis=0)

predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0

accuracy = sum(predictions == data_train["Survived"]) / len(predictions)

print ("准确率为: ", accuracy)



# 对data——test进行一样的预处理

data_test = pd.read_csv('test.csv')

ans = data_test['Embarked'].value_counts()

# 返回最大值索引
fillstr = ans.idxmax()

data_test['Embarked'] = data_test['Embarked'].fillna(fillstr)
data_test.info()
data_test.loc[data_test["Sex"] == "male", "Sex"] = 0
data_test.loc[data_test["Sex"] == "female", "Sex"] = 1

data_test.loc[data_test['Embarked'] == 'C', 'Embarked'] = 0
data_test.loc[data_test['Embarked'] == 'Q', 'Embarked'] = 1
data_test.loc[data_test['Embarked'] == 'S', 'Embarked'] = 2

data_test['Age'] = data_test['Age'].fillna(data_test['Age'].median())

mid = data_test['Fare'].median()
data_test['Fare'] = data_test['Fare'].fillna(value=mid)

test_predictions = alg.predict(data_test[predictors])
test_predictions[test_predictions >= 0.5] = 1
test_predictions[test_predictions < 0.5] = 0

# 导出结果
result = pd.DataFrame(
    {
    
    'PassengerId': data_test['PassengerId'].values, 'Survived': test_predictions.astype(np.int32)})

# writer = pd.ExcelWriter('submission.csv')
result.to_csv(r"submission.csv",index=False)

猜你喜欢

转载自blog.csdn.net/weixin_41281151/article/details/108569521