LR模型 kaggle入门项目Titanic

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
train=pd.read_csv('../1/train.csv')         #数据读取
test=pd.read_csv('../1/test.csv')

print(train.info())                 #观察数据的基本信息
print(test.info())

train['Sex'] = train['Sex'].map({'female': 0, 'male': 1})
train['Embarked'] = train['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

test['Sex'] = test['Sex'].map({'female': 0, 'male': 1})
test['Embarked'] = test['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

x_train=train[['Pclass','Sex', 'Age', 'Embarked','SibSp','Parch', 'Fare']]      #特征属性
x_test=test[['Pclass','Sex', 'Age', 'Embarked','SibSp','Parch', 'Fare']]

#通过之前对数据的总体观察,得知Embarked特征存在缺失值,由于缺失数量相对于总样本量可以忽略,所以采用丢弃缺失值
x_train=x_train[x_train['Embarked'].notnull()]
x_test=x_test[x_test['Embarked'].notnull()]

y_train=train[['Survived', 'Embarked']]
y_train=y_train[y_train[ 'Embarked'].notnull()]           #保留非空
y_train=y_train['Survived']                             #只保留标签属性

#通过之前对数据的总体观察,得知Age特征存在缺失值,由于缺失数量相对于总样本量不能忽略,所以需要填充,此处采用平均值填充
x_train['Age'].fillna(x_train['Age'].mean(),inplace=True)
x_test['Age'].fillna(x_test['Age'].mean(),inplace=True)

x_test['Fare'].fillna(x_test['Fare'].mean(), inplace = True)        #因比赛要求,测试样本不能丢弃数据,采用填充

print(x_train.info())                       #再次观察处理后的数据
print(x_test.info())

lr=LogisticRegression(C=10000)
score=model_selection.cross_val_score(lr, x_train,y_train, cv=5)

print("分类器交叉验证结果:")
print(score.mean())

answer = pd.read_csv('../1/gender_submission.csv')

# 训练分类器
lr.fit(x_train,y_train)

y_predict = lr.predict(x_test)
ruselt= pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':y_predict})
ruselt.to_csv(r'../1/ruselt2.csv',index = False)       #保存结果

猜你喜欢

转载自blog.csdn.net/lonely2018/article/details/80285221