kaggle入门篇二【Titanic】

作为新手入门 ,泰坦尼克是必过的坎。

我用了keras,但准确率不高只有0.78

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout
from keras.optimizers import RMSprop
from keras.utils import np_utils

def nomalizerandStandard(data):
    scaler=StandardScaler()
    data=scaler.fit_transform(data)
    scaler=MinMaxScaler()
    data=scaler.fit_transform(data)
    return data
       
def loadData():
    train=pd.read_csv(r"F:\kaggle_file\Titanic\train.csv")
    test=pd.read_csv(r"F:\kaggle_file\Titanic\test.csv")
    return train,test

def dividedata(data):
    target=data['Survived']
    train=data.drop(['Survived'],axis=1)   
    return train,target

def FamilyandName(data):
    data['family']=data['SibSp']+data['Parch'] 
    data['Name'].ix[data["Sex"]=='male']=0
    data['Name'].ix[data.Name.str.contains('Dona')|data.Name.str.contains('Miss')|data.Name.str.contains('Ms')|data.Name.str.contains('Mlle')]=1
    data['Name'].ix[data.Name.str.contains('Countess')|data.Name.str.contains('Mme')|data.Name.str.contains('Mrs')|train.Name.str.contains('Dr')]=2

    
def preAgeandSex(data):
    data.loc[data['Sex']=='male','Sex']=1
    data.loc[data['Sex']!=1,'Sex']=0
    data['Age']=data['Age'].fillna(data.Age.median())

def fillEmbarkedNan(data):
    data['Embarked']=data['Embarked'].fillna("S")
    data['Embarked'].ix[data["Embarked"] == "S"]=0
    data['Embarked'].ix[data["Embarked"] == "C"]=1
    data['Embarked'].ix[data["Embarked"] == "Q"]=2
    
def dropCabinAndOthers(data):
    data=data.drop(["Cabin"],axis=1)
    data=data.drop(['Ticket','PassengerId'],axis=1)
    return data

def network():
    model=Sequential()
    model.add(Dense(10,input_shape=(9,),activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(7,activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(4,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation='sigmoid'))
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=rmsprop,loss='binary_crossentropy',
              metrics=['accuracy'])
    return model
    
    
train,test=loadData()

##for train
FamilyandName(train)
preAgeandSex(train)
fillEmbarkedNan(train)
train=dropCabinAndOthers(train)



#for testdata
FamilyandName(test)
preAgeandSex(test)
fillEmbarkedNan(test)
test=dropCabinAndOthers(test)
test['Fare']=test['Fare'].fillna(test.Age.median())


##for fitting
traindata,traintarget=dividedata(train)
traindata=nomalizerandStandard(traindata)
traintarget=np.array(traintarget)
traintarget = np_utils.to_categorical(traintarget, num_classes=2)


##for predict


test=nomalizerandStandard(test)
model=network()
model.fit(traindata,traintarget,nb_epoch=1000,batch_size=50)
predict=model.predict(test)
predict=predict.argmax(axis=1).reshape(-1,1)
predict=pd.DataFrame(predict)
predict.to_csv('F:\\kaggle_file\\Titanic\\result.csv')





猜你喜欢

转载自blog.csdn.net/qq_36440163/article/details/70991198