机器学习实战之Titanic(Kaggle)

一、船员数据分析


  1. PassengerId :每一个乘客的标志符
  2. Survived:Lable值,代表是否获救
  3. Pclass:船员仓库等级
  4. Name:姓名
  5. Sex:性别
  6. Age:年龄 
  7. SibSp:兄弟姐妹有几个
  8. Parch:老人孩子的数量
  9. Ticket:船票的编号
  10. Fare:船票价格
  11. Cabin:船舱位置,此列出现大量缺失,可以不要
  12. Embarked:上船地点

二、数据预处理

1.导入需要的包

import pandas as pa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

2.观察数据的前几行

filename = "train.csv"
titanic = pa.read_csv(filename)
titanic.head()
结果:


3.观察数据的简单数据特征

print titanic.describe()

结果:

    PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000         NaN    0.000000   
50%     446.000000    0.000000    3.000000         NaN    0.000000   
75%     668.500000    1.000000    3.000000         NaN    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
  • 可以看到Age列数据只有714个,其余列均有891个,因此此列需要对缺失值进行填充
    titanic["Age"]=titanic["Age"].fillna(titanic["Age"].median())
    print titanic.describe()

    结果:

     PassengerId    Survived      Pclass         Age       SibSp  \
    count   891.000000  891.000000  891.000000  891.000000  891.000000   
    mean    446.000000    0.383838    2.308642   29.361582    0.523008   
    std     257.353842    0.486592    0.836071   13.019697    1.102743   
    min       1.000000    0.000000    1.000000    0.420000    0.000000   
    25%     223.500000    0.000000    2.000000   22.000000    0.000000   
    50%     446.000000    0.000000    3.000000   28.000000    0.000000   
    75%     668.500000    1.000000    3.000000   35.000000    1.000000   
    max     891.000000    1.000000    3.000000   80.000000    8.000000   
    
                Parch        Fare  
    count  891.000000  891.000000  
    mean     0.381594   32.204208  
    std      0.806057   49.693429  
    min      0.000000    0.000000  
    25%      0.000000    7.910400  
    50%      0.000000   14.454200  
    75%      0.000000   31.000000  
    max      6.000000  512.329200 
  • 将string值转为int/float值
            1) 首先,观察相应列有几种字符串
    • print titanic["Sex"].unique()
      print titanic["Embarked"].unique()

      结果:

      ['male' 'female']
      ['S' 'C' 'Q' nan]
      2) 然后,将相应字符串的位置附上对应的Int/float值

      titanic.loc[titanic["Sex"]=="male","Sex"] = 0; 
      titanic.loc[titanic["Sex"]=="female","Sex"] = 1;
      titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0; 
      titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1;
      titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2;
      titanic.head()

      结果:


      替换成功

三、分类

def data_proprocess():
    import pandas as pa
    import numpy as np
    import matplotlib.pyplot as plt
    
    filename = "train.csv"
    titanic = pa.read_csv(filename)
    #titanic.head()
    #print titanic.describe()
    titanic["Age"]=titanic["Age"].fillna(titanic["Age"].median())
    titanic['Embarked'] = titanic['Embarked'].fillna('S')
    #print titanic["Sex"].unique()
    #print titanic["Embarked"].unique()
    titanic.loc[titanic["Sex"]=="male","Sex"] = 0; 
    titanic.loc[titanic["Sex"]=="female","Sex"] = 1;
    
    titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0; 
    titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1;
    titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2;
    #titanic.head()
    return titanic

def classify_LinearRegression(titanic):
    import pandas as pa
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.cross_validation import KFold
    from sklearn.linear_model import LinearRegression
    
    predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]#特征
    
    alg = LinearRegression()#线性回归
    kf = KFold(titanic.shape[0],n_folds=3,random_state=1)#交叉验证集
    predictions = []
    for train,test in kf:
        train_predictors = (titanic[predictors].iloc[train,:])
        train_target = titanic["Survived"].iloc[train]
        alg.fit(train_predictors,train_target)
        test_predictions = alg.predict(titanic[predictors].iloc[test,:])
        predictions.append(test_predictions)
    
    predictions = np.concatenate(predictions,axis=0)
    predictions[predictions > 0.5] =1
    predictions[predictions <= 0.5] =0

    accuracy = sum(predictions[predictions == titanic['Survived']])/len(predictions)
    return accuracy

def classify_LogisticRegression(titanic):
    import pandas as pa
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import cross_validation
    from sklearn.linear_model import LogisticRegression
    predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]#特征
    alg = LogisticRegression(random_state=1)
    scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=3)
    return scores.mean()
print "LinearRegression Classification result is :"
print classify_LinearRegression(data_proprocess())
print "LogisticRegression Classification result is :"
print classify_LogisticRegression(data_proprocess())
结果:
LinearRegression Classification result is :
0.261503928171
LogisticRegression Classification result is :
0.787878787879

从结果可以看出,还是用逻辑回归做分类问题精度更高。

四、使用随机森林提高分类精度并将结果传到kaggle

 
 
def classify_RandomForestClassifier(train_data,test_data):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import cross_validation
    import pandas as pa  
    import numpy as np  
    predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"] 
    clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
    scores = cross_validation.cross_val_score(clf,train_data[predictors],train_data["Survived"],cv=3)  
    clf .fit(train_data[predictors],train_data["Survived"])
    predict_result=  clf.predict(test_data[predictors])
    result = pa.DataFrame({'PassengerId':test_data['PassengerId'].as_matrix(), 'Survived':predict_result.astype(np.int32)})
    result.to_csv("logistic_regression_predictions.csv", index=False)
    return scores.mean() 
print "train"
titanic_train=data_proprocess("train.csv")
print "test"
titanic_test=data_proprocess("test.csv")

classify_RandomForestClassifier(titanic_train,titanic_test)


猜你喜欢

转载自blog.csdn.net/xuanweichangran/article/details/80165157
今日推荐