kaggle:Costa Rican Household Poverty Level Prediction(2):Base line

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014281392/article/details/82110720

  接上篇,地址在简单的DEA之后,开始Training Model, 工具LightGBM

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
train = pd.read_csv('../fix_train.csv')
test = pd.read_csv('../fix_test.csv')
feature_desp = pd.read_csv('../feature_description.csv', error_bad_lines=False,index_col='F_name')

特征分类

hh : household features

ind : individual features

ids : Id, idhogar, Target

ind_bool = ['v18q', 'dis', 'male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3', 
            'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 
            'parentesco1', 'parentesco2',  'parentesco3', 'parentesco4', 'parentesco5', 
            'parentesco6', 'parentesco7', 'parentesco8',  'parentesco9', 'parentesco10', 
            'parentesco11', 'parentesco12', 'instlevel1', 'instlevel2', 'instlevel3', 
            'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 
            'instlevel9', 'mobilephone']

ind_non_bool = ['rez_esc', 'escolari', 'age','SQBescolari','SQBage','agesq']

hh_bool = ['hacdor', 'hacapo', 'v14a', 'refrig', 'paredblolad', 'paredzocalo', 
           'paredpreb','pisocemento', 'pareddes', 'paredmad',
           'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisoother', 
           'pisonatur', 'pisonotiene', 'pisomadera',
           'techozinc', 'techoentrepiso', 'techocane', 'techootro', 'cielorazo', 
           'abastaguadentro', 'abastaguafuera', 'abastaguano',
            'public', 'planpri', 'noelec', 'coopele', 'sanitario1', 
           'sanitario2', 'sanitario3', 'sanitario5',   'sanitario6',
           'energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4', 
           'elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 
           'elimbasu5', 'elimbasu6', 'epared1', 'epared2', 'epared3',
           'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3', 
           'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5', 
           'computer', 'television', 'lugar1', 'lugar2', 'lugar3',
           'lugar4', 'lugar5', 'lugar6', 'area1', 'area2']

hh_non_bool = ['v2a1', 'v18q1', 'meaneduc', 'SQBovercrowding', 'SQBdependency',
               'SQBmeaned', 'overcrowding', 'rooms', 'r4h1', 'r4h2', 'r4h3', 'r4m1',
               'r4m2', 'r4m3', 'r4t1', 'r4t2', 'r4t3', 'tamhog', 'tamviv', 'hhsize',
               'hogar_nin', 'hogar_adul', 'hogar_mayor', 'hogar_total',  'bedrooms',
               'qmobilephone', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin']

hh_cont = [ 'dependency', 'edjefe', 'edjefa']


ids = ['Id', 'idhogar', 'Target']

Merge train test data

test['Target'] = np.nan
data = train.append(test)
data.info()
train.idhogar.nunique(), test.idhogar.nunique(), data.idhogar.nunique()
(2988, 7352, 10340)

Miss value count

# data miss value count > 0
miss_count = data.isnull().sum() > 0
# data miss value count
misvalue_counts = data.isnull().sum()[miss_count]
# miss value percent
misvalue_percent = misvalue_counts/data.shape[0]*100

misvalue_percent
v2a1         72.615449
v18q1        76.221830
rez_esc      82.545716
meaneduc      0.107742
SQBmeaned     0.107742
Target       71.397360
dtype: float64

fill miss value

from sklearn.preprocessing import MinMaxScaler,Imputer

imputer = Imputer(missing_values=np.nan, strategy='mean', axis = 0)

Train data

  • hh_ : household level data
  • ind_ : indiviual level data
  • hh_train : 有缺失
  • ind_train : 有缺失
  • hh_train_df : 填充后
  • ind_train_df : 填充后
hh_train = train.loc[train.parentesco1 == 1, ids+hh_bool+hh_non_bool+hh_cont].reset_index()

target = hh_train[['Target']]
hh_train_ids = hh_train[['idhogar']]
# before filling miss value
hh_train = hh_train.drop(['Id','idhogar','Target','index'], axis=1)
# after filling miss value
hh_train_df = pd.DataFrame(imputer.fit_transform(hh_train),columns=list(hh_train.columns))

# add idhogar and Target columns
hh_train['idhogar'] = hh_train_ids
hh_train_df['idhogar'] = hh_train_ids
hh_train['Target'] = target
hh_train_df['Target'] = target
# indiviual level data on train set
ind_train = train.loc[ :, ids+ind_bool+ind_non_bool].reset_index()

ind_train_ids = ind_train[['idhogar']]
ind_target = ind_train[['Target']]

# before filling miss value, drop old index
ind_train = ind_train.drop(['Id','idhogar','Target','index'], axis=1)

# after filling miss value
ind_train_df=pd.DataFrame(imputer.fit_transform(ind_train),columns=list(ind_train.columns))

# add idhogar, Target
ind_train['idhogar'] = ind_train_ids
ind_train['Target'] = ind_target
ind_train_df['idhogar'] = ind_train_ids
ind_train_df['Target'] = ind_target

miss value填充前后,KDE变化

  • hh_train : 填充前
  • hh_train_df : 填充后
    • v2a1
    • v18q1
    • meaneduc
    • SQBmeaned
from collections import OrderedDict

mis_cols = ['v2a1','v2a1','v18q1','v18q1','meaneduc','meaneduc','SQBmeaned','SQBmeaned']


# Color mapping
colors = OrderedDict({1: 'red', 2: 'orange', 3: 'blue', 4: 'green'})
label_mapping = OrderedDict({1: 'extreme', 2: 'moderate', 3: 'vulnerable', 
                               4: 'non vulnerable'})
#----------------------------------------------------------------------------

plt.figure(figsize = (12, 7))
for i, col in enumerate(mis_cols):
    ax = plt.subplot(4, 2, i + 1)
    # Iterate through the poverty levels
    for poverty_level, color in colors.items():
        # 核密度估计
        if (i%2 == 0):
            sns.kdeplot(hh_train_df.loc[hh_train_df.Target == poverty_level,col].dropna(), 
                        ax = ax, color = color, label = label_mapping[poverty_level])
            plt.title('%s before filling KDE'%(col.capitalize()))
            plt.xlabel('%s'%col)
            plt.ylabel('Density')
        else :
            sns.kdeplot(hh_train.loc[hh_train.Target == poverty_level, col].dropna(),
                        ax = ax, color = color, label = label_mapping[poverty_level])
            plt.title('%s after filling KDE'%(col.capitalize()))
            plt.xlabel('%s'%col)
            plt.ylabel('Density')
plt.subplots_adjust(top = 2.5)

这里写图片描述

indiviual data填充前后KDE变化情况

  • ind_train : 填充前
  • ind_train_df : 填充后
    • rez_esc
cols = ['rez_esc','rez_esc']
plt.figure(figsize=(14, 2.5))
for i, col in enumerate(cols):
    ax = plt.subplot(1, 2, i + 1)
    for poverty_level, color in colors.items():
        if (i%2 == 0):
            sns.kdeplot(ind_train_df.loc[ind_train_df.Target == poverty_level,col].dropna(), 
                        ax = ax, color = color, label = label_mapping[poverty_level])
            plt.title('%s KDE'%(col.capitalize()))
            plt.xlabel('%s'%col)
            plt.ylabel('Density')
        else :
            sns.kdeplot(ind_train.loc[ind_train.Target == poverty_level, col].dropna(),
                        ax = ax, color = color, label = label_mapping[poverty_level])
            plt.title('%s filled miss KDE'%(col.capitalize()))
            plt.xlabel('%s'%col)
            plt.ylabel('Density')
plt.subplots_adjust(top = 2)

这里写图片描述

Test data
test data中有18个家庭是没有,指定户主的.为了对齐hh_test和ind_test,需要指定户主

  • hh_test : 填充前
  • hh_test_df : 填充后
  • ind_test :
  • ind_test_df :
# test data中也有一些家庭没有指定户主的情况,所以需要指定一个
mis_hh = test.groupby(by='idhogar').parentesco1.agg('sum')==0

# 缺失户主的家庭idhogar
mis_idhogar = test.groupby(by='idhogar').parentesco1.agg('sum')[mis_hh].index

下面的26行数据来自测试数据中缺失户主的家庭

pd.options.display.max_columns = 10
test.loc[test.idhogar.isin(mis_idhogar),:][['Id','idhogar','parentesco1']].sort_values(by='idhogar')
Id idhogar parentesco1
22791 ID_99d27ab2f 0e2a3453d 0
22790 ID_f09603838 0e2a3453d 0
15544 ID_49d05f9e6 198fc274a 0
18735 ID_b0874f522 2dc45d484 0
18643 ID_ceeb5dfe2 5a667591a 0
23547 ID_aa8f26c06 676750a21 0
23253 ID_e42c1dde2 91aff0a8e 0
23252 ID_bbc0959ef 91aff0a8e 0
15090 ID_9c12f6ebc 9d874b0d6 0
22833 ID_26d95edff b115b4536 0
12753 ID_93fa2f7cc b59f5b526 0
17053 ID_bca8a1dde ce6154327 0
23711 ID_4036d87e3 d14b3e03a 0
22006 ID_9f025fde6 d678c45ad 0
17163 ID_6094ce990 df06e01c6 0
17162 ID_d6cbeec15 df06e01c6 0
17132 ID_00e8a868f e3f69768c 0
19318 ID_d0beee31f e4df1caaf 0
19317 ID_3805bdb08 e4df1caaf 0
21654 ID_894de66bc f2fd28dbb 0
21655 ID_56a407d03 f2fd28dbb 0
21656 ID_960e558e0 f2fd28dbb 0
21657 ID_cc28b0331 f2fd28dbb 0
18549 ID_aa650fb4a f6d6fad32 0
19299 ID_139a474f3 fc6c8d241 0
19300 ID_f447c7c54 fc6c8d241 0

下面给这18个家庭指定一名成员为head of household

test.loc[test.Id == 'ID_99d27ab2f','parentesco1'] = 1
test.loc[test.Id == 'ID_49d05f9e6','parentesco1'] = 1
test.loc[test.Id == 'ID_b0874f522','parentesco1'] = 1
test.loc[test.Id == 'ID_ceeb5dfe2','parentesco1'] = 1
test.loc[test.Id == 'ID_aa8f26c06','parentesco1'] = 1
test.loc[test.Id == 'ID_e42c1dde2','parentesco1'] = 1
test.loc[test.Id == 'ID_9c12f6ebc','parentesco1'] = 1
test.loc[test.Id == 'ID_26d95edff','parentesco1'] = 1
test.loc[test.Id == 'ID_93fa2f7cc','parentesco1'] = 1
test.loc[test.Id == 'ID_bca8a1dde','parentesco1'] = 1
test.loc[test.Id == 'ID_4036d87e3','parentesco1'] = 1
test.loc[test.Id == 'ID_9f025fde6','parentesco1'] = 1
test.loc[test.Id == 'ID_6094ce990','parentesco1'] = 1
test.loc[test.Id == 'ID_00e8a868f','parentesco1'] = 1
test.loc[test.Id == 'ID_d0beee31f','parentesco1'] = 1
test.loc[test.Id == 'ID_894de66bc','parentesco1'] = 1
test.loc[test.Id == 'ID_aa650fb4a','parentesco1'] = 1
test.loc[test.Id == 'ID_139a474f3','parentesco1'] = 1

Filling miss value on testset

# household level test data
hh_test = test.loc[test.parentesco1 == 1, ids+hh_bool+hh_non_bool+hh_cont].reset_index()

hh_test_ids = hh_test[['idhogar']]

hh_test = hh_test.drop(['Id','idhogar','Target','index'], axis = 1)

# filling miss values
hh_test_df = pd.DataFrame(imputer.fit_transform(hh_test),columns=list(hh_test.columns))

# add idhogar columns
hh_test_df['idhogar'] = hh_test_ids
hh_test['idhogar'] = hh_test_ids
# indiviual level test data
ind_test = test.loc[:, ids+ind_bool+ind_non_bool].reset_index()

ind_test_ids = ind_test[['idhogar']]
ind_test = ind_test.drop(['Id','idhogar','Target','index'], axis = 1)
ind_test_df = pd.DataFrame(imputer.fit_transform(ind_test),columns=list(ind_test.columns))

# add idhogar columns
ind_test['idhogar'] = ind_test_ids
ind_test_df['idhogar'] = ind_test_ids

create new indiviual feature

Indiviual Train data

ind_train_groupobj = ind_train_df.groupby(by='idhogar')

ind_train_data = pd.DataFrame({'idhogar':ind_train_df.idhogar.unique()})
def AddFeatures(feature_df, cols, funcs, groupobj):
    for func in funcs:
        for col in cols:
            group_object = groupobj[col].agg(func).reset_index()
            group_object.rename(index=str, columns={col:col+'_'+func}, inplace=True)
            feature_df = feature_df.merge(group_object, on='idhogar', how='left')
    return feature_df
# indiviual bool features
ind_train_data = AddFeatures(ind_train_data, ind_bool, ['mean','sum'], ind_train_groupobj)

# indiviual non bool features
funcs = ['mean','min','max','median','sum','nunique']
ind_train_data = AddFeatures(ind_train_data, ind_non_bool, funcs, ind_train_groupobj)
Indiviual Test data
ind_test_groupobj = ind_test_df.groupby(by='idhogar')
ind_test_data = pd.DataFrame({'idhogar':ind_test_df.idhogar.unique()})

ind_test_data = AddFeatures(ind_test_data, ind_bool, ['mean','sum'], ind_test_groupobj)

ind_test_data = AddFeatures(ind_test_data, ind_non_bool, funcs, ind_test_groupobj)

Merge household and indiviual data

train_data = hh_train_df.merge(ind_train_data, on = 'idhogar', how='left')
test_data = hh_test_df.merge(ind_test_data, on = 'idhogar', how='left')

Now Training Model with lightGBM

import gc
import lightgbm as lgb
from tq
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

def model(train_data, test_data, n_folds = 10):
    # household id
    train_ids = train_data[['idhogar']]
    test_ids = test_data[['idhogar']]
    # Target/label
    labels = train_data[['Target']].astype(int)
    # drop idhogar, Target
    train_data = train_data.drop(['idhogar','Target'],axis = 1)
    test_data = test_data.drop(['idhogar'], axis = 1)
    # feature columns name
    feature_names = list(train_data.columns)
    # 10 folds cross validation
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 2018)
    # test predictions
    test_predictions = list()
    # validation predictions
    out_of_fold = np.zeros(train_data.shape[0])
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    # record scores : means f1_macro
    Valid_F1 = []
    Train_F1 = []
    # lightgbm not support f1_macro, so map
    Valid_Score = []
    Train_Score = []
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(train_data):
        # Training data for the fold
        train_features = train_data.loc[train_indices, :]
        train_labels = labels.loc[train_indices, :]
        # Validation data for the fold
        valid_features = train_data.loc[valid_indices, :]
        valid_labels = labels.loc[valid_indices, :]
        # Create the model
        model = lgb.LGBMClassifier(boosting_type='gbdt',n_estimators=1000, 
                                   objective = 'multiclass', class_weight = 'balanced',
                                   learning_rate = 0.03,  num_leaves = 31,
                                   reg_alpha = 0.1, reg_lambda = 0.3, num_class = 4,
                                   subsample = 0.8, n_jobs = -1, random_state = 2018)

        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'multi_error',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = 'auto',
                  early_stopping_rounds = 100, verbose = 200)
        # Record the best iteration
        best_iteration = model.best_iteration_
        # 
        test_predictions.append(model.predict(test_data, num_iteration = best_iteration))
        # feature importance
        feature_importance_values += model.feature_importances_ /n_folds
        # Record the best multi error
        valid_score = model.best_score_['valid']['multi_error']
        train_score = model.best_score_['train']['multi_error']
        Valid_Score.append(valid_score)
        Train_Score.append(train_score)
        # Record F1_macro score
        pred_valid = model.predict(valid_features, num_iteration = best_iteration)
        pred_train = model.predict(train_features, num_iteration = best_iteration)
        valid_f1 = f1_score(valid_labels, pred_valid, average='macro')
        train_f1 = f1_score(train_labels, pred_train, average='macro')
        Valid_F1.append(valid_f1)
        Train_F1.append(train_f1)

        # validation set result
        out_of_fold[valid_indices] = pred_valid

        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        print('................................................')

    # feature importance
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    # overall valida
    Valid_F1.append(f1_score(labels, out_of_fold, average='macro'))
    Train_F1.append(np.mean(Train_F1))
    Valid_Score.append(np.mean(Valid_Score))
    Train_Score.append(np.mean(Train_Score))
    # dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train error': Train_Score,
                            'valid error': Valid_Score,
                            'train f1' : Train_F1,
                            'valid f1' : Valid_F1}) 

    # make submission.csv
    predict_df = pd.DataFrame(np.array(test_predictions).T)
    voting_result = [predict_df.iloc[x,:].value_counts().argmax() for x in range(predict_df.shape[0])]
    submission = test_ids.copy()
    submission['Target'] = voting_result
    # metric, fetaure importance , househodl target
    return metrics, feature_importances,submission
metric, feature_importance, submission = model(train_data, test_data, 10)
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0517394  valid's multi_error: 0.365772
Early stopping, best iteration is:
[245]   train's multi_error: 0.0368913  valid's multi_error: 0.352349
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0458956  valid's multi_error: 0.389262
[400]   train's multi_error: 0.012987   valid's multi_error: 0.38255
Early stopping, best iteration is:
[433]   train's multi_error: 0.0108696  valid's multi_error: 0.369128
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0474782  valid's multi_error: 0.436242
[400]   train's multi_error: 0.012372   valid's multi_error: 0.395973
Early stopping, best iteration is:
[399]   train's multi_error: 0.012372   valid's multi_error: 0.395973
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0463667  valid's multi_error: 0.43771
[400]   train's multi_error: 0.013058   valid's multi_error: 0.420875
Early stopping, best iteration is:
[393]   train's multi_error: 0.0134813  valid's multi_error: 0.417508
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0446141  valid's multi_error: 0.363636
Early stopping, best iteration is:
[286]   train's multi_error: 0.0266365  valid's multi_error: 0.329966
................................................
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[35]    train's multi_error: 0.216065   valid's multi_error: 0.37037
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0465465  valid's multi_error: 0.407407
[400]   train's multi_error: 0.0121707  valid's multi_error: 0.360269
Early stopping, best iteration is:
[478]   train's multi_error: 0.00687285 valid's multi_error: 0.346801
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0458219  valid's multi_error: 0.393939
[400]   train's multi_error: 0.0107405  valid's multi_error: 0.350168
Early stopping, best iteration is:
[417]   train's multi_error: 0.00989259 valid's multi_error: 0.340067
................................................
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[26]    train's multi_error: 0.230932   valid's multi_error: 0.410774
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0458289  valid's multi_error: 0.380471
Early stopping, best iteration is:
[226]   train's multi_error: 0.0344418  valid's multi_error: 0.360269
................................................
metric
fold train error train f1 valid error valid f1
0 0 0.036891 0.895244 0.352349 0.470469
1 1 0.010870 0.965888 0.369128 0.459939
2 2 0.012372 0.959736 0.395973 0.377560
3 3 0.013481 0.957862 0.417508 0.401015
4 4 0.026636 0.922596 0.329966 0.408451
5 5 0.216065 0.683224 0.370370 0.487074
6 6 0.006873 0.979038 0.346801 0.392146
7 7 0.009893 0.968767 0.340067 0.467333
8 8 0.230932 0.672731 0.410774 0.451792
9 9 0.034442 0.899205 0.360269 0.447325
10 overall 0.059845 0.890429 0.369321 0.440105

make submission.csv

submit = test[['Id','idhogar']]

submit = submit.merge(submission, on = 'idhogar')

submit = submit.drop(['idhogar'],axis = 1)

submit.to_csv('../submit_0.csv',index = False)

Public score

这里写图片描述
Feature Importances

feature_importance = feature_importance.sort_values(by = 'importance')

feature_importance.set_index('feature').plot(kind='barh', figsize=(10, 40))
plt.title('Feature Importances')
Text(0.5,1,'Feature Importances')

这里写图片描述

猜你喜欢

转载自blog.csdn.net/u014281392/article/details/82110720
今日推荐