Santander ML Explainability

1 准本步骤

1-1.导包
1-2 设置
1-3 版本

2 问题描述
3 EDA

3-1 数据采集

减小内存
3-1-1 数据集字段
3-2-2 数值描述

3-2可视化

3-2-1 直方图
3-2-2 平均频率
3-2-3 countplot
3-2-4 hist
3-2-5 distplot
3-2-6 散点图

3-3 数据预处理
3-3-1 缺失值检查
3-3-2 二值化
3-3-3 数据平衡
3-3-4 偏度和峰度

4 特征工程

4-1 Permutation Importance
4-2 如何计算和显示重要性
4-4 Partial Dependence Plots
4-5 Partial Dependence Plot
4-6 Chart analysis
4-7 SHAP Values
4-7 特征之间的相关性

5 模型

准备：Augment
5-1 lightgbm
5-2 RandomForestClassifier
5-3 DecisionTreeClassifier
5-4 Logistic Regression

6 提交

1 准本步骤

1-1.导包

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier,Pool
from IPython.display import display
import matplotlib.patches as patch
import matplotlib.pyplot as plt
from sklearn.svm import NuSVR
from scipy.stats import norm
from sklearn import svm
import lightgbm as lgb
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import time
import glob
import sys
import os
import gc

1-2 设置

# for get better result chage fold_n to 5
fold_n=5
folds = StratifiedKFold(n_splits=fold_n, shuffle=True, random_state=10)
%matplotlib inline
%precision 4
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
pd.set_option("display.precision", 15)

1-3 版本

print('pandas: {}'.format(pd.__version__))
print('numpy: {}'.format(np.__version__))
print('Python: {}'.format(sys.version))

2 问题描述

3 EDA

3-1 数据采集

print(os.listdir("../input/"))
train= pd.read_csv("../input/train.csv")
test = pd.read_csv('../input/test.csv')
sample_submission = pd.read_csv('../input/sample_submission.csv')
sample_submission.head()
train.shape, test.shape, sample_submission.shape
train.head(5)

减小内存

def reduce_mem_usage(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if df[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",df[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = df[col].max()
            mn = df[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = df[col].fillna(0).astype(np.int64)
            result = (df[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",df[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df, NAlist

#训练集
train, NAlist = reduce_mem_usage(train)
print("_________________")
print("")
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("_________________")
print("")
print(NAlist)
#测试集
test, NAlist = reduce_mem_usage(test)
print("_________________")
print("")
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("_________________")
print("")
print(NAlist)

3-1-1 数据集字段

train.columns
print(len(train.columns))
print(train.info())

3-2-2 数值描述

train.describe()

3-2可视化

3-2-1 直方图

train['target'].value_counts().plot.bar();
f,ax=plt.subplots(1,2,figsize=(20,10))
train[train['target']==0].var_0.plot.hist(ax=ax[0],bins=20,edgecolor='black',color='red')
ax[0].set_title('target= 0')
x1=list(range(0,85,5))
ax[0].set_xticks(x1)
train[train['target']==1].var_0.plot.hist(ax=ax[1],color='green',bins=20,edgecolor='black')
ax[1].set_title('target= 1')
x2=list(range(0,85,5))
ax[1].set_xticks(x2)
plt.show()

3-2-2 平均频率

train[train.columns[2:]].mean().plot('hist');
plt.title('Mean Frequency');

3-2-3 countplot

f,ax=plt.subplots(1,2,figsize=(18,8))
train['target'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('target')
ax[0].set_ylabel('')
sns.countplot('target',data=train,ax=ax[1])
ax[1].set_title('target')
plt.show()

3-2-4 hist

train["var_0"].hist();

3-2-5 distplot

sns.set(rc={'figure.figsize':(9,7)})
sns.distplot(train['target']);

3-2-6 散点图

def plot_feature_scatter(df1, df2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(4,4,figsize=(14,14))

    for feature in features:
        i += 1
        plt.subplot(4,4,i)
        plt.scatter(df1[feature], df2[feature], marker='+')
        plt.xlabel(feature, fontsize=9)
    plt.show();

features = ['var_0', 'var_1','var_2','var_3', 'var_4', 'var_5', 'var_6', 'var_7', 
           'var_8', 'var_9', 'var_10','var_11','var_12', 'var_13', 'var_14', 'var_15', 
           ]
plot_feature_scatter(train_df[::20],test_df[::20], features)

def plot_new_feature_distribution(df1, df2, label1, label2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(2,4,figsize=(18,8))

    for feature in features:
        i += 1
        plt.subplot(2,4,i)
        sns.kdeplot(df1[feature], bw=0.5,label=label1)
        sns.kdeplot(df2[feature], bw=0.5,label=label2)
        plt.xlabel(feature, fontsize=11)
        locs, labels = plt.xticks()
        plt.tick_params(axis='x', which='major', labelsize=8)
        plt.tick_params(axis='y', which='major', labelsize=8)
    plt.show();

t0 = train_df.loc[train_df['target'] == 0]
t1 = train_df.loc[train_df['target'] == 1]
features = train_df.columns.values[202:]
plot_new_feature_distribution(t0, t1, 'target: 0', 'target: 1', features)

3-3 数据预处理

3-3-1 缺失值检查

def check_missing_data(df):
    flag=df.isna().sum().any()
    if flag==True:
        total = df.isnull().sum()
        percent = (df.isnull().sum())/(df.isnull().count()*100)
        output = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
        data_type = []
        # written by MJ Bahmani
        for col in df.columns:
            dtype = str(df[col].dtype)
            data_type.append(dtype)
        output['Types'] = data_type
        return(np.transpose(output))
    else:
        return(False)

#另一个版本
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

check_missing_data(train)
check_missing_data(test)

3-3-2 二值化

train['target'].unique()

3-3-3 数据平衡

train['target'].value_counts()

def check_balance(df,target):
    check=[]
    # written by MJ Bahmani for binary target
    print('size of data is:',df.shape[0] )
    for i in [0,1]:
        print('for target  {} ='.format(i))
        print(df[target].value_counts()[i]/df.shape[0]*100,'%')

3-3-4 偏度和峰度

print("Skewness: %f" % train['target'].skew())
print("Kurtosis: %f" % train['target'].kurt())

4 特征工程

4-1 Permutation Importance

1.哪些特征对预测影响最大？
2.如何从模型中提取信息？

cols=["target","ID_code"]
X = train.drop(cols,axis=1)
y = train["target"]

X_test  = test.drop("ID_code",axis=1)

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
rfc_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)

4-2 如何计算和显示重要性

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(rfc_model, random_state=1).fit(val_X, val_y)

eli5.show_weights(perm, feature_names = val_X.columns.tolist(), top=150)

4-4 Partial Dependence Plots

虽然特征重要性显示变量对预测的影响最大，Partial Dependence Plots显示特征如何影响预测和在模型拟合后计算Partial Dependence Plots。

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
tree_model = DecisionTreeClassifier(random_state=0, max_depth=5, min_samples_split=5).fit(train_X, train_y)

features = [c for c in train.columns if c not in ['ID_code', 'target']]
from sklearn import tree
import graphviz
tree_graph = tree.export_graphviz(tree_model, out_file=None, feature_names=features)
graphviz.Source(tree_graph)

4-5 Partial Dependence Plot

from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots

# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=features, feature='var_81')

# plot it
pdp.pdp_plot(pdp_goals, 'var_81')
plt.show()

4-6 Chart analysis

# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=features, feature='var_82')

# plot it
pdp.pdp_plot(pdp_goals, 'var_82')
plt.show()

4-7 SHAP Values

row_to_show = 5
data_for_prediction = val_X.iloc[row_to_show]  # use 1 row of data here. Could use multiple rows if desired
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)

rfc_model.predict_proba(data_for_prediction_array);

import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(rfc_model)

# Calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)

4-7 特征之间的相关性

%%time
correlations = train_df[features].corr().abs().unstack().sort_values(kind="quicksort").reset_index()
correlations = correlations[correlations['level_0'] != correlations['level_1']]
correlations.head(10)

5 模型

准备：Augment

def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

5-1 lightgbm

params = {'objective' : "binary", 
               'boost':"gbdt",
               'metric':"auc",
               'boost_from_average':"false",
               'num_threads':8,
               'learning_rate' : 0.01,
               'num_leaves' : 13,
               'max_depth':-1,
               'tree_learner' : "serial",
               'feature_fraction' : 0.05,
               'bagging_freq' : 5,
               'bagging_fraction' : 0.4,
               'min_data_in_leaf' : 80,
               'min_sum_hessian_in_leaf' : 10.0,
               'verbosity' : 1}

%%time
y_pred_lgb = np.zeros(len(X_test))
num_round = 1000000
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    lgb_model = lgb.train(params,train_data,num_round,#change 20 to 2000
                    valid_sets = [train_data, valid_data],verbose_eval=1000,early_stopping_rounds = 3500)##change 10 to 200
            
    y_pred_lgb += lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)/5

5-2 RandomForestClassifier

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
rfc_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)
y_pred_rfc = rfc_model.predict(X_test)

5-3 DecisionTreeClassifier

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
tree_model = DecisionTreeClassifier(random_state=0, max_depth=5, min_samples_split=5).fit(train_X, train_y)
y_pred_tree = tree_model.predict(X_test)

5-4 Logistic Regression

def augment(x,y,t=2):
    
    if t==0:
        return x, y
    
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)
        del x1
        gc.collect()

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)
        del x1
        gc.collect()
        
    print("The sizes of x, xn, and xs are {}, {}, {}, respectively.".format(sys.getsizeof(x),
                                                                            sys.getsizeof(xn),
                                                                            sys.getsizeof(xs)
                                                                           )
         )
    
    xs = np.vstack(xs)
    xn = np.vstack(xn)
    
    print("The sizes of x, xn, and xs are {}, {}, {}, respectively.".format(sys.getsizeof(x)/1024**3,
                                                                            sys.getsizeof(xn),
                                                                            sys.getsizeof(xs)
                                                                           )
         )
    
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])

    y = np.concatenate([y,ys,yn])
    print("The sizes of y, yn, and ys are {}, {}, {}, respectively.".format(sys.getsizeof(y),
                                                                            sys.getsizeof(yn),
                                                                            sys.getsizeof(ys)
                                                                           )
         )
    
    gc.collect()

    return np.vstack([x,xs, xn]), y

for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    print("Current Fold: {}".format(fold_))
    trn_x, trn_y = X[trn_, :], y[trn_]
    val_x, val_y = X[val_, :], y[val_]
    
    NAUGMENTATIONS=1#5
    NSHUFFLES=0#2  # turning off the augmentation by shuffling since it did not help
    
    val_pred, test_fold_pred = 0, 0
    for i in range(NAUGMENTATIONS):
        
        print("\nFold {}, Augmentation {}".format(fold_, i+1))
        
        trn_aug_x, trn_aug_y = augment(trn_x, trn_y, NSHUFFLES)
        trn_aug_x = pd.DataFrame(trn_aug_x)
        trn_aug_x = trn_aug_x.add_prefix('var_')
        
        clf = Pipeline([
            #('scaler', StandardScaler()),
            #('qt', QuantileTransformer(output_distribution='normal')),
            ('lr_clf', LogisticRegression(solver='lbfgs', max_iter=1500, C=10))
        ])

        clf.fit(trn_aug_x, trn_aug_y)
        
        print("Making predictions for the validation data")
        val_pred += clf.predict_proba(val_x)[:,1]
        
        print("Making predictions for the test data")
        test_fold_pred += clf.predict_proba(X_test)[:,1]
        
    val_pred /= NAUGMENTATIONS
    test_fold_pred /= NAUGMENTATIONS
    
    roc_cv.append(roc_auc_score(val_y, val_pred))
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    oof_preds[val_, :] = val_pred.reshape((-1, 1))
    test_preds += test_fold_pred.reshape((-1, 1))

test_preds /= NFOLDS
roc_score_1 = round(roc_auc_score(y, oof_preds.ravel()), 5)
roc_score = round(sum(roc_cv)/len(roc_cv), 5)
st_dev = round(np.array(roc_cv).std(), 5)

print("Average of the folds' AUCs = {}".format(roc_score))
print("Combined folds' AUC = {}".format(roc_score_1))
print("The standard deviation = {}".format(st_dev))

6 提交

submission_rfc_cat = pd.DataFrame({
        "ID_code": test["ID_code"],
        "target": (y_pred_rfc +y_pred_cat)/2
    })
submission_rfc_cat.to_csv('submission_rfc_cat.csv', index=False)

数据挖掘比赛模板

Santander ML Explainability

1 准本步骤

1-1.导包

1-2 设置

1-3 版本

2 问题描述

3 EDA

3-1 数据采集

减小内存

3-1-1 数据集字段

3-2-2 数值描述

3-2可视化

3-2-1 直方图

3-2-2 平均频率

3-2-3 countplot

3-2-4 hist

3-2-5 distplot

3-2-6 散点图

3-3 数据预处理

3-3-1 缺失值检查

3-3-2 二值化

3-3-3 数据平衡

3-3-4 偏度和峰度

4 特征工程

4-1 Permutation Importance

4-2 如何计算和显示重要性

4-4 Partial Dependence Plots

4-5 Partial Dependence Plot

4-6 Chart analysis

4-7 SHAP Values

4-7 特征之间的相关性

5 模型

准备：Augment

5-1 lightgbm

5-2 RandomForestClassifier

5-3 DecisionTreeClassifier

5-4 Logistic Regression

6 提交

猜你喜欢