import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
for data in[train,testA]:
data['earliesCreditLine']= data['earliesCreditLine'].apply(lambda s:int(s[-4:]))
クラス特徴処理
cate_features =['grade','subGrade','employmentTitle','homeOwnership','verificationStatus','purpose','postCode','regionCode','applicationType','initialListStatus','title','policyCode']for f in cate_features:print(f,'类型数:', data[f].nunique())
外れ値の処理
異常を検出する方法: 平均二乗誤差
deffind_outliers_by_3segama(data,fea):
data_std = np.std(data[fea])
data_mean = np.mean(data[fea])
outliers_cut_off = data_std*3
lower_rule = data_mean - outliers_cut_off
upper_rule = data_mean + outliers_cut_off
data[fea+'_outliers']= data[fea].apply(lambda x:str('异常值')if x > upper_rule or x < lower_rule else'正常值')return data
data_train = data_train.copy()for fea in numerical_fea:
data_train = find_outliers_by_3segama(data_train,fea)print(data_train[fea+'_outliers'].value_counts())print(data_train.groupby(fea+'_outliers')['isDefault'].sum())print('*'*10)
#label-encode:subGrade,postCode,title# 高维类别特征需要进行转换for col in tqdm(['employmentTitle','postCode','title','subGrade']):
le = LabelEncoder()
le.fit(list(data_train[col].astype(str).values)+list(data_test_a[col].astype(str).values))
data_train[col]= le.transform(list(data_train[col].astype(str).values))
data_test_a[col]= le.transform(list(data_test_a[col].astype(str).values))print('Label Encoding 完成')
機能の選択
フィルタ a. 分散選択法 b. 相関係数法(ピアソン相関係数) c. カイ二乗検定 d. 相互情報量法