1、移除取值单一对分类没有贡献的特征
# Removing empty features nun = data.nunique() empty = list(nun[nun <= 1].index) data.drop(empty, axis=1, inplace=True) print('After removing empty features there are {0:d} features'.format(data.shape[1]))
2、移除Target取0和1同分布的特征
# Removing features with the same distribution on 0 and 1 classes corr = pd.DataFrame(index=['diff', 'p']) ind = data[data['TARGET'].notnull()].index for c in data.columns.drop('TARGET'): corr[c] = corr_feature_with_target(data.loc[ind, c], data.loc[ind, 'TARGET']) corr = corr.T corr['diff_norm'] = abs(corr['diff'] / data.mean(axis=0)) to_del_1 = corr[((corr['diff'] == 0) & (corr['p'] > .05))].index to_del_2 = corr[((corr['diff_norm'] < .5) & (corr['p'] > .05))].drop(to_del_1).index to_del = list(to_del_1) + list(to_del_2) if 'SK_ID_CURR' in to_del: to_del.remove('SK_ID_CURR') data.drop(to_del, axis=1, inplace=True) print('After removing features with the same distribution on 0 and 1 classes there are {0:d} features'.format( data.shape[1]))
其中corr_feature_with_target()
def corr_feature_with_target(feature, target): c0 = feature[target == 0].dropna() c1 = feature[target == 1].dropna() if set(feature.unique()) == set([0, 1]): diff = abs(c0.mean(axis=0) - c1.mean(axis=0)) else: diff = abs(c0.median(axis=0) - c1.median(axis=0)) p = ranksums(c0, c1)[1] if ((len(c0) >= 20) & (len(c1) >= 20)) else 2 return [diff, p]
3、只保留训练集和测试集同分布的特征
现在知道的有两种方法
(1)用均值差和P值
# Removing features with not the same distribution on train and test datasets corr_test = pd.DataFrame(index=['diff', 'p']) target = data['TARGET'].notnull().astype(int) for c in data.columns.drop('TARGET'): corr_test[c] = corr_feature_with_target(data[c], target) corr_test = corr_test.T corr_test['diff_norm'] = abs(corr_test['diff'] / data.mean(axis=0)) bad_features = corr_test[((corr_test['p'] < .05) & (corr_test['diff_norm'] > 1))].index bad_features = corr.loc[bad_features][corr['diff_norm'] == 0].index data.drop(bad_features, axis=1, inplace=True) print( 'After removing features with not the same distribution on train and test datasets there are {0:d} features'.format( data.shape[1]))
(2)通过KS检验
def get_safe_KS(df, thr=0.1):
"""Use KS to determine columns with KS statistic above threshold between train & test"""
# Book-keeping
drop_features = []
# Go through all columns
with tqdm() as pbar:
for col in feature_df.columns:
# Columns left
cols_left = [c for c in feature_df.columns if c not in drop_features]
pbar.update(1)
# Look at distribution in feature
statistic, pvalue = ks_2samp(
feature_df.loc[train_idx, col].values,
feature_df.loc[test_idx, col].values
)
if pvalue < 0.05 and statistic > 0.1:
pbar.set_description(f"Dropping: {col}. KS: {statistic}. p-value: {pvalue}. {len(cols_left)} features left.")
drop_features.append(col)
# Return columns to keep
return cols_left
4、丢弃异常点的值
通常情况都是利用四分位间距定义异常数据
outliers = []
# 对于每一个特征,找到值异常高或者是异常低的数据点
for feature in log_data.keys():
# TODO: 计算给定特征的Q1(数据的25th分位点)
Q1 = np.percentile(log_data[feature], 25)
# TODO: 计算给定特征的Q3(数据的75th分位点)
Q3 = np.percentile(log_data[feature], 75)
# TODO: 使用四分位范围计算异常阶(1.5倍的四分位距)
step = 1.5 * (Q3 - Q1)
# 显示异常点
print("Data points considered outliers for the feature '{}':".format(feature))
outliers.append(list(log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))].index))
good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)
5、通过模型来移除那些与目标值无关的特征
# Removing features not interesting for classifier clf = LGBMClassifier(random_state=0) train_index = data[data['TARGET'].notnull()].index train_columns = data.drop('TARGET', axis=1).columns score = 1 new_columns = [] while score > .7: train_columns = train_columns.drop(new_columns) clf.fit(data.loc[train_index, train_columns], data.loc[train_index, 'TARGET']) f_imp = pd.Series(clf.feature_importances_, index=train_columns) score = roc_auc_score(data.loc[train_index, 'TARGET'], clf.predict_proba(data.loc[train_index, train_columns])[:, 1]) new_columns = f_imp[f_imp > 0].index data.drop(train_columns, axis=1, inplace=True) print('After removing features not interesting for classifier there are {0:d} features'.format(data.shape[1]))