Kaggle-HomeRisk中一次特征工程的记录

1、移除取值单一对分类没有贡献的特征

# Removing empty features
nun = data.nunique()
empty = list(nun[nun <= 1].index)

data.drop(empty, axis=1, inplace=True)
print('After removing empty features there are {0:d} features'.format(data.shape[1]))

2、移除Target取0和1同分布的特征

# Removing features with the same distribution on 0 and 1 classes
corr = pd.DataFrame(index=['diff', 'p'])
ind = data[data['TARGET'].notnull()].index

for c in data.columns.drop('TARGET'):
    corr[c] = corr_feature_with_target(data.loc[ind, c], data.loc[ind, 'TARGET'])

corr = corr.T
corr['diff_norm'] = abs(corr['diff'] / data.mean(axis=0))

to_del_1 = corr[((corr['diff'] == 0) & (corr['p'] > .05))].index
to_del_2 = corr[((corr['diff_norm'] < .5) & (corr['p'] > .05))].drop(to_del_1).index
to_del = list(to_del_1) + list(to_del_2)
if 'SK_ID_CURR' in to_del:
    to_del.remove('SK_ID_CURR')

data.drop(to_del, axis=1, inplace=True)
print('After removing features with the same distribution on 0 and 1 classes there are {0:d} features'.format(
    data.shape[1]))

其中corr_feature_with_target()

def corr_feature_with_target(feature, target):
    c0 = feature[target == 0].dropna()
    c1 = feature[target == 1].dropna()

    if set(feature.unique()) == set([0, 1]):
        diff = abs(c0.mean(axis=0) - c1.mean(axis=0))
    else:
        diff = abs(c0.median(axis=0) - c1.median(axis=0))

    p = ranksums(c0, c1)[1] if ((len(c0) >= 20) & (len(c1) >= 20)) else 2

    return [diff, p]

3、只保留训练集和测试集同分布的特征

现在知道的有两种方法

(1)用均值差和P值

# Removing features with not the same distribution on train and test datasets
corr_test = pd.DataFrame(index=['diff', 'p'])
target = data['TARGET'].notnull().astype(int)

for c in data.columns.drop('TARGET'):
    corr_test[c] = corr_feature_with_target(data[c], target)

corr_test = corr_test.T
corr_test['diff_norm'] = abs(corr_test['diff'] / data.mean(axis=0))

bad_features = corr_test[((corr_test['p'] < .05) & (corr_test['diff_norm'] > 1))].index
bad_features = corr.loc[bad_features][corr['diff_norm'] == 0].index

data.drop(bad_features, axis=1, inplace=True)
print(
    'After removing features with not the same distribution on train and test datasets there are {0:d} features'.format(
        data.shape[1]))

(2)通过KS检验

def get_safe_KS(df, thr=0.1):
"""Use KS to determine columns with KS statistic above threshold between train & test"""

# Book-keeping
drop_features = []

    # Go through all columns
    with tqdm() as pbar:
        for col in feature_df.columns:

            # Columns left
            cols_left = [c for c in feature_df.columns if c not in drop_features]
            pbar.update(1)

            # Look at distribution in feature
            statistic, pvalue = ks_2samp(
                feature_df.loc[train_idx, col].values,
                feature_df.loc[test_idx, col].values
            )
            if pvalue < 0.05 and statistic > 0.1:
                pbar.set_description(f"Dropping: {col}. KS: {statistic}. p-value: {pvalue}. {len(cols_left)} features left.")
                drop_features.append(col)

    # Return columns to keep
    return cols_left

4、丢弃异常点的值

通常情况都是利用四分位间距定义异常数据

outliers = []
# 对于每一个特征，找到值异常高或者是异常低的数据点
for feature in log_data.keys():

    # TODO: 计算给定特征的Q1（数据的25th分位点）
    Q1 = np.percentile(log_data[feature], 25)

    # TODO: 计算给定特征的Q3（数据的75th分位点）
    Q3 = np.percentile(log_data[feature], 75)

    # TODO: 使用四分位范围计算异常阶（1.5倍的四分位距）
    step = 1.5 * (Q3 - Q1)

    # 显示异常点
    print("Data points considered outliers for the feature '{}':".format(feature))
    outliers.append(list(log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))].index))

good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)

5、通过模型来移除那些与目标值无关的特征

# Removing features not interesting for classifier
clf = LGBMClassifier(random_state=0)
train_index = data[data['TARGET'].notnull()].index
train_columns = data.drop('TARGET', axis=1).columns

score = 1
new_columns = []
while score > .7:
    train_columns = train_columns.drop(new_columns)
    clf.fit(data.loc[train_index, train_columns], data.loc[train_index, 'TARGET'])
    f_imp = pd.Series(clf.feature_importances_, index=train_columns)
    score = roc_auc_score(data.loc[train_index, 'TARGET'],
                          clf.predict_proba(data.loc[train_index, train_columns])[:, 1])
    new_columns = f_imp[f_imp > 0].index

data.drop(train_columns, axis=1, inplace=True)
print('After removing features not interesting for classifier there are {0:d} features'.format(data.shape[1]))

Kaggle-HomeRisk中一次特征工程的记录

猜你喜欢