lightGBM的categorical_feature(类别特征)使用
http://www.luyixian.cn/news_show_253681.aspx
lightGBM比XGBoost的1个改进之处在于对类别特征的处理, 不再需要将类别特征转为one-hot形式, 具体可参考这里.
在使用python API时(参考官方文档)
1.可以使用pd.DataFrame存放特征X, 每一列表示1个特征, 将类别特征设置为X[cat_cols].astype('category'). 这样模型在fit时会自动识别类别特征.
2.在模型的fit方法中传入参数categorical_feature, 指明哪些列是类别特征.
3.类别特征的值必须是从0开始的连续整数, 比如0,1,2,..., 不能是负数.
import lightgbm as lgb
import pandas as pd
import numpy as np
NUMERIC_COLS = [
'start_distance', 'price', 'time_diff','sex']
df_train = df[(df['parsed_log_time'] < yesterday.strftime("%Y-%m-%d %H:%M:%S"))].drop_duplicates()
df_test = df[(df['parsed_log_time'] >= yesterday.strftime("%Y-%m-%d %H:%M:%S"))].drop_duplicates()
y_train = df_train['success'] # training label
y_test = df_test['success'] # testing label
X_train = df_train[NUMERIC_COLS] # training dataset
X_test = df_test[NUMERIC_COLS] # testing dataset
def map_value(x):
if x == -1:
return 0
else:
return x
X_train['sex'] = X_train.agg(lambda x: map_value(x['sex']),axis=1)
X_train['sex'] = X_train['sex'].astype('category')
X_test['sex'] = X_test.agg(lambda x: map_value(x['sex']),axis=1)
X_test['sex'] = X_test['sex'].astype('category')
num_leaf = 128
# 保存GBDT模型pmml
model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=num_leaf, reg_alpha=0.0, reg_lambda=1,
max_depth=7, n_estimators=100, objective='binary',
learning_rate=0.06, random_state=20, n_jobs=4)
model.fit(X_train, y_train,categorical_feature=['sex'])
print(pd.DataFrame({
'column': NUMERIC_COLS,
'importance': model.feature_importances_
}).sort_values(by='importance', ascending=False))
A = {}
def unqiue_element(x):
a = len(np.unique(x))
A[x.name] = a
# df[NUMERIC_COLS].apply(unqiue_element)
# print(sorted(A.items(), key=lambda x: x[1], reverse=True))
predictions = model.predict(X_test)
from sklearn.metrics import precision_score, recall_score, roc_auc_score
print('正确率:', model.score(X_test, y_test))
print('精确率:', precision_score(y_test, predictions))
print('召回率:', recall_score(y_test, predictions))
print('auc值:', roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
df_test['p0'] = model.predict_proba(X_test)[:, 1]
df_test['rk'] = df_test.groupby(['driver_id', 'parsed_log_time'])['p0'].rank(ascending=0, method='average')
df_test['orderNew'] = df_test.groupby(['driver_id', 'parsed_log_time'])['orderNum'].rank(ascending=1, method='average')
print('测试集新排位平均值:', df_test[df_test['success'] == 1]['rk'].mean())
print('测试集老排位平均值:', df_test[df_test['success'] == 1]['orderNew'].mean())