Feature Engineering and Baseline Model

Feature engineering

Overview:

Data and features determine the upper limit of machine learning , and models and algorithms only approach this upper limit .

Feature engineering is essentially an engineering activity whose purpose is to extract features from raw data to the maximum extent for use by algorithms and models.

It is necessary to think about whether the characteristics of one's structure are reasonable and whether the characteristics are sufficient to express the problem to be solved.

method:

  1. Based on data type
  2. Based on multiple analysis perspectives

Feature selection:

  1. Filter
  2. Wrapper
  3. Embedded

Construct offline verification set

Three aspects need to be considered when constructing the offline verification set

  1. Assess crossing
  2. Differences in characteristics of training set and test set
  3. Differences in the distribution of training set and test set

Baseline model

# -*- coding: utf-8 -*-
'''
@Time    : 2021/1/29 18:21
@Author  : Hjh
@File    : baseline.py
'''

# 导入相关应用包
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# 忽略警告信息
import warnings

warnings.filterwarnings("ignore")

# 调用matplotlib.pyplot的绘图函数plot()进行绘图的时候,
# 或者生成一个figure画布的时候,可以直接在你的python console里面生成图像。
# %matplotlib inline

# 读取数据
path = './dataset/'
train = pd.read_csv(path + 'train.csv')  # 训练集
test = pd.read_csv(path + 'test.csv')  # 测试集

# 特征工程
# 利用count()函数和nunique()函数生成特征:反映样本调用api,tid,index的频率信息
def simple_sts_features(df):
    simple_fea = pd.DataFrame()
    simple_fea['file_id'] = df['file_id'].unique()
    simple_fea = simple_fea.sort_values('file_id')

    df_grp = df.groupby('file_id')
    simple_fea['file_id_api_count'] = df_grp['api'].count().values
    simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values

    simple_fea['file_id_tid_count'] = df_grp['tid'].count().values
    simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values

    simple_fea['file_id_index_count'] = df_grp['index'].count().values
    simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values

    return simple_fea

# 利用mean()函数、min()函数、std()函数、max()函数生成特征:tid,index可认为是数值特征,可提取对应的统计特征
def simple_numerical_sts_features(df):
    simple_numerical_fea = pd.DataFrame()
    simple_numerical_fea['file_id'] = df['file_id'].unique()
    simple_numerical_fea = simple_numerical_fea.sort_values('file_id')

    df_grp = df.groupby('file_id')

    simple_numerical_fea['file_id_tid_mean'] = df_grp['tid'].mean().values
    simple_numerical_fea['file_id_tid_min'] = df_grp['tid'].min().values
    simple_numerical_fea['file_id_tid_std'] = df_grp['tid'].std().values
    simple_numerical_fea['file_id_tid_max'] = df_grp['tid'].max().values

    simple_numerical_fea['file_id_index_mean'] = df_grp['index'].mean().values
    simple_numerical_fea['file_id_index_min'] = df_grp['index'].min().values
    simple_numerical_fea['file_id_index_std'] = df_grp['index'].std().values
    simple_numerical_fea['file_id_index_max'] = df_grp['index'].max().values

    return simple_numerical_fea

#利用定义的特征生成函数,并生成训练集和测试集的统计特征
#(1)反映样本调用api,tid,index的频率信息和统计特征
simple_train_fea1 = simple_sts_features(train)
simple_test_fea1 = simple_sts_features(test)
#(2)反映tid、index等数值特征的统计特征
simple_train_fea2 = simple_numerical_sts_features(train)
simple_test_fea2 = simple_numerical_sts_features(test)

# 基线构建
# 获取标签
train_label = train[['file_id', 'label']].drop_duplicates(subset=['file_id', 'label'], keep='first')
test_submit = test[['file_id']].drop_duplicates(subset=['file_id'], keep='first')
# 训练集&测试集构建
train_data = train_label.merge(simple_train_fea1, on='file_id', how='left')
train_data = train_data.merge(simple_train_fea2, on='file_id', how='left')

test_submit = test_submit.merge(simple_test_fea1, on='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea2, on='file_id', how='left')


# 模型评估函数
def lgb_logloss(preds, data):
    labels_ = data.get_label()
    classes_ = np.unique(labels_)
    preds_prob = []
    for i in range(len(classes_)):
        preds_prob.append(preds[i * len(labels_):(i + 1) * len(labels_)])

    preds_prob_ = np.vstack(preds_prob)

    loss = []
    for i in range(preds_prob_.shape[1]):  # 样本个数
        sum_ = 0
        for j in range(preds_prob_.shape[0]):  # 类别个数
            pred = preds_prob_[j, i]  # 第i个样本预测为第j类的概率
            if j == labels_[i]:
                sum_ += np.log(pred)
            else:
                sum_ += np.log(1 - pred)
        loss.append(sum_)
    return 'loss is: ', -1 * (np.sum(loss) / preds_prob_.shape[1]), False


# 模型验证。因为数据与时间的相关性不是非常大,所以使用传统的5折交叉验证来构建线下验证集
train_features = [col for col in train_data.columns if col not in ['label', 'file_id']]
train_label = 'label'

#%%time
import time
from sklearn.model_selection import StratifiedKFold, KFold

start1 = time.perf_counter()
end1 = time.perf_counter()

params = {
    'task': 'train',
    'num_leaves': 255,
    'objective': 'multiclass',
    'num_class': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 5,
    'max_bin': 128,
    'random_state': 100
}

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))

predict_res = 0
models = []
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)
    val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values)

    clf = lgb.train(params, trn_data, num_boost_round=2000, valid_sets=[trn_data, val_data], verbose_eval=50,
                    early_stopping_rounds=100, feval=lgb_logloss)
    models.append(clf)

print("final is in : %s Seconds " % (end1-start1))

# 特征重要性分析
feature_importance = pd.DataFrame()
feature_importance['fea_name'] = train_features
feature_importance['fea_imp'] = clf.feature_importance()
feature_importance = feature_importance.sort_values('fea_imp',ascending = False)

#模型测试
pred_res = 0
fold = 5
for model in models:
    pred_res +=model.predict(test_submit[train_features]) * 1.0 / fold
test_submit['prob0'] = 0
test_submit['prob1'] = 0
test_submit['prob2'] = 0
test_submit['prob3'] = 0
test_submit['prob4'] = 0
test_submit['prob5'] = 0
test_submit['prob6'] = 0
test_submit['prob7'] = 0
test_submit[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']] = pred_res
test_submit[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('baseline.csv',index = None)

 

 

Guess you like

Origin blog.csdn.net/qq_41904729/article/details/114371876