自动特征工具包featuretools的使用方法

# -*- coding: utf-8 -*-
"""
@ModuleName:feature_tools
@Function: 
@Author: H2017824
@Time: 2019/11/8 下午 02:45
"""
# -*- coding: utf-8 -*-
"""
@ModuleName:entity_test2
@Function:
@Author: H2017824
@Time: 2019/11/8 下午 02:45
"""

from sqlalchemy import create_engine
import pandas as pd
import featuretools as ft

if __name__ == '__main__':
    conn = create_engine('postgres://pmart_per:[email protected]:5432/personnel')
    model_type = 'N_Y_ZH'  # N_Y_Q  N_Y_ZH
    year_week = '2019_33'
    # year_week_list=year_week.split('_')
    before_year_week = '2019_30'
    while year_week != '2019_34':
        print(year_week)
        sql_list = """select * from pmart_per.DM_ZZK_PERSON_LIST_W
        where year_week ='{year_week}'
        and model_type='{model_type}'
        AND EMP_STATE!='diaochu'
        """
        sql_list = sql_list.format(year_week=year_week, model_type=model_type)
        sql_leave = """select stat_week,emp_no,leave_type,plan_leave_hours,actual_leave_hours,actual_leave_times 
        from pdata_per.dw_zzk_hrm_leave_info_w
        where '{before_year_week}'<= stat_week  
        and stat_week <='{year_week}'
        """
        sql_leave = sql_leave.format(before_year_week=before_year_week, year_week=year_week)
        sql_attendance = """select stat_week,emp_no,ovhours,msc,mxc,mac,attflag,holidays,yc 
        from pdata_per.dw_zzk_hrm_attendance_report_w
        where '{before_year_week}'<= stat_week  
        and stat_week <='{year_week}'
        """
        sql_attendance = sql_attendance.format(before_year_week=before_year_week, year_week=year_week)
        sql_absent = """select stat_week,emp_no,absent_hours,absent_times,dept_name 
        from pdata_per.dw_zzk_hrm_emp_absent_info_w
        where '{before_year_week}'<= stat_week  
        and stat_week <='{year_week}'
        union all
        select 'gdgd','hhhhf',8.0,1,'hhhhf'
        """
        sql_absent = sql_absent.format(before_year_week=before_year_week, year_week=year_week)

        list_df = pd.read_sql(sql_list, conn)
        leave_df = pd.read_sql(sql_leave, conn)
        attendance_df = pd.read_sql(sql_attendance, conn)
        absent_df = pd.read_sql(sql_absent, conn)
        # 步驟一:創建一個實體集
        es = ft.EntitySet(id='list')
        print('空實體集:\n', es)
        # 步驟二:在實體集es中加入 list 實體和 leave、attendance、absent實體
        es = es.entity_from_dataframe(entity_id='list', dataframe=list_df,
                                      index='emp_no', time_index='stat_date')
        es = es.entity_from_dataframe(entity_id='leave', dataframe=leave_df,
                                      variable_types={'leave_type': ft.variable_types.Categorical},
                                      make_index=True, index='leave_df_id')
        es = es.entity_from_dataframe(entity_id='attendance', dataframe=attendance_df,
                                      variable_types={'attflag': ft.variable_types.Categorical,
                                                      'holidays': ft.variable_types.Categorical},
                                      make_index=True, index='attendance_df_id')
        es = es.entity_from_dataframe(entity_id='absent', dataframe=absent_df,
                                      variable_types={'dept_name': ft.variable_types.Categorical},
                                      make_index=True, index='absent_df_id')
        print('添加三個實體的實體集:\n', es)
        # 步驟三:建立多個實體之間的關係,用 emp_no 關聯
        es = es.add_relationship(ft.Relationship(es["list"]["emp_no"], es["leave"]["emp_no"]))
        es = es.add_relationship(ft.Relationship(es["list"]["emp_no"], es["attendance"]["emp_no"]))
        es = es.add_relationship(ft.Relationship(es["list"]["emp_no"], es["absent"]["emp_no"]))
        # list_relationship = ft.Relationship(es['list']['emp_no'],
        #                                     es['leave']['emp_no'])
        # es = es.add_relationship(list_relationship)
        # list_relationship = ft.Relationship(es['list']['emp_no'],
        #                                     es['attendance']['emp_no'])
        # es = es.add_relationship(list_relationship)
        # list_relationship = ft.Relationship(es['list']['emp_no'],
        #                                     es['absent']['emp_no'])
        # es = es.add_relationship(list_relationship)
        print('添加上三個實體關係后的實體集:\n', es)
        # 步驟四:使用dfs進行特征工程,目標實體list
        features, feature_names = ft.dfs(entityset=es,
                                         target_entity='list',
                                         agg_primitives=['mean', 'max', 'percent_true', 'last'],
                                         max_depth=2)
        # features, feature_names = ft.dfs(entityset=es, target_entity='list',
        #                                  agg_primitives=['mean', 'max', 'percent_true', 'last'],
        #                                  trans_primitives=['years', 'month', 'subtract', 'divide'])
        print('特征工程輸出的內容:\n', features.head())
        print('特征工程輸出的數據的維度:\n', features.shape)
        # 步驟五:將存在非法字段名稱的,對其進行修改以實現將結果上傳數據庫
        cols_list = list(features.columns)
        cols_list_new = []
        for i in cols_list:
            i = i.replace('(', '_')
            i = i.replace(')', '')
            i = i.replace('.', '_')
            cols_list_new.append(i)
        features.columns = cols_list_new
        # features=features.head(100)
        # features.to_sql(name='dm_zzk_person_list_test3', con=conn, if_exists='append', index=False)
        features.to_csv('data/{year_week}_{model_type}.csv'.format(year_week=year_week,model_type=model_type), encoding='utf-8-sig')
        year_week_list = year_week.split('_')
        if int(year_week_list[1]) < 52:
            if int(year_week_list[1]) < 9:
                year_week_list[1] = '0' + str(int(year_week_list[1]) + 1)
            else:
                year_week_list[1] = str(int(year_week_list[1]) + 1)
            year_week = '_'.join(year_week_list)
        else:
            year_week_list[0] = str(int(year_week_list[0]) + 1)
            year_week_list[1] = '01'
            year_week = '_'.join(year_week_list)

    # 自定義基元
    # from featuretools.primitives import make_agg_primitive, make_trans_primitive
    # from featuretools.variable_types import Text, Numeric
    #
    #
    # def absolute(column):
    #     return abs(column)
    #
    #
    # absolute_re = make_trans_primitive(function=absolute,
    #                                    input_types=[Numeric],
    #                                    return_type=Numeric)
    #
    #
    # def maximum(column):
    #     return max(column)
    #
    #
    # maximum_re = make_agg_primitive(function=maximum,
    #                                 input_types=[Numeric],
    #                                 return_type=Numeric)
    #
    # features, feature_names = ft.dfs(entityset=es, target_entity='list', agg_primitives=[maximum_re],
    #                                  trans_primitives=[absolute_re], max_depth=2)

发布了197 篇原创文章 · 获赞 35 · 访问量 12万+

猜你喜欢

转载自blog.csdn.net/PoGeN1/article/details/104279819
今日推荐