# -*- coding: utf-8 -*-
"""
@ModuleName:feature_tools
@Function:
@Author: H2017824
@Time: 2019/11/8 下午 02:45
"""
# -*- coding: utf-8 -*-
"""
@ModuleName:entity_test2
@Function:
@Author: H2017824
@Time: 2019/11/8 下午 02:45
"""
from sqlalchemy import create_engine
import pandas as pd
import featuretools as ft
if __name__ == '__main__':
conn = create_engine('postgres://pmart_per:[email protected]:5432/personnel')
model_type = 'N_Y_ZH' # N_Y_Q N_Y_ZH
year_week = '2019_33'
# year_week_list=year_week.split('_')
before_year_week = '2019_30'
while year_week != '2019_34':
print(year_week)
sql_list = """select * from pmart_per.DM_ZZK_PERSON_LIST_W
where year_week ='{year_week}'
and model_type='{model_type}'
AND EMP_STATE!='diaochu'
"""
sql_list = sql_list.format(year_week=year_week, model_type=model_type)
sql_leave = """select stat_week,emp_no,leave_type,plan_leave_hours,actual_leave_hours,actual_leave_times
from pdata_per.dw_zzk_hrm_leave_info_w
where '{before_year_week}'<= stat_week
and stat_week <='{year_week}'
"""
sql_leave = sql_leave.format(before_year_week=before_year_week, year_week=year_week)
sql_attendance = """select stat_week,emp_no,ovhours,msc,mxc,mac,attflag,holidays,yc
from pdata_per.dw_zzk_hrm_attendance_report_w
where '{before_year_week}'<= stat_week
and stat_week <='{year_week}'
"""
sql_attendance = sql_attendance.format(before_year_week=before_year_week, year_week=year_week)
sql_absent = """select stat_week,emp_no,absent_hours,absent_times,dept_name
from pdata_per.dw_zzk_hrm_emp_absent_info_w
where '{before_year_week}'<= stat_week
and stat_week <='{year_week}'
union all
select 'gdgd','hhhhf',8.0,1,'hhhhf'
"""
sql_absent = sql_absent.format(before_year_week=before_year_week, year_week=year_week)
list_df = pd.read_sql(sql_list, conn)
leave_df = pd.read_sql(sql_leave, conn)
attendance_df = pd.read_sql(sql_attendance, conn)
absent_df = pd.read_sql(sql_absent, conn)
# 步驟一:創建一個實體集
es = ft.EntitySet(id='list')
print('空實體集:\n', es)
# 步驟二:在實體集es中加入 list 實體和 leave、attendance、absent實體
es = es.entity_from_dataframe(entity_id='list', dataframe=list_df,
index='emp_no', time_index='stat_date')
es = es.entity_from_dataframe(entity_id='leave', dataframe=leave_df,
variable_types={'leave_type': ft.variable_types.Categorical},
make_index=True, index='leave_df_id')
es = es.entity_from_dataframe(entity_id='attendance', dataframe=attendance_df,
variable_types={'attflag': ft.variable_types.Categorical,
'holidays': ft.variable_types.Categorical},
make_index=True, index='attendance_df_id')
es = es.entity_from_dataframe(entity_id='absent', dataframe=absent_df,
variable_types={'dept_name': ft.variable_types.Categorical},
make_index=True, index='absent_df_id')
print('添加三個實體的實體集:\n', es)
# 步驟三:建立多個實體之間的關係,用 emp_no 關聯
es = es.add_relationship(ft.Relationship(es["list"]["emp_no"], es["leave"]["emp_no"]))
es = es.add_relationship(ft.Relationship(es["list"]["emp_no"], es["attendance"]["emp_no"]))
es = es.add_relationship(ft.Relationship(es["list"]["emp_no"], es["absent"]["emp_no"]))
# list_relationship = ft.Relationship(es['list']['emp_no'],
# es['leave']['emp_no'])
# es = es.add_relationship(list_relationship)
# list_relationship = ft.Relationship(es['list']['emp_no'],
# es['attendance']['emp_no'])
# es = es.add_relationship(list_relationship)
# list_relationship = ft.Relationship(es['list']['emp_no'],
# es['absent']['emp_no'])
# es = es.add_relationship(list_relationship)
print('添加上三個實體關係后的實體集:\n', es)
# 步驟四:使用dfs進行特征工程,目標實體list
features, feature_names = ft.dfs(entityset=es,
target_entity='list',
agg_primitives=['mean', 'max', 'percent_true', 'last'],
max_depth=2)
# features, feature_names = ft.dfs(entityset=es, target_entity='list',
# agg_primitives=['mean', 'max', 'percent_true', 'last'],
# trans_primitives=['years', 'month', 'subtract', 'divide'])
print('特征工程輸出的內容:\n', features.head())
print('特征工程輸出的數據的維度:\n', features.shape)
# 步驟五:將存在非法字段名稱的,對其進行修改以實現將結果上傳數據庫
cols_list = list(features.columns)
cols_list_new = []
for i in cols_list:
i = i.replace('(', '_')
i = i.replace(')', '')
i = i.replace('.', '_')
cols_list_new.append(i)
features.columns = cols_list_new
# features=features.head(100)
# features.to_sql(name='dm_zzk_person_list_test3', con=conn, if_exists='append', index=False)
features.to_csv('data/{year_week}_{model_type}.csv'.format(year_week=year_week,model_type=model_type), encoding='utf-8-sig')
year_week_list = year_week.split('_')
if int(year_week_list[1]) < 52:
if int(year_week_list[1]) < 9:
year_week_list[1] = '0' + str(int(year_week_list[1]) + 1)
else:
year_week_list[1] = str(int(year_week_list[1]) + 1)
year_week = '_'.join(year_week_list)
else:
year_week_list[0] = str(int(year_week_list[0]) + 1)
year_week_list[1] = '01'
year_week = '_'.join(year_week_list)
# 自定義基元
# from featuretools.primitives import make_agg_primitive, make_trans_primitive
# from featuretools.variable_types import Text, Numeric
#
#
# def absolute(column):
# return abs(column)
#
#
# absolute_re = make_trans_primitive(function=absolute,
# input_types=[Numeric],
# return_type=Numeric)
#
#
# def maximum(column):
# return max(column)
#
#
# maximum_re = make_agg_primitive(function=maximum,
# input_types=[Numeric],
# return_type=Numeric)
#
# features, feature_names = ft.dfs(entityset=es, target_entity='list', agg_primitives=[maximum_re],
# trans_primitives=[absolute_re], max_depth=2)
自动特征工具包featuretools的使用方法
猜你喜欢
转载自blog.csdn.net/PoGeN1/article/details/104279819
今日推荐
周排行