- 1. Add a distributed cache ([Alluxio](http://www.alluxio.org/docs/master/cn/)) between HDFS and the computing engine, the data generated by create table ... select is cached by Alluxio, The next sql read table data is directly read from the cache, reducing disk IO, but the job still generates 13 actions, and the optimization effect is not too obvious;
- 2. Parse the sql, get the select part of each create table...select statement, call spark.sql to generate a dataframe, and then call registertemptable to register as a temporary table, the create table...select statement no longer triggers action, and the job has only one insert statement , trigger an action to maximize the advantages of spark iterative computing.
In the second method, you cannot use spark sql or spark thriftserver directly. You need to provide a spark driver for conversion processing. At the same time, you can customize various algorithms on the sparkdriver, such as the implementation document of the kernel probability estimation algorithm: https://github.com /melin/sparkdriver
-- prepare data tdl_eunomia_access_ddg_sht_event create table tdl_eunomia_access_ddg_sht_event LIFECYCLE 10 as select event_id, prodcateleafid, isvrtlprod, byr_byrstarlvl, sel_isb2cuser, byrid, substr(ds,1,8) as ds, substr(ds,1,10) as ds_hour, dispatch_time from secods.odl_event_crm_rights_start where ds>='201611300000' and ds<='201611302359' and perf_test = 'false'; create table tdl_eunomia_access_ddg_sht_indicator_tmp LIFECYCLE 10 as select byrId as byrId, 'byrId' as keys_list, '1' as dis_index from tdl_eunomia_access_ddg_sht_event; create table tdl_eunomia_access_ddg_sht_indicator LIFECYCLE 10 as select distinct byrId, keys_list from tdl_eunomia_access_ddg_sht_indicator_tmp; --index create table tdl_eunomia_calc_once_fg_sdt LIFECYCLE 10 as select a.* from ( select dispatch_time , byrId ,if(( selId > 0 AND byrId > 0),1,0) as condition_ENN ,substr(ds,1,8) as ds ,if(substr(ds,1,8)>='20161116',1,0) as ds_ENN from secods.odl_event_crm_rights_start where ds<='201611302359' and ds>='201611160000' and byrId is not null and (( selId > 0 AND byrId > 0)) ) a join ( select byrId from tdl_eunomia_access_ddg_sht_indicator where keys_list='byrId' ) b on a.byrId=b.byrId create table tdl_eunomia_calc_once_fg_sdt1 LIFECYCLE 10 as select byrId ,COUNT(1) as value ,ds ,'ENN' as indi_code from tdl_eunomia_calc_once_fg_sdt where ds<substr('2016113023',1,8) and ds_ENN=1 and condition_ENN=1 group by ds,byrId; create table tdl_eunomia_calc_once_fg_sdt2 LIFECYCLE 10 as select dispatch_time as event_time , byrId ,COUNT(1) over(partition by ds,byrId order by dispatch_time rows 500 preceding) as value ,ds ,'ENN' as indi_code from tdl_eunomia_calc_once_fg_sdt where condition_ENN=1 and ds>=substr('2016113000',1,8); set spark.cache.table.tdl_pub_sec_date_cfg_ENN = true; create table tdl_pub_sec_date_cfg_ENN LIFECYCLE 10 as select date_num as bdate from secdw.pub_sec_date_cfg where date_num<=substr('2016113023',1,8) and date_num>=substr('2016113000',1,8) and date_type='date'; create table tdl_eunomia_calc_once_fg_sdt40 LIFECYCLE 10 as select a.*,b.bdate from tdl_eunomia_calc_once_fg_sdt1 a join tdl_pub_sec_date_cfg_ENN b where a.ds<b.bdate and (a.ds>=to_char(dateadd(to_date(b.bdate,'yyyymmdd'),1-15,'dd') ,'yyyymmdd') and a.indi_code='ENN'); create table tdl_eunomia_calc_once_fg_sdt4 LIFECYCLE 10 as select byrId,SUM(cast(value as bigint)) as ENN,bdate as ds,indi_code from tdl_eunomia_calc_once_fg_sdt40 where to_char(dateadd(to_date(bdate,'yyyymmdd'),1-15,'dd') ,'yyyymmdd')<=ds and indi_code='ENN' group by byrId,bdate,indi_code; create table tdl_eunomia_calc_once_fg_sdt2 LIFECYCLE 10 as select * from tdl_eunomia_calc_once_fg_sdt2 where ds<=substr('2016113023',1,8) and ds>=substr('2016113000',1,8) and indi_code in('ENN') and value<500; create table tdl_eunomia_calc_once_fg_sdt3 LIFECYCLE 10 as select cast(event_time as bigint) as event_time , byrId ,value ,ds ,indi_code from( select a.event_time ,a.byrId ,cast(case when a.indi_code='ENN' then coalesce(a.value,0)+coalesce(b.value,0) end as string) as value ,a.indi_code ,a.ds from tdl_eunomia_calc_once_fg_sdt2 a left outer join tdl_eunomia_calc_once_fg_sdt4 b on a.byrId=b.byrId and a.indi_code=b.indi_code and a.ds=b.ds and a.indi_code in('ENN') and b.indi_code in('ENN') union all select cast(ds as bigint) as event_time , byrId ,value ,indi_code ,ds from tdl_eunomia_calc_once_fg_sdt4 where indi_code in('ENN') ) a; create table adl_eunomia_calc_once_fg_sdt LIFECYCLE 10 as select byrId ,if(gmt_begin=ds,unix_timestamp(to_date(ds,'yyyymmdd'))*1000,gmt_begin) as gmt_begin ,if(gmt_end is null,unix_timestamp(dateadd(to_date(ds,'yyyymmdd'),1,'dd'))*1000,gmt_end) as gmt_end ,value ,ds ,indi_code from( select byrId ,event_time as gmt_begin ,lead(event_time,1) over(partition by byrId, ds, indi_code order by event_time asc) as gmt_end ,case when indi_code='ENN' then cast(value as bigint) end as value ,indi_code ,ds from tdl_eunomia_calc_once_fg_sdt3 where indi_code in('ENN') ) a; insert OVERWRITE TABLE adl_eunomia_access_ddg_sht PARTITION(ds) select a.event_id, a.prodcateleafid, a.isvrtlprod, a.byr_byrstarlvl, a.sel_isb2cuser, a.byrid, secods:indi_accdata_filter_value(a.dispatch_time, b.aet_a_GROUP) as indireltmaeta3d, a.ds_hour as ds from tdl_eunomia_access_ddg_sht_event a left outer join ( select byrId, substr(ds, 1, 10) ds, WM_CONCAT(concat(value, ";", gmt_begin, ';', gmt_end)) aet_a_GROUP from adl_eunomia_calc_once_fg_sdt where substr(ds,1,10)>='2016113000' and substr(ds,1,10)<='2016113023' and indi_code = 'ENN' group by byrId, substr(ds, 1, 10) ) b on FillBlankFunc(a.byrId) = b.byrId and a.ds=substr(b ```.ds, 1, 8);