How does hive use sql to write a statement that initializes historical data as a zipper table

Although I have asked questions on Zhihu, but no one has answered them, I have made the results so far.

Still need to show it.

The first step is to divide the half-year data into a single month for initialization. The following is only the data of 201801, and the others will not be displayed. Just change the date:

--HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set  hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;

with tmp1 as (
  select cust_id,zc_bal,cast(dt as string ) as dt
       ,lead(zc_bal,1,NULL)over(partition by cust_id order by dt) as zc_bal1 
   from   FACT_RPSM.F_CUST_BAL_SUM a
where dt>='20180101' and dt <='20180131' and coalesce(cust_id,'')<>''  
),
tmp2 as (
SELECT A.cust_id,A.DT,A.zc_bal,zc_bal1,(CASE WHEN zc_bal=zc_bal1  THEN '29991231' ELSE DT END ) DT_END FROM tmp1 a
),
tmp3 as (
SELECT A.cust_id,A.DT,A.zc_bal,zc_bal1,DT_END,cast(cast( lag(dt,1)over(partition by cust_id order by dt)+'1' as int) as string) as DATE_2 from tmp2 a WHERE DT_END<>'29991231'
),
tmp4 as (
SELECT cust_id,zc_bal,coalesce(DATE_2, DT) AS START_DATE,DT_END AS END_DATE FROM tmp3 a
)
SELECT a.cust_id,zc_bal,START_DATE
,CASE WHEN max(END_DATE)over(partition by cust_id)=END_DATE and END_DATE='20180131' THEN '29991231' ELSE END_DATE END END_DATE
,CASE WHEN max(END_DATE)over(partition by cust_id)=END_DATE and END_DATE<'20180131' THEN '1'   ELSE '0' END IS_FLAG
FROM tmp4 a

Hive's basic four operations will convert characters to decimal type. After adding dates, convert to int type and then convert to string type.

The second step is splicing. There is sql to implement splicing, but due to data skew or other reasons, it cannot run out, so the data at the end of each month can only be closed and integrated:

--HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set  hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;


select
b.cust_id ,b.zc_bal,b.start_date,b.end_date,b.is_flag 
from
(
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20171031' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_2 a  where   dt ='20180412'  
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20171130' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_11 a where   dt ='20180412'  
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20171231' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_12 a where  dt ='20180412'   
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20180131' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_01 a where   dt ='20180412'  
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20180228' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_02 a where   dt ='20180412'  
union all
select a.cust_id ,a.zc_bal,a.start_date,a.end_date
,a.is_flag from tmp.cust_bal_zip_03 a where   dt ='20180412' 
 ) b

The third step, go to redundant data that remains unchanged for half a year and zc_bal=0:

--HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set  hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;set hive.map.aggr=true;set hive.groupby.skewindata=true;

with tmp1 as (
select a.cust_id , count(1) as cnt  from tmp.cust_bal_zip_init a where dt ='20180412' group by cust_id 
),
tmp2 as (
select a.cust_id, count(1) as cnt from tmp.cust_bal_zip_init a where dt ='20180412'  and zc_bal = 0 group by cust_id 
),
tmp3 as (
select a.cust_id from tmp1 a inner join tmp2 b on a.cust_id = b.cust_id and b.cnt = 6  where a.cnt = 6
)
select * from
(
select b.cust_id ,b.zc_bal,b.start_date,b.end_date,b.is_flag from tmp.cust_bal_zip_init b left join tmp3 c on b.cust_id = c.cust_id where b.dt ='20180412'  and c.cust_id is null
union all
select a.cust_id ,cast(0 as decimal(20,2)) as zc_bal,'20171001' as start_date,'29991231' as end_date,'0' as is_flag from tmp3 a 
) d

The fourth step is to add to the zipper table and execute the daily batch run:

----HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;
--APPEND_RELY=SELF

--取数据表今日数据t2
with cust_zc_bal_now as (
select     
                            cust_id ,
                            zc_bal 
  from FACT_RPSM.F_CUST_BAL_SUM 
 WHERE DT = '${start|yyyyMMdd}' 
   and coalesce(cust_id,'')<>'' 
)
,

--取拉链表昨日数据t1
cust_zc_bal_zipper_y1 as 
(
select     
                            cust_id ,
                            zc_bal ,
                            start_date as start_dt,
                            end_date as end_dt,
                            is_flag as del_flag,
                            '${start-1d|yyyyMMdd}' as data_date
from tmp.cust_bal_zip_init2 
 where dt ='20180416'
--   from view.CUST_BAL_Z_T 
--  WHERE DT = '${start-1d|yyyyMMdd}'
)
,

--t1 闭链的数据
cust_zc_bal_zipper_y2 as 
(
select     
                            cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag
  from cust_zc_bal_zipper_y1 
 WHERE end_dt <> '29991231' 
)
,

--t1和t2 的客户 的资产余额相同的客户,即数据不变的.
cust_zc_bal_zipper_y3 as (
select     
                            t1.cust_id ,
                            t1.zc_bal ,
                            t1.start_dt,
                            '29991231' as end_dt,         --闭链日期为MAX_DATE
                            '0' as del_flag             --删除标记为0
  from cust_zc_bal_zipper_y1  t1
  left join cust_zc_bal_now   t2
    on t1.cust_id = t2.cust_id  
 where t1.end_dt = '29991231' 
   and t1.del_flag = '0'  
   and t2.cust_id is not null  
   and t1.zc_bal = t2.zc_bal       --T1未删除且结束日期是29991231 ,T2客户不为空,t1和t2资产相同.
)
,


--t1比t2 多的客户,即这些客户失效了
cust_zc_bal_zipper_y4 as (
select     
                            t1.cust_id ,
                            t1.zc_bal ,
                            t1.start_dt,
                            t1.data_date as end_dt ,   --闭链日期为昨日
                            '1' as del_flag          --删除标记为1
  from cust_zc_bal_zipper_y1  t1
  left join cust_zc_bal_now   t2
    on t1.cust_id = t2.cust_id  
 where t1.end_dt = '29991231' 
   and t1.del_flag = '0'  
   and t2.cust_id is null   --T1未删除且结束日期是29991231 ,T2客户为空.
)
,

--t1比t2 变更的客户  进行闭链
cust_zc_bal_zipper_y5 as (
select     
                            t1.cust_id ,
                            t1.zc_bal ,
                            t1.start_dt,
                            t1.data_date as end_dt  ,     --闭链日期为昨日
                            '0' as del_flag              --删除标记为0
  from cust_zc_bal_zipper_y1  t1
  left join cust_zc_bal_now   t2
    on t1.cust_id = t2.cust_id  
 where t1.end_dt = '29991231' 
   and t1.del_flag = '0'  
   and t2.cust_id is not null   
   and t1.zc_bal <> t2.zc_bal --T1未删除且结束日期是29991231 ,T2客户不为空,t1和t2资产不相同
) 
,

--t2新增和变更 新增开链数据

cust_zc_bal_zipper_y6 as (

select         
                            t1.cust_id ,
                            t1.zc_bal ,
                            '${start|yyyyMMdd}' as start_dt, --开链日期为dt
                            '29991231' as end_dt   ,          --闭链日期为昨日
                            '0' as del_flag                  --删除标记为0
  from cust_zc_bal_zipper_y1  t1
  left join cust_zc_bal_now   t2
    on t1.cust_id = t2.cust_id  
 where t1.end_dt = '29991231' 
   and t1.del_flag = '0'  
   and t2.cust_id is not null   
   and t1.zc_bal <> t2.zc_bal --T1未删除且结束日期是29991231 ,T2客户不为空,t1和t2资产不相同

union all 

select         
                            t1.cust_id ,
                            t1.zc_bal ,
                            '${start|yyyyMMdd}' as start_dt, --开链日期为dt
                            '29991231' as end_dt   ,          --闭链日期为昨日
                            '0' as del_flag                  --删除标记为0
  from cust_zc_bal_now   t1
  left join cust_zc_bal_zipper_y1   t2
    on t1.cust_id = t2.cust_id  
 where t2.cust_id is null                                  --t2历史没有此客户号
    or ( t2.cust_id is not null and t2.del_flag ='1' )     --t2历史有此客户号,但是失效了
) 
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag 
from(
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag  from cust_zc_bal_zipper_y2
union all
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag  from cust_zc_bal_zipper_y3
union all
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag from cust_zc_bal_zipper_y4
union all 
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag  from cust_zc_bal_zipper_y5
union all
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag  from cust_zc_bal_zipper_y6
) a

The fifth step, on the first day after running for half a year, change the table initialized in cust_zc_bal_zipper_y1 to the yesterday table of the current table.

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325122427&siteId=291194637