Hive行列转换

行列转换在ETL中是非常常见的场景,Hive中也毫不意外的遇到了:
事例1
drop table if exists row_column_test;
create table row_column_test
(
   name              string 
   ,category        string
   ,id                   string 
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;

insert into table row_column_test
select
'123' as name,
'qls' as category,
'1' as id
from dual;

insert into table row_column_test
select
'124' as name,
'ecif' as category,
'1' as id
from dual;

insert into table row_column_test
select
'223' as name,
'ecif' as category,
'2' as id
from dual;

insert into table row_column_test
select
'224' as name,
'qls' as category,
'2' as id
from dual;


select a.id,  max(qlsname) as qlsname,max(ecifname) as ecifname, max(qlscategory) as qlscategory,max(ecifcategory) as ecifcategory
from (
  select id,
  case when category='qls' then name end as qlsname,
  case when category='ecif' then name end as ecifname,
  case when category='qls' then category end as qlscategory,
  case when category='ecif' then category end as ecifcategory
  from row_column_test
) a
group by a.id


事例2
把单个复杂的ETL值计算抽取出来,单独计算,存入通用的中间表(test_general_etl_result)。然后再生成目标表数据的时候,再join行列转换后的中间表,获取键值

drop table if exists rrs_grzx_common_etl_tmp;
create table rrs_grzx_common_etl_tmp
(
   col_name              string  COMMENT '列名'
   ,col_value            string  COMMENT '列值'
   ,lending_ref          string  COMMENT '借据号'
)partitioned by(ds string,tb_name string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;

insert into table rrs_grzx_common_etl_tmp  partition(ds='2015-01-30',tb_name='grzx_bf_base_info_loan')
select
'balance' as col_name,
'5000' as col_value,
'0001' as lending_ref
from dual;

insert into table test_general_etl_result
select
'grzx_bf_base_info_loan' as tb_name,
'balance' as col_name,
'20.34' as col_value,
'0001' as business_num
from dual;

insert into table test_general_etl_result
select
'grzx_bf_base_info_loan' as tb_name,
'billing_date' as col_name,
'20150123' as col_value,
'0001' as business_num
from dual;


insert into table test_general_etl_result
select
'grzx_bf_base_info_loan' as tb_name,
'account_status' as col_name,
'wip' as col_value,
'0002' as business_num
from dual;


insert into table test_general_etl_result
select
'grzx_bf_base_info_loan' as tb_name,
'balance' as col_name,
'4453' as col_value,
'0002' as business_num
from dual;


insert into table test_general_etl_result
select
'grzx_bf_base_info_loan' as tb_name,
'billing_date' as col_name,
'20150124' as col_value,
'0002' as business_num
from dual;


insert into table test_general_etl_result
select
a.lending_ref,
max(a.BALANCE) as BALANCE,
max(a.BILLING_DATE) as BILLING_DATE
from (
select
lending_ref as lending_ref,
case when col_name='balance' then col_value end as BALANCE,
case when col_name='billing_date' then col_value end as BILLING_DATE
from rrs_grzx_common_etl_tmp where tb_name='grzx_bf_base_info_loan' and ds='2015-01-30'
) a group by a.lending_ref
 
 
 
left join
        (select
         a.lending_ref,
         max(a.ACT_PAY_AMOUNT) as ACT_PAY_AMOUNT,
         max(a.BILLING_DATE) as BILLING_DATE,
         max(a.DELINQUENCY_COUNT) as DELINQUENCY_COUNT,
         max(a.ID_INDICATOR) as ID_INDICATOR,
         max(a.LAST_PAY_DAY) as LAST_PAY_DAY,
         max(a.LEFT_MONTH) as LEFT_MONTH,
         max(a.MAX_CREDIT) as MAX_CREDIT,
         max(a.MAX_DELINQUENCY_TERM) as MAX_DELINQUENCY_TERM,
         max(a.PAST_COUNT) as PAST_COUNT,
         max(a.PAY_MONTH) as PAY_MONTH,
         max(a.PAY_STATUS) as PAY_STATUS,
         max(a.SHARE_CREDIT_LIMIT) as SHARE_CREDIT_LIMIT,
         max(a.PAST_DUE) as PAST_DUE,
         max(a.SCHE_PAY_AMOUNT) as SCHE_PAY_AMOUNT
        from
            (select
             lending_ref as lending_ref,
             case when col_name='ACT_PAY_AMOUNT' then col_value end as ACT_PAY_AMOUNT,
             case when col_name='BILLING_DATE' then col_value end as BILLING_DATE,
             case when col_name='DELINQUENCY_COUNT' then col_value end as DELINQUENCY_COUNT,
             case when col_name='ID_INDICATOR' then col_value end as ID_INDICATOR,
             case when col_name='LAST_PAY_DAY' then col_value end as LAST_PAY_DAY,
             case when col_name='LEFT_MONTH' then col_value end as LEFT_MONTH,
             case when col_name='MAX_CREDIT' then col_value end as MAX_CREDIT,
             case when col_name='MAX_DELINQUENCY_TERM' then col_value end as MAX_DELINQUENCY_TERM,
             case when col_name='PAST_COUNT' then col_value end as PAST_COUNT,
             case when col_name='PAY_MONTH' then col_value end as PAY_MONTH,
             case when col_name='PAY_STATUS' then col_value end as PAY_STATUS,
             case when col_name='SHARE_CREDIT_LIMIT' then col_value end as SHARE_CREDIT_LIMIT,
             case when col_name='PAST_DUE' then col_value end as PAST_DUE,
             case when col_name='SCHE_PAY_AMOUNT' then col_value end as SCHE_PAY_AMOUNT
             from rrs_grzx_common_etl_tmp
             where tb_name='grzx_bf_base_info_loan' and ds='$report_date10'
             ) a group by a.lending_ref
        ) as h on h.LENDING_REF=a.LENDING_REF

猜你喜欢

转载自zzhonghe.iteye.com/blog/2209008
今日推荐