【hive】exists/in/left join/left semi join 一些实际使用时运行效率对比

背景

目标:有一批号码,要统计这批号码近60天日均主叫(有通话天数) <= 3 且 低流量(月累计)<=1M 

数据源:号码表(tmp_acc_table)、通话时间表(tmp_calling_list)、流量使用表(tmp_traffic_list)


最初思路

先单独对话单表、流量表全量统计出合适的号码,再inner join筛选出合适的号码。

存在问题

没有考虑到流量表数据量巨大(上百亿),存在的运行效率问题。全表统计低流量耗时非常长!!

调整思路

先将需要的号码中 ” 60天日均主叫(有通话天数) <= 3 “ 的号码过滤出来,再去匹配这批号码表的流量使用情况,最后统计“ 低流量(月累计)<=1M ”。


代码实现

--近60天日均通话次数小于3的号码表 tmp_acc_avg_below3 
--流量表 tmp_traffic_list 
--使用inner join 
--耗时:183.679 seconds
drop table tmp_traffic_info_test;
create table tmp_traffic_info_test as 
select v.* 
from (
    select acc_nbr,sum(sum_amount)/1048576 as cell_traffic_month_sum
    from( 
        select s.acc_nbr,t.sum_amount from  tmp_acc_avg_below3 s join tmp_traffic_list t on             s.acc_nbr=t.billing_nbr
        where t.billing_nbr regexp('^1[0-9]+$') 
        and length(t.billing_nbr)=11
        and cast(t.start_date as int) <=20220831
        and cast(t.etl_cycle_id as int)<=20220831
    ) v group by acc_nbr 
)v where v.cell_traffic_month_sum<1;


--使用exists
--耗时:135.573 seconds
drop table tmp_traffic_info_test;
create table tmp_traffic_info_test as 
select v.* 
from (
    select t.billing_nbr,sum(t.sum_amount)/1048576 as cell_traffic_month_sum from tmp_traffic_list  t 
    where exists (
    select k.acc_nbr from tmp_acc_avg_below3 k
    where t.billing_nbr = k.acc_nbr)
    and cast(t.start_date as int)>=20220801 
    and cast(t.start_date as int)<=20220831
    and cast(t.etl_cycle_id as int)>=20220801
    and cast(t.etl_cycle_id as int)<=20220831
    and t.billing_nbr regexp('^1[0-9]+$') 
    and length(t.billing_nbr)=11 
    group by billing_nbr
 )v where v.cell_traffic_month_sum<1;  

--使用in
--耗时:167.664 seconds
drop table tmp_traffic_info_test;
create table tmp_traffic_info_test as 
select v.* 
from (
	select t.billing_nbr,sum(t.sum_amount)/1048576 as cell_traffic_month_sum from  tmp_traffic_list  t 
	where t.billing_nbr in (
	select k.acc_nbr from tmp_acc_avg_below3 k
	where t.billing_nbr = k.acc_nbr)
	and cast(t.start_date as int)>=20220801 
	and cast(t.start_date as int)<=20220831
	and cast(t.etl_cycle_id as int)>=20220801
	and cast(t.etl_cycle_id as int)<=20220831
	and t.billing_nbr regexp('^1[0-9]+$') 
	and length(t.billing_nbr)=11 
	group by billing_nbr
)v where v.cell_traffic_month_sum<1; 


--使用left semi join 
--耗时:184.042 seconds
drop table tmp_traffic_info_test;
create table tmp_traffic_info_test as 
select v.* 
from (
select v.acc_nbr,sum(v.sum_amount)/1048576 as cell_traffic_month_sum 
  from( 
    select t.billing_nbr as acc_nbr,t.sum_amount 
        from (
            select t.billing_nbr,t.sum_amount 
            from tmp_traffic_list t 
            where t.billing_nbr regexp('^1[0-9]+$') 
            and length(t.billing_nbr)=11 
            and cast(t.start_date as int)>=20220801
            and cast(t.etl_cycle_id as int)>=20220801
        ) t 
    left semi join  tmp_acc_avg_below3 s on t.billing_nbr=s.acc_nbr 
   )v group by acc_nbr
)v where v.cell_traffic_month_sum<1;



总结

exists运行时间最少。

猜你喜欢

转载自blog.csdn.net/sodaloveer/article/details/126953852