背景
目标:有一批号码,要统计这批号码近60天日均主叫(有通话天数) <= 3 且 低流量(月累计)<=1M
数据源:号码表(tmp_acc_table)、通话时间表(tmp_calling_list)、流量使用表(tmp_traffic_list)
最初思路
先单独对话单表、流量表全量统计出合适的号码,再inner join筛选出合适的号码。
存在问题
没有考虑到流量表数据量巨大(上百亿),存在的运行效率问题。全表统计低流量耗时非常长!!
调整思路
先将需要的号码中 ” 60天日均主叫(有通话天数) <= 3 “ 的号码过滤出来,再去匹配这批号码表的流量使用情况,最后统计“ 低流量(月累计)<=1M ”。
代码实现
--近60天日均通话次数小于3的号码表 tmp_acc_avg_below3
--流量表 tmp_traffic_list
--使用inner join
--耗时:183.679 seconds
drop table tmp_traffic_info_test;
create table tmp_traffic_info_test as
select v.*
from (
select acc_nbr,sum(sum_amount)/1048576 as cell_traffic_month_sum
from(
select s.acc_nbr,t.sum_amount from tmp_acc_avg_below3 s join tmp_traffic_list t on s.acc_nbr=t.billing_nbr
where t.billing_nbr regexp('^1[0-9]+$')
and length(t.billing_nbr)=11
and cast(t.start_date as int) <=20220831
and cast(t.etl_cycle_id as int)<=20220831
) v group by acc_nbr
)v where v.cell_traffic_month_sum<1;
--使用exists
--耗时:135.573 seconds
drop table tmp_traffic_info_test;
create table tmp_traffic_info_test as
select v.*
from (
select t.billing_nbr,sum(t.sum_amount)/1048576 as cell_traffic_month_sum from tmp_traffic_list t
where exists (
select k.acc_nbr from tmp_acc_avg_below3 k
where t.billing_nbr = k.acc_nbr)
and cast(t.start_date as int)>=20220801
and cast(t.start_date as int)<=20220831
and cast(t.etl_cycle_id as int)>=20220801
and cast(t.etl_cycle_id as int)<=20220831
and t.billing_nbr regexp('^1[0-9]+$')
and length(t.billing_nbr)=11
group by billing_nbr
)v where v.cell_traffic_month_sum<1;
--使用in
--耗时:167.664 seconds
drop table tmp_traffic_info_test;
create table tmp_traffic_info_test as
select v.*
from (
select t.billing_nbr,sum(t.sum_amount)/1048576 as cell_traffic_month_sum from tmp_traffic_list t
where t.billing_nbr in (
select k.acc_nbr from tmp_acc_avg_below3 k
where t.billing_nbr = k.acc_nbr)
and cast(t.start_date as int)>=20220801
and cast(t.start_date as int)<=20220831
and cast(t.etl_cycle_id as int)>=20220801
and cast(t.etl_cycle_id as int)<=20220831
and t.billing_nbr regexp('^1[0-9]+$')
and length(t.billing_nbr)=11
group by billing_nbr
)v where v.cell_traffic_month_sum<1;
--使用left semi join
--耗时:184.042 seconds
drop table tmp_traffic_info_test;
create table tmp_traffic_info_test as
select v.*
from (
select v.acc_nbr,sum(v.sum_amount)/1048576 as cell_traffic_month_sum
from(
select t.billing_nbr as acc_nbr,t.sum_amount
from (
select t.billing_nbr,t.sum_amount
from tmp_traffic_list t
where t.billing_nbr regexp('^1[0-9]+$')
and length(t.billing_nbr)=11
and cast(t.start_date as int)>=20220801
and cast(t.etl_cycle_id as int)>=20220801
) t
left semi join tmp_acc_avg_below3 s on t.billing_nbr=s.acc_nbr
)v group by acc_nbr
)v where v.cell_traffic_month_sum<1;
总结
exists运行时间最少。