Hive 部分业务数据指标计算

需求：统计连续出现三次的数据的id

文件名称：series.txt
数据格式如下：

id,number
1,1
2,1
3,1
4,2
5,1
6,2
7,2
8,3
9,3
10,3
11,3
12,4

思路：可以做一个自连接，使用笛卡尔积的方式来实现

CREATE  TABLE  series(
id int ,
number int 
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 


set hive.fetch.task.conversion=more;

第一步先自连接：

select * from series a join series b on a.id = b.id-1  
join series c on b.id=c.id-1;
 
a.id  a.number	b.id  b.number  c.id	c.number
1		1		2		1		3		1
2		1		3		1		4		2
3		1		4		2		5		1
4		2		5		1		6		2
5		1		6		2		7		2
6		2		7		2		8		3
7		2		8		3		9		3
8		3		9		3		10		3
9		3		10		3		11		3
10		3		11		3		12		4

第二部：加上筛选条件

select a.id,distinct(a.number) from series a 
join series b on a.id = b.id-1  join series c on b.id=c.id-1 
where a.number=b.number and b.number=c.number

a.id	a.number	b.id	b.number	c.id	c.number
1	1	2	1	3	1
8	3	9	3	10	3
9	3	10	3	11	3

最终结结果只需去重即可

数据文件 score.txt

需求：统计每个学生成绩最好的课程和分数、最差的课程和分数、求平均分

数据样本：

peck|math:90,english:100,sport:80
alice|math:80,english:99,sport:85
tom|math:62,english:99,sport:88

采用Map类型创建表，使用collection items 指定列分隔符，使用Map 指定数据的key分割符

create table score(
name string,
score map<string, int>
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' 
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';

查询数据：

load data local inpath '/home/hadoop/data/hive/score.txt' into table score;

hive (default)> select * from score

score.name	score.score
peck	{"math":90,"english":100,"sport":80}
alice	{"math":80,"english":99,"sport":85}
tom	{"math":62,"english":99,"sport":88}
Time taken: 0.046 seconds, Fetched: 3 row(s)

select name,score['math'] math,score['english'] english,score['sport'] sport from score
OK
name	math	english	sport
peck	90	100	80
alice	80	99	85
tom	62	99	88

使用explode函数炸开数据

select explode(score) as (course,score) from score;

hive (default)> select explode(score) as (course,score) from score;
course	score
math	90
english	100
sport	80
math	80
english	99
sport	85
math	62
english	99
sport	88

使用lateral view 进行炸裂

create table result_tmp as 
select ss.name,t.course,t.score
from
score ss
lateral view explode(ss.score) t as course,score;

ss.name	t.course	t.score
peck	math	90
peck	english	100
peck	sport	80
alice	math	80
alice	english	99
alice	sport	85
tom	math	62
tom	english	99
tom	sport	88

进一步操作得到结果数据

select ss.name,max(t.score) max_score,min(t.score) min_score,avg(t.score) avg_score
from
score ss
lateral view explode(ss.score) t as course,score 
group by ss.name

ss.name	max_score	min_score	avg_score
alice	99	80	88.0
peck	100	80	90.0
tom	99	62	83.0

select a.name,b.course,a.max_score,a.min_score,a.avg_score 
from (
select ss.name,max(t.score) max_score,min(t.score) min_score,avg(t.score) avg_score
from
score ss
lateral view explode(ss.score) t as course,score 
group by ss.name ) a 
join
(
select ss.name,t.course,t.score
from
score ss
lateral view explode(ss.score) t as course,score) b
on a.name = b.name and  a.max_score=b.score ;

换一种写法：使用With进行操作

with 
a as (select ss.name,max(t.score) max_score,min(t.score) min_score,avg(t.score) avg_score from score ss lateral view explode(ss.score) t as course,score  group by ss.name),
b as (select ss.name,t.course,t.score from score ss lateral view explode(ss.score) t as course,score) 
select a.name,b.course,a.max_score,a.min_score,a.avg_score
from a join b  
on a.name = b.name and  a.max_score=b.score ;

OK
a.name	b.course	a.max_score	a.min_score	a.avg_score
alice	english	99	80	88.0
peck	english	100	80	90.0
tom	english	99	62	83.0

将数据进行格式的转换

create table result_tmp as 
select ss.name,t.course,t.score
from
score ss
lateral view explode(ss.score) t as course,score;

ss.name	t.course	t.score
peck	math	90
peck	english	100
peck	sport	80
alice	math	80
alice	english	99
alice	sport	85
tom	math	62
tom	english	99
tom	sport	88

select t.name ,collect_list(concat_ws("-",t.course,cast(t.score as string))) as score
from result_tmp t
group by t.name

OK
t.name	score
alice	["math-80","english-99","sport-85"]
peck	["math-90","english-100","sport-80"]
tom		["math-62","english-99","sport-88"]

将上述拆分出来
select s.name,
split(s.score[0],"-")[1] as math_score,
split(s.score[1],"-")[1] as english_score,
split(s.score[2],"-")[1] as sport_score
from
(select t.name ,collect_list(concat_ws("-",t.course,cast(t.score as string))) as score
from result_tmp t
group by t.name) s

s.name	math_score	english_score	sport_score
alice	80	99	85
peck	90	100	80
tom	62	99	88

上述是行转列

需求：每个域名截止到每个月最大的访问次数累计到月的总访问次数

数据形式：

google.com,2020-01-02,5
google.com,2020-01-03,15
baidu.com,2020-01-01,5
youdao.com,2020-01-04,8
google.com,2020-01-02,25
google.com,2020-01-05,5
baidu.com,2020-02-02,4
youdao.com,2020-02-02,6
google.com,2020-02-02,10
google.com,2020-02-02,5
baidu.com,2020-02-02,16
youdao.com,2020-02-02,22
google.com,2020-03-02,23
google.com,2020-03-02,10
baidu.com,2020-03-02,11
youdao.com,2020-03-02,13


 			pv			max	sum
google.com,2020-01,50	50	50
google.com,2020-02,15	50	65
google.com,2020-03,33	50	98

创建表

create table access(
domain string,
day string,
pv int
)row format delimited fields terminated by ',';

加载数据

load data local inpath '/home/hadoop/data/hive/access.txt' into table access

hive (default)> select * from access;

access.domain	access.day	access.pv
google.com	2020-01-02	5
google.com	2020-01-03	15
baidu.com	2020-01-01	5

1.先把每个月的指标算出来

select domain,date_format(day,'yyyy-MM') month,sum(pv) pv
from access
group by domain,date_format(day,'yyyy-MM');

domain	month	pv
baidu.com	2020-01	5
baidu.com	2020-02	20
baidu.com	2020-03	11
google.com	2020-01	50
google.com	2020-02	15
google.com	2020-03	33
youdao.com	2020-01	8
youdao.com	2020-02	28
youdao.com	2020-03	13

2.根据上述计算结果算出每个月的指标

select 
t.domain,
t.month,
t.pv,
max(t.pv) over(partition by t.domain order by t.domain,t.month rows between unbounded preceding and current row) max_pv,
sum(t.pv) over(partition by t.domain order by t.domain,t.month rows between unbounded preceding and current row) sum_pv
from
(
select domain,date_format(day,'yyyy-MM') month,sum(pv) pv
from access
group by domain,date_format(day,'yyyy-MM')
) t

t.domain	t.month	t.pv	max_pv	sum_pv
baidu.com	2020-01	5	5	5
baidu.com	2020-02	20	20	25
baidu.com	2020-03	11	20	36
google.com	2020-01	50	50	50
google.com	2020-02	15	50	65
google.com	2020-03	33	50	98
youdao.com	2020-01	8	8	8
youdao.com	2020-02	28	28	36
youdao.com	2020-03	13	28	49

第二种方式采用自连接的方式实现

create table access_tmp as 
select domain,date_format(day,'yyyy-MM') month,sum(pv) pv
from access
group by domain,date_format(day,'yyyy-MM');

select * from access_tmp

access_tmp.domain	access_tmp.month	access_tmp.pv
baidu.com	2020-01	5
baidu.com	2020-02	20
baidu.com	2020-03	11
google.com	2020-01	50
google.com	2020-02	15
google.com	2020-03	33
youdao.com	2020-01	8
youdao.com	2020-02	28
youdao.com	2020-03	13

create table access_tmp2 as 
select a.domain a_domain,a.month a_month,a.pv a_pv,b.domain b_domain,b.month b_month,b.pv b_pv
from
access_tmp a join access_tmp b 
on a.domain = b.domain


select t.b_domain,t.b_month,t.b_pv,max(t.a_pv) max_pv,sum(t.a_pv) sum_pv
from (
select * from access_tmp2 where a_month <= b_month
) t 
group by t.b_domain,t.b_month,t.b_pv


t.b_domain	t.b_month	t.b_pv	max_pv	sum_pv
baidu.com	2020-01	5	5	5
baidu.com	2020-02	20	20	25
baidu.com	2020-03	11	20	36
google.com	2020-01	50	50	50
google.com	2020-02	15	50	65
google.com	2020-03	33	50	98
youdao.com	2020-01	8	8	8
youdao.com	2020-02	28	28	36
youdao.com	2020-03	13	28	49

Hive 部分业务数据指标计算

猜你喜欢