Hive SQL窗口函数实现页面统计(以腾云天下页面访问为例)

埋点数据字段为:

userid,at,sid,pid分别表示用户id,访问时间,sessionId(区分一次启动),页面id

表名为beacon

所有数据均为模拟数据

2018-07-04 11:46:37	2856	efda26adec1c3eb8	h_01
2018-07-04 11:46:47	2856	efda26adec1c3eb8	h_03
2018-07-04 11:46:54	2856	efda26adec1c3eb8	h_02
2018-07-04 11:47:04	2856	efda26adec1c3eb8	h_02
2018-07-04 11:47:39	2856	efda26adec1c3eb8	h_04
2018-07-04 11:47:39	2856	efda26adec1c3eb8	h_09
2018-07-04 11:47:39	2856	efda26adec1c3eb8	h_01
2018-07-04 11:47:39	2856	efda26adec1c3eb8	h_03
2018-07-04 11:48:40	2856	efda26adec1c3eb8	h_07
2018-07-04 12:48:13	2856	b975601de0e1c2fc	h_01
2018-07-04 12:48:40	2856	b975601de0e1c2fc	h_03
2018-07-04 12:49:07	2856	b975601de0e1c2fc	h_02
2018-07-04 12:49:52	2856	b975601de0e1c2fc	h_07
2018-07-04 12:50:02	2856	5f52c96c52c98367	h_01
2018-07-04 12:50:47	2823	5f52c96c52c98367	h_03
2018-07-04 12:51:09	2823	5f52c96c52c98367	h_02

埋点原因无法统计到最后一个页面停留时间

最终可视化效果为如下图所示

无法查看图片可直接去腾云天下官网查看http://doc.talkingdata.com/posts/522

页面停留时间:

需要按sid分组后,访问时间从小到大排序,后一条时间减去前一条时间为上一条数据里页面的停留时间,故需要用到lead函数

1.求页面受访人数,页面受访(次数|比率)

select to_date(at) date,page p,count(1) pv,count(distinct userid) uv 
from tmp 
group by to_date(at),page

  结果如下

比率:需要每个页面的pv/总的pv,这里用窗口函数sum() over()

select t.date,t.p,t.uv,t.pv,round(t.pv/sum(t.pv) over(),3)
from
(
select to_date(at) date,page p,count(1) pv,count(distinct userid) uv 
from tmp 
group by to_date(at),page
) t

结果如下:

2.求受访总时长占比,平均停留时间(使用lead函数)

select to_date(at) date,page p,
lead(page,1,'endpage') over(partition by sid order by unix_timestamp(at)) nextpage,
at at,
lead(at,1,'endat') over(partition by sid order by unix_timestamp(at)) nextat
from tmp;

结果如下:

接下来求所有页面的停留时长,并过滤掉最后一个页面(下个页面为endpage)与页面与下个页面相同的数据

受访总时长占比为:每个页面总的访问时长/所有页面总的访问时间

select p.date date,
p.p page,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/count(1) over(partition by p.p),3) avglen,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.date),3) rate
from
(
select to_date(at) date,
page p,lead(page,1,'endpage') over(partition by sid order by unix_timestamp(at)) nextpage,
at at,
lead(at,1,'endat') over(partition by sid order by unix_timestamp(at)) nextat
from tmp
) p
where p.p!=p.nextpage and p.nextpage!='endpage'

结果如下:

因为使用over(),页面相同的数据都一样,故去重一下

select n.date date,n.page p,n.avglen avg,n.rate rate
from
(
select p.date date,p.p page,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/count(1) over(partition by p.p),3) avglen,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.date),3) rate
from
(
select to_date(at) date,page p,
lead(page,1,'endpage') over(partition by sid order by unix_timestamp(at)) nextpage,
at at,
lead(at,1,'endat') over(partition by sid order by unix_timestamp(at)) nextat
from tmp
) p
where p.p!=p.nextpage and p.nextpage!='endpage'
) n
group by n.date,n.page,n.avglen,n.rate

结果如下:

:

3.求离开应用

select to_date(browsepath.time) date,browsepath.p p,
round(sum(case when browsepath.nextpage='end' then 1 else 0 end)/sum(1),3) lrate
from
(
select at time,page p,
lead(page,1,'end') over(partition by sid order by unix_timestamp(at)) nextpage
from tmp
) browsepath
where browsepath.p!=browsepath.nextpage
group by to_date(browsepath.time),browsepath.p

结果如下:

4.走向

select j.date date,j.p p,
collect_list(concat_ws('_',j.nextpage,j.rate)) l
from
(
select b.date date,b.p p,b.nextpage nextpage,
cast(b.c/sum(b.c) over(partition by b.p) as string) rate
from
(
select to_date(browsepath.time) date,
browsepath.p p,browsepath.nextpage nextpage,count(1) c
from
(
select at time,page p,
lead(page,1,'end') over(partition by sid order by unix_timestamp(at)) nextpage
from tmp
) browsepath
where browsepath.p!=browsepath.nextpage and nextpage!='end'
group by to_date(browsepath.time),browsepath.p,browsepath.nextpage
) b
) j
group by j.date,j.p

结果如下:

接下来就是把sql join一下:

select pu.date,pu.p,pu.uv,pu.pv,len.rate,len.avg,lr.lrate,lr.path
from
(
select leave.date date,leave.p p,leave.lrate lrate,browse.l path
from
(
select to_date(browsepath.time) date,browsepath.p p,
round(sum(case when browsepath.nextpage='end' then 1 else 0 end)/sum(1),3) lrate
from
(
select at time,page p,
lead(page,1,'end') over(partition by sid order by unix_timestamp(at)) nextpage
from tmp
) browsepath
where browsepath.p!=browsepath.nextpage
group by to_date(browsepath.time),browsepath.p
) leave
full join
(
select j.date date,j.p p,collect_list(concat_ws('_',j.nextpage,j.rate)) l
from
(
select b.date date,b.p p,b.nextpage nextpage,
cast(b.c/sum(b.c) over(partition by b.p) as string) rate
from
(
select to_date(browsepath.time) date,browsepath.p p,browsepath.nextpage nextpage,count(1) c
from
(
select at time,page p,
lead(page,1,'end') over(partition by sid order by unix_timestamp(at)) nextpage
from tmp
) browsepath
where browsepath.p!=browsepath.nextpage and nextpage!='end'
group by to_date(browsepath.time),browsepath.p,browsepath.nextpage
) b
) j
group by j.date,j.p
) browse
on leave.date=browse.date and leave.p=browse.p
) lr
join
(
select t.date date,t.p p,concat_ws('_',cast(t.pv as string),
cast(round(t.pv/sum(pv) over(),3) as string)) pv,t.uv uv
from
(
select to_date(at) date,page p,count(1) pv,count(distinct userid) uv 
from tmp 
group by to_date(at),page
) t
) pu
on lr.date=pu.date and lr.p=pu.p
join
(
select n.date date,n.page p,n.avglen avg,n.rate rate
from
(
select p.date date,p.p page,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/count(1) over(partition by p.p),3) avglen,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.date),3) rate
from
(
select to_date(at) date,page p,
lead(page,1,'endpage') over(partition by sid order by unix_timestamp(at)) nextpage,
at at,
lead(at,1,'endat') over(partition by sid order by unix_timestamp(at)) nextat
from tmp
) p
where p.p!=p.nextpage and p.nextpage!='endpage'
) n
group by n.date,n.page,n.avglen,n.rate
) len
on pu.date=len.date and pu.p=len.p;

这就ok啦,有不足的地方欢迎大家评论!

猜你喜欢

转载自blog.csdn.net/zhaolq1024/article/details/81081710
今日推荐