离线数据分析

离线数据分析
1.hive创建外部分区表管理数据
hdfs有数据了 在hive中创建外部表来关联数据 进行数据的清洗 处理
在hive中创建外部分区表 管理数据
EXTERNAL 关键字可以让用户创建一个外部表,在建表的同时指定一个指向实际数据的路径(LOCATION),Hive 创建内部表时,会将数据移动到数据仓库指向的路径;若创建外部表,仅记录数据所在的路径,不对数据的位置做任何改变。在删除表的时候,内部表的元数据和数据会被一起删除,而外部表只删除元数据,不删除数据。
LIKE 允许用户复制现有的表结构,但是不复制数据。
有分区的表可以在创建的时候使用 PARTITIONED BY 语句。一个表可以拥有一个或者多个分区,每一个分区单独存在一个目录下。
#创建外部分区表关联文件夹
create external table flux (url string,urlname string,title string,chset string,scr string,col string,lg string,je string,ec string,fv string,cn string,ref string,uagent string,stat_uv string,stat_ss string,cip string) PARTITIONED BY (reportTime string) row format delimited fields terminated by '|' location '/flux';
#增加分区信息

alter table flux add partition (reportTime='2016-05-18') location '/flux/reportTime=2016-05-18';

2.数据清洗
在大数据分析处理数据时 往往会发现数据本身有一定的缺陷
包括:
数据格式不统一
字段缺失
丢弃数据 补充固定值 根据已有数据推测 咨询业务人员 单独提取出来处理
字段格式、数据范围错误
格式错误 调整格式 参看上面字段缺失的解决方案
数据需要预先经过合并排序等处理
按照需求处理

需要在数据处理之前解决数据本身如上的问题 这个过程就称之为数据清洗的过程


数据清洗并没有严格工具上的限制 什么工具合适 最适合业务需求 和 公司自身的技术栈 就用什么技术。


解决如上问题有很多方案,总的来说要结合业务特点 灵活的进行方案设计。


3.flux的数据清洗
只保留需要的字段
将会话信息拆分 为 会话编号 会话页面数 会话时间


数据格式
url urlname title chset scr col lg je ec fv cn ref uagent stat_uv stat_ss cip
访问地址 资源名 网页标题 字符集 屏幕信息 颜色 语言环境 是否支持java 是否支持cookie flash版本 随机数 前跳地址 用户agent uv编号(uv_id) vv信息(会话id_会话次数_当前时间) 客户ip(服务器端获取)


#创建清洗表
create table dataclear(reportTime string,url string,urlname string,ref string,uagent string,uvid string,ssid string,sscount string,sstime string,cip string)row format delimited fields terminated by '|';
#清洗数据
insert overwrite table dataclear select reportTime,url,urlname,ref,uagent,stat_uv,split(stat_ss,'_')[0],split(stat_ss,'_')[1],split(stat_ss,'_')[2],cip from flux;

4.hive处理业务逻辑
pv
点击量 - 一次访问就是一个pv - 统计一段时间内日志的数量就是这段时间内的pv
select count(*) as pv from dataclear where reportTime = '2016-05-18';

uv
独立访客数 - 一天之内 所有的独立访客数的总量 - 
select count(distinct uvid) as uv from dataclear where reportTime='2016-05-18';
vv
会话总数 - 一天之内 所有会话的总数 - 
select count(distinct ssid) as vv from dataclear where reportTime='2016-05-18';


br
跳出率 - 一天之内 跳出的会话/总的会话 得到的比率 - 跳出的会话是指 一个会话中只访问过一个页面 这样的会话称之为跳出的会话 总的会话 所有的会话总数其实就是上面的vv 
#所有的会话总数
select count(distinct ssid) as vv_count from dataclear where reportTime='2016-05-18';
#跳出的会话
select count(*) as br_count from (select ssid from dataclear where reportTime = '2016-05-18' group by ssid having count(*) = 1) as br_tab;
#将两个结果 想象成两张表 来进行连接 将两个字段相除得到跳出率
select round(br_a_tab.br_count/br_b_tab.vv_count,4) as br from (select count(*) as br_count from (select ssid from dataclear where reportTime = '2016-05-18' group by ssid having count(*) = 1) as br_tab) as br_a_tab,(select count(distinct ssid) as vv_count from dataclear where reportTime='2016-05-18') as br_b_tab;

newip
新增ip总数 - 今天一天内 所有ip 去重后 在历史数据中从未出现过的总数 - 今天去重后的ip  历史上的ip
select count(distinct dataclear.cip) from dataclear where reportTime='2016-05-18' and cip not in (select inner_dataclear.cip from dataclear as inner_dataclear where datediff(inner_dataclear.reportTime,'2016-05-18')<0);


newcust
新增客户数 - 今天一天内 所有客户编号 去重后 在历史数据中从未出现过的总数 - 今天去重后的uvid 历史上的uvid
select count(distinct dataclear.uvid) from dataclear where reportTime='2016-05-18' and uvid not in (select inner_dataclear.uvid from dataclear as inner_dataclear where datediff(inner_dataclear.reportTime,'2016-05-18')<0);


avgtime
平均访问时长 - 今天一天内所有会话 访问时长的平均值 - 一个会话的访问时长 - 这个会话所有访问的时间 avg(结束时间 - 开始时间)
select round(avg(usetime),4) from (select max(sstime) - min(sstime) as usetime from dataclear where reportTime='2016-05-18' group by ssid) as avgtime_tab;

avgdeep
平均访问深度 - 今天一天内所有会话 访问深度的平均值 - 一个会话中访问页面去重后的总数
select round(avg(deep),2) avgdeep from (select count(distinct urlname) as deep from dataclear where reportTime='2016-05-18' group by ssid) as avgdeep_tab;


5.结构导入最终表
方案1:
create table tongji1(reporttime string,fname string,fvalue string) row format delimited fields terminated by '|';


insert into tongji1 select '2016-10-10','pv',pv from (select count(*) as pv from dataclear where reporttime='2016-10-10') as pv_tabx;


insert into tongji1 select '2016-10-10','uv',uv from (select count(distinct uvid) as uv from dataclear where reporttime = '2016-10-10') as uv_tabx;


insert into tongji1 select '2016-10-10','vv',vv from (select count(distinct ssid) as vv from dataclear where reporttime = '2016-10-10') as vv_tabx;


insert into tongji1 select '2016-10-10','br',br from (select round(br_right_tab.brsc/br_left_tab.sc,4) as br from (select count(distinct ssid) as sc from dataclear where reporttime = '2016-10-10') as br_left_tab,(select count(br_tab.ssid) as brsc from (select ssid from dataclear where reporttime = '2016-10-10' group by ssid having count(*) == 1) as br_tab)as br_right_tab) as br_tabx;


insert into tongji1 select '2016-10-10','newip',newip from (select count(distinct dataclear.cip) as newip from dataclear where dataclear.reporttime = '2016-10-10' and cip not in (select inner_dataclear.cip from dataclear as inner_dataclear where datediff(inner_dataclear.reporttime,'2016-10-10')<0)) as newip_tabx;


insert into tongji1 select '2016-10-10','newcust',newcust from (select count(distinct dataclear.uvid) as newcust from dataclear where dataclear.reporttime = '2016-10-10' and uvid not in (select inner_dataclear.uvid from dataclear as inner_dataclear where datediff(inner_dataclear.reporttime,'2016-10-10')<0)) as newcust_tabx;


insert into tongji1 select '2016-10-10','avgtime',avgtime from (select round(avg(usetime),4) as avgtime from (select max(sstime) - min(sstime) as usetime from dataclear where reporttime = '2016-10-10' group by ssid) as avgtime_tab) as avgtime_tabx;


insert into tongji1 select '2016-10-10','avgdeep',avgdeep from (select round(avg(ssdeep),4) as avgdeep from (select count(distinct urlname) as ssdeep from dataclear where reporttime = '2016-10-10' group by ssid) as avgdeep_tab) as avgdeep_tabx;




create table tongji(reporttime string, pv int, uv int, vv int, br double, newip int, newcust int, avgtime double, avgdeep double) row format delimited fields terminated by '|';


insert into tongji 
select 
'2016-10-10',
pv_tabx.pv,
uv_tabx.uv,
vv_tabx.vv,
br_tabx.br,
newip_tabx.newip,
newcust_tabx.newcust,
avgtime_tabx.avgtime,
avgdeep_tabx.avgdeep
from
(select fvalue as pv from tongji1 where reporttime='2016-10-10' and fname ='pv') as pv_tabx,
(select fvalue as uv from tongji1 where reporttime='2016-10-10' and fname ='uv') as uv_tabx,
(select fvalue as vv from tongji1 where reporttime='2016-10-10' and fname ='vv') as vv_tabx,
(select fvalue as br from tongji1 where reporttime='2016-10-10' and fname ='br') as br_tabx,
(select fvalue as newip from tongji1 where reporttime='2016-10-10' and fname ='newip') as newip_tabx,
(select fvalue as newcust from tongji1 where reporttime='2016-10-10' and fname ='newcust') as newcust_tabx,
(select fvalue as avgtime from tongji1 where reporttime='2016-10-10' and fname ='avgtime') as avgtime_tabx,
(select fvalue as avgdeep from tongji1 where reporttime='2016-10-10' and fname ='avgdeep') as avgdeep_tabx;


--insert into tongji  select  '2016-10-10', pv_tabx.pv, uv_tabx.uv, vv_tabx.vv, br_tabx.br, newip_tabx.newip, newcust_tabx.newcust, avgtime_tabx.avgtime, avgdeep_tabx.avgdeep from (select fvalue as pv from tongji1 where reporttime='2016-10-10' and fname ='pv') as pv_tabx, (select fvalue as uv from tongji1 where reporttime='2016-10-10' and fname ='uv') as uv_tabx, (select fvalue as vv from tongji1 where reporttime='2016-10-10' and fname ='vv') as vv_tabx, (select fvalue as br from tongji1 where reporttime='2016-10-10' and fname ='br') as br_tabx, (select fvalue as newip from tongji1 where reporttime='2016-10-10' and fname ='newip') as newip_tabx, (select fvalue as newcust from tongji1 where reporttime='2016-10-10' and fname ='newcust') as newcust_tabx, (select fvalue as avgtime from tongji1 where reporttime='2016-10-10' and fname ='avgtime') as avgtime_tabx, (select fvalue as avgdeep from tongji1 where reporttime='2016-10-10' and fname ='avgdeep') as avgdeep_tabx; 




方案2:
create table tongji( reporttime string, pv int, uv int, vv int, br double, newip int, newcust int, avgtime double, avgdeep double) row format delimited fields terminated by '|';


insert into tongji
select 
'2016-10-10',
pv_tabx.pv,
uv_tabx.uv,
vv_tabx.vv,
br_tabx.br,
newip_tabx.newip,
newcust_tabx.newcust,
avgtime_tabx.avgtime,
avgdeep_tabx.avgdeep
from
(select count(*) as pv from dataclear where reporttime='2016-10-10') as pv_tabx,
(select count(distinct uvid) from dataclear where reporttime = '2016-10-10') as uv_tabx,
(select count(distinct ssid) from dataclear where reporttime = '2016-10-10') as vv_tabx,
(select count(distinct ssid) as sc from dataclear where reporttime = '2016-10-10') as br_tabx,
(select count(distinct dataclear.cip) as newip from dataclear where dataclear.reporttime = '2016-10-10' and cip not in (select inner_dataclear.cip from dataclear as inner_dataclear where datediff(inner_dataclear.reporttime,'2016-10-10')<0)) as newip_tabx,
(select count(distinct dataclear.uvid) as newcust from dataclear where dataclear.reporttime = '2016-10-10' and uvid not in (select inner_dataclear.uvid from dataclear as inner_dataclear where datediff(inner_dataclear.reporttime,'2016-10-10')<0)) as newcust_tabx,
(select round(avg(usetime),4) as avgtime from (select max(sstime) - min(sstime) as usetime from dataclear where reporttime = '2016-10-10' group by ssid) as avgtime_tab) as avgtime_tabx,
(select round(avg(ssdeep),4) as avgdeep from (select count(distinct urlname) as ssdeep from dataclear where reporttime = '2016-10-10' group by ssid) as avgdeep_tab) as avgdeep_tabx,


6.开发定时脚本
====hive中的变量使用 和 文件执行=================
hive定义变量
./hive --define key=value

./hive -d key=value
hive使用变量
${hivevar:key} 
或 
${key}
hive调用hql脚本
./hive -e "HQL"

./hive -f HQL文件
===========================================


为了自动化执行,将如上的hql改造,将其中变化的内容用占位符替代 每次启动时 动态指定
insert overwrite table tongji  select '${rtime}',tab1.pv,tab2.uv,tab3.vv,tab4.br,tab5.newip,tab6.newcust,tab7.avgtime,tab8.avgdeep from (select count(*) as pv from dataclear where reportTime = '${rtime}') as tab1, (select count(distinct uvid) as uv from dataclear where reportTime='${rtime}') as tab2, (select count(distinct ssid) as vv from dataclear where reportTime='${rtime}') as tab3, (select round(br_a_tab.br_count/br_b_tab.vv_count,4) as br from (select count(*) as br_count from (select ssid from dataclear where reportTime = '${rtime}' group by ssid having count(*) = 1) as br_tab) as br_a_tab,(select count(distinct ssid) as vv_count from dataclear where reportTime='${rtime}') as br_b_tab) as tab4, (select count(distinct dataclear.cip) as newip from dataclear where reportTime='${rtime}' and cip not in (select inner_dataclear.cip from dataclear as inner_dataclear where datediff(inner_dataclear.reportTime,'${rtime}')<0)) as tab5, (select count(distinct dataclear.uvid) as newcust from dataclear where reportTime='${rtime}' and uvid not in (select inner_dataclear.uvid from dataclear as inner_dataclear where datediff(inner_dataclear.reportTime,'${rtime}')<0)) as tab6, (select round(avg(usetime),4) as avgtime from (select max(sstime) - min(sstime) as usetime from dataclear where reportTime='${rtime}' group by ssid) as avgtime_tab) as tab7, (select round(avg(deep),2) as avgdeep from (select count(distinct urlname) as deep from dataclear where reportTime='${rtime}' group by ssid) as avgdeep_tab) as tab8


再在启动时通过以下方式动态指定变量值:
./hive -f /root/work/tongji.hql -d rtime='2016-05-18'


增加定时任务:
/root/work/apache-hive-1.2.0-bin/bin/hive -f /root/work/flux.hql -d key_date=$(date +%Y-%m-%d)


7.使用sqoop导出数据到mysql
create database fluxdb;
use fluxdb;
create table tongji_1(
reportTime date,
pv int,
uv int,
vv int,
br double,
newip int,
avgtime double,
newcust int,
avgdeep double
); 


sqoop export --connect jdbc:mysql://192.168.242.101:3306/fluxdb --username root --password root --export-dir '/user/hive/warehouse/fluxdb.db/tongji' --table tongji_1 -m 1 --fields-terminated-by '|'

猜你喜欢

转载自blog.csdn.net/vitaair/article/details/80220624