离线数据分析

离线数据分析
1.hive创建外部分区表管理数据
hdfs有数据了在hive中创建外部表来关联数据进行数据的清洗处理
在hive中创建外部分区表管理数据
EXTERNAL 关键字可以让用户创建一个外部表，在建表的同时指定一个指向实际数据的路径（LOCATION），Hive 创建内部表时，会将数据移动到数据仓库指向的路径；若创建外部表，仅记录数据所在的路径，不对数据的位置做任何改变。在删除表的时候，内部表的元数据和数据会被一起删除，而外部表只删除元数据，不删除数据。
 LIKE 允许用户复制现有的表结构，但是不复制数据。
 有分区的表可以在创建的时候使用 PARTITIONED BY 语句。一个表可以拥有一个或者多个分区，每一个分区单独存在一个目录下。
#创建外部分区表关联文件夹
create external table flux (url string,urlname string,title string,chset string,scr string,col string,lg string,je string,ec string,fv string,cn string,ref string,uagent string,stat_uv string,stat_ss string,cip string) PARTITIONED BY (reportTime string) row format delimited fields terminated by '|' location '/flux';
#增加分区信息

alter table flux add partition (reportTime='2016-05-18') location '/flux/reportTime=2016-05-18';

2.数据清洗
在大数据分析处理数据时往往会发现数据本身有一定的缺陷
包括：
数据格式不统一
字段缺失
丢弃数据补充固定值根据已有数据推测咨询业务人员单独提取出来处理
字段格式、数据范围错误
格式错误调整格式参看上面字段缺失的解决方案
数据需要预先经过合并排序等处理
按照需求处理

需要在数据处理之前解决数据本身如上的问题这个过程就称之为数据清洗的过程

数据清洗并没有严格工具上的限制什么工具合适最适合业务需求和公司自身的技术栈就用什么技术。

解决如上问题有很多方案，总的来说要结合业务特点灵活的进行方案设计。

3.flux的数据清洗
只保留需要的字段
将会话信息拆分为会话编号会话页面数会话时间

数据格式
url urlname title chset scr col lg je ec fv cn ref uagent stat_uv stat_ss cip
访问地址资源名网页标题字符集屏幕信息颜色语言环境是否支持java 是否支持cookie flash版本随机数前跳地址用户agent uv编号(uv_id) vv信息(会话id_会话次数_当前时间) 客户ip(服务器端获取)

#创建清洗表
create table dataclear(reportTime string,url string,urlname string,ref string,uagent string,uvid string,ssid string,sscount string,sstime string,cip string)row format delimited fields terminated by '|';
#清洗数据
insert overwrite table dataclear select reportTime,url,urlname,ref,uagent,stat_uv,split(stat_ss,'_')[0],split(stat_ss,'_')[1],split(stat_ss,'_')[2],cip from flux;

4.hive处理业务逻辑
pv
点击量 - 一次访问就是一个pv - 统计一段时间内日志的数量就是这段时间内的pv
select count(*) as pv from dataclear where reportTime = '2016-05-18';

uv
独立访客数 - 一天之内所有的独立访客数的总量 -
select count(distinct uvid) as uv from dataclear where reportTime='2016-05-18';
vv
会话总数 - 一天之内所有会话的总数 -
select count(distinct ssid) as vv from dataclear where reportTime='2016-05-18';

br
跳出率 - 一天之内跳出的会话/总的会话得到的比率 - 跳出的会话是指一个会话中只访问过一个页面这样的会话称之为跳出的会话总的会话所有的会话总数其实就是上面的vv
#所有的会话总数
select count(distinct ssid) as vv_count from dataclear where reportTime='2016-05-18';
#跳出的会话
select count(*) as br_count from (select ssid from dataclear where reportTime = '2016-05-18' group by ssid having count(*) = 1) as br_tab;
#将两个结果想象成两张表来进行连接将两个字段相除得到跳出率
select round(br_a_tab.br_count/br_b_tab.vv_count,4) as br from (select count(*) as br_count from (select ssid from dataclear where reportTime = '2016-05-18' group by ssid having count(*) = 1) as br_tab) as br_a_tab,(select count(distinct ssid) as vv_count from dataclear where reportTime='2016-05-18') as br_b_tab;

newip
新增ip总数 - 今天一天内所有ip 去重后在历史数据中从未出现过的总数 - 今天去重后的ip 历史上的ip
select count(distinct dataclear.cip) from dataclear where reportTime='2016-05-18' and cip not in (select inner_dataclear.cip from dataclear as inner_dataclear where datediff(inner_dataclear.reportTime,'2016-05-18')<0);

newcust
新增客户数 - 今天一天内所有客户编号去重后在历史数据中从未出现过的总数 - 今天去重后的uvid 历史上的uvid
select count(distinct dataclear.uvid) from dataclear where reportTime='2016-05-18' and uvid not in (select inner_dataclear.uvid from dataclear as inner_dataclear where datediff(inner_dataclear.reportTime,'2016-05-18')<0);

avgtime
平均访问时长 - 今天一天内所有会话访问时长的平均值 - 一个会话的访问时长 - 这个会话所有访问的时间 avg(结束时间 - 开始时间)
select round(avg(usetime),4) from (select max(sstime) - min(sstime) as usetime from dataclear where reportTime='2016-05-18' group by ssid) as avgtime_tab;

avgdeep
平均访问深度 - 今天一天内所有会话访问深度的平均值 - 一个会话中访问页面去重后的总数
select round(avg(deep),2) avgdeep from (select count(distinct urlname) as deep from dataclear where reportTime='2016-05-18' group by ssid) as avgdeep_tab;

5.结构导入最终表
方案1：
create table tongji1(reporttime string,fname string,fvalue string) row format delimited fields terminated by '|';

insert into tongji1 select '2016-10-10','pv',pv from (select count(*) as pv from dataclear where reporttime='2016-10-10') as pv_tabx;

insert into tongji1 select '2016-10-10','uv',uv from (select count(distinct uvid) as uv from dataclear where reporttime = '2016-10-10') as uv_tabx;

insert into tongji1 select '2016-10-10','vv',vv from (select count(distinct ssid) as vv from dataclear where reporttime = '2016-10-10') as vv_tabx;

insert into tongji1 select '2016-10-10','br',br from (select round(br_right_tab.brsc/br_left_tab.sc,4) as br from (select count(distinct ssid) as sc from dataclear where reporttime = '2016-10-10') as br_left_tab,(select count(br_tab.ssid) as brsc from (select ssid from dataclear where reporttime = '2016-10-10' group by ssid having count(*) == 1) as br_tab)as br_right_tab) as br_tabx;

insert into tongji1 select '2016-10-10','newip',newip from (select count(distinct dataclear.cip) as newip from dataclear where dataclear.reporttime = '2016-10-10' and cip not in (select inner_dataclear.cip from dataclear as inner_dataclear where datediff(inner_dataclear.reporttime,'2016-10-10')<0)) as newip_tabx;

insert into tongji1 select '2016-10-10','newcust',newcust from (select count(distinct dataclear.uvid) as newcust from dataclear where dataclear.reporttime = '2016-10-10' and uvid not in (select inner_dataclear.uvid from dataclear as inner_dataclear where datediff(inner_dataclear.reporttime,'2016-10-10')<0)) as newcust_tabx;

insert into tongji1 select '2016-10-10','avgtime',avgtime from (select round(avg(usetime),4) as avgtime from (select max(sstime) - min(sstime) as usetime from dataclear where reporttime = '2016-10-10' group by ssid) as avgtime_tab) as avgtime_tabx;

insert into tongji1 select '2016-10-10','avgdeep',avgdeep from (select round(avg(ssdeep),4) as avgdeep from (select count(distinct urlname) as ssdeep from dataclear where reporttime = '2016-10-10' group by ssid) as avgdeep_tab) as avgdeep_tabx;

create table tongji(reporttime string, pv int, uv int, vv int, br double, newip int, newcust int, avgtime double, avgdeep double) row format delimited fields terminated by '|';

insert into tongji
select
'2016-10-10',
pv_tabx.pv,
uv_tabx.uv,
vv_tabx.vv,
br_tabx.br,
newip_tabx.newip,
newcust_tabx.newcust,
avgtime_tabx.avgtime,
avgdeep_tabx.avgdeep
from
(select fvalue as pv from tongji1 where reporttime='2016-10-10' and fname ='pv') as pv_tabx,
(select fvalue as uv from tongji1 where reporttime='2016-10-10' and fname ='uv') as uv_tabx,
(select fvalue as vv from tongji1 where reporttime='2016-10-10' and fname ='vv') as vv_tabx,
(select fvalue as br from tongji1 where reporttime='2016-10-10' and fname ='br') as br_tabx,
(select fvalue as newip from tongji1 where reporttime='2016-10-10' and fname ='newip') as newip_tabx,
(select fvalue as newcust from tongji1 where reporttime='2016-10-10' and fname ='newcust') as newcust_tabx,
(select fvalue as avgtime from tongji1 where reporttime='2016-10-10' and fname ='avgtime') as avgtime_tabx,
(select fvalue as avgdeep from tongji1 where reporttime='2016-10-10' and fname ='avgdeep') as avgdeep_tabx;

--insert into tongji select '2016-10-10', pv_tabx.pv, uv_tabx.uv, vv_tabx.vv, br_tabx.br, newip_tabx.newip, newcust_tabx.newcust, avgtime_tabx.avgtime, avgdeep_tabx.avgdeep from (select fvalue as pv from tongji1 where reporttime='2016-10-10' and fname ='pv') as pv_tabx, (select fvalue as uv from tongji1 where reporttime='2016-10-10' and fname ='uv') as uv_tabx, (select fvalue as vv from tongji1 where reporttime='2016-10-10' and fname ='vv') as vv_tabx, (select fvalue as br from tongji1 where reporttime='2016-10-10' and fname ='br') as br_tabx, (select fvalue as newip from tongji1 where reporttime='2016-10-10' and fname ='newip') as newip_tabx, (select fvalue as newcust from tongji1 where reporttime='2016-10-10' and fname ='newcust') as newcust_tabx, (select fvalue as avgtime from tongji1 where reporttime='2016-10-10' and fname ='avgtime') as avgtime_tabx, (select fvalue as avgdeep from tongji1 where reporttime='2016-10-10' and fname ='avgdeep') as avgdeep_tabx;

方案2：
create table tongji( reporttime string, pv int, uv int, vv int, br double, newip int, newcust int, avgtime double, avgdeep double) row format delimited fields terminated by '|';

insert into tongji
select
'2016-10-10',
pv_tabx.pv,
uv_tabx.uv,
vv_tabx.vv,
br_tabx.br,
newip_tabx.newip,
newcust_tabx.newcust,
avgtime_tabx.avgtime,
avgdeep_tabx.avgdeep
from
(select count(*) as pv from dataclear where reporttime='2016-10-10') as pv_tabx,
(select count(distinct uvid) from dataclear where reporttime = '2016-10-10') as uv_tabx,
(select count(distinct ssid) from dataclear where reporttime = '2016-10-10') as vv_tabx,
(select count(distinct ssid) as sc from dataclear where reporttime = '2016-10-10') as br_tabx,
(select count(distinct dataclear.cip) as newip from dataclear where dataclear.reporttime = '2016-10-10' and cip not in (select inner_dataclear.cip from dataclear as inner_dataclear where datediff(inner_dataclear.reporttime,'2016-10-10')<0)) as newip_tabx,
(select count(distinct dataclear.uvid) as newcust from dataclear where dataclear.reporttime = '2016-10-10' and uvid not in (select inner_dataclear.uvid from dataclear as inner_dataclear where datediff(inner_dataclear.reporttime,'2016-10-10')<0)) as newcust_tabx,
(select round(avg(usetime),4) as avgtime from (select max(sstime) - min(sstime) as usetime from dataclear where reporttime = '2016-10-10' group by ssid) as avgtime_tab) as avgtime_tabx,
(select round(avg(ssdeep),4) as avgdeep from (select count(distinct urlname) as ssdeep from dataclear where reporttime = '2016-10-10' group by ssid) as avgdeep_tab) as avgdeep_tabx,

6.开发定时脚本
====hive中的变量使用和文件执行=================
hive定义变量
./hive --define key=value
或
./hive -d key=value
hive使用变量
${hivevar:key}
或
${key}
hive调用hql脚本
./hive -e "HQL"
或
./hive -f HQL文件
===========================================

为了自动化执行，将如上的hql改造，将其中变化的内容用占位符替代每次启动时动态指定
insert overwrite table tongji select '${rtime}',tab1.pv,tab2.uv,tab3.vv,tab4.br,tab5.newip,tab6.newcust,tab7.avgtime,tab8.avgdeep from (select count(*) as pv from dataclear where reportTime = '${rtime}') as tab1, (select count(distinct uvid) as uv from dataclear where reportTime='${rtime}') as tab2, (select count(distinct ssid) as vv from dataclear where reportTime='${rtime}') as tab3, (select round(br_a_tab.br_count/br_b_tab.vv_count,4) as br from (select count(*) as br_count from (select ssid from dataclear where reportTime = '${rtime}' group by ssid having count(*) = 1) as br_tab) as br_a_tab,(select count(distinct ssid) as vv_count from dataclear where reportTime='${rtime}') as br_b_tab) as tab4, (select count(distinct dataclear.cip) as newip from dataclear where reportTime='${rtime}' and cip not in (select inner_dataclear.cip from dataclear as inner_dataclear where datediff(inner_dataclear.reportTime,'${rtime}')<0)) as tab5, (select count(distinct dataclear.uvid) as newcust from dataclear where reportTime='${rtime}' and uvid not in (select inner_dataclear.uvid from dataclear as inner_dataclear where datediff(inner_dataclear.reportTime,'${rtime}')<0)) as tab6, (select round(avg(usetime),4) as avgtime from (select max(sstime) - min(sstime) as usetime from dataclear where reportTime='${rtime}' group by ssid) as avgtime_tab) as tab7, (select round(avg(deep),2) as avgdeep from (select count(distinct urlname) as deep from dataclear where reportTime='${rtime}' group by ssid) as avgdeep_tab) as tab8

再在启动时通过以下方式动态指定变量值：
./hive -f /root/work/tongji.hql -d rtime='2016-05-18'

增加定时任务：
/root/work/apache-hive-1.2.0-bin/bin/hive -f /root/work/flux.hql -d key_date=$(date +%Y-%m-%d)

7.使用sqoop导出数据到mysql
create database fluxdb;
use fluxdb;
create table tongji_1(
reportTime date,
pv int,
uv int,
vv int,
br double,
newip int,
avgtime double,
newcust int,
avgdeep double
);

sqoop export --connect jdbc:mysql://192.168.242.101:3306/fluxdb --username root --password root --export-dir '/user/hive/warehouse/fluxdb.db/tongji' --table tongji_1 -m 1 --fields-terminated-by '|'

猜你喜欢