hive电商流量日志分析2--pv分析

第一步: 需求分析

1

2

3

4

需要哪些字段(时间:每一天,各个时段,id,url,guid,tracTime)

需要分区为天/

PV(统计记录数)

UV(guid去重) 

第二步: 实施步骤

1

2

3

4

5

建Hive表,表列分隔符和文件保持一至

Load数据到Hive表中

写HiveSql进行统计,将结果放入Hive另一张表中(数据清洗)

从Hive的另一张表中的数据导出到Mysql,使用sqoop

网站项目从Mysql读取这张表的信息

预期结果

1

日期      小时      PV      UV

第三步: 实施

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

# 建源表(注意进入beeline用户名密码是linux的)

  create database if not exists track_log;

  use track_log;

  create table if not exists yhd_source(

  id              string,

  url             string,

  referer         string,

  keyword         string,

  type            string,

  guid            string,

  pageId          string,

  moduleId        string,

  linkId          string,

  attachedInfo    string,

  sessionId       string,

  trackerU        string,

  trackerType     string,

  ip              string,

  trackerSrc      string,

  cookie          string,

  orderCode       string,

  trackTime       string,

  endUserId       string,

  firstLink       string,

  sessionViewNo   string,

  productId       string,

  curMerchantId   string,

  provinceId      string,

  cityId          string,

  fee             string,

  edmActivity     string,

  edmEmail        string,

  edmJobId        string,

  ieVersion       string,

  platform        string,

  internalKeyword string,

  resultSum       string,

  currentPage     string,

  linkPosition    string,

  buttonPosition  string

  )row format delimited fields terminated by '\t' 

  stored as textfile

  load data local inpath '/home/liuwl/opt/datas/2015082818' into table yhd_source;

  load data local inpath '/home/liuwl/opt/datas/2015082819' into table yhd_source;

# 创建清洗表

  create table if not exists yhd_clean(

  id string,

  url string,

  guid string,

  date string,

  hour string)

  row format delimited fields terminated by '\t'

  insert into table yhd_clean select id,url,guid,substring(trackTime,9,2) date,substring(trackTime,12,2) hour from yhd_source;

  select id,date,hour from yhd_clean limit 5;

# 改建分区表(静态分区)

  create table if not exists yhd_part1(

  id string,

  url string,

  guid string

  ) partitioned by (date string,hour string)

  row format delimited fields terminated by '\t'

  insert into table yhd_part1 partition (date='28',hour='18') select id,url,guid from yhd_clean where date='28' and hour='18';

  insert into table yhd_part1 partition (date='28',hour='19') select id,url,guid from yhd_clean where date='28' and hour='19';

  select id,date ,hour from yhd_part1 where date ='28' and hour='18' limit 10;

# 使用动态分区需要修改部分参数

   hive.exec.dynamic.partition--true

   hive.exec.dynamic.partition.mode--nonstrict

  create table if not exists yhd_part2(

  id string,

  url string,

  guid string

  ) partitioned by (date string,hour string)

  row format delimited fields terminated by '\t'

# 动态分区根据partition字段进行匹配

  insert into table yhd_part2 partition (date,hour) select * from yhd_clean;

  select id,date ,hour from yhd_part2 where date ='28' and hour='18' limit 10;

# 实现需求

  PV: select date,hour,count(url) PV from yhd_part1 group by date,hour;

  0: jdbc:hive2://hadoop09-linux-01.ibeifeng.co> select date,hour,count(url) PV from yhd_part1 group by date,hour;

  +-------+-------+--------+--+

  | date  | hour  |   pv   |

  +-------+-------+--------+--+

  | 28    18    64972  |

  | 28    19    61162  |

  +-------+-------+--------+--+

  UV: select date,hour,count(distinct(guid)) UV from yhd_part1 group by date,hour;

  0: jdbc:hive2://hadoop09-linux-01.ibeifeng.co> select date,hour,count(distinct(guid)) UV from yhd_part1 group by date,hour;

  +-------+-------+--------+--+

  | date  | hour  |   uv   |

  +-------+-------+--------+--+

  | 28    18    23938  |

  | 28    19    22330  |

  +-------+-------+--------+--+

# 结合放入log_result表

  create table if not exists log_result as select date,hour,count(url) PV,count(distinct(guid)) UV from yhd_part1 group by date,hour;

  select date,hour,pv,uv from log_result;

  0: jdbc:hive2://hadoop09-linux-01.ibeifeng.co> select date,hour,pv,uv from log_result;    

  +-------+-------+--------+--------+--+

  | date  | hour  |   pv   |   uv   |

  +-------+-------+--------+--------+--+

  | 28    18    64972  23938  |

  | 28    19    61162  22330  |

  +-------+-------+--------+--------+--+

# 将结果表导出到Mysql,使用Sqoop

# 在Mysql中创建数据库和表

  create database if not exists track_result;

  use track_result;

  create table if not exists log_track_result(

  date varchar(10not null,

  hour varchar(10not null,

  pv varchar(10not null,

  uv varchar(10not null,

  primary key(date,hour)

  );

# 使用sqoop export 导出到log_track_result表

  bin/sqoop export \

  --connect jdbc:mysql://hadoop09-linux-01.ibeifeng.com:3306/track_result \

  --username root \

  --password root \

  --table log_track_result \

  --export-dir /user/hive/warehouse/track_log.db/log_result \

  --num-mappers 1 \

  --input-fields-terminated-by '\001'

# 在Mysql中查询测试

  select * from log_track_result;

  mysql> select * from log_track_result;

  +------+------+-------+-------+

  | date | hour | pv    | uv    |

  +------+------+-------+-------+

  | 28   18   64972 23938 |

  | 28   19   61162 22330 |

  +------+------+-------+-------+

  2 rows in set (0.00 sec)  

猜你喜欢

转载自blog.csdn.net/onway_goahead/article/details/85408031