3.淘宝购买行为项目——使用Hive+Superset实现数据可视化

1. 热卖品牌Top10数据可视化

1.1 HiveSQL的编写

编写HiveSql

-- 热卖商品Top10
select brand_id,count(item_id) sale_num
from to_user_log
where brand_id is not null
group by brand_id
distribute by sale_num
sort by sale_num desc
limit 20;

-- 创建热卖商品Top10的结果表
create table if not exists tm_hot_sale_brand(
    brand_id int comment "品牌id",
    sale_name int comment "销售数量",
    date_day string comment "添加日期"
)
row format delimited
fields terminated by ","
lines terminated by "\n";

-- 将结果插入表中
from to_user_log
insert into tm_hot_sale_brand
select brand_id,count(item_id) sale_num, '20300101'
where brand_id is not null
group by brand_id
distribute by sale_num
sort by sale_num desc
limit 20;

在这里插入图片描述

1.2 Hive数据库同步到MySQL数据库

创建MySQL表
在这里插入图片描述

编写sqoop导出数据脚本

[root@node3 ~]# cat export_tm_hot_sale_brand.txt 
export
--connect
jdbc:mysql://node1:3306/taobao
--username
root
--password
123456
-m
1
--table
tm_hot_sale_brand
--columns
brand_id,sale_num,date_day
--export-dir
/user/hive_remote/warehouse/taobao.db/tm_hot_sale_brand

使用sqoop导出

[root@node3 ~]# sqoop --options-file export_tm_hot_sale_brand.txt 

在这里插入图片描述

1.3 superset绘制饼图

添加数据集到superset中
在这里插入图片描述
对数据进行设计
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
准备添加图
在这里插入图片描述
选择条件
在这里插入图片描述
在这里插入图片描述
保存可视化结果
在这里插入图片描述
在这里插入图片描述

2.购物达人Top10数据可视化

2.1 Hive SQL编写

-- 购物达人top10
select user_id, count(item_id) buy_num,'20300101'
from  to_user_log
where user_id is not null
group by user_id
distribute by buy_num
sort by buy_num desc
limit 10;

-- 创建购物达人top10结果表
create table if not exists tm_shopper_master(
    user_id int comment "用户id",
    buy_num int comment "购买数量",
    date_day int comment "添加日期"
)
row format delimited
fields terminated by ","
lines terminated by "\n";

-- 购物达人top10 结果插入表中
from  to_user_log
insert into tm_shopper_master
select user_id, count(item_id) buy_num,'20300101'
where user_id is not null
group by user_id
distribute by buy_num
sort by buy_num desc
limit 10;

在这里插入图片描述

2.2 Hive数据库同步到MySQL数据库

mysql中创建表
在这里插入图片描述

编写sqoop脚本导出数据并执行

[root@node3 ~]# cat export_tm_shopper_master.txt 
export
--connect
jdbc:mysql://node1:3306/taobao
--username
root
--password
123456
--table
tm_shopper_master
--columns
user_id,buy_num,date_day
--export-dir
/user/hive_remote/warehouse/taobao.db/tm_shopper_master
[root@node3 ~]# sqoop --options-file export_tm_shopper_master.txt 

2.3 supersert绘制漏斗图

导入数据表
在这里插入图片描述
编辑表
在这里插入图片描述
绘制图像
在这里插入图片描述
在这里插入图片描述
保存到可视化面板中
在这里插入图片描述

3.回购商品Top50数据可视化

3.1 HiveSQL的编写

在某个品牌下,同用户购买多次称之为回购:对用户id和商品id分组后,求出商品id出现的次数,对次数进行排序即可。

-- 回购top50
select user_id, brand_id, count(item_id) buy_num, '20300101'
from to_user_log
where user_id is not null and brand_id is not null
group by user_id,brand_id
distribute by buy_num
sort by buy_num desc
limit 50;


-- 回购Top50结果保存
create table if not exists tm_buy_back(
    user_id int comment "用户id",
    brand_id int comment "品牌id",
    buy_num int comment "购买数量",
    date_day string comment "添加日期"
)
row format delimited
fields terminated by ","
lines terminated by "\n";

-- 回购top50结果插入数据表
from to_user_log
insert into tm_buy_back
select user_id, brand_id, count(item_id) buy_num, '20300101'
where user_id is not null and brand_id is not null
group by user_id,brand_id
distribute by buy_num
sort by buy_num desc
limit 50;

在这里插入图片描述

3.2 Hive数据库同步到MySQL数据库

创建MySQL数据表

CREATE TABLE IF NOT EXISTS tm_but_back(
  user_id INT COMMENT "用户id",
  brand_id INT COMMENT "品牌id",
  buy_num INT COMMENT "购买数量",
  date_day VARCHAR(10) COMMENT "添加日期"
)

编写脚本,使用Sqoop同步数据库

[root@node3 ~]# cat export_tm_buy_back.txt
export
--connect
jdbc:mysql://node1:3306/taobao
--username
root
--password
123456
-m
1
--table
tm_buy_back
--columns
user_id,brand_id,buy_num,date_day
--export-dir
/user/hive_remote/warehouse/taobao.db/tm_buy_back
[root@node3 ~]# sqoop --options-file export_tm_buy_back.txt 

在这里插入图片描述

3.3 superset绘制表格

在superset添加并设置表
在这里插入图片描述

绘制table表
在这里插入图片描述
保存到可视化面板中
在这里插入图片描述

superset自定义显示颜色
在这里插入图片描述

4.各年龄段购物数量数据可视化

4.1 HiveSQL的编写

-- 各年龄端购物数量
-- 注意:age_range中存在空数据,将null改为-1,避免将来存在冲突问题
select if(u.age_range is null,-1,u.age_range) age_range, count(item_id) buy_num
from to_user_info u
left join to_user_log g on u.id = g.user_id
where g.user_id is not null
group by u.age_range
distribute by buy_num
sort by buy_num desc;

-- 各年龄段购物数量结果表
create table if not exists tm_age_range_buy(
    age_range int comment "用户年龄段",
    buy_num int comment "购买数量",
    date_day int comment "添加日期"
)
row format delimited
fields terminated by ","
lines terminated by "\n";

-- 各年龄端购物数量保存结果到hive数据表中
-- 注意:age_range中存在空数据,将null改为-1,避免将来存在冲突问题
from to_user_info u
left join to_user_log g on u.id = g.user_id
insert into tm_age_range_buy
select if(u.age_range is null,-1,u.age_range) age_range, count(item_id) buy_num,'20300101'
where g.user_id is not null
group by u.age_range
distribute by buy_num
sort by buy_num desc;

4.2 Hive数据库同步到MySQL数据库

-- 创建mysql数据表保存各年龄阶段购物数据同步
create table if not exists tm_age_range_buy(
    age_range int comment "年龄段",
    buy_num int comment "购买数量",
    date_day varchar(10) comment "添加日期"
);

编写脚本,使用Sqoop同步数据库

[root@node3 ~]# cat export_tm_age_range_buy.txt 
export
--connect
jdbc:mysql://node1:3306/taobao
--username
root
--password
123456
-m
1
--table
tm_age_range_buy
--columns
age_range,buy_num,date_day
--export-dir
/user/hive_remote/warehouse/taobao.db/tm_age_range_buy
[root@node3 ~]# sqoop --options-file export_tm_age_range_buy.txt 

4.3 superset绘制热力图

在这里插入图片描述
发现此时的y轴坐标顺序混乱,不直观
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

5.网站购物行为与性别关系数据可视化

5.1 HiveSQL的编写

性别数据:男、女、缺失、丢失数据

-- 网购行为与性别关系
select if(u.gender is null,3,u.gender),count(item_id) buy_num,'20300101'
from to_user_info u
left join to_user_log g on u.id=g.user_id
where g.user_id is not null
group by u.gender
distribute by buy_num
sort by buy_num

-- 网购行为与性别关系结果表
create table if not exists tm_gender_buy(
    gender int comment "性别",
    buy_num int comment "购买数量",
    date_buy string comment "添加日期"
)
row format delimited
fields terminated by ","
lines terminated by "\n"

-- 网购行为与性别关系结果存储
from to_user_info u
left join to_user_log g on u.id=g.user_id
insert into tm_gender_buy
select if(u.gender is null,3,u.gender),count(item_id) buy_num,'20300101'
where g.user_id is not null
group by u.gender
distribute by buy_num
sort by buy_num

在这里插入图片描述

5.2 Hive数据库同步到MySQL数据库

CREATE TABLE IF NOT EXISTS tm_gender_buy(
  gender INT COMMENT "性别 0表示女, 1表示男, 2表示保密, 3表示未知",
  buy_num INT COMMENT "购买数量",
  date_day VARCHAR(10) COMMENT	"添加日期"
);

同步数据

[root@node3 ~]# cat export_tm_gender_buy.txt 
export
--connect
jdbc:mysql://node1:3306/taobao
--username
root
--password
123456
-m
1
--table
tm_gender_buy
--columns
gender,buy_num,date_buy
--export-dir
/user/hive_remote/warehouse/taobao.db/tm_gender_buy
[root@node3 ~]# sqoop --options-file export_tm_gender_buy.txt 

在这里插入图片描述

5.3 superset绘制饼图

在这里插入图片描述

在这里插入图片描述
在这里插入图片描述

6.品牌内热销商品Top3数据可视化

6.1 HiveSQL的编写

求各个品牌的商品销量最高的Top3

-- 品牌内热销商品Top3:如何在双重索引内部进行排列
select brand_id,item_id,sale_num,rank
from (
         select brand_id,
                item_id,
                sale_num,
                row_number() over (partition by brand_id order by sale_num desc ) rank
         from (
                  select brand_id, item_id, count(user_id) sale_num
                  from to_user_log
                  where brand_id is not null
                    and item_id is not null
                  group by brand_id, item_id
                           -- distribute by brand_id
                           -- sort by brand_id asc
                           -- 等价于:brand_id都是一样的 且 为升序
                      cluster by brand_id
              ) tba
     ) tbb
where rank<=3;

-- 品牌内热销商品Top3结果表
create table if not exists tm_brand_item_rank
(
    brand_id int comment "品牌id",
    item_id  int comment "商品id",
    sale_num int comment "销售数量",
    rank     int comment "商品销量排名",
    date_buy string comment "添夹日期"
)
row format delimited
fields terminated by ","
lines terminated by "\n";

-- 品牌内热销商品Top3结果存储到结果表中
from (
         select brand_id,
                item_id,
                sale_num,
                row_number() over (partition by brand_id order by sale_num desc ) rank
         from (
                  select brand_id, item_id, count(user_id) sale_num
                  from to_user_log
                  where brand_id is not null
                    and item_id is not null
                  group by brand_id, item_id
                           -- distribute by brand_id
                           -- sort by brand_id asc
                           -- 等价于:brand_id都是一样的 且 为升序
                      cluster by brand_id
              ) tba
) tbb
insert into tm_brand_item_rank
select brand_id,item_id,sale_num,rank,'20030101'
where rank<=3;

在这里插入图片描述

6.2 Hive数据库同步到MySQL数据库

# 创建品牌内热销mysql表
create table if not exists tm_brand_item_rank(
    brand_id int comment '品牌id',
    item_id int comment '商品id',
    sale_num int comment '销售数量',
    rank int comment '销量排名',
    date_day varchar(10) comment '添加日期'
)
[root@node3 ~]# cat export_tm_brand_item_rank.txt 
export
--connect
jdbc:mysql://node1:3306/taobao
--username
root
--password
123456
--m
1
--table
tm_brand_item_rank
--columns
brand_id,item_id,sale_num,rank,date_day
--export-dir
/user/hive_remote/warehouse/taobao.db/tm_brand_item_rank
[root@node3 ~]# sqoop --options-file export_tm_brand_item_rank.txt 

在这里插入图片描述

6.3 superset绘制table图表

在这里插入图片描述
在这里插入图片描述

7.购物记录时间拓宽为年月日数据可视化

7.1 HiveSQL的编写——将数据拓宽为年月日

-- 购物记录时间拓宽为年月日
select user_id,cat_id,brand_id,item_id,seller_id,2030 time_year ,`floor`(time_stamp/100) time_month,time_stamp%100 time_day
from to_user_log
where time_stamp is not null
limit 100;

-- 创建中间表存储结果
create table if not exists temp_user_log
(
    user_id    int comment "买家id",
    cat_id     int comment "分类id",
    brand_id   int comment "品牌id",
    item_id    int comment "产品id",
    seller_id  int comment "卖家id",
    time_year  int comment "年",
    time_month int comment "月",
    time_day  int comment "日"
)
row format delimited
fields terminated by ","
lines terminated by "\n";

-- 查询结果插入中间表
from to_user_log
insert overwrite table temp_user_log
select user_id,cat_id,brand_id,item_id,seller_id,2030 time_year ,`floor`(time_stamp/100) time_month,time_stamp%100 time_day
where time_stamp is not null

在这里插入图片描述

7.2 HiveSQL编写——拓宽周数和星期几

-- 购物记录拓宽周数和星期几
select weekofyear('2030-01-01');
select pmod(datediff("2030-01-01","1970-01-01")-3,7);
select concat('aa',100,5.5)
select unix_timestamp('2030-01-01','yyyy-MM-dd');

--从中间零时表数据进行分析计算
select
user_id,cat_id,brand_id,item_id,seller_id,time_year,time_month,time_day,
    unix_timestamp(concat(time_year,"-",if(time_month>9,time_month,concat("0",time_month)),"-",
        if(time_day>9,time_day,concat("0",time_day))),'yyyy-MM-dd') time_stamp,
    weekofyear(concat(time_year,"-",if(time_month>9,time_month,concat("0",time_month)),"-",
        if(time_day>9,time_day,concat("0",time_day)))) eek_year
from temp_user_log
limit 50;

-- 创建结果表
create table if not exists td_userlog_year_month_day_week(
    user_id int comment "买家id",
    cat_id int comment "分类id",
    brand_id int comment "品牌id",
    item_id int comment "产品id",
    seller_id int comment "卖家id",
    time_year int comment "年",
    time_month int comment "月",
    time_day int comment "日",
    time_stamp bigint comment "时间戳 单位秒",
    week_year int comment "一年中的第几周",
    week_day int comment "星期几"
)
row format delimited
fields terminated by ","
lines terminated by "\n";

-- 结果插入结果表中
from temp_user_log
insert into td_userlog_year_month_day_week
select user_id,cat_id,brand_id,item_id,seller_id,time_year,time_month,time_day,
    unix_timestamp(concat(time_year,"-",
        if(time_month>9,time_month,concat("0",time_month)),"-",if(time_day>9,time_day,concat("0",time_day))),'yyyy-MM-dd') time_stamp,
    weekofyear(concat(time_year,"-",if(time_month>9,time_month,concat("0",time_month)),"-",
        if(time_day>9,time_day,concat("0",time_day)))) week_year,
    pmod(datediff(concat(time_year,"-",if(time_month>9,time_month,concat("0",time_month)),"-",
        if(time_day>9,time_day,concat("0",time_day))),
            "1970-01-01") - 3,7) week_day;

7.3 创建MySQL表存储

CREATE TABLE IF NOT EXISTS td_userlog_year_month_day_week(
    user_id INT COMMENT "买家id",
    cat_id INT COMMENT "分类id",
    brand_id INT COMMENT "品牌id",
    item_id INT COMMENT "产品id",
    seller_id INT COMMENT "卖家id",
    time_year INT COMMENT "年",
    time_month INT COMMENT "月",
    time_day INT COMMENT "日",
    time_stamp BIGINT COMMENT "时间戳 单位秒",
    week_year INT COMMENT "一年中的第几个星期",
    week_day INT COMMENT "星期几"
);taobaotd_userlog_year_month_day_week

猜你喜欢

转载自blog.csdn.net/m0_63953077/article/details/130789572