2020.10.14课堂笔记(项目:新零售电商数据仓库系统项目)

相关项目文档:
链接:https://pan.baidu.com/s/1Wk9EfxkDTHdzshGBi6AGJQ
提取码:1124

安装部署文档的书写标准:

1.安装前的资料准备(软件、硬件、其他):
开发环境 生产环境
2.安装中的步骤:
很明确很明细,路径全部都是绝对路径 可以写变量:SUBJECT_HOME 如:
傻瓜式操作,写脚本一键安装
3.安装过程中可能出现的异常处理(回滚)
4.完成安装之后的测试:
5.完成整个部署后的环境清理:
删除脚本、生成的测试文件、给实施人员看的文档不应该留在生产环境上

数据来源技术:

网页埋点/日志:logstash/flume等日志收集工具
数据库信息:sqoop等数据抽取工具
各种形式文件(csv、js):可以放置hdfs或直接load进hive表

将原始数据导入MySQL数据库:

3.新零售数仓项目\02-项目数据\01-原始数据\snbap_ods.sql

source /root/day02/snbap_ods.sql

通过sqoop将MySQL里的表导入到hive中:

bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table user --target-dir /data/snbap_ods/user --m 10
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table user --hive-import --hive-table snbap_ods.user --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table user_extend --hive-import --hive-table snbap_ods.user_extend --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table user_pc_click_log --hive-import --hive-table snbap_ods.user_pc_click_log --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table user_app_click_log --hive-import --hive-table snbap_ods.user_app_click_log --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table us_order --hive-import --hive-table snbap_ods.us_order --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table order_item --hive-import --hive-table snbap_ods.order_item --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table order_delivery --hive-import --hive-table snbap_ods.order_delivery --m 3

补充:导入user_addr

./sqoop import \
--connect \
jdbc:mysql://hadoop100:3306/snbap_ods \
--table user_addr \
--username root \
--password ok \
--hive-import \
--create-hive-table \
--hive-database snbap_ods \
--hive-table user_addr \
--m 3

重点表字段罗列如下:

在这里插入图片描述在这里插入图片描述在这里插入图片描述

create database snbap_dw;

create table snbap_dw.user_basic as
select u.user_id,user_name,u.user_gender,user_birthday,user_age,
constellation,province,city,city_level,e_mail,op_mail,mobile,
num_seg_mobile,op_Mobile,register_time,login_ip,login_source,
request_user,total_score,used_score,is_blacklist,is_married,
education,monthly_income,profession,is_pregnant_woman,is_have_children,
is_have_car,phone_brand,phone_brand_level,phone_cnt,change_phone_cnt,
is_maja,majia_account_cnt,loyal_model,shopping_type_model,weight,height 
from user u join user_extend e on u.user_id=e.user_id;

如果表位置错误可以采用如下方法:

create table snbap_dw.user_basic as select * from snbap_ods.user_basic;
drop table snbap_ods.user_basic;

最近一次的pc端用户访问情况

--最近一次的pc端用户访问情况
create table snbap_dw.user_last_pc_click as 
select distinct * from
(select user_id,session_id,cookie_id,visit_os,browser_name,visit_time,
rank() over(partition by user_id order by visit_time desc) rn,
count(page_id) over(partition by user_id order by visit_time) cnt
from snbap_ods.user_pc_click_log) a where rn=1;

第一次的pc端用户访问情况

--第一次的pc端用户访问情况
create table snbap_dw.user_first_pc_click as
select distinct * from
(select user_id,session_id,cookie_id,visit_os,browser_name,visit_time,
rank() over(partition by user_id order by visit_time) rn,
count(page_id) over(partition by user_id order by visit_time) cnt
from snbap_ods.user_pc_click_log) a where rn=1;

pc连续7天访问次数(跑任务的日期的前7天)

unix_timestamp() 得到当前时间戳
unix_timestamp(string date)
指定日期参数调用UNIX_TIMESTAMP(),它返回参数值’1970-01-01 00:00:00′到指定日期的秒数。
unix_timestamp(string date, string pattern)
指定时间输入格式,返回到1970年秒数:unix_timestamp(’2009-03-20′, ‘yyyy-MM-dd’) = 1237532400

--pc连续7天访问次数(跑任务的日期的前7天)    unix_timestamp()
select count(page_id) from snbap_ods.user_pc_click_log
where visit_time between date_sub(from_unixtime(unix_timestamp('2020-02-11'),'yyyy-MM-dd'),7) and '2020-02-11'
group by user_id

或者:

select count(page_id) from snbap_ods.user_pc_click_log
where visit_time between date_sub(current_date(),7) and current_date()
group by user_id

用户访问次数季度统计表

--用户访问次数季度统计表
create table user_pc_visit_total  as
with
r1 as (select count(visit_time) cnt1 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),7) and current_date()),
r2 as (select count(visit_time) cnt2 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),15) and current_date()),
r3 as (select count(visit_time) cnt3 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),30) and current_date()),
r4 as (select count(visit_time) cnt4 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),60) and current_date()),
r5 as (select count(visit_time) cnt5 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),90) and current_date())
select cnt1,cnt2,cnt3,cnt4,cnt5 from r1,r2,r3,r4,r5;

创建分区表user_pc_click_partition

create table snbap_dw.user_pc_click_partition(
  `log_id` bigint, 
  `user_id` bigint, 
  `session_id` string, 
  `cookie_id` string, 
  `visit_time` string, 
  `visit_url` string, 
  `visit_os` string, 
  `browser_name` string, 
  `visit_ip` string, 
  `province` string, 
  `city` string, 
  `page_id` int, 
  `goods_id` bigint, 
  `shop_id` bigint
)
partitioned by (visit_date string);
alter table snbap_dw.user_pc_click_partition change page_id pv int;

开启动态分区,非严格模式,设置分区参数上限:

set hive.ecec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
set hive.exec.max.dynamic.partitions.pernode=10000;
set hive.exec.max.dynamic.partitions=50000;

插入数据:

insert into snbap_dw.user_pc_click_partition partition (visit_date)
  select
  `log_id`, 
  `user_id`, 
  `session_id`, 
  `cookie_id`, 
  `visit_time`, 
  `visit_url`, 
  `visit_os`, 
  `browser_name`, 
  `visit_ip`, 
  `province`, 
  `city`, 
  count(`page_id`) over(partition by user_id,visit_time) pv, 
  `goods_id`, 
  `shop_id`,
  date(visit_time)
  from snbap_ods.user_pc_click_log

查看数据pv,但是这里的pv值时没有聚合的,pv=n代表有n条记录。

select user_id,pv from snbap_dw.user_pc_click_partition where pv=3;

同理创建app端的分区表:user_app_click_partition

create table snbap_dw.user_app_click_partition
(
  `user_id` bigint, 
  `imei` string, 
  `log_time` string, 
  `visit_os` string, 
  `os_version` string, 
  `app_name` string, 
  `app_version` string, 
  `device_token` string, 
  `visit_ip` string, 
  pv int,
  `province` string, 
  `city` string
)
partitioned by (log_date string);

设置动态分区相关参数:

set hive.ecec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
set hive.exec.max.dynamic.partitions.pernode=10000;
set hive.exec.max.dynamic.partitions=50000;
set mapred.reduce.tasks = 15;

向表中插入数据:

insert into snbap_dw.user_app_click_partition partition(log_date)
select `user_id`, 
  `imei`, 
  `log_time`, 
  `visit_os`, 
  `os_version`, 
  `app_name`, 
  `app_version`, 
  `device_token`, 
  `visit_ip`, 
  count(1) over(partition by user_id,log_time) pv,
  `province`, 
  `city`,
  date(log_time)
  from snbap_ods.user_app_click_log;

连续不同天数的访问次数

--连续不同天数的访问次数
create table snbap_dw.user_pc_visit_total  as
with
r1 as (select user_id,count(visit_time) cnt1 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),7) and current_date() group by user_id) ,
r2 as (select user_id,count(visit_time) cnt2 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),15) and current_date() group by user_id),
r3 as (select user_id,count(visit_time) cnt3 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),30) and current_date() group by user_id),
r4 as (select user_id,count(visit_time) cnt4 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),60) and current_date() group by user_id),
r5 as (select user_id,count(visit_time) cnt5 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),90) and current_date() group by user_id)
select COALESCE(r1.user_id,r2.user_id,r3.user_id,r4.user_id,r5.user_id) user_id,nvl(cnt1,0) week,nvl(cnt2,0) half,nvl(cnt3,0) mon,nvl(cnt4,0) mon2,nvl(cnt5,0) season from r1 full join r2 on r1.user_id=r2.user_id
full join r3 on r1.user_id=r3.user_id full join r4 on r1.user_id=r4.user_id full join r5 on r1.user_id=r5.user_id;

近30天不同时间段的pv访问量

--近30天不同时间段的pv访问量
create table snbap_dw.user_pv_30_total as
select user_id,count(pv) pv_30_total,
sum(case when hour(visit_time) between 0 and 5 then 1 else 0 end) pv_30_0_5,
sum(case when hour(visit_time) between 6 and 7 then 1 else 0 end) pv_30_6_7,
sum(case when hour(visit_time) between 8 and 9 then 1 else 0 end) pv_30_8_9,
sum(case when hour(visit_time) between 10 and 11 then 1 else 0 end) pv_30_10_11,
sum(case when hour(visit_time) between 12 and 13 then 1 else 0 end) pv_30_12_13,
sum(case when hour(visit_time) between 14 and 15 then 1 else 0 end) pv_30_14_15,
sum(case when hour(visit_time) between 16 and 17 then 1 else 0 end) pv_30_16_17,
sum(case when hour(visit_time) between 18 and 19 then 1 else 0 end) pv_30_18_19,
sum(case when hour(visit_time) between 20 and 21 then 1 else 0 end) pv_30_20_21,
sum(case when hour(visit_time) between 22 and 23 then 1 else 0 end) pv_30_22_23,
count(distinct visit_ip) visit_ip_30_cnt,
count(distinct cookie_id) cookie_30_cnt
from snbap_dw.user_pc_click_partition
where visit_date between date_sub(current_date(),30) and current_date()
group by user_id;

30天访问最多的ip

--30天访问最多的ip
create table snbap_dw.user_visit_30_popular_ip as
select user_id,visit_ip from
(select user_id,visit_ip,dense_rank() over(partition by user_id order by ip_cnt desc) rn
from
(select user_id,visit_ip,count(visit_ip) ip_cnt 
from snbap_dw.user_pc_click_partition
where visit_date between date_sub(current_date(),30) and current_date()
group by user_id,visit_ip) a) b
where rn=1;

30天使用最多的cookie_id

--30天使用最多的cookie_id
create table snbap_dw.user_30_cookie_popular_id as
select user_id,cookie_id from
(select user_id,cookie_id,dense_rank() over(partition by user_id order by ip_cnt desc) rn
from
(select user_id,cookie_id,count(cookie_id) ip_cnt 
from snbap_dw.user_pc_click_partition
where visit_date between date_sub(current_date(),30) and current_date()
group by user_id,cookie_id) a) b
where rn=1;

30天使用最多的浏览器

--30天使用最多的浏览器
create table snbap_dw.user_30_popula_browser as
select user_id,browser_name from
(select user_id,browser_name,dense_rank() over(partition by user_id order by ip_cnt desc) rn
from
(select user_id,browser_name,count(browser_name) ip_cnt 
from snbap_dw.user_pc_click_partition
where visit_date between date_sub(current_date(),30) and current_date()
group by user_id,browser_name) a) b
where rn=1;

30天使用最多的操作系统

--30天使用最多的操作系统
create table snbap_dw.user_visit_30_popular_os as
select user_id,visit_os from
(select user_id,visit_os,dense_rank() over(partition by user_id order by ip_cnt desc) rn
from
(select user_id,visit_os,count(visit_os) ip_cnt 
from snbap_dw.user_pc_click_partition
where visit_date between date_sub(current_date(),30) and current_date()
group by user_id,visit_os) a) b
where rn=1;

创建数据库:snbap_dm

create database snbap_dm;

PC端 指标 综合

--PC端 指标 综合
create table snbap_dm.user_pc_visit as
select lc.user_id, lc.visit_time last_visit,lc.session_id last_session,
lc.cookie_id last_cookie,lc.cnt last_pv,lc.browser_name last_browser,
lc.visit_os last_os,fc.visit_time first_visit,fc.session_id first_session,
fc.cookie_id first_cookie,fc.cnt first_pv,fc.browser_name first_browser,
fc.visit_os first_os,vt.week visit_week_cnt,vt.half visit_2weeks_cnt,vt.mon visit_month_cnt,
vt.mon2 visit_mon2_cnt,vt.season visit_season_cnt,vt.mon pc_30_cnt,
pv.pv_30_total,round(pv.pv_30_total/30,2) avg_30_total,
pv.pv_30_0_5,pv.pv_30_6_7,pv.pv_30_8_9,pv.pv_30_10_11,
pv.pv_30_12_13,pv.pv_30_14_15,pv.pv_30_16_17,pv.pv_30_18_19,pv.pv_30_20_21,
pv.pv_30_22_23,pv.visit_ip_30_cnt,vi.visit_ip popular_ip,pv.cookie_30_cnt,
ci.cookie_id popular_cookie,b.browser_name popular_browser,vo.visit_os popular_vo
from
snbap_dw.user_last_pc_click lc join
snbap_dw.user_first_pc_click fc on lc.user_id=fc.user_id left join
snbap_dw.user_pc_visit_total vt on lc.user_id=vt.user_id left join
snbap_dw.user_pv_30_total pv on lc.user_id=pv.user_id left join
snbap_dw.user_visit_30_popular_ip vi on lc.user_id=vi.user_id left join
snbap_dw.user_visit_30_popular_os vo on lc.user_id=vo.user_id left join
snbap_dw.user_30_popula_browser b on lc.user_id=b.user_id left join
snbap_dw.user_30_cookie_popular_id ci on lc.user_id=ci.user_id

app端:

最近一次app访问日期,app名称,app操作系统

--最近一次app访问日期,app名称,app操作系统
create table snbap_dw.user_last_app_click as
select distinct * from 
(select user_id,log_time,app_name,visit_os,rank() over(partition by user_id order by log_time desc) rn from snbap_ods.user_app_click_log) a where rn=1;

第一次app访问日期,app名称,app操作系统,app访问ip

--第一次app访问日期,app名称,app操作系统,app访问ip
create table snbap_dw.user_first_app_click as
select distinct * from 
(select user_id,log_time,app_name,visit_os,visit_ip,rank() over(partition by user_id order by log_time) rn from snbap_ods.user_app_click_log) a where rn=1;

连续不同天数的访问次数

--连续不同天数的访问次数
create table snbap_dw.user_app_visit_total  as
with
r1 as (select user_id,count(log_time) cnt1 from snbap_dw.user_app_click_partition
where log_time between date_sub(current_date(),7) and current_date() group by user_id) ,
r2 as (select user_id,count(log_time) cnt2 from snbap_dw.user_app_click_partition
where log_time between date_sub(current_date(),15) and current_date() group by user_id),
r3 as (select user_id,count(log_time) cnt3 from snbap_dw.user_app_click_partition
where log_time between date_sub(current_date(),30) and current_date() group by user_id),
r4 as (select user_id,count(log_time) cnt4 from snbap_dw.user_app_click_partition
where log_time between date_sub(current_date(),60) and current_date() group by user_id),
r5 as (select user_id,count(log_time) cnt5 from snbap_dw.user_app_click_partition
where log_time between date_sub(current_date(),90) and current_date() group by user_id)
select COALESCE(r1.user_id,r2.user_id,r3.user_id,r4.user_id,r5.user_id) user_id,nvl(cnt1,0) week,nvl(cnt2,0) half,nvl(cnt3,0) mon,nvl(cnt4,0) mon2,nvl(cnt5,0) season from r1 full join r2 on r1.user_id=r2.user_id
full join r3 on r1.user_id=r3.user_id full join r4 on r1.user_id=r4.user_id full join r5 on r1.user_id=r5.user_id;

app近30天不同时间段的pv数量

--app近30天不同时间段的pv数量
create table snbap_dw.user_app_pv_30_total as
select user_id,count(pv) pv_30_total,
sum(case when hour(log_time) between 0 and 5 then 1 else 0 end) pv_30_0_5,
sum(case when hour(log_time) between 6 and 7 then 1 else 0 end) pv_30_6_7,
sum(case when hour(log_time) between 8 and 9 then 1 else 0 end) pv_30_8_9,
sum(case when hour(log_time) between 10 and 11 then 1 else 0 end) pv_30_10_11,
sum(case when hour(log_time) between 12 and 13 then 1 else 0 end) pv_30_12_13,
sum(case when hour(log_time) between 14 and 15 then 1 else 0 end) pv_30_14_15,
sum(case when hour(log_time) between 16 and 17 then 1 else 0 end) pv_30_16_17,
sum(case when hour(log_time) between 18 and 19 then 1 else 0 end) pv_30_18_19,
sum(case when hour(log_time) between 20 and 21 then 1 else 0 end) pv_30_20_21,
sum(case when hour(log_time) between 22 and 23 then 1 else 0 end) pv_30_22_23
from snbap_dw.user_app_click_partition
where log_date between date_sub(current_date(),30) and current_date()
group by user_id;

app端 指标 综合

--app端 指标 综合
create table snbap_dm.user_app_visit as
select lc.user_id,lc.log_time last_visit,lc.app_name last_appname,lc.visit_os last_visitos,fc.log_time first_visit,fc.app_name first_appname,fc.visit_os first_visitos,fc.visit_ip,vt.week,vt.half,vt.mon,vt.mon2,vt.season,pv.pv_30_0_5,pv.pv_30_6_7,pv.pv_30_8_9,pv.pv_30_10_11,pv.pv_30_12_13,pv.pv_30_14_15,pv.pv_30_16_17,pv.pv_30_18_19,pv.pv_30_20_21,pv.pv_30_22_23
from
snbap_dw.user_last_app_click lc join
snbap_dw.user_first_app_click fc on lc.user_id=fc.user_id left join
snbap_dw.user_app_visit_total vt on lc.user_id=vt.user_id left join
snbap_dw.user_app_pv_30_total pv on lc.user_id=pv.user_id

最近一次访问的ip 城市 省份

--综合指标
--最近一次访问的ip
--最近一次访问的城市
--最近一次访问的省份

create table snbap_dw.user_last_app_pc_visit as
select distinct user_id,visit_ip,city,province from
(select user_id,visit_ip,city,province,visit_time,
rank() over(partition by user_id order by visit_time desc) rn from
(select user_id,visit_time,visit_ip,city,province from snbap_dw.user_pc_click_partition union all
select user_id,log_time visit_time,visit_ip,city,province from snbap_dw.user_app_click_partition) b) a where rn=1;

第一次访问的ip 城市 省份

--第一次访问的ip
--第一次访问的城市
--第一次访问的省份

create table snbap_dw.user_first_app_pc_visit as
select distinct user_id,visit_ip,city,province from
(select user_id,visit_ip,city,province,visit_time,
rank() over(partition by user_id order by visit_time) rn from
(select user_id,visit_time,visit_ip,city,province from snbap_dw.user_pc_click_partition union all
select user_id,log_time visit_time,visit_ip,city,province from snbap_dw.user_app_click_partition) b) a where rn=1;

order_type 0线下订单 1线上订单
order_status 0 已成交 1已收获 2已取消
pay_type 0现金支付 1第三方支付 2银行卡/网银支付
pay_status 0未支付 1已支付

--造数据 order_type 0线下订单   1线上订单  order_status 0 已成交  1已收获  2已取消
--pay_type 0现金支付 1第三方支付 2银行卡/网银支付   pay_status 0未支付 1已支付
insert into snbap_ods.us_order(order_no,user_id,user_name,order_money,order_type,order_status,pay_status,pay_type,update_time) 
select order_no,user_id,user_name,round(rand()*100,2),round(rand()),round(rand()*2),round(rand()),round(rand()*2),current_timestamp() 
from snbap_ods.us_order;

insert into snbap_ods.us_order(order_no,user_id,user_name,order_money,order_type,order_status,pay_status,pay_type,order_date) 
select order_no,user_id,user_name,round(rand()*100,2),round(rand()),round(rand()*2),round(rand()),round(rand()*2),current_timestamp() 
from snbap_ods.us_order;

查看数据的记录条数:

select count(1),order_type from snbap_ods.us_order group by order_type
select count(1),order_status from snbap_ods.us_order group by order_status
select count(1),pay_type from snbap_ods.us_order group by pay_type
select count(1),pay_status from snbap_ods.us_order group by pay_status

drop table snbap_dw.user_order_info;

首单,末单时间及距今时间

--首单,末单时间及距今时间
create table snbap_dw.user_order_info as
select user_id,min(order_date) first_order_date,max(order_date) last_order_date,
datediff(current_date(),min(order_date)) first_order_days,
datediff(current_date(),max(order_date)) last_order_days,
sum(case when order_status<>2 and order_date between date_sub(current_date(),30) and current_date() then 1 else 0 end) order_cnt_30,
sum(case when order_status<>2 and order_date between date_sub(current_date(),30) and current_date() then order_money else 0 end) order_money_30,
sum(case when order_status<>2 and order_date between date_sub(current_date(),60) and current_date() then 1 else 0 end) order_cnt_60,
sum(case when order_status<>2 and order_date between date_sub(current_date(),60) and current_date() then order_money else 0 end) order_money_60,
sum(case when order_status<>2 and order_date between date_sub(current_date(),90) and current_date() then 1 else 0 end) order_cnt_90,
sum(case when order_status<>2 and order_date between date_sub(current_date(),90) and current_date() then order_money else 0 end) order_money_90,
sum(case when order_date between date_sub(current_date(),30) and current_date() then 1 else 0 end) order_cnt_30_all,
sum(case when order_date between date_sub(current_date(),30) and current_date() then order_money else 0 end) order_money_30_all,
sum(case when order_date between date_sub(current_date(),60) and current_date() then 1 else 0 end) order_cnt_60_all,
sum(case when order_date between date_sub(current_date(),60) and current_date() then order_money else 0 end) order_money_60_all,
sum(case when order_date between date_sub(current_date(),90) and current_date() then 1 else 0 end) order_cnt_90_all,
sum(case when order_date between date_sub(current_date(),90) and current_date() then order_money else 0 end) order_money_90_all,
max(order_money) max_order_money,
min(order_money) min_order_money,
sum(case when order_status<>2 then 1 else 0 end) total_order_cnt,
sum(case when order_status<>2 then order_money else 0 end) total_order_money,
sum(order_money) total_order_money_all,
max(case when order_status=2 then update_time else null end) last_retreat_time
from snbap_ods.us_order
group by user_id

退货商品数量,金额

--退货商品数量,金额
create table snbap_dw.dwd_user_retreat as
select u.user_id,
sum(case when order_status=2 then goods_amount else 0 end) retret_goods_num,
sum(case when order_status=2 then cost_price else 0 end) retret_goods_money
from snbap_ods.us_order u join snbap_ods.order_item i on u.user_id=i.user_id group by u.user_id

最常用的收货地址

--最常用的收货地址
create table snbap_dw.dwd_user_address as
select user_id,address popular_address from 
(select user_id,address,rank() over(partition by user_id order by addr_cnt desc) rn from 
(select user_id,address,count(1) addr_cnt
from snbap_ods.order_delivery d join snbap_ods.us_order u on d.order_id=u.order_id group by user_id,address) a) b
where rn=1

最常用的支付方式

--最常用的支付方式
create table snbap_dw.dwd_user_popular_pay as
select user_id,pay_type popular_pay_type from 
(select user_id,pay_type,rank() over(partition by user_id order by pay_cnt desc) rn from 
(select user_id,pay_type,count(1) pay_cnt
from snbap_ods.us_order group by user_id,pay_type) a) b 
where rn=1

最终

--最终
create table snbap_dm.user_order_info_final as
select u.*,popular_address,popular_pay_type,retret_goods_num,retret_goods_money
from snbap_dw.user_order_info u 
join snbap_dw.dwd_user_address a on u.user_id=a.user_id
join snbap_dw.dwd_user_popular_pay p on u.user_id=p.user_id
join snbap_dw.dwd_user_retreat r on u.user_id=r.user_id

查询结果表

select * from snbap_dm.user_order_info_final;

用户下单分布

create table snbap_dw.user_order_falg_time_total as
select 
sum(case user_order_flag when 1 then 1 else 0 end) school_total,
sum(case user_order_flag when 2 then 1 else 0 end) company_total,
sum(case user_order_flag when 3 then 1 else 0 end) home_total,
sum(case when hour(order_date) between 0 and 5 then 1 else 0 end) early_morning_total,
sum(case when hour(order_date) between 6 and 12 then 1 else 0 end) morning_total,
sum(case when hour(order_date) between 13 and 15 then 1 else 0 end) noon_total,
sum(case when hour(order_date) between 16 and 20 then 1 else 0 end) afternoon_total,
sum(case when hour(order_date) between 21 and 24 then 1 else 0 end) evening_total from snbap_ods.user_addr s join
snbap_ods.us_order u on s.user_id=u.user_id join  snbap_ods.order_delivery d on u.order_id=d.order_id

查看Linux系统的空间使用情况:df -h

[root@hadoop100 tmp]# df -h
Filesystem               Size  Used Avail Use% Mounted on
/dev/mapper/centos-root   17G   16G  1.4G  93% /
devtmpfs                 1.9G     0  1.9G   0% /dev
tmpfs                    1.9G     0  1.9G   0% /dev/shm
tmpfs                    1.9G   12M  1.9G   1% /run
tmpfs                    1.9G     0  1.9G   0% /sys/fs/cgroup
/dev/sda1               1014M  146M  869M  15% /boot
tmpfs                    378M     0  378M   0% /run/user/0

查看某一级目录下文件/文件夹占用的空间大小,du -h -x --max-depth=1
max-depth=1深度为1表示当前目录不显示具体子目录

[root@hadoop100 tmp]# du -h -x --max-depth=1
999M    ./dfs
7.3G    ./nm-local-dir
68M     ./mapred
8.3G    .
[root@hadoop100 /]# cd /
[root@hadoop100 /]# du -h -x --max-depth=1
31M     ./etc
459M    ./root
835M    ./var
11M     ./tmp
1.6G    ./usr
112K    ./home
0       ./media
540K    ./mnt
8.6G    ./opt
0       ./srv
12G     .

hdfs系统的本地文件路径为tmp目录下的usercache文件夹,root表示具体用户

/opt/hadoop/tmp/nm-local-dir/usercache/root

猜你喜欢

转载自blog.csdn.net/m0_48758256/article/details/109056448