数仓流程的报表设计及应用

1.背景:先用sqoop全量抽取数据，再增量抽取，整合到全量表中，作为ods层；进行业务处理到作为dwd层，结果到dm层；再sqoop将结果抽到mysql。

ods层：

#!/bin/bash

# ********************************************************************************
# 程序名称: online_tab_user_order
# 功能描述: 将mysql中online_tab_user_order表数据sqoop全量抽取到hive中
# 输入参数:
#
# 输入资源:
# 输出资源:
#
# 中间资源:
# 创建人员: csq
# 创建日期:
# 版本说明:
# 修改人员:
# 修改日期:
# 修改原因:
# 版本说明:
#
# ********************************************************************************
# ********************************************************************************

VC_DBLIST='10.68.21.92,3306,hue,"xxx",user_online'
VC_DBLIST1='10.68.21.92,3306,hue,"xxx",user_online_other'

HIVE_SERVER='10.68.25.198:10000'
export HADOOP_USER_NAME=hue页面账号（hadoop集群账号）
dblist=${VC_DBLIST}
dbhost=`echo "${dblist}" |awk -F, '{print $1}'`
dbport=`echo "${dblist}" |awk -F, '{print $2}'`
dbuser=`echo "${dblist}" |awk -F, '{print $3}'`
dbpw=`echo "${dblist}" |awk -F, '{print $4}'`
dbname=`echo "${dblist}" |awk -F, '{print $5}'`

if [ $# -eq 0 ];
then
p_in_time_str=`date -d @date -d today +'%Y-%m-%d'' 00:00:00'`
p_in_time_end=$p_in_time_str

elif [ $# -eq 1 ];
then
p_in_time_str=$1
p_in_time_end=$1
elif [ $# -eq 2 ];
then
p_in_time_str=$1
p_in_time_end=$2
else
p_in_time_str=$1
p_in_time_end=$2
fi

vc_stat_st=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`
vc_stat_start=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`
vc_stat_ed=`date -d "$p_in_time_end next-day" +'%Y-%m-%d'' 00:00:00'`

vi_load_st=`date -d "$p_in_time_str" +'%Y%m%d'`
vi_load_ed=`date -d "$p_in_time_end" +'%Y%m%d'`
vc_load_st=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`

vi_stat_st=`date -d "$p_in_time_str 1 day ago" +'%Y-%m-%d'' 00:00:00'`
vi_stat_ed=`date -d "$p_in_time_end" +'%Y-%m-%d'' 00:00:00'`
vi_stat=`date -d "$p_in_time_str 1 day ago" +'%Y%m%d'`

vi_part_drop=`date -d "$p_in_time_str 1080 day ago" +'%Y%m%d'`

echo $p_in_time_str","$p_in_time_end","$vc_stat_st","$vc_stat_ed","$vi_stat_st","$vi_stat_ed

sqoop_time=`date -d @date -d today +'%Y-%m-%d'`
qt=`date -d "2 days ago" +'%Y-%m-%d'' 24:00:00'`
ye=`date -d yesterday +'%Y-%m-%d'' 24:00:00'`

{#先建表，其实也可以不建表，因为sqoop会自主对应映射mysql表到hive。但要想用自己指定的字段类型，先建表比较好
   beeline -u jdbc:hive2://${HIVE_SERVER} -n ${HADOOP_USER_NAME} -e "
   drop table online_ods.online_all_tab_user_order;
   create table online_ods.online_all_tab_user_order(
   order_id string,
brand_name string,
channel_name string,
county_name string,
create_date string,
customer_type_name string,
customer_type_name_lv1 string,
des_branch_name string,
name_city string,
name_province string,
price string,
product_mode_name string,
product_name string,
product_spec_name string,
product_type_name string,
quantity string,
report_date string,
salesmoney string,
shop_id string,
shop_name string,
shopper_addr string,
shopper_name string,
shopper_phone string,
subcompany_name string,
user_id string,
coupons1 string,
coupons2 string,
coupons3 string,
confirm_date string,
work_create_date string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' LINES TERMINATED BY '\n' STORED AS TEXTFILE;"
}&&{#mysql中是分库分表的，分表就只能挨个循环读取了
for table in tab_user_order_0 tab_user_order_1 tab_user_order_2 tab_user_order_3 tab_user_order_4 tab_user_order_5 tab_user_order_6 tab_user_order_7 tab_user_order_8 tab_user_order_9
do
{#注意：出现断裂一般是--参数之间不止一个空格，如果出现warehouse中有表，但hue页面无表一般是缺少--hive-import这个参

#数,\$CONDITIONS这个关键字一定不能忽略。--target-dir：sqoop抽取时先把数据放到hdfs，再put过去，中间会建一个临时的

#表，这个就是hdfs的临时表放的位置，如果要写成warehouse的位置，你会发现只有最后一张表的对于分表来说。
sudo -u hdfs sqoop import --hive-drop-import-delims --connect jdbc:mysql://${dbhost}:${dbport}/${dbname} --username ${dbuser} --password ${dbpw} --delete-target-dir --lines-terminated-by '\n' --fields-terminated-by '\001' --null-string '\\N' --null-non-string '\\N' --hive-import --num-mappers 1 --query "SELECT * from $table where \$CONDITIONS " --target-dir /tmp/hive-root/ --delete-target-dir --hive-table online_ods.online_all_tab_user_order

time=`date "+%Y-%m-%d %H:%M:%S"`
echo $tabe $time "is done"
echo "--------------------------finish----------------------------------"
}

done
}

#增量

#!/bin/bash

# ********************************************************************************
# 程序名称: online_tab_user_order
# 功能描述: 将mysql中online_tab_user_order表数据sqoop抽取到hive中
# 输入参数:
#
# 输入资源:
# 输出资源:
#
# 中间资源:
# 创建人员: csq
# 创建日期:
# 版本说明:
# 修改人员:
# 修改日期:
# 修改原因:
# 版本说明:
#
# ********************************************************************************
# ********************************************************************************
#mysql中online_tab_user_order表数据sqoop抽取到hive中

VC_DBLIST='10.68.21.92,3306,hue,"xxx",user_online'
VC_DBLIST1='10.68.21.92,3306,hue,"xxxx",user_online_other'
export HADOOP_USER_NAME=
HIVE_SERVER='10.68.25.198:10000'

dblist=${VC_DBLIST}
dbhost=`echo "${dblist}" |awk -F, '{print $1}'`
dbport=`echo "${dblist}" |awk -F, '{print $2}'`
dbuser=`echo "${dblist}" |awk -F, '{print $3}'`
dbpw=`echo "${dblist}" |awk -F, '{print $4}'`
dbname=`echo "${dblist}" |awk -F, '{print $5}'`

vc_stat_st=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`
vc_stat_start=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`
vc_stat_ed=`date -d "$p_in_time_end next-day" +'%Y-%m-%d'' 00:00:00'`

vi_load_st=`date -d "$p_in_time_str" +'%Y%m%d'`
vi_load_ed=`date -d "$p_in_time_end" +'%Y%m%d'`
vc_load_st=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`

vi_stat_st=`date -d "$p_in_time_str 1 day ago" +'%Y-%m-%d'' 00:00:00'`
vi_stat_ed=`date -d "$p_in_time_end" +'%Y-%m-%d'' 00:00:00'`
vi_stat=`date -d "$p_in_time_str 1 day ago" +'%Y%m%d'`

vi_part_drop=`date -d "$p_in_time_str 1080 day ago" +'%Y%m%d'`

echo $p_in_time_str","$p_in_time_end","$vc_stat_st","$vc_stat_ed","$vi_stat_st","$vi_stat_ed

sqoop_time=`date -d @date -d today +'%Y-%m-%d'`
qt=`date -d "2 days ago" +'%Y-%m-%d'' 24:00:00'`
ye=`date -d yesterday +'%Y-%m-%d'' 24:00:00'`

{
   beeline -u jdbc:hive2://${HIVE_SERVER} -n ${HADOOP_USER_NAME} -e "
   drop table online_ods.online_tab_user_order;#注意要先删除，因为每天抽增量
   create table online_ods.online_tab_user_order(
   order_id string,
brand_name string,
channel_name string,
county_name string,
create_date string,
customer_type_name string,
customer_type_name_lv1 string,
des_branch_name string,
name_city string,
name_province string,
price string,
product_mode_name string,
product_name string,
product_spec_name string,
product_type_name string,
quantity string,
report_date string,
salesmoney string,
shop_id string,
shop_name string,
shopper_addr string,
shopper_name string,
shopper_phone string,
subcompany_name string,
user_id string,
coupons1 string,
coupons2 string,
coupons3 string,
confirm_date string,
work_create_date string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' LINES TERMINATED BY '\n' STORED AS TEXTFILE;"
}&&{
for table in tab_user_order_0 tab_user_order_1 tab_user_order_2 tab_user_order_3 tab_user_order_4 tab_user_order_5 tab_user_order_6 tab_user_order_7 tab_user_order_8 tab_user_order_9
do
{
sudo -u hdfs sqoop import --hive-drop-import-delims --connect jdbc:mysql://${dbhost}:${dbport}/${dbname} --username ${dbuser} --password ${dbpw} --delete-target-dir --lines-terminated-by '\n' --fields-terminated-by '\001' --null-string '\\N' --null-non-string '\\N' --hive-import --num-mappers 1 --query "SELECT * from $table where work_create_date> '${qt}' and work_create_date <='${ye}' and \$CONDITIONS " --target-dir /tmp/hive-root/ --delete-target-dir --hive-table online_ods.online_tab_user_order

time=`date "+%Y-%m-%d %H:%M:%S"`
echo $tabe $time "is done"
echo "--------------------------finish----------------------------------"
}

done
}&&{
beeline -u jdbc:hive2://${HIVE_SERVER} -n ${HADOOP_USER_NAME} -e "
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.optimize.sort.dynamic.partition=false;
set hive.exec.max.dynamic.partitions.pernode=1000;
set hive.exec.max.dynamic.partitions=10000;
INSERT into TABLE online_ods.online_all_tab_user_order
SELECT
order_id ,
brand_name ,
channel_name ,
county_name ,
create_date ,
customer_type_name ,
customer_type_name_lv1 ,
des_branch_name ,
name_city ,
name_province ,
price ,
product_mode_name ,
product_name ,
product_spec_name ,
product_type_name ,
quantity ,
report_date ,
salesmoney ,
shop_id ,
shop_name ,
shopper_addr ,
shopper_name ,
shopper_phone ,
subcompany_name ,
user_id ,
coupons1 ,
coupons2 ,
coupons3 ,
confirm_date ,
work_create_date
FROM online_ods.online_tab_user_order;"
}

dwd:

#!/bin/bash

# ********************************************************************************
# 程序名称: 意向用户-促销活动用户数
# 功能描述: Tlink用户资产管理应用中，截止当前销售代表主动所添加的意向用户数（来源：用户数据中心）
# 输入参数:
#
# 输入资源:
# 输出资源:
#
# 中间资源:
# 创建人员: csq
# 创建日期:
# 版本说明:
# 修改人员:
# 修改日期:
# 修改原因:
# 版本说明:
#
# ********************************************************************************
# ********************************************************************************
#mysql中online_tab_user_order表数据sqoop抽取到hive中

VC_DBLIST='10.68.25.201,3306,admin,"123$#sadY23",user_online'
VC_DBLIST1='10.68.25.201,3306,admin,"123$#sadY23",user_online_other'
HIVE_SERVER='10.68.25.198:10000'
export HADOOP_USER_NAME=chensiqing

vc_stat_st=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`
vc_stat_start=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`
vc_stat_ed=`date -d "$p_in_time_end next-day" +'%Y-%m-%d'' 00:00:00'`

vi_load_st=`date -d "$p_in_time_str" +'%Y%m%d'`
vi_load_ed=`date -d "$p_in_time_end" +'%Y%m%d'`
vc_load_st=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`

vi_stat_st=`date -d "$p_in_time_str 1 day ago" +'%Y-%m-%d'' 00:00:00'`
vi_stat_ed=`date -d "$p_in_time_end" +'%Y-%m-%d'' 00:00:00'`
vi_stat=`date -d "$p_in_time_str 1 day ago" +'%Y%m%d'`

vi_part_drop=`date -d "$p_in_time_str 1080 day ago" +'%Y%m%d'`

echo $p_in_time_str","$p_in_time_end","$vc_stat_st","$vc_stat_ed","$vi_stat_st","$vi_stat_ed

createDate=`date -d @date -d today +'%Y-%m-%d'`
qt=`date -d "2 days ago" +'%Y-%m-%d'' 24:00:00'`
ye=`date -d yesterday +'%Y-%m-%d'' 24:00:00'`

{
beeline -u jdbc:hive2://${HIVE_SERVER} -n ${HADOOP_USER_NAME} -e "
drop table online_dw.actionUserResult_tab;
create TABLE online_dw.actionUserResult_tab(
subcompany_name string,
number int
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' LINES TERMINATED BY '\n' STORED AS TEXTFILE;

INSERT into TABLE online_dw.actionUserResult_tab
select w.sn,count(w.intention_id) from
(
SELECT distinct s.subcompany_name sn,s.user_id ,s.user_name,ia.intention_id
FROM online_ods.online_all_crm_user_shop s
left JOIN
(
select i.user_id,i.intention_id FROM online_ods.online_all_tab_user_intention i
where i.intention_source ='2' and i.user_stage in ('0','2')
) ia on ia.user_id=s.user_id
WHERE s.subcompany_name LIKE '%分公司%'

) w
group by w.sn;"

time=`date "+%Y-%m-%d %H:%M:%S"`
echo $time "table online_dw.actionUserResult_tab drop create insert is done"
echo "--------------------------finish----------------------------------"
}

#将所有结果表合成想要的报表

#!/bin/bash

# ********************************************************************************
# 程序名称: 复购用户-累计已购
# 功能描述: 截止当前订单数大于等于2的用户人数
# 输入参数:
#
# 输入资源:
# 输出资源:
#
# 中间资源:
# 创建人员: csq
# 创建日期:
# 版本说明:
# 修改人员:
# 修改日期:
# 修改原因:
# 版本说明:
#
# ********************************************************************************
# ********************************************************************************
#mysql中online_tab_user_order表数据sqoop抽取到hive中

VC_DBLIST='10.68.25.201,3306,admin,"XXXX",user_online'
VC_DBLIST1='10.68.25.201,3306,admin,"XXXXX",user_online_other'
HIVE_SERVER='10.68.25.198:10000'
export HADOOP_USER_NAME=

vc_stat_st=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`
vc_stat_start=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`
vc_stat_ed=`date -d "$p_in_time_end next-day" +'%Y-%m-%d'' 00:00:00'`

vi_load_st=`date -d "$p_in_time_str" +'%Y%m%d'`
vi_load_ed=`date -d "$p_in_time_end" +'%Y%m%d'`
vc_load_st=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`

vi_stat_st=`date -d "$p_in_time_str 1 day ago" +'%Y-%m-%d'' 00:00:00'`
vi_stat_ed=`date -d "$p_in_time_end" +'%Y-%m-%d'' 00:00:00'`
vi_stat=`date -d "$p_in_time_str 1 day ago" +'%Y%m%d'`

vi_part_drop=`date -d "$p_in_time_str 1080 day ago" +'%Y%m%d'`

echo $p_in_time_str","$p_in_time_end","$vc_stat_st","$vc_stat_ed","$vi_stat_st","$vi_stat_ed

createDate=`date -d @date -d today +'%Y-%m-%d'`
qt=`date -d "2 days ago" +'%Y-%m-%d'' 24:00:00'`
ye=`date -d yesterday +'%Y-%m-%d'' 24:00:00'`

{
beeline -u jdbc:hive2://${HIVE_SERVER} -n ${HADOOP_USER_NAME} -e "
drop table online_dm.Report;
create TABLE online_dm.Report(
subcompany_name string,
sum_add_user int,
sum_week_add_user int,
sum_action_add_user int,
sum_week_action_add_user int,
sum_ordered_add_user int,
sum_week_ordered_add_user int,
sum_transform_add_user int,
sum_week_transform_add_user int,
sum_old_order_add_user int,
sum_week_old_order_user int
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' LINES TERMINATED BY '\n' STORED AS TEXTFILE;

INSERT into TABLE online_dm.Report
SELECT a.subcompany_name,a.number,b.number,c.number,d.number,e.number,f.number,g.number,h.number,i.number,j.number
from
online_dw.adduserresult_tab a
left join
online_dw.addweekuserresult_tab b on a.subcompany_name=b.subcompany_name
left join
online_dw.actionuserresult_tab c on b.subcompany_name=c.subcompany_name
left join
online_dw.actionweekuserresult_tab d on c.subcompany_name=d.subcompany_name
left join
online_dw.orderedadduserresult_tab e on d.subcompany_name=e.subcompany_name
left join
online_dw.orderedaddweekuserresult_tab f on e.subcompany_name=f.subcompany_name
left join
online_dw.transformadduserresult_tab g on f.subcompany_name=g.subcompany_name
left join
online_dw.transformweekadduserresult_tab h on g.subcompany_name=h.subcompany_name
left join
online_dw.oldorderedthantwo i on h.subcompany_name=i.subcompany_name
left join
online_dw.oldweekorderedthantwo j on i.subcompany_name=j.subcompany_name ;"

time=`date "+%Y-%m-%d %H:%M:%S"`
echo $time "online_dm.Report drop create insert is done"
echo "--------------------------finish----------------------------------"
}

dm层

#!/bin/bash

# ********************************************************************************
# 程序名称: 报表导出到mysql
# 功能描述: 将dm层的report数据sqoop抽取到hive中
# 输入参数:
#
# 输入资源:
# 输出资源:
#
# 中间资源:
# 创建人员: csq
# 创建日期:
# 版本说明:
# 修改人员:
# 修改日期:
# 修改原因:
# 版本说明:
#
# ********************************************************************************
# ********************************************************************************

VC_DBLIST='10.68.25.201,3306,admin,"xxxx",user_online'
VC_DBLIST1='10.68.25.201,3306,admin,"xxxxx",user_online_other'
HIVE_SERVER='10.68.25.198:10000'

export HADOOP_USER_NAME=

vc_stat_st=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`
vc_stat_start=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`
vc_stat_ed=`date -d "$p_in_time_end next-day" +'%Y-%m-%d'' 00:00:00'`

vi_load_st=`date -d "$p_in_time_str" +'%Y%m%d'`
vi_load_ed=`date -d "$p_in_time_end" +'%Y%m%d'`
vc_load_st=`date -d "$p_in_time_str" +'%Y-%m-%d'' 00:00:00'`

vi_stat_st=`date -d "$p_in_time_str 1 day ago" +'%Y-%m-%d'' 00:00:00'`
vi_stat_ed=`date -d "$p_in_time_end" +'%Y-%m-%d'' 00:00:00'`
vi_stat=`date -d "$p_in_time_str 1 day ago" +'%Y%m%d'`

vi_part_drop=`date -d "$p_in_time_str 1080 day ago" +'%Y%m%d'`

echo $p_in_time_str","$p_in_time_end","$vc_stat_st","$vc_stat_ed","$vi_stat_st","$vi_stat_ed

sqoop_time=`date -d @date -d today +'%Y-%m-%d'`
qt=`date -d "2 days ago" +'%Y-%m-%d'' 24:00:00'`
ye=`date -d yesterday +'%Y-%m-%d'' 24:00:00'`

{#将生成的报表sqoop倒到mysql中，注意加上?useUnicode=true&characterEncoding=utf-8防止中文乱码，还要提前mysql中建

#好表。每次sqoop倒出时，要注意truncat清空表，覆盖不了的
sudo -u hdfs sqoop export --connect "jdbc:mysql://${dbhost}:${dbport}/${dbname}?useUnicode=true&characterEncoding=utf-8" --username ${dbuser} --password ${dbpw} --num-mappers 1 --export-dir /user/hive/warehouse/online_dm.db/report --table report --input-fields-terminated-by '\001' --input-null-string '\\N' --input-null-non-string '\\N'
time=`date "+%Y-%m-%d %H:%M:%S"`
echo " report at " $time "is done"
echo "--------------------------finish----------------------------------"

}

数仓流程的报表设计及应用

猜你喜欢