Business data collection of big data projects (2)

Business data collection

Platform model building

Insert picture description here

1. Business collection

Business data: The business related to the core business of the enterprise is stored in the MySQL database, and
the data in MySQL needs to be collected into hdfs.

plan selection

1. Data transmission: sqoop

Advantage:

  • 1. Sqoop is used in business scenarios, and the method of use and data import is RDMS and HDFS import each other
  • 2. Batch processing scene! In a non-real-time project, data is imported the next day, so there is no need for streaming. sqoop can quickly import data into HDFS by starting mapreduce and only map.
  • 3. Open source and free

2. Data import method

One, the whole amount
每天需要存一份完整数据,数据量不大,有更新和修改
2. Increment
每天存储一份增量的数据,适用于数据量大,只有插入的操作
Three, increment and change
每天新增和变化,存储创建事件和操作事件都是今天的数据
Fourth, special strategies
特殊的维度表(比如客观世界维度,日期维度,地区维度)

Insert picture description here

2. Data import

Question 1: Null value storage problem

Null in Hive is stored with "\N" at the bottom, and Null in MySQL is Null at the bottom, in order to ensure the consistency of the data at both ends. Two parameters –input-null-string and –input-null-non-string are used when exporting data. Use –null-string and –null-non-string when importing data.

Import script:

#!/bin/bash

#1、判断日期是否为空,  中括号判断,如果为空,执行||之后的内容,如果不为空,执行&&之后的内容
[ "$2" ] && datestr=$2 || datestr=$(date -d '-1 day' +%Y-%m-%d)

#公共参数
import_data(){
    
    
/opt/module/sqoop/bin/sqoop import \
--connect jdbc:mysql://hadoop102:3306/gmall \
--username root \
--password root123 \
--delete-target-dir \
--num-mappers 1 \
--query "$1"  \
--target-dir hdfs://hadoop102:8020/gmall/$2/$datestr \
--compress \
--compression-codec lzop \
--null-string '\\N' \
--null-non-string '\\N' \
--fields-terminated-by ,
}
#--compress \
#--compression-codec lzop \
#如果采用lzop的方式压缩,需要生成索引,格式:hadoop jar jar包位置,全类名,压缩文件所在位置
# hadoop jar /opt/module/hadoop/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer hdfs://hadoop102:8020/gamll/$2/$datestr


#全量导入
import_base_dic(){
    
    
import_data "select * from base_dic where \$CONDITIONS" base_dic id
}

import_base_trademark(){
    
    
import_data "select * from base_trademark where \$CONDITIONS" base_trademark id
}

import_base_category3(){
    
    
import_data "select * from base_category3 where \$CONDITIONS" base_category3 id
}

import_base_category2(){
    
    
import_data "select * from base_category2 where \$CONDITIONS" base_category2 id
}

import_base_category1(){
    
    
import_data "select * from base_category1 where \$CONDITIONS" base_category1 id
}

import_activity_rule(){
    
    
import_data "select * from activity_rule where \$CONDITIONS" activity_rule id
}

import_activity_info(){
    
    
import_data "select * from activity_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" activity_info id
}

import_activity_sku(){
    
    
import_data "select * from activity_sku where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" activity_sku id
}

import_cart_info(){
    
    
import_data "select * from cart_info where \$CONDITIONS" cart_info id
}

import_favor_info(){
    
    
import_data "select * from favor_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" favor_info id
}

import_coupon_info(){
    
    
import_data "select * from coupon_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" coupon_info id
}

import_spu_info(){
    
    
import_data "select * from spu_info where \$CONDITIONS" spu_info id
}

import_sku_info(){
    
    
import_data "select * from sku_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" sku_info id
}


#新增导入
import_order_refund_info(){
    
    
import_data "select * from order_refund_info where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" order_refund_info id
}

import_order_refund_info_all(){
    
    
import_data "select * from order_refund_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" order_refund_info id
}

import_order_status_log(){
    
    
import_data "select * from order_status_log where DATE_FORMAT(operate_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" order_status_log id
}

import_order_status_log_all(){
    
    
import_data "select * from order_status_log where DATE_FORMAT(operate_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" order_status_log id
}

import_payment_info(){
    
    
import_data "select * from payment_info where DATE_FORMAT(payment_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" payment_info id
}

import_payment_info_all(){
    
    
import_data "select * from payment_info where DATE_FORMAT(payment_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" payment_info id
}

import_order_detail(){
    
    
import_data "select * from order_detail where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" order_detail id
}

import_order_detail_all(){
    
    
import_data "select * from order_detail where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" order_detail id
}

import_activity_order(){
    
    
import_data "select * from activity_order where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" activity_order id
}

import_activity_order_all(){
    
    
import_data "select * from activity_order where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" activity_order id
}

import_comment_info(){
    
    
import_data "select * from comment_info where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" comment_info id
}

import_comment_info_all(){
    
    
import_data "select * from comment_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" comment_info id
}


#新增及修改导入
import_user_info(){
    
    
import_data "select * from user_info where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(operate_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" user_info id
}

import_user_info_all(){
    
    
import_data "select * from user_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(operate_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" user_info id
}

import_coupon_use(){
    
    
import_data "select * from coupon_use where DATE_FORMAT(get_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(using_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(used_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(expire_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" coupon_use id
}

import_coupon_use_all(){
    
    
import_data "select * from coupon_use where DATE_FORMAT(get_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(operate_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(used_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(expire_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" coupon_use id
}

import_order_info(){
    
    
import_data "select * from order_info where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(operate_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(expire_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" order_info id
}

import_order_info_all(){
    
    
import_data "select * from order_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(operate_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(expire_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" order_info id
}

#特殊导入
import_base_region(){
    
    
import_data "select * from base_region where \$CONDITIONS" base_region id
}

import_base_province(){
    
    
import_data "select * from base_province where \$CONDITIONS" base_province id
}

#2、根据参数的第一个参数,确定导入策略
case $1 in
#第一次导入,全部导入:全量、新增、新增及修改,特殊策略
"first")
import_base_dic
import_base_trademark
import_base_category3
import_base_category2
import_base_category1
import_activity_rule
import_activity_info
import_activity_sku
import_cart_info
import_favor_info
import_coupon_info
import_sku_info
import_spu_info

import_order_refund_info_all
import_order_status_log_all
import_payment_info_all
import_order_detail_all
import_activity_order_all
import_comment_info_all

import_coupon_use_all
import_user_info_all
import_order_info_all

import_base_province
import_base_region
;;

#非第一次导入,导入:全量、新增、新增及修改
"second")
import_base_dic
import_base_trademark
import_base_category3
import_base_category2
import_base_category1
import_activity_rule
import_activity_info
import_activity_sku
import_cart_info
import_favor_info
import_coupon_info
import_sku_info
import_spu_info

import_order_refund_info
import_order_status_log
import_payment_info
import_order_detail
import_activity_order
import_comment_info

import_coupon_use
import_user_info
import_order_info
;;
#########################按照表名导入############################
# 编码字典表
"base_dic")
    import_base_dic
;;
# 品牌表
"base_trademark")
    import_base_trademark
;;
# 商品一级分类
"base_category1")
    import_base_category1
;;
# 商品二级分类
"base_category2")
    import_base_category2
;;
# 商品三级分类
"base_category3")
    import_base_category3
;;
# 优惠活动表
"activity_rule")
    import_activity_rule
;;
# 活动表
"activity_info")
    import_activity_info
;;
# 活动参与商品表
"activity_sku")
    import_activity_sku
;;

# 加购表
"cart_info")
    import_cart_info
;;

# 商品收藏表
"favor_info")
    import_favor_info
;;

# 优惠券表
"coupon_info")
    import_coupon_info
;;

# SKU商品表
"sku_info")
     import_sku_info
;;

# spu商品表
"spu_info")
      import_spu_info
;;


########## 新增  ##########

# 退单表
"order_refund_info")
      import_order_refund_info
;;

# 订单状态表
"order_status_log")
      import_order_status_log
;;

# 支付流水表
"payment_info")
     import_payment_info
;;

# 订单详情表
"order_detail")
     import_order_detail
;;

# 活动与订单关联表
"activity_order")
      import_activity_order
;;

# 商品评论表
"comment_info")
      import_comment_info
;;

#####  新增和变化表  ##########

# 优惠券领用表
"coupon_use")
      import_coupon_use
;;

# 用户表
"user_info")
     import_user_info
;;

# 订单表
"order_info")
     import_order_info
;;

######  特殊表  ####
# 省份表
"base_province")
     import_base_province
;;

# 地区表
  "base_region")
     import_base_region
;;


*)
	echo "表名不存在或参数输入有误"
;;
esac

Guess you like

Origin blog.csdn.net/qq_38705144/article/details/114059969