# Real-time data warehouse project-data collection and ODS layer
Configure canal to collect mysql data in real time
One, mysql open binlog
- Modify the mysql configuration file (linux:/etc/my.cnf, Windows:\my.ini)
log-bin=mysql-bin # 开期binlog
binlog-format=ROW #选择ROW模式
binglog-do-db=dwshow #dwshow是数据库的名称
binlog-format can choose statement, row, mixed, the difference is:
mode | the difference |
---|---|
statement | Record write operation statement, save space, but may cause data inconsistency |
row | Record the changes of each line after each operation, which takes up a lot of space |
mixed | Including UUID(), udf is in the extreme case of row mode, there will still be inconsistencies, which is inconvenient for binglong monitoring |
- Restart mysql and create user canal
systemctl restart mysqld
#MySQL重启之后,到下面路径中看有没有mysql-bin.*****文件
cd /var/lib/mysq
GRANT SELECT, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'canal'@'%' IDENTIFIED BY 'canal' ;
Two, install and configure canal to collect data to kafka
- Canal download the installation package and unzip it to the installation directory.
Download: https://github.com/alibaba/canal/releases
Copy the downloaded Canal.deployer-1.1.4.tar.gz to linux and unzip it (the path can be adjusted by yourself)
[root@linux123 ~]# mkdir /opt/modules/canal
[root@linux123 mysql]# tar -zxf canal.deployer-1.1.4.tar.gz -C /opt/modules/canal
- Modify configuration conf/canal.properties, configure zk and kafka addresses
# 配置zookeeper地址
canal.zkServers =linux121:2181,linux123:2181
# tcp, kafka, RocketMQ
canal.serverMode = kafka
# 配置kafka地址
canal.mq.servers =linux121:9092,linux123:9092
- Modify the configuration conf/example/instance.properties to
configure the mysql host, user name and password, and the monitored kafka theme
# 配置MySQL数据库所在的主机
canal.instance.master.address = linux123:3306
# username/password,配置数据库用户和密码
canal.instance.dbUsername =canal
canal.instance.dbPassword =canal
# mq config,对应Kafka主题:
canal.mq.topic=test
- Start canal
sh bin/startup.sh
- Close canal
sh bin/stop.sh
Three, start kafka consumer verification
ODS layer data processing imported into hbase
1. Flink collects Kafka data
- To write a tool class to obtain kafka consumers as a flink data source, you need to set the server address, key and value deserializers, consumer group Id, and the offset at which consumption starts
package myUtils
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
class SourceKafka {
def getKafkaSource(topicName:String): FlinkKafkaConsumer[String] ={
val props = new Properties()
props.setProperty("bootstrap.servers","linux121:9092,linux122:9092,linux123:9092");
props.setProperty("group.id","consumer-group")
props.setProperty("key.deserializer","org.apache.kafka.common.serialization.StringDeserializer");
props.setProperty("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer");
props.setProperty("auto.offset.reset","latest")
new FlinkKafkaConsumer[String](topicName,new SimpleStringSchema(),props);
}
}
- Get data from kafka and write it to hbase, get data in json format from kafka, and use alibaba's fastjson for parsing
package ods
import java.util
import com.alibaba.fastjson.{
JSON, JSONObject}
import models.TableObject
import myUtils.SourceKafka
import org.apache.flink.streaming.api.scala.{
DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.api.scala._
object KafkaToHbase {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val kafkaConsumer: FlinkKafkaConsumer[String] = new SourceKafka().getKafkaSource("test")
kafkaConsumer.setStartFromLatest()
val sourceStream: DataStream[String] = env.addSource(kafkaConsumer)//需要隐式转换
val mappedStream: DataStream[util.ArrayList[TableObject]] = sourceStream.map(x => {
val jsonObj: JSONObject = JSON.parseObject(x)
val database: AnyRef = jsonObj.get("database")
val table: AnyRef = jsonObj.get("table")
val typeInfo: AnyRef = jsonObj.get("type")
val objects = new util.ArrayList[TableObject]()
jsonObj.getJSONArray("data").forEach(x => {
println(database.toString + "...." + table.toString + ".." + typeInfo.toString + "..." + x.toString)
objects.add(TableObject(database.toString, table.toString, typeInfo.toString, x.toString))
})
objects
})
mappedStream.addSink(new SinkHbase)
env.execute()
}
}
- Write flink's hbasesink to
write hbase connection method
package myUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{
HBaseConfiguration, HConstants}
import org.apache.hadoop.hbase.client.{
Connection, ConnectionFactory}
class ConnHBase {
def connToHabse:Connection={
val conf: Configuration = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum","linux121,linux122,linux123")
conf.set("hbase.zookeeper.property.clinetPort","2181")
conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT,30000)
conf.setInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD,30000)
val connection: Connection = ConnectionFactory.createConnection(conf)
connection
}
}
Inherit RichSinkFunction to write sinker from flink to hbase
package ods
import models.{
AreaInfo, DataInfo, TableObject}
import org.apache.flink.streaming.api.functions.sink.{
RichSinkFunction, SinkFunction}
import java.util
import com.alibaba.fastjson.JSON
import myUtils.ConnHBase
import org.apache.flink.configuration.Configuration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.{
Connection, Delete, Put, Table}
class SinkHbase extends RichSinkFunction[util.ArrayList[TableObject]]{
var connection:Connection= _
var hbtable:Table =_
override def invoke(value: util.ArrayList[TableObject], context: SinkFunction.Context[_]): Unit = {
value.forEach(x=>{
println(x.toString)
val database: String = x.dataBase
val tableName: String = x.tableName
val typeInfo: String = x.typeInfo
hbtable = connection.getTable(TableName.valueOf(tableName))
if(database.equalsIgnoreCase("dwshow")&&tableName.equalsIgnoreCase("lagou_trade_orders")){
if(typeInfo.equalsIgnoreCase("insert")){
value.forEach(x=>{
val info: DataInfo = JSON.parseObject(x.dataInfo, classOf[DataInfo])
insertTradeOrders(hbtable,info);
})
}else if(typeInfo.equalsIgnoreCase("update")){
value.forEach(x=>{
val info: DataInfo = JSON.parseObject(x.dataInfo, classOf[DataInfo])
insertTradeOrders(hbtable,info)
})
}else if(typeInfo.equalsIgnoreCase("delete")){
value.forEach(x=>{
val info: DataInfo = JSON.parseObject(x.dataInfo, classOf[DataInfo])
deleteTradeOrders(hbtable,info)
})
}
}
if(database.equalsIgnoreCase("dwshow")&&tableName.equalsIgnoreCase("lagou_area")){
value.forEach(x=>{
val info: AreaInfo = JSON.parseObject(x.dataInfo, classOf[AreaInfo])
if(typeInfo.equalsIgnoreCase("insert")){
insertArea(hbtable,info)
}else if(typeInfo.equalsIgnoreCase("update")){
insertArea(hbtable,info)
}else if(typeInfo.equalsIgnoreCase("delete")){
deleteArea(hbtable,info)
}
})
}
})
}
override def open(parameters: Configuration): Unit = {
connection = new ConnHBase().connToHabse
}
override def close(): Unit = {
if(hbtable!=null){
hbtable.close()
}
if(connection!=null){
connection.close()
}
}
def insertTradeOrders(hbTable:Table,dataInfo:DataInfo)={
val put = new Put(dataInfo.orderId.getBytes)
put.addColumn("f1".getBytes,"modifiedTime".getBytes,dataInfo.modifiedTime.getBytes())
put.addColumn("f1".getBytes,"orderNo".getBytes,dataInfo.orderNo.getBytes())
put.addColumn("f1".getBytes,"isPay".getBytes,dataInfo.isPay.getBytes())
put.addColumn("f1".getBytes,"tradeSrc".getBytes,dataInfo.tradeSrc.getBytes())
put.addColumn("f1".getBytes,"payTime".getBytes,dataInfo.payTime.getBytes())
put.addColumn("f1".getBytes,"productMoney".getBytes,dataInfo.productMoney.getBytes())
put.addColumn("f1".getBytes,"totalMoney".getBytes,dataInfo.totalMoney.getBytes())
put.addColumn("f1".getBytes,"dataFlag".getBytes,dataInfo.dataFlag.getBytes())
put.addColumn("f1".getBytes,"userId".getBytes,dataInfo.userId.getBytes())
put.addColumn("f1".getBytes,"areaId".getBytes,dataInfo.areaId.getBytes())
put.addColumn("f1".getBytes,"createTime".getBytes,dataInfo.createTime.getBytes())
put.addColumn("f1".getBytes,"payMethod".getBytes,dataInfo.payMethod.getBytes())
put.addColumn("f1".getBytes,"isRefund".getBytes,dataInfo.isRefund.getBytes())
put.addColumn("f1".getBytes,"tradeType".getBytes,dataInfo.tradeType.getBytes())
put.addColumn("f1".getBytes,"status".getBytes,dataInfo.status.getBytes())
hbTable.put(put)
}
def deleteTradeOrders(hbtable:Table,dataInfo: DataInfo)={
val delete = new Delete(dataInfo.orderId.getBytes());
hbtable.delete(delete)
}
def insertArea(hbTable:Table,areaInfo: AreaInfo)={
val put = new Put(areaInfo.id.getBytes())
put.addColumn("f1".getBytes(),"name".getBytes(),areaInfo.name.getBytes())
put.addColumn("f1".getBytes(),"pid".getBytes(),areaInfo.pid.getBytes())
put.addColumn("f1".getBytes(),"sname".getBytes(),areaInfo.sname.getBytes())
put.addColumn("f1".getBytes(),"level".getBytes(),areaInfo.level.getBytes())
put.addColumn("f1".getBytes(),"citycode".getBytes(),areaInfo.citycode.getBytes())
put.addColumn("f1".getBytes(),"yzcode".getBytes(),areaInfo.yzcode.getBytes())
put.addColumn("f1".getBytes(),"mername".getBytes(),areaInfo.mername.getBytes())
put.addColumn("f1".getBytes(),"Lng".getBytes(),areaInfo.Lng.getBytes())
put.addColumn("f1".getBytes(),"Lat".getBytes(),areaInfo.Lat.getBytes())
put.addColumn("f1".getBytes(),"pinyin".getBytes(),areaInfo.pinyin.getBytes())
hbTable.put(put)
}
def deleteArea(hbTable:Table,areaInfo: AreaInfo)={
val delete = new Delete(areaInfo.id.getBytes())
hbTable.delete(delete)
}
}