Real-time data warehouse project (2) DIM layer regional dimension table establishment

DIM layer regional dimension table establishment

1. Thinking analysis

  1. Import lagou_area in mysql into lagou_area in hbase
  2. Build the data source, read the content of lago_area from Hbase
  3. Use flink for operator transformation to get flinktable
  4. Write sql
  5. Convert the table back to data
  6. Custom sinker, write data to hbase

Two, coding realization

  1. Create a sample class
package dim

case class DimArea(
                  areaId:Int,
                  aname:String,
                  cid:Int,
                  city:String,
                  proId:Int,
                  province:String
                  )

case class AreaDetail(id:Int,name:String,pid:Int)

  1. Customize HbaseReader to read lagou_area data
package dim

import java.util

import myUtils.ConnHBase
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{
    
    RichSourceFunction, SourceFunction}
import org.apache.hadoop.hbase.{
    
    Cell, TableName}
import org.apache.hadoop.hbase.client.{
    
    Connection, Result, ResultScanner, Scan, Table}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.JavaConverters._

//?会增量读取HBASE中的数据吗?
class HBaseReader extends RichSourceFunction[(String,String)]{
    
    

  private var conn:Connection =_
  private var table:Table =_
  private var scan:Scan=_


  override def run(sourceContext: SourceFunction.SourceContext[(String, String)]): Unit = {
    
    
    val rs: ResultScanner = table.getScanner(scan)
    val iterator: util.Iterator[Result] = rs.iterator()

    while(iterator.hasNext){
    
    
      val result: Result = iterator.next()
      val rowKey: String = Bytes.toString(result.getRow)
      val buffer = new StringBuffer()
      for(cell:Cell<-result.listCells().asScala){
    
    
        val value: String = Bytes.toString(cell.getValueArray, cell.getValueOffset, cell.getValueLength)
        buffer.append(value).append("-")

      }

      val valueString: String = buffer.replace(buffer.length() - 1, buffer.length(), "").toString
      sourceContext.collect((rowKey,valueString))
    }
  }

  override def cancel(): Unit = {
    
    

  }

  override def open(parameters: Configuration): Unit = {
    
    

    conn = new ConnHBase().connToHabse
    table = conn.getTable(TableName.valueOf("lagou_area"))
    scan = new Scan()
    scan.addFamily("f1".getBytes())

  }

  override def close(): Unit = {
    
    
    try {
    
    
      if(table!=null){
    
    
        table.close()
      }
      if(conn!=null){
    
    
        conn.close()
      }

    }catch {
    
    
      case e:Exception=>println(e.getMessage)
    }

  }
}

  1. Custom HbaseWriter writes dim_lagou_area data
package dim

import java.util

import myUtils.ConnHBase
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink.{
    
    RichSinkFunction, SinkFunction}
import org.apache.flink.streaming.api.functions.source.{
    
    RichSourceFunction, SourceFunction}
import org.apache.hadoop.hbase.{
    
    Cell, TableName}
import org.apache.hadoop.hbase.client.{
    
    Connection, Put, Result, ResultScanner, Scan, Table}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.JavaConverters._

class HBaseWriter extends RichSinkFunction[DimArea] {
    
    

  var connection:Connection = _
  var hTable:Table =_
  override def open(parameters: Configuration): Unit = {
    
    
    connection = new ConnHBase().connToHabse
    hTable = connection.getTable(TableName.valueOf("dim_lagou_area"))
  }

  override def close(): Unit = {
    
    
    if(hTable != null) {
    
    
      hTable.close()
    }
    if (connection != null) {
    
    
      connection.close()
    }
  }

  override def invoke(value: DimArea, context: SinkFunction.Context[_]): Unit = {
    
    
    insertDimArea(hTable,value)
  }



  def insertDimArea(table: Table, value: DimArea): Unit ={
    
    
//    val infos: Array[String] = value.split(",")
//    val areaId: String = infos(0).trim.toString
//    val aname: String = infos(1).trim.toString
//    val cid: String = infos(2).trim.toString
//    val city: String = infos(3).trim.toString
//    val proId: String = infos(4).trim.toString
//    val province: String = infos(5).trim.toString

    val areaId: String = value.areaId.toString
    val aname: String = value.aname.toString
    val cid: String = value.cid.toString
    val city: String = value.city.toString
    val proId: String = value.proId.toString
    val province: String = value.province.toString

    val put = new Put(areaId.getBytes())
    put.addColumn("f1".getBytes(),"aname".getBytes(),aname.getBytes())
    put.addColumn("f1".getBytes(),"cid".getBytes(),cid.getBytes())
    put.addColumn("f1".getBytes(),"city".getBytes(),city.getBytes())
    put.addColumn("f1".getBytes(),"proId".getBytes(),proId.getBytes())
    put.addColumn("f1".getBytes(),"province".getBytes(),province.getBytes())
    hTable.put(put)

  }
}

  1. Use flink flinktable to process data.
    Ideas:
    1. Initialize the data source and get flinkStreamExcutionEnvironment
    2. Obtain and parse the
    stream data 3. Create a temporary table based on the stream data
    4. Write the conversion sql
    5. Convert the flink table into stream data
    6. Use self Define sinker to write into hbase table
package dim

import org.apache.flink.streaming.api.{
    
    CheckpointingMode, TimeCharacteristic}
import org.apache.flink.streaming.api.scala.{
    
    DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
import org.apache.flink.table.api.Table
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
import org.apache.flink.types.Row
object AreaDetailInfo {
    
    


  def main(args: Array[String]): Unit = {
    
    
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.enableCheckpointing(5000)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)

    val data: DataStream[(String, String)] = env.addSource(new HBaseReader)

    //data.print()

    val dataStream: DataStream[AreaDetail] = data.map(x => {
    
    
      val id: Int = x._1.toInt
      val datas: Array[String] = x._2.split("-")
      val name: String = datas(5).trim
      val pid: Int = datas(6).trim.toInt
      AreaDetail(id, name, pid)
    })

    val tableEnv: StreamTableEnvironment = StreamTableEnvironment.create(env)

    tableEnv.createTemporaryView("lagou_area",dataStream)


    val sql: String =
      """
        |select a.id as areaId,a.name as aname,a.pid as cid,b.name as city,c.id as proid,c.name as province
        |from lagou_area as a
        |inner join lagou_area as b on a.pid = b.id
        |inner join lagou_area as c on b.pid = c.id
        |""".stripMargin

    val areaTable: Table = tableEnv.sqlQuery(sql)


//    val resultStream: DataStream[String] = tableEnv.toRetractStream[DimArea](areaTable).map(x => {
    
    
//      val row: DimArea = x._2
//      row.areaId + "," + row.aname + "," + row.cid + "," + row.city + "," + row.proId + "," + row.province
//    })
    //x_1是true
    val resultStream: DataStream[DimArea] = tableEnv.toRetractStream[DimArea](areaTable).map(x => {
    
    
      val row: DimArea = x._2
      row
    })

    resultStream.print()

    resultStream.addSink(new HBaseWriter)
    env.execute("dimareabuild")



  }

}

Guess you like

Origin blog.csdn.net/weixin_38813363/article/details/114144119