sparkstreaming利用hbase保存kafka消息offset(亲测可用)

概述

利用hbase保存kafka消息的offset,sparkstreaming消费kafka时每次从上一次消费的offset偏移量开始消费,消费逻辑处理后又更新存储的offset, 这样就彻底避免了程序异常导致消息丢失的问题

工具类

import org.apache.hadoop.hbase.HBaseConfiguration

import java.util.HashMap
import org.apache.hadoop.hbase.client.{
    
    ConnectionFactory, Get, Put, Table}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{
    
    ConsumerStrategies, KafkaUtils, LocationStrategies, OffsetRange}
object OffsetHbaseUtil {
    
    

  /** 获取hbase连接 */
  def getConnection(zkhosts_hbase: String) = {
    
    
    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.set("hbase.zookeeper.quorum", zkhosts_hbase)
    val connection = ConnectionFactory.createConnection(hbaseConf)
    connection
  }

  /**
   * 创建 DirectStream
   */
  def createStreamingContextHbase(ssc: StreamingContext,
                                  topics: Array[String],
                                  kafkaParams: Map[String, Object],
                                  table: Table): InputDStream[ConsumerRecord[String, String]] = {
    
    
    var kafkaStreams: InputDStream[ConsumerRecord[String, String]] = null
    val offSets = getOffset(topics, table)
    if (offSets.nonEmpty) {
    
    
      kafkaStreams = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe(topics, kafkaParams, offSets))
    } else {
    
    
      kafkaStreams = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe(topics, kafkaParams))
    }
    kafkaStreams
  }



  /**
   * 得到历史 OffSet
   */
  def getOffset(topics: Array[String], table: Table) = {
    
    
    var fromOffSets = scala.collection.mutable.LinkedHashMap[TopicPartition, Long]()
    for (i <- topics.indices) {
    
    
      val topic = topics(i)
      val get = new Get((s"${
      
      topic}_offset").getBytes)
      val listget = table.get(get)
      if (listget.getRow != null) {
    
    
        var top: String = null
        var partitions: String = null
        if (listget.getValue("topicinfo".getBytes, "topic".getBytes) != null && listget.getValue("topicinfo".getBytes, "partition".getBytes) != null)
          top = new String(listget.getValue("topicinfo".getBytes, "topic".getBytes))
        partitions = new String(listget.getValue("topicinfo".getBytes, "partition".getBytes))
        val o = partitions.split(",")
        for (x <- 0 until (o.length)) {
    
    
          val pt = o(x).split("\\|")
          fromOffSets.put(new TopicPartition(top, Integer.parseInt(pt(0))), String.valueOf(pt(1)).toLong)
        }
      }
    }
    fromOffSets
  }


  /**
   * 保存新的 OffSet
   */
  def storeOffSet(ranges: Array[OffsetRange], topic: Array[String], table: Table) = {
    
    
    var map = new HashMap[String, String]
    ranges.map {
    
     x => (x.topic, x.partition.+("|" + x.untilOffset)) }.map {
    
    
      case (x) => {
    
    
        if (map.get(x._1) != null) map.put(x._1, map.get(x._1) + "," + x._2)
        else map.put(x._1, x._2)
      }
    }
    val arr = map.keySet().toArray()
    for (index <- 0 until (arr.length)) {
    
    
      //			val put = new Put((s"${topic}_offset").getBytes)
      val topic = arr(index)
      val put = new Put((s"${
      
      topic}_offset").getBytes)
      put.addColumn("topicinfo".getBytes, "topic".getBytes, arr(index).toString() getBytes)
      put.addColumn("topicinfo".getBytes, "partition".getBytes, map.get(arr(index)).toString().getBytes)
      table.put(put)
    }
  }
}

案例

object Event2CKStreamJob {
    
    

  def compute(spark: SparkSession, args: Array[String]): Unit = {
    
    

    //设置参数提交长度
    if (args.length < 15) {
    
    
      System.err.println(
        """
          <master>
          <jobid>
           ckHost,
            ckPort,
            dbName,
            ckUser,
            ckPasswd,
            zkQuorum,
            bootstrapServers,
            consumerGroupID,
            topics_input,
            batchDuration,
            autoOffsetReset,
            maxRatePerPartition,
            htable
        				""".stripMargin)
      sys.exit(1)
    }

    //参数传入
    println("params=>" + args.mkString(" "))
    val Array(master, jobId,
    ckHost,
    ckPort,
    dbName,
    ckUser,
    ckPasswd,
    zkQuorum,
    bootstrapServers,
    consumerGroupID,
    topics_input,
    batchDuration,
    autoOffsetReset,
    maxRatePerPartition,
    htable
    ) = args

    println(zkQuorum)

    spark.sparkContext.getConf.set("spark.streaming.kafka.maxRatePerPartition", maxRatePerPartition)
    spark.sparkContext.getConf.set("spark.streaming.stopGracefullyOnShutdown", "true")

    val ssc = new StreamingContext(spark.sparkContext, Seconds(batchDuration.toLong))
//    ssc.checkpoint("/tmp/" + System.currentTimeMillis())


    //    val ssc = new StreamingContext(sparkConf, Milliseconds(millisecondsStr.toLong))

    val kafkaParams = Map[String, Object](
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers,
      ConsumerConfig.GROUP_ID_CONFIG -> consumerGroupID,
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      "auto.offset.reset" -> autoOffsetReset,
      "session.timeout.ms" -> "30000",
      "heartbeat.interval.ms" -> "10000",
      "fetch.max.wait.ms" -> "3000",
      "fetch.min.bytes" -> "4194304",
      "max.poll.interval.ms" -> "300000",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    kafkaParams.foreach(x => {
    
    
      println(x._1 + ":" + x._2)
    })


    val topicsSet = topics_input.split(",").toSet
    val topics = topicsSet.toArray
    val topic = topics(0)

    var kafkaStreams: InputDStream[ConsumerRecord[String, String]] = null
//    kafkaStreams = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
//      ConsumerStrategies.Subscribe(topicsSet, kafkaParams))


    val table = OffsetHbaseUtil.getConnection(zkQuorum).getTable(TableName.valueOf(htable))
    kafkaStreams = OffsetHbaseUtil.createStreamingContextHbase(ssc, topics, kafkaParams, table)

    kafkaStreams.foreachRDD((rdd, batchTime) => {
    
    

      import spark.implicits._
      val filterDS = getFilterDS(spark, rdd.map(line => {
    
    
        line.value()
      }).toDS())

      val startEventDS = getStartEventDS(spark, filterDS)
      val msgReadEventDS = getMsgReadEventDS(spark, filterDS)
      val msgAckEventDS = getMsgAckEventDS(spark, filterDS)

      val jdbcUrl = s"jdbc:clickhouse://$ckHost:$ckPort/$dbName"
      val prop = getCKJdbcProperties(ckUser, ckPasswd)


//      startEventDS.cache()
//      startEventDS.show()
      startEventDS.write
        .mode(SaveMode.Append)
        .jdbc(jdbcUrl, "event_start", prop)


//      msgReadEventDS.cache()
//      msgReadEventDS.show()
      msgReadEventDS.write
        .mode(SaveMode.Append)
        .jdbc(jdbcUrl, "event_msg_read", prop)

//      msgAckEventDS.cache()
//      msgAckEventDS.show()
      msgAckEventDS.write
        .mode(SaveMode.Append)
        .jdbc(jdbcUrl, "event_msg_ack", prop)

      //  保存新的 Offset
      OffsetHbaseUtil.storeOffSet(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, topics, table)

    })




    ssc.start()
    ssc.awaitTermination()

  }

hbase表内容

hbase(main):002:0> scan 'kafka_offSet'
ROW                                  COLUMN+CELL
 EventReportTopic_offset        column=topicinfo:partition, timestamp=1675072501550, value=0|473609798,1|473518108,2|472762027
 EventReportTopic_offset        column=topicinfo:topic, timestamp=1675072501550, value=EventReportTopic
 PubMsgTopic_offset             column=topicinfo:partition, timestamp=1675072500591, value=0|276,1|276,2|266
 PubMsgTopic_offset             column=topicinfo:topic, timestamp=1675072500591, value=PubMsgTopic

参考资料

https://blog.cloudera.com/offset-management-for-apache-kafka-with-apache-spark-streaming/

猜你喜欢

转载自blog.csdn.net/qq_16038125/article/details/128806688