kakfa offset

package cn.analysys.stream.state

import java.nio.ByteBuffer

import cn.analysys.meta.MetaMapInfo
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import cn.analysys.third.hllc.HLLCounter
import cn.analysys.utils.{BridgeUtils, ParseUtils}
import org.apache.spark.streaming.State
import org.apache.spark.streaming.dstream.DStream


object StateProcess {
  val M = 10
  val TOPN = 20
  val MINUTECOMBINE = 300000
  val TIMEOUTMINUTE = 1440
  val OUTPUT = 100
  val funcLongAdd = (key: String, tempId: Option[Long], state: State[Long]) => {
    val sum = state.getOption.getOrElse(0L) + tempId.getOrElse(1l)
    val hllc: HLLCounter = null
    val output = (key, (sum, hllc))
    state.update(sum)
    output
  }

  def isNewKey(key: String, hllc: HLLCounter): Boolean = {
    var isNew = false
    val begin = hllc.getCountEstimate
    hllc.add(key)
    val end = hllc.getCountEstimate
    if (end > begin) isNew = true
    isNew
  }


  val appKeyDeviceIdessionIDDistinct = (key: Int, loadPage: Option[LoadPageClassType], state: State[HLLCounter]) => {
    val hllc = state.getOption().getOrElse(new HLLCounter(M))
    var page: LoadPageClassType = null
    val begin = hllc.getCountEstimate
    hllc.add(key)
    val end = hllc.getCountEstimate
    if (end > begin) page = loadPage .getOrElse(null)
    val output = page
    state.update(hllc)
    output
  }

  // In order to reduce the subsequent processing data, set the number of occurrences to Null.
  val funcHllcAdd = (key: String, tempId: Option[String], state: State[HLLCounter]) => {
    val hllc = state.getOption().getOrElse(new HLLCounter(M))
    val begin = hllc.getCountEstimate
    hllc. add(tempId.getOrElse(""))
    val end = hllc.getCountEstimate
    val output = (if (end > begin) key else "", (hllc.getCountEstimate, hllc))
    state.update(hllc)
    output
  }

  //All KV returns are a String, a Long or hllc
  //Convention: For the output Key, according to the MetaMapInfo.SEG of the Key, if it is two segments, the K:V structure is stored, and if it is three segments, the K:K:V structure is stored
  //KVType : MetaMapInfo.KEYTYPEKV , MetaMapInfo.KEYTYPEKKV
  // Consider redis pressure. 1. The two-stage stage will be based on KEY or maximum value. Write storage
  // 2, three-segment type, will be sorted according to the third field, get the top 20. And the sum of the values ​​of all dimensions, record it. Write into the library
  def longDstreamSortAndOutPut(resultDstreamOrg: DStream[(String, (Long, HLLCounter))], KVType: String,
                               isHLLC: Boolean, sorted: Boolean = false) = {
    //debug
    // if(isHLLC)
    // resultDstream.foreachRDD(rRdd => rRdd.take(5).foreach(x => println(s" key : ${x._1} hllc value ${x._2._2.getCountEstimate } ")))
    / / else resultDstream.foreachRDD(rRdd => rRdd.take(5).foreach(x => println(s" key : ${x._1} long value ${x._2._1}")))
    val resultDstream = resultDstreamOrg.filter(result => result._2._1>OUTPUT )
    resultDstream.foreachRDD(resultRdd => {
      val comparedTopRdd = resultRdd.reduceByKey((a, b) => if (a._1 > b._1) a else b ) // There are as many records from mapwithstate as the original records.
      if (MetaMapInfo.KEYTYPEKV.equals(KVType)) {
        // Write two-section KEY directly
        comparedTopRdd.foreachPartition(partition => {
          BridgeUtils.initRedis(MetaMapInfo.URL, MetaMapInfo.TABLENAME, MetaMapInfo.USERNAME, MetaMapInfo.PASSWORD)
          if (isHLLC) partition.foreach(KV =>  BridgeUtils.putRedis(KV._1, getByteFromHllc(KV._2._2)))
          else partition.foreach(KV =>  BridgeUtils.putRedis(KV._1, KV._2._1))
        })
      } else {
        //三段样式 要排序
        val dimentionRdd = comparedTopRdd.filter(
          kv => kv._1.contains(MetaMapInfo.DIMSEG) && kv._1.split(MetaMapInfo.DIMSEG).size == 2)
          .map(kv => {
            val dims = kv._1.split(MetaMapInfo.DIMSEG)
            (dims(0), (dims(1), kv._2))
          })
        // top 20 需要排序的
        if (sorted) {
          dimentionRdd.groupByKey().map(keyValueCollect => {
            val key = keyValueCollect._1
            val valueCollection = keyValueCollect._2
            val sortedKeyValue = valueCollection.toArray.sortWith(_._2._1 > _._2._1)
            (key, if (sortedKeyValue.size > TOPN) sortedKeyValue.take(TOPN) else sortedKeyValue)
          }).foreachPartition(partition => {
            BridgeUtils.initRedis(MetaMapInfo.URL, MetaMapInfo.TABLENAME, MetaMapInfo.USERNAME, MetaMapInfo.PASSWORD)
            partition.foreach(kvCollection => {
              val key = kvCollection._1
              // val sumValue = kvCollection._2
              val dimValue = kvCollection._2
              // BridgeUtils.putRedis(key, MetaMapInfo.ALLDIMENSIONCOUNT, sumValue)
              if (isHLLC) dimValue.foreach(kv =>  if(kv._2._2!=null) BridgeUtils.putRedis(key, kv._1, getByteFromHllc(kv._2._2)))
              else dimValue.foreach(kv => BridgeUtils.putRedis(key, kv._1, kv._2._1))
            })
          })
        }
        else {
          dimentionRdd.foreachPartition(partition => {
            BridgeUtils.initRedis(MetaMapInfo.URL, MetaMapInfo.TABLENAME, MetaMapInfo.USERNAME, MetaMapInfo.PASSWORD)
            partition.foreach(kvCollection => {
              val key = kvCollection._1
              val dimValue = kvCollection._2
              if (isHLLC&&dimValue._2._2!=null)  BridgeUtils.putRedis(key, dimValue._1, getByteFromHllc(dimValue._2._2))
              else  BridgeUtils.putRedis(key, dimValue._1, dimValue._2._1)
            }
            )
          })
        }
      }
    })
  }


  def getByteFromHllc(hllc: HLLCounter): Array[Byte] = {
    val out1 = ByteBuffer.allocate(hllc.maxLength())
    hllc.writeRegisters(out1)
    out1.array()
  }

  def getHllcFromByte(bytes: Array[Byte], compressMode: Int): HLLCounter = {
    val hllCounter = new HLLCounter(compressMode)
    hllCounter.readRegisters(ByteBuffer.wrap(bytes))
    hllCounter
  }


  def createContext(brokers: String, topics: String, batchseconds: Int, checkpointDirectory: String
                    , DataType: String, calDate: String, offset: String): StreamingContext = {

    println(s" createContext \n  " +
      s" calbrokersDate : ${brokers} \n  " +
      s" topics : ${topics} \n  " +
      s" batchseconds : ${batchseconds}  \n " +
      s" checkpointDirectory : ${checkpointDirectory}  \n " +
      s" DataType : ${DataType}  \n " +
      s" calDate : ${calDate}  \n " +
      s" offset : ${offset}   ")

    val sparkConf = new SparkConf().setAppName("ArkStreamApp")
      .set("spark.streaming.backpressure.enabled","true")
      .set("spark.streaming.kafka.maxRatePerPartition","5")

    val ssc = new StreamingContext(sparkConf, Seconds(batchseconds))
    ssc.checkpoint(checkpointDirectory)
    val topicsSet = topics.split(",").toSet
    var kafkaParams = Map[String, String](
      "metadata.broker.list" -> brokers
      , "auto.offset.reset" -> "largest"
    )
    if ("smallest".equals(offset)) kafkaParams = kafkaParams.updated("auto.offset.reset", "smallest")


    val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
      ssc, kafkaParams, topicsSet)

    var offsetRanges = Array.empty[OffsetRange]

    messages.transform(r => {
      offsetRanges = r.asInstanceOf[HasOffsetRanges].offsetRanges
      for (o <- offsetRanges) {
        println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
      }
      r
    }).count().print()


//    val dataInfoStream = messages.flatMap(
//      record => ParseUtils.parseArray(record._2, DataType)
//    ).filter(record => record.dataType.equals(MetaMapInfo.PINFO)) //只处理 pinfo数据
//      .map(record => {
//      var mapData = record.dataMap
//      mapData = ParseUtils.eventTimeProcess(mapData) //时间处理
//      mapData = ParseUtils.appIDProcess(mapData) // appID 处理
//      mapData = ParseUtils.sourceProcess(mapData) //来源类型处理
//      mapData
//    }).filter(dataMap =>
//      (dataMap.contains(MetaMapInfo.H5MAP("ApplicationKey"))
//        && dataMap(MetaMapInfo.H5MAP("ApplicationKey")) != null
//        && !(dataMap(MetaMapInfo.H5MAP("ApplicationKey")).isEmpty)
//        && dataMap("APPID") != null
//        && !dataMap("APPID").isEmpty
//        && dataMap("APPID") != "-1"
//        && dataMap("APPID").equals("100024")
//        && (!(dataMap("eventTime").isEmpty))
//        && dataMap("eventTime").contains(calDate)))



    //dataInfoStream.foreachRDD( r => r.take(100).foreach( record => println(s" dataInfoStream2 ${record("eventTime")} ")))
// dataInfoStream.count().print
    // business type:
    // 1. Calculate business UV class, COUNT(DISTINCT TEMPID)
    //val UV = "UV"
    //val REGION_PROVINCE = "PROVINCE"
    //val REGION_CITY = "CITY"
    //val DEVICE_TYPE = "DEVICE-TYPE "
    //DstreamUVProcess.uvDStreamProcess(dataInfoStream)
    //DstreamUVProcess.uvDStreamProcessDim(dataInfoStream)
    // Second, this is a statistic
    //val PAGE_COUNT = "PAGE-COUNT"
    //DstreamPagePvProcess. pagePVDStreamProcess(dataInfoStream)
    // 3. Landing page related statistics
    //val SOURCE_TYPE = "SOURCE-TYPE"
    //val SOURCE_AD = "SOURCE-AD"
    //val SOURCE_KEYWORD = "SOURCE-KEYWORD"
    //val LOAD_PAGE_INCOUNT = "LOAD-PAGE-INCOUNT"
    //DstreamLoadPageProcess.loadPageDStreamProcess(dataInfoStream)
    ssc
  }

}

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326119161&siteId=291194637
Recommended