構造化ストリーミングでの複数のストリーミング集約のサポート

構造化ストリーミングでは、データセットに対して集計操作を繰り返し実行すると、次のエラーが報告される場合があります。

Multiple streaming aggregations are not supported with streaming DataFrames/Datasets
Multiple mapGroupsWithStates are not supported on a streaming DataFrames/Datasets
Mixing mapGroupsWithStates and flatMapGroupsWithStates are not supported on a streaming DataFrames/Datasets

上記のエラーが発生したとき、長い間、sparkは複数の集約チェーンをまったくサポートしていないと思っていました。昨日までソースコードを調べて、最終的に次の文を見つけました。

Multiple flatMapGroupsWithStates are not supported when they are not all in append mode or the output mode is not append on a streaming DataFrames/Datasets

これを見て、複数のflatMapGroupsWithStatesが集約のために連結されている場合、それらは追加モードでのみ表示できることがわかりました。つまり、これは、複数の連続集計操作を実装できることを意味しますが、複数のflatMapGroupsWithStatesと追加出力モードのみで十分です。

コードを直接見てください:

object PaasPvUv {

  val dateFormat: FastDateFormat = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .registerKryoClasses(Array(classOf[AidUvState], classOf[Roaring64Bitmap]))

    val spark = SparkSession.builder
      .appName("StructuredStreamingTest")
      .master("local[*]")
      .config(conf)
"com.neusoft.distinct.RocksDbStateStoreProvider")
      .config("spark.sql.shuffle.partitions", "1")
      .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    // Create DataFrame representing the stream of input lines from connection to localhost:9999
    val lines = spark.readStream
      .format("socket")
      .option("host", "192.168.1.171")
      .option("port", 9999)
      .load()

    import spark.implicits._
//    implicit val stateEncoder = org.apache.spark.sql.Encoders.bean(classOf[AidUvState])
//    implicit val resultEncoder = org.apache.spark.sql.Encoders.bean(classOf[AidUvResult])

    val query = lines.as[String].map(line => {
      println("line===>", line)
      val arr = line.split(";")
      PaasInput(arr(0), arr(1), arr(2), arr(3), this.formatTime2TimeStamp(arr(4)))
    }).as[PaasInput]
      // 这里必须是要有WaterMark的
      .withWatermark("inputtime", "1 minutes")
      .groupByKey(line=>(line.app_id, line.aid))
      .flatMapGroupsWithState(OutputMode.Append(), GroupStateTimeout.EventTimeTimeout())(updateAcrossEvents2)
      // 这里必须是要有WaterMark的
      .withWatermark("inputtime", "1 minutes")
      .groupByKey(_.app_id)
      .flatMapGroupsWithState(OutputMode.Append(), GroupStateTimeout.EventTimeTimeout())(updateAppIdAcrossEvent2)
      .writeStream
      .outputMode(OutputMode.Append())
      .queryName("Aid_Uv_Count")
      .format("console")
      .start()

    query.awaitTermination()

  }

  def updateAppIdAcrossEvent2(appId: String, events: Iterator[AidUvResult], oldState: GroupState[AppIdState]): Iterator[AppIdResult] = {
    var state = if(oldState.exists) oldState.get else AppIdState(appId, new Timestamp(0L), 0L, 0L)
    if (oldState.hasTimedOut) {
      val state = oldState.get
      oldState.remove()
      Iterator(AppIdResult(appId, state.uv, state.pv, state.inputtime, true))
    } else {
      var timemax: Long = state.inputtime.getTime
      for (event <- events) {
        timemax = math.max(timemax, event.inputtime.getTime)

        state = state.copy(app_id = event.app_id, inputtime = new Timestamp(timemax), uv = state.uv+event.uv, pv=state.pv+event.pv)
      }
      oldState.update(state)
      oldState.setTimeoutTimestamp(timemax, "5 seconds")
      val ret = AppIdResult(app_id = appId, state.uv, state.pv, new Timestamp(timemax), false)
      println("ret:", ret)
      Iterator(ret)
    }
  }

  def updateAcrossEvents2(groupKey: (String, String), events: Iterator[PaasInput], oldState: GroupState[AidUvState]): Iterator[AidUvResult] = {
    var state = if(oldState.exists) oldState.get else AidUvState(groupKey._1, groupKey._2, new Timestamp(0L), new mutable.MutableList[String], new mutable.MutableList[String])

    if (oldState.hasTimedOut) {
      val state = oldState.get
      oldState.remove()
      Iterator(AidUvResult(groupKey._1, groupKey._2, state.uids.size, state.sessions.size, state.inputtime, true))
    } else {
      var timemax: Long = state.inputtime.getTime
      for (event <- events) {
        timemax = math.max(timemax, event.inputtime.getTime)
        if(!state.uids.contains(event.uid)){
          state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), uids = state.uids :+ event.uid, state.sessions)
        }
        if(!state.sessions.contains(event.s)){
          state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), state.uids, sessions = state.sessions :+ event.s)
        }
        state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), state.uids, state.sessions)
      }
      oldState.update(state)
      oldState.setTimeoutTimestamp(timemax, "5 seconds")
      val ret = AidUvResult(app_id = groupKey._1, aid = groupKey._2, state.uids.size, state.sessions.size, new Timestamp(timemax), false)
      println("ret:", ret)
      Iterator(ret)
    }
  }

  def formatTime2TimeStamp(timeStr: String): Timestamp ={
    val milliSeconds = dateFormat.parse(timeStr).getTime
    val timestamp = new Timestamp(milliSeconds)

    timestamp
  }

}

次に、次のデータを入力します。

9320432d;lss_0bc36d14;9320432d_4444444441520999402417;9320432d_444444444;2021-03-15 16:19:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:20:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:21:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402417;9320432d_444444444;2021-03-15 16:22:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:23:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:24:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:25:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:26:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:26:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:27:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:28:00

出力は次のとおりです。

上記は、完全な構造化ストリーミングの下で​​の連続的な集約チェーンの実現ですが、追加モードです。将来、異常な情報が報告された場合は、ソースコードを探す方が良いでしょうし、新しい解決策を見つけることが可能です。

おすすめ

転載: blog.csdn.net/qq_32323239/article/details/114988109