構造化ストリーミングでは、データセットに対して集計操作を繰り返し実行すると、次のエラーが報告される場合があります。
Multiple streaming aggregations are not supported with streaming DataFrames/Datasets
Multiple mapGroupsWithStates are not supported on a streaming DataFrames/Datasets
Mixing mapGroupsWithStates and flatMapGroupsWithStates are not supported on a streaming DataFrames/Datasets
上記のエラーが発生したとき、長い間、sparkは複数の集約チェーンをまったくサポートしていないと思っていました。昨日までソースコードを調べて、最終的に次の文を見つけました。
Multiple flatMapGroupsWithStates are not supported when they are not all in append mode or the output mode is not append on a streaming DataFrames/Datasets
これを見て、複数のflatMapGroupsWithStatesが集約のために連結されている場合、それらは追加モードでのみ表示できることがわかりました。つまり、これは、複数の連続集計操作を実装できることを意味しますが、複数のflatMapGroupsWithStatesと追加出力モードのみで十分です。
コードを直接見てください:
object PaasPvUv {
val dateFormat: FastDateFormat = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
def main(args: Array[String]): Unit = {
val conf = new SparkConf().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.registerKryoClasses(Array(classOf[AidUvState], classOf[Roaring64Bitmap]))
val spark = SparkSession.builder
.appName("StructuredStreamingTest")
.master("local[*]")
.config(conf)
"com.neusoft.distinct.RocksDbStateStoreProvider")
.config("spark.sql.shuffle.partitions", "1")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
// Create DataFrame representing the stream of input lines from connection to localhost:9999
val lines = spark.readStream
.format("socket")
.option("host", "192.168.1.171")
.option("port", 9999)
.load()
import spark.implicits._
// implicit val stateEncoder = org.apache.spark.sql.Encoders.bean(classOf[AidUvState])
// implicit val resultEncoder = org.apache.spark.sql.Encoders.bean(classOf[AidUvResult])
val query = lines.as[String].map(line => {
println("line===>", line)
val arr = line.split(";")
PaasInput(arr(0), arr(1), arr(2), arr(3), this.formatTime2TimeStamp(arr(4)))
}).as[PaasInput]
// 这里必须是要有WaterMark的
.withWatermark("inputtime", "1 minutes")
.groupByKey(line=>(line.app_id, line.aid))
.flatMapGroupsWithState(OutputMode.Append(), GroupStateTimeout.EventTimeTimeout())(updateAcrossEvents2)
// 这里必须是要有WaterMark的
.withWatermark("inputtime", "1 minutes")
.groupByKey(_.app_id)
.flatMapGroupsWithState(OutputMode.Append(), GroupStateTimeout.EventTimeTimeout())(updateAppIdAcrossEvent2)
.writeStream
.outputMode(OutputMode.Append())
.queryName("Aid_Uv_Count")
.format("console")
.start()
query.awaitTermination()
}
def updateAppIdAcrossEvent2(appId: String, events: Iterator[AidUvResult], oldState: GroupState[AppIdState]): Iterator[AppIdResult] = {
var state = if(oldState.exists) oldState.get else AppIdState(appId, new Timestamp(0L), 0L, 0L)
if (oldState.hasTimedOut) {
val state = oldState.get
oldState.remove()
Iterator(AppIdResult(appId, state.uv, state.pv, state.inputtime, true))
} else {
var timemax: Long = state.inputtime.getTime
for (event <- events) {
timemax = math.max(timemax, event.inputtime.getTime)
state = state.copy(app_id = event.app_id, inputtime = new Timestamp(timemax), uv = state.uv+event.uv, pv=state.pv+event.pv)
}
oldState.update(state)
oldState.setTimeoutTimestamp(timemax, "5 seconds")
val ret = AppIdResult(app_id = appId, state.uv, state.pv, new Timestamp(timemax), false)
println("ret:", ret)
Iterator(ret)
}
}
def updateAcrossEvents2(groupKey: (String, String), events: Iterator[PaasInput], oldState: GroupState[AidUvState]): Iterator[AidUvResult] = {
var state = if(oldState.exists) oldState.get else AidUvState(groupKey._1, groupKey._2, new Timestamp(0L), new mutable.MutableList[String], new mutable.MutableList[String])
if (oldState.hasTimedOut) {
val state = oldState.get
oldState.remove()
Iterator(AidUvResult(groupKey._1, groupKey._2, state.uids.size, state.sessions.size, state.inputtime, true))
} else {
var timemax: Long = state.inputtime.getTime
for (event <- events) {
timemax = math.max(timemax, event.inputtime.getTime)
if(!state.uids.contains(event.uid)){
state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), uids = state.uids :+ event.uid, state.sessions)
}
if(!state.sessions.contains(event.s)){
state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), state.uids, sessions = state.sessions :+ event.s)
}
state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), state.uids, state.sessions)
}
oldState.update(state)
oldState.setTimeoutTimestamp(timemax, "5 seconds")
val ret = AidUvResult(app_id = groupKey._1, aid = groupKey._2, state.uids.size, state.sessions.size, new Timestamp(timemax), false)
println("ret:", ret)
Iterator(ret)
}
}
def formatTime2TimeStamp(timeStr: String): Timestamp ={
val milliSeconds = dateFormat.parse(timeStr).getTime
val timestamp = new Timestamp(milliSeconds)
timestamp
}
}
次に、次のデータを入力します。
9320432d;lss_0bc36d14;9320432d_4444444441520999402417;9320432d_444444444;2021-03-15 16:19:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:20:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:21:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402417;9320432d_444444444;2021-03-15 16:22:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:23:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:24:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:25:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:26:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:26:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:27:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:28:00
出力は次のとおりです。
上記は、完全な構造化ストリーミングの下での連続的な集約チェーンの実現ですが、追加モードです。将来、異常な情報が報告された場合は、ソースコードを探す方が良いでしょうし、新しい解決策を見つけることが可能です。