Apache Spark Structured Streaming and watermarks

"watermark" should "discard late data and accept 1 late but within watermark with window aggregation" in {
   val testKey = "watermark-window-test"
   val inputStream = new MemoryStream[(Timestamp, String)]( 1 , sparkSession.sqlContext)
   val now = 5000 L
   val aggregatedStream = inputStream.toDS().toDF( "created" , "name" )
     .withWatermark( "created" , "2 seconds" )
     .groupBy(window($ "created" , "2 seconds" )).count()
 
   val query = aggregatedStream.writeStream.outputMode( "update" )
     .foreach( new ForeachWriter[Row]() {
       override def open(partitionId : Long, version : Long) : Boolean = true
       override def process(processedRow : Row) : Unit = {
         val window = processedRow.get( 0 )
         val rowRepresentation = s "${window.toString} -> ${processedRow.getAs[Long](" count ")}"
         InMemoryKeyedStore.addValue(testKey, rowRepresentation)
       }
       override def close(errorOrNull : Throwable) : Unit = {}
     }).start()
 
   new Thread( new Runnable() {
     override def run() : Unit = {
       inputStream.addData(( new Timestamp(now), "a1" ), ( new Timestamp(now), "a2" ),
         ( new Timestamp(now- 4000 L), "b1" ))
       while (!query.isActive) {
         // wait the query to activate
       }
       // The watermark is now computed as MAX(event_time) - watermark, i.e.:
       // 5000 - 2000 = 3000
       // Thus among the values sent above only "a6" should be accepted because it's within the watermark
       Thread.sleep( 7000 )
       val timeOutOfWatermark = 1000 L
       inputStream.addData(( new Timestamp(timeOutOfWatermark), "b2" ), ( new Timestamp(timeOutOfWatermark), "b3" ),
         ( new Timestamp(timeOutOfWatermark), "b4" ), ( new Timestamp(now), "a3" ))
     }
   }).start()
 
   query.awaitTermination( 25000 )
 
   val readValues = InMemoryKeyedStore.getValues(testKey)
   println(s "All data=${readValues}" )
   // As you can notice, the count for the window 0-2 wasn't updated with 3 fields (b2, b3 and b4) because they fall
   // before the watermark
   // Please see how this behavior changes in the next test where the watermark is defined to 10 seconds
   readValues should have size 3
   readValues should contain allOf( "[1970-01-01 01:00:00.0,1970-01-01 01:00:02.0] -> 1" ,
     "[1970-01-01 01:00:04.0,1970-01-01 01:00:06.0] -> 2" ,
     "[1970-01-01 01:00:04.0,1970-01-01 01:00:06.0] -> 3" )
}
 
"late data but within watermark" should "be aggregated in correct windows" in {
   val testKey = "watermark-window-test-accepted-data"
   val inputStream = new MemoryStream[(Timestamp, String)]( 1 , sparkSession.sqlContext)
   val now = 5000 L
   val aggregatedStream = inputStream.toDS().toDF( "created" , "name" )
     .withWatermark( "created" , "10 seconds" )
     .groupBy(window($ "created" , "2 seconds" )).count()
 
   val query = aggregatedStream.writeStream.outputMode( "update" )
     .foreach( new ForeachWriter[Row]() {
       override def open(partitionId : Long, version : Long) : Boolean = true
       override def process(processedRow : Row) : Unit = {
         val window = processedRow.get( 0 )
         val rowRepresentation = s "${window.toString} -> ${processedRow.getAs[Long](" count ")}"
         InMemoryKeyedStore.addValue(testKey, rowRepresentation)
       }
       override def close(errorOrNull : Throwable) : Unit = {}
     }).start()
 
   new Thread( new Runnable() {
     override def run() : Unit = {
       inputStream.addData(( new Timestamp(now), "a1" ), ( new Timestamp(now), "a2" ),
         ( new Timestamp(now- 4000 L), "b1" ))
       while (!query.isActive) {
         // wait the query to activate
       }
       // The watermark is now computed as MAX(event_time) - watermark, i.e.:
       // 5000 - 2000 = 3000
       // Thus among the values sent above only "a6" should be accepted because it's within the watermark
       Thread.sleep( 7000 )
       val timeOutOfWatermark = 1000 L
       inputStream.addData(( new Timestamp(timeOutOfWatermark), "b2" ), ( new Timestamp(timeOutOfWatermark), "b3" ),
         ( new Timestamp(timeOutOfWatermark), "b4" ), ( new Timestamp(now), "a3" ))
     }
   }).start()
 
   query.awaitTermination( 25000 )
 
   val readValues = InMemoryKeyedStore.getValues(testKey)
   // As you can notice, the count for the window 0-2 wasn't updated with 3 fields (b2, b3 and b4) because they fall
   // before the watermark
   // Please see how this behavior changes in the next test where the watermark is defined to 10 seconds
   readValues should have size 4
   readValues should contain allOf( "[1970-01-01 01:00:00.0,1970-01-01 01:00:02.0] -> 1" ,
     "[1970-01-01 01:00:00.0,1970-01-01 01:00:02.0] -> 4" ,
     "[1970-01-01 01:00:04.0,1970-01-01 01:00:06.0] -> 2" ,
     "[1970-01-01 01:00:04.0,1970-01-01 01:00:06.0] -> 3" )
}
 
"watermark after aggregation" should "not be allowed in append mode" in {
   val inputStream = new MemoryStream[(Timestamp, String)]( 1 , sparkSession.sqlContext)
   inputStream.addData(( new Timestamp(System.currentTimeMillis()), "a" ),
     ( new Timestamp(System.currentTimeMillis()), "b" ))
   val aggregatedStream = inputStream.toDS().toDF( "created" , "name" )
     .groupBy( "created" ).count()
     .withWatermark( "created" , "10 seconds" )
 
   val exception = intercept[AnalysisException] {
     val query = aggregatedStream.writeStream.outputMode( "append" )
       .foreach( new NoopForeachWriter[Row]()).start()
     query.awaitTermination( 3000 )
   }
 
   exception.message should include ( "Append output mode not supported when there are streaming aggregations on streaming " +
     "DataFrames/DataSets without watermark" )
}
 
"watermark applied on different field than the aggregation in append mode" should "make the processing fail" in {
   val inputStream = new MemoryStream[(Timestamp, String)]( 1 , sparkSession.sqlContext)
   inputStream.addData(( new Timestamp(System.currentTimeMillis()), "a" ),
     ( new Timestamp(System.currentTimeMillis()), "b" ))
   val aggregatedStream = inputStream.toDS().toDF( "created" , "name" )
     .withWatermark( "created" , "10 seconds" )
     .agg(count( "name" ))
 
   val exception = intercept[AnalysisException] {
     // It fails only for append mode. If the watermark is not applied on grouped column it means no more
     // no less that this group is never finished and then that it would never output anything
     val query = aggregatedStream.writeStream.outputMode( "append" )
       .foreach( new NoopForeachWriter[Row]()).start()
     query.awaitTermination( 3000 )
   }
 
   exception.message should include ( "Append output mode not supported when there are streaming aggregations on streaming " +
     "DataFrames/DataSets without watermark" )
}
 
"watermark in append mode" should "emit the results only after watermark expiration" in {
   // Accordingly to the Spark's documentation
   // (https://spark.apache.org/docs/2.2.1/structured-streaming-programming-guide.html#handling-late-data-and-watermarking)
   // in the append mode the partial results are not updated. Only the final result is computed
   // and emitted to the result table:
   // ```
   // Similar to the Update Mode earlier, the engine maintains intermediate counts for each window.
   // However, the partial counts are not updated to the Result Table and not written to sink.
   // The engine waits for “10 mins” for late date to be counted, then drops intermediate state of a
   // window < watermark, and appends the final counts to the Result Table/sink.
   // ```
   // But please notice that in the append output mode the results
   // are emitted once the watermark passed. Here the results for 00:00:10 will be
   // emitted only when the watermark will pass to 00:00:19 (00:00:10 is now before the watermark), i.e. after updating
   // the watermark for the record (24000L, 5).
   val testKey = "watermark-append-mode"
   val inputStream = new MemoryStream[(Timestamp, Int)]( 1 , sparkSession.sqlContext)
   val now = 5000 L
   val aggregatedStream = inputStream.toDS().toDF( "created" , "name" )
     .withWatermark( "created" , "5 second" )
     .groupBy( "created" )
     .count()
 
   val query = aggregatedStream.writeStream.trigger(Trigger.ProcessingTime( "2 seconds" )).outputMode( "append" )
     .foreach( new ForeachWriter[Row]() {
       override def open(partitionId : Long, version : Long) : Boolean = true
       override def process(processedRow : Row) : Unit = {
         val rowRepresentation = s "${processedRow.getAs[Timestamp](" created ")} -> ${processedRow.getAs[Long](" count ")}"
         InMemoryKeyedStore.addValue(testKey, rowRepresentation)
         println(s "Processing ${rowRepresentation} at ${new Date(System.currentTimeMillis())}" )
       }
       override def close(errorOrNull : Throwable) : Unit = {}
     }).start()
 
   new Thread( new Runnable() {
     override def run() : Unit = {
       // send the first batch - max time = 10 seconds, so the watermark will be 5 seconds
       inputStream.addData(( new Timestamp(now+ 5000 ), 1 ), ( new Timestamp(now+ 5000 ), 2 ), ( new Timestamp(now+ 5000 ), 3 ),
         ( new Timestamp(now+ 5000 ), 4 ))
       while (!query.isActive) {
         // wait the query to activate
       }
       Thread.sleep( 4000 )
       inputStream.addData(( new Timestamp(now), 6 ), ( new Timestamp( 6000 ), 7 ))
       Thread.sleep( 4000 )
       inputStream.addData(( new Timestamp( 24000 L), 8 ))  // Only to update the watermark
       Thread.sleep( 4000 )
       inputStream.addData(( new Timestamp( 4000 L), 9 ))
       Thread.sleep( 4000 )
       inputStream.addData(( new Timestamp(now), 10 ))
     }
   }).start()
 
   query.awaitTermination( 45000 )
 
   val readValues = InMemoryKeyedStore.getValues(testKey)
   readValues should have size 2
   readValues should contain allOf( "1970-01-01 01:00:10.0 -> 4" , "1970-01-01 01:00:06.0 -> 1" )
}
 
 
"the watermark" should "be used in aggregations others than windowing" in {
   val testKey = "watermark-count-aggregation"
   val inputStream = new MemoryStream[(Timestamp, Int)]( 1 , sparkSession.sqlContext)
   val now = 5000 L
   val aggregatedStream = inputStream.toDS().toDF( "created" , "number" )
     .withWatermark( "created" , "1 second" )
     .groupBy( "created" )
     .count()
 
   val query = aggregatedStream.writeStream. /*trigger(Trigger.ProcessingTime("2 seconds")).*/ outputMode( "update" )
     .foreach( new ForeachWriter[Row]() {
       override def open(partitionId : Long, version : Long) : Boolean = true
       override def process(processedRow : Row) : Unit = {
         val rowRepresentation = s "${processedRow.getAs[Timestamp](" created ")} -> ${processedRow.getAs[Long](" count ")}"
         InMemoryKeyedStore.addValue(testKey, rowRepresentation)
       }
       override def close(errorOrNull : Throwable) : Unit = {}
     }).start()
 
   new Thread( new Runnable() {
     override def run() : Unit = {
       // send the first batch - max time = 10 seconds, so the watermark will be 9 seconds
       inputStream.addData(( new Timestamp(now+ 5000 ), 1 ), ( new Timestamp(now+ 5000 ), 2 ), ( new Timestamp(now+ 5000 ), 3 ),
         ( new Timestamp(now+ 5000 ), 4 ))
       while (!query.isActive) {
         // wait the query to activate
       }
       // Here the rows#5 and #6 are ignored because of the watermark
       // In the other side, the row#7 will be taken into account
       // To see what happens when the watermark doesn't exist, you can uncomment the line .withWatermark...
       // Without the watermark, the counter for *now* will be updated
       Thread.sleep( 10000 )
       inputStream.addData(( new Timestamp( 4000 L), 5 ))
       inputStream.addData(( new Timestamp(now), 6 ), ( new Timestamp( 11000 ), 7 ))
     }
   }).start()
 
   query.awaitTermination( 45000 )
 
   val readValues = InMemoryKeyedStore.getValues(testKey)
   println(s "All data=${readValues}" )
   // As you can notice, the count for the window 0-2 wasn't updated with 3 fields (b2, b3 and b4) because they fall
   // before the watermark
   // Please see how this behavior changes in the next test where the watermark is defined to 10 seconds
   readValues should have size 2
   readValues should contain allOf( "1970-01-01 01:00:10.0 -> 4" , "1970-01-01 01:00:11.0 -> 1" )
}
 
"the data arriving after the watermark and the state older than the watermark" should "not be discarded correctly" in {
   val testKey = "watermark-deduplicate"
   val testKeyLastProgress = "watermark-deduplicate-last-progress"
   val inputStream = new MemoryStream[(Timestamp, Int)]( 1 , sparkSession.sqlContext)
   val now = 5000 L
   val aggregatedStream = inputStream.toDS().toDF( "created" , "number" )
     .withWatermark( "created" , "6 seconds" )
     .dropDuplicates( "number" , "created" )
 
   val query = aggregatedStream.writeStream.trigger(Trigger.ProcessingTime( "2 seconds" )).outputMode( "update" )
     .foreach( new ForeachWriter[Row]() {
       override def open(partitionId : Long, version : Long) : Boolean = true
       override def process(processedRow : Row) : Unit = {
         val rowRepresentation =
           s "${processedRow.getAs[Timestamp](" created ").toString} -> ${processedRow.getAs[Int](" number ")}"
         InMemoryKeyedStore.addValue(testKey, rowRepresentation)
         println(s "processing ${rowRepresentation}" )
       }
       override def close(errorOrNull : Throwable) : Unit = {}
     }).start()
 
   new Thread( new Runnable() {
     override def run() : Unit = {
       // Events sent before - they should be correctly deduplicated, i.e. (5000, 1), (5000, 2) and (10000, 2)
       // should be taken
       // As you can observe, the deduplication occurs with the pair (event time, value)
       inputStream.addData(( new Timestamp(now), 1 ), ( new Timestamp(now), 2 ),
         ( new Timestamp(now), 1 ), ( new Timestamp(now+ 5000 ), 2 ))
       while (!query.isActive) {
         // wait the query to activate
       }
       logInfo(s "Query was activated, sleep for 9 seconds before sending new data. Current timestamp " +
         s "is ${System.currentTimeMillis()}" )
       Thread.sleep( 11000 )
       logInfo(s "Awaken at ${System.currentTimeMillis()} where the query status is ${query.lastProgress.json}" )
       // In the logs we can observe the following entry:
       // ```
       // Filtering state store on: (created#5-T6000ms <= 4000000)
       // (org.apache.spark.sql.execution.streaming.StreamingDeduplicateExec:54)
       // ```
       // As you can correctly deduce, among the entries below:
       // - 1, 2 and 3 will be filtered
       // - 4 will be accepted
       // Moreover, the 4 will be used to compute the new watermark. Later in the logs we can observe the following:
       // ```
       // Filtering state store on: (created#5-T6000ms <= 6000000)
       // (org.apache.spark.sql.execution.streaming.StreamingDeduplicateExec:54)
       // ```
       inputStream.addData(( new Timestamp(now), 1 ), ( new Timestamp(now- 1000 ), 2 ),
         ( new Timestamp(now- 3000 ), 3 ), ( new Timestamp(now+ 7000 ), 4 ))
       Thread.sleep( 9000 )
       // Here the value 1 is after the watermark so automatically discarded
       InMemoryKeyedStore.addValue(testKeyLastProgress, query.lastProgress.json)
       inputStream.addData(( new Timestamp(now), 1 ))
       Thread.sleep( 7000 )
       inputStream.addData(( new Timestamp(now), 1 ))
       InMemoryKeyedStore.addValue(testKeyLastProgress, query.lastProgress.json)
     }
   }).start()
 
   query.awaitTermination( 55000 )
 
   val accumulatedValues = InMemoryKeyedStore.getValues(testKey)
   accumulatedValues should have size 4
   accumulatedValues should contain allOf( "1970-01-01 01:00:10.0 -> 2" , "1970-01-01 01:00:05.0 -> 1" ,
     "1970-01-01 01:00:05.0 -> 2" , "1970-01-01 01:00:12.0 -> 4" )
   val seenProgresses = InMemoryKeyedStore.getValues(testKeyLastProgress)
   // This progress represents the moment where all 4 rows are considered as within the watermark
   val initialProgress = seenProgresses( 0 )
   initialProgress should include( "\"stateOperators\":[{\"numRowsTotal\":4,\"numRowsUpdated\":0}]" )
   initialProgress should include( "\"eventTime\":{\"watermark\":\"1970-01-01T00:00:04.000Z\"}" )
   // This progress represents the moment where 2 rows (1970-01-01 01:00:05.0 -> 1 and 1970-01-01 01:00:05.0 -> 2)
   // were removed from the state store because of the watermark expiration
   val progressWithRemovedStates = seenProgresses( 1 )
   progressWithRemovedStates should include( "\"stateOperators\":[{\"numRowsTotal\":2,\"numRowsUpdated\":0}]" )
   progressWithRemovedStates should include( "\"eventTime\":{\"watermark\":\"1970-01-01T00:00:06.000Z\"}" )

}

ref:http://www.waitingforcode.com/apache-spark-structured-streaming/apache-spark-structured-streaming-watermarks/read

猜你喜欢

转载自blog.csdn.net/zhouyan8603/article/details/80311494