Apache Spark Structured Streaming and watermarks

 
  "watermark" 
  should  
  "discard late data and accept 1 late but within watermark with window aggregation" 
  in { 
 
  val 
  testKey  
  = 
  "watermark-window-test" 
 
  val 
  inputStream  
  = 
  new 
  MemoryStream[(Timestamp, String)]( 
  1 
  , sparkSession.sqlContext) 
 
  val 
  now  
  = 
  5000 
  L 
 
  val 
  aggregatedStream  
  = 
  inputStream.toDS().toDF( 
  "created" 
  ,  
  "name" 
  ) 
 
  .withWatermark( 
  "created" 
  ,  
  "2 seconds" 
  ) 
 
  .groupBy(window($ 
  "created" 
  ,  
  "2 seconds" 
  )).count() 
 
  val 
  query  
  = 
  aggregatedStream.writeStream.outputMode( 
  "update" 
  ) 
 
  .foreach( 
  new 
  ForeachWriter[Row]() { 
 
  override 
  def 
  open(partitionId 
  : 
  Long, version 
  : 
  Long) 
  : 
  Boolean  
  = 
  true 
 
  override 
  def 
  process(processedRow 
  : 
  Row) 
  : 
  Unit  
  = 
  { 
 
  val 
  window  
  = 
  processedRow.get( 
  0 
  ) 
 
  val 
  rowRepresentation  
  = 
  s 
  "${window.toString} -> ${processedRow.getAs[Long](" 
  count 
  ")}" 
 
  InMemoryKeyedStore.addValue(testKey, rowRepresentation) 
 
  } 
 
  override 
  def 
  close(errorOrNull 
  : 
  Throwable) 
  : 
  Unit  
  = 
  {} 
 
  }).start() 
 
  new 
  Thread( 
  new 
  Runnable() { 
 
  override 
  def 
  run() 
  : 
  Unit  
  = 
  { 
 
  inputStream.addData(( 
  new 
  Timestamp(now),  
  "a1" 
  ), ( 
  new 
  Timestamp(now),  
  "a2" 
  ), 
 
  ( 
  new 
  Timestamp(now- 
  4000 
  L),  
  "b1" 
  )) 
 
  while 
  (!query.isActive) { 
 
  // wait the query to activate 
 
  } 
 
  // The watermark is now computed as MAX(event_time) - watermark, i.e.: 
 
  // 5000 - 2000 = 3000 
 
  // Thus among the values sent above only "a6" should be accepted because it's within the watermark 
 
  Thread.sleep( 
  7000 
  ) 
 
  val 
  timeOutOfWatermark  
  = 
  1000 
  L 
 
  inputStream.addData(( 
  new 
  Timestamp(timeOutOfWatermark),  
  "b2" 
  ), ( 
  new 
  Timestamp(timeOutOfWatermark),  
  "b3" 
  ), 
 
  ( 
  new 
  Timestamp(timeOutOfWatermark),  
  "b4" 
  ), ( 
  new 
  Timestamp(now),  
  "a3" 
  )) 
 
  } 
 
  }).start() 
 
  query.awaitTermination( 
  25000 
  ) 
 
  val 
  readValues  
  = 
  InMemoryKeyedStore.getValues(testKey) 
 
  println(s 
  "All data=${readValues}" 
  ) 
 
  // As you can notice, the count for the window 0-2 wasn't updated with 3 fields (b2, b3 and b4) because they fall 
 
  // before the watermark 
 
  // Please see how this behavior changes in the next test where the watermark is defined to 10 seconds 
 
  readValues should have size  
  3 
 
  readValues should contain allOf( 
  "[1970-01-01 01:00:00.0,1970-01-01 01:00:02.0] -> 1" 
  , 
 
  "[1970-01-01 01:00:04.0,1970-01-01 01:00:06.0] -> 2" 
  , 
 
  "[1970-01-01 01:00:04.0,1970-01-01 01:00:06.0] -> 3" 
  ) 
 
  } 
 
  "late data but within watermark" 
  should  
  "be aggregated in correct windows" 
  in { 
 
  val 
  testKey  
  = 
  "watermark-window-test-accepted-data" 
 
  val 
  inputStream  
  = 
  new 
  MemoryStream[(Timestamp, String)]( 
  1 
  , sparkSession.sqlContext) 
 
  val 
  now  
  = 
  5000 
  L 
 
  val 
  aggregatedStream  
  = 
  inputStream.toDS().toDF( 
  "created" 
  ,  
  "name" 
  ) 
 
  .withWatermark( 
  "created" 
  ,  
  "10 seconds" 
  ) 
 
  .groupBy(window($ 
  "created" 
  ,  
  "2 seconds" 
  )).count() 
 
  val 
  query  
  = 
  aggregatedStream.writeStream.outputMode( 
  "update" 
  ) 
 
  .foreach( 
  new 
  ForeachWriter[Row]() { 
 
  override 
  def 
  open(partitionId 
  : 
  Long, version 
  : 
  Long) 
  : 
  Boolean  
  = 
  true 
 
  override 
  def 
  process(processedRow 
  : 
  Row) 
  : 
  Unit  
  = 
  { 
 
  val 
  window  
  = 
  processedRow.get( 
  0 
  ) 
 
  val 
  rowRepresentation  
  = 
  s 
  "${window.toString} -> ${processedRow.getAs[Long](" 
  count 
  ")}" 
 
  InMemoryKeyedStore.addValue(testKey, rowRepresentation) 
 
  } 
 
  override 
  def 
  close(errorOrNull 
  : 
  Throwable) 
  : 
  Unit  
  = 
  {} 
 
  }).start() 
 
  new 
  Thread( 
  new 
  Runnable() { 
 
  override 
  def 
  run() 
  : 
  Unit  
  = 
  { 
 
  inputStream.addData(( 
  new 
  Timestamp(now),  
  "a1" 
  ), ( 
  new 
  Timestamp(now),  
  "a2" 
  ), 
 
  ( 
  new 
  Timestamp(now- 
  4000 
  L),  
  "b1" 
  )) 
 
  while 
  (!query.isActive) { 
 
  // wait the query to activate 
 
  } 
 
  // The watermark is now computed as MAX(event_time) - watermark, i.e.: 
 
  // 5000 - 2000 = 3000 
 
  // Thus among the values sent above only "a6" should be accepted because it's within the watermark 
 
  Thread.sleep( 
  7000 
  ) 
 
  val 
  timeOutOfWatermark  
  = 
  1000 
  L 
 
  inputStream.addData(( 
  new 
  Timestamp(timeOutOfWatermark),  
  "b2" 
  ), ( 
  new 
  Timestamp(timeOutOfWatermark),  
  "b3" 
  ), 
 
  ( 
  new 
  Timestamp(timeOutOfWatermark),  
  "b4" 
  ), ( 
  new 
  Timestamp(now),  
  "a3" 
  )) 
 
  } 
 
  }).start() 
 
  query.awaitTermination( 
  25000 
  ) 
 
  val 
  readValues  
  = 
  InMemoryKeyedStore.getValues(testKey) 
 
  // As you can notice, the count for the window 0-2 wasn't updated with 3 fields (b2, b3 and b4) because they fall 
 
  // before the watermark 
 
  // Please see how this behavior changes in the next test where the watermark is defined to 10 seconds 
 
  readValues should have size  
  4 
 
  readValues should contain allOf( 
  "[1970-01-01 01:00:00.0,1970-01-01 01:00:02.0] -> 1" 
  , 
 
  "[1970-01-01 01:00:00.0,1970-01-01 01:00:02.0] -> 4" 
  , 
 
  "[1970-01-01 01:00:04.0,1970-01-01 01:00:06.0] -> 2" 
  , 
 
  "[1970-01-01 01:00:04.0,1970-01-01 01:00:06.0] -> 3" 
  ) 
 
  } 
 
  "watermark after aggregation" 
  should  
  "not be allowed in append mode" 
  in { 
 
  val 
  inputStream  
  = 
  new 
  MemoryStream[(Timestamp, String)]( 
  1 
  , sparkSession.sqlContext) 
 
  inputStream.addData(( 
  new 
  Timestamp(System.currentTimeMillis()),  
  "a" 
  ), 
 
  ( 
  new 
  Timestamp(System.currentTimeMillis()),  
  "b" 
  )) 
 
  val 
  aggregatedStream  
  = 
  inputStream.toDS().toDF( 
  "created" 
  ,  
  "name" 
  ) 
 
  .groupBy( 
  "created" 
  ).count() 
 
  .withWatermark( 
  "created" 
  ,  
  "10 seconds" 
  ) 
 
  val 
  exception  
  = 
  intercept[AnalysisException] { 
 
  val 
  query  
  = 
  aggregatedStream.writeStream.outputMode( 
  "append" 
  ) 
 
  .foreach( 
  new 
  NoopForeachWriter[Row]()).start() 
 
  query.awaitTermination( 
  3000 
  ) 
 
  } 
 
  exception.message should include ( 
  "Append output mode not supported when there are streaming aggregations on streaming " 
  + 
 
  "DataFrames/DataSets without watermark" 
  ) 
 
  } 
 
  "watermark applied on different field than the aggregation in append mode" 
  should  
  "make the processing fail" 
  in { 
 
  val 
  inputStream  
  = 
  new 
  MemoryStream[(Timestamp, String)]( 
  1 
  , sparkSession.sqlContext) 
 
  inputStream.addData(( 
  new 
  Timestamp(System.currentTimeMillis()),  
  "a" 
  ), 
 
  ( 
  new 
  Timestamp(System.currentTimeMillis()),  
  "b" 
  )) 
 
  val 
  aggregatedStream  
  = 
  inputStream.toDS().toDF( 
  "created" 
  ,  
  "name" 
  ) 
 
  .withWatermark( 
  "created" 
  ,  
  "10 seconds" 
  ) 
 
  .agg(count( 
  "name" 
  )) 
 
  val 
  exception  
  = 
  intercept[AnalysisException] { 
 
  // It fails only for append mode. If the watermark is not applied on grouped column it means no more 
 
  // no less that this group is never finished and then that it would never output anything 
 
  val 
  query  
  = 
  aggregatedStream.writeStream.outputMode( 
  "append" 
  ) 
 
  .foreach( 
  new 
  NoopForeachWriter[Row]()).start() 
 
  query.awaitTermination( 
  3000 
  ) 
 
  } 
 
  exception.message should include ( 
  "Append output mode not supported when there are streaming aggregations on streaming " 
  + 
 
  "DataFrames/DataSets without watermark" 
  ) 
 
  } 
 
  "watermark in append mode" 
  should  
  "emit the results only after watermark expiration" 
  in { 
 
  // Accordingly to the Spark's documentation 
 
  // (https://spark.apache.org/docs/2.2.1/structured-streaming-programming-guide.html#handling-late-data-and-watermarking) 
 
  // in the append mode the partial results are not updated. Only the final result is computed 
 
  // and emitted to the result table: 
 
  // ``` 
 
  // Similar to the Update Mode earlier, the engine maintains intermediate counts for each window. 
 
  // However, the partial counts are not updated to the Result Table and not written to sink. 
 
  // The engine waits for “10 mins” for late date to be counted, then drops intermediate state of a 
 
  // window < watermark, and appends the final counts to the Result Table/sink. 
 
  // ``` 
 
  // But please notice that in the append output mode the results 
 
  // are emitted once the watermark passed. Here the results for 00:00:10 will be 
 
  // emitted only when the watermark will pass to 00:00:19 (00:00:10 is now before the watermark), i.e. after updating 
 
  // the watermark for the record (24000L, 5).  
 
  val 
  testKey  
  = 
  "watermark-append-mode" 
 
  val 
  inputStream  
  = 
  new 
  MemoryStream[(Timestamp, Int)]( 
  1 
  , sparkSession.sqlContext) 
 
  val 
  now  
  = 
  5000 
  L 
 
  val 
  aggregatedStream  
  = 
  inputStream.toDS().toDF( 
  "created" 
  ,  
  "name" 
  ) 
 
  .withWatermark( 
  "created" 
  ,  
  "5 second" 
  ) 
 
  .groupBy( 
  "created" 
  ) 
 
  .count() 
 
  val 
  query  
  = 
  aggregatedStream.writeStream.trigger(Trigger.ProcessingTime( 
  "2 seconds" 
  )).outputMode( 
  "append" 
  ) 
 
  .foreach( 
  new 
  ForeachWriter[Row]() { 
 
  override 
  def 
  open(partitionId 
  : 
  Long, version 
  : 
  Long) 
  : 
  Boolean  
  = 
  true 
 
  override 
  def 
  process(processedRow 
  : 
  Row) 
  : 
  Unit  
  = 
  { 
 
  val 
  rowRepresentation  
  = 
  s 
  "${processedRow.getAs[Timestamp](" 
  created 
  ")} -> ${processedRow.getAs[Long](" 
  count 
  ")}" 
 
  InMemoryKeyedStore.addValue(testKey, rowRepresentation) 
 
  println(s 
  "Processing ${rowRepresentation} at ${new Date(System.currentTimeMillis())}" 
  ) 
 
  } 
 
  override 
  def 
  close(errorOrNull 
  : 
  Throwable) 
  : 
  Unit  
  = 
  {} 
 
  }).start() 
 
  new 
  Thread( 
  new 
  Runnable() { 
 
  override 
  def 
  run() 
  : 
  Unit  
  = 
  { 
 
  // send the first batch - max time = 10 seconds, so the watermark will be 5 seconds 
 
  inputStream.addData(( 
  new 
  Timestamp(now+ 
  5000 
  ),  
  1 
  ), ( 
  new 
  Timestamp(now+ 
  5000 
  ),  
  2 
  ), ( 
  new 
  Timestamp(now+ 
  5000 
  ),  
  3 
  ), 
 
  ( 
  new 
  Timestamp(now+ 
  5000 
  ),  
  4 
  )) 
 
  while 
  (!query.isActive) { 
 
  // wait the query to activate 
 
  } 
 
  Thread.sleep( 
  4000 
  ) 
 
  inputStream.addData(( 
  new 
  Timestamp(now),  
  6 
  ), ( 
  new 
  Timestamp( 
  6000 
  ),  
  7 
  )) 
 
  Thread.sleep( 
  4000 
  ) 
 
  inputStream.addData(( 
  new 
  Timestamp( 
  24000 
  L),  
  8 
  ))   
  // Only to update the watermark 
 
  Thread.sleep( 
  4000 
  ) 
 
  inputStream.addData(( 
  new 
  Timestamp( 
  4000 
  L),  
  9 
  )) 
 
  Thread.sleep( 
  4000 
  ) 
 
  inputStream.addData(( 
  new 
  Timestamp(now),  
  10 
  )) 
 
  } 
 
  }).start() 
 
  query.awaitTermination( 
  45000 
  ) 
 
  val 
  readValues  
  = 
  InMemoryKeyedStore.getValues(testKey) 
 
  readValues should have size  
  2 
 
  readValues should contain allOf( 
  "1970-01-01 01:00:10.0 -> 4" 
  ,  
  "1970-01-01 01:00:06.0 -> 1" 
  ) 
 
  } 
 
  "the watermark" 
  should  
  "be used in aggregations others than windowing" 
  in { 
 
  val 
  testKey  
  = 
  "watermark-count-aggregation" 
 
  val 
  inputStream  
  = 
  new 
  MemoryStream[(Timestamp, Int)]( 
  1 
  , sparkSession.sqlContext) 
 
  val 
  now  
  = 
  5000 
  L 
 
  val 
  aggregatedStream  
  = 
  inputStream.toDS().toDF( 
  "created" 
  ,  
  "number" 
  ) 
 
  .withWatermark( 
  "created" 
  ,  
  "1 second" 
  ) 
 
  .groupBy( 
  "created" 
  ) 
 
  .count() 
 
  val 
  query  
  = 
  aggregatedStream.writeStream. 
  /*trigger(Trigger.ProcessingTime("2 seconds")).*/ 
  outputMode( 
  "update" 
  ) 
 
  .foreach( 
  new 
  ForeachWriter[Row]() { 
 
  override 
  def 
  open(partitionId 
  : 
  Long, version 
  : 
  Long) 
  : 
  Boolean  
  = 
  true 
 
  override 
  def 
  process(processedRow 
  : 
  Row) 
  : 
  Unit  
  = 
  { 
 
  val 
  rowRepresentation  
  = 
  s 
  "${processedRow.getAs[Timestamp](" 
  created 
  ")} -> ${processedRow.getAs[Long](" 
  count 
  ")}" 
 
  InMemoryKeyedStore.addValue(testKey, rowRepresentation) 
 
  } 
 
  override 
  def 
  close(errorOrNull 
  : 
  Throwable) 
  : 
  Unit  
  = 
  {} 
 
  }).start() 
 
  new 
  Thread( 
  new 
  Runnable() { 
 
  override 
  def 
  run() 
  : 
  Unit  
  = 
  { 
 
  // send the first batch - max time = 10 seconds, so the watermark will be 9 seconds 
 
  inputStream.addData(( 
  new 
  Timestamp(now+ 
  5000 
  ),  
  1 
  ), ( 
  new 
  Timestamp(now+ 
  5000 
  ),  
  2 
  ), ( 
  new 
  Timestamp(now+ 
  5000 
  ),  
  3 
  ), 
 
  ( 
  new 
  Timestamp(now+ 
  5000 
  ),  
  4 
  )) 
 
  while 
  (!query.isActive) { 
 
  // wait the query to activate 
 
  } 
 
  // Here the rows#5 and #6 are ignored because of the watermark 
 
  // In the other side, the row#7 will be taken into account 
 
  // To see what happens when the watermark doesn't exist, you can uncomment the line .withWatermark... 
 
  // Without the watermark, the counter for *now* will be updated 
 
  Thread.sleep( 
  10000 
  ) 
 
  inputStream.addData(( 
  new 
  Timestamp( 
  4000 
  L),  
  5 
  )) 
 
  inputStream.addData(( 
  new 
  Timestamp(now),  
  6 
  ), ( 
  new 
  Timestamp( 
  11000 
  ),  
  7 
  )) 
 
  } 
 
  }).start() 
 
  query.awaitTermination( 
  45000 
  ) 
 
  val 
  readValues  
  = 
  InMemoryKeyedStore.getValues(testKey) 
 
  println(s 
  "All data=${readValues}" 
  ) 
 
  // As you can notice, the count for the window 0-2 wasn't updated with 3 fields (b2, b3 and b4) because they fall 
 
  // before the watermark 
 
  // Please see how this behavior changes in the next test where the watermark is defined to 10 seconds 
 
  readValues should have size  
  2 
 
  readValues should contain allOf( 
  "1970-01-01 01:00:10.0 -> 4" 
  ,  
  "1970-01-01 01:00:11.0 -> 1" 
  ) 
 
  } 
 
  "the data arriving after the watermark and the state older than the watermark" 
  should  
  "not be discarded correctly" 
  in { 
 
  val 
  testKey  
  = 
  "watermark-deduplicate" 
 
  val 
  testKeyLastProgress  
  = 
  "watermark-deduplicate-last-progress" 
 
  val 
  inputStream  
  = 
  new 
  MemoryStream[(Timestamp, Int)]( 
  1 
  , sparkSession.sqlContext) 
 
  val 
  now  
  = 
  5000 
  L 
 
  val 
  aggregatedStream  
  = 
  inputStream.toDS().toDF( 
  "created" 
  ,  
  "number" 
  ) 
 
  .withWatermark( 
  "created" 
  ,  
  "6 seconds" 
  ) 
 
  .dropDuplicates( 
  "number" 
  ,  
  "created" 
  ) 
 
  val 
  query  
  = 
  aggregatedStream.writeStream.trigger(Trigger.ProcessingTime( 
  "2 seconds" 
  )).outputMode( 
  "update" 
  ) 
 
  .foreach( 
  new 
  ForeachWriter[Row]() { 
 
  override 
  def 
  open(partitionId 
  : 
  Long, version 
  : 
  Long) 
  : 
  Boolean  
  = 
  true 
 
  override 
  def 
  process(processedRow 
  : 
  Row) 
  : 
  Unit  
  = 
  { 
 
  val 
  rowRepresentation  
  = 
 
  s 
  "${processedRow.getAs[Timestamp](" 
  created 
  ").toString} -> ${processedRow.getAs[Int](" 
  number 
  ")}" 
 
  InMemoryKeyedStore.addValue(testKey, rowRepresentation) 
 
  println(s 
  "processing ${rowRepresentation}" 
  ) 
 
  } 
 
  override 
  def 
  close(errorOrNull 
  : 
  Throwable) 
  : 
  Unit  
  = 
  {} 
 
  }).start() 
 
  new 
  Thread( 
  new 
  Runnable() { 
 
  override 
  def 
  run() 
  : 
  Unit  
  = 
  { 
 
  // Events sent before - they should be correctly deduplicated, i.e. (5000, 1), (5000, 2) and (10000, 2) 
 
  // should be taken 
 
  // As you can observe, the deduplication occurs with the pair (event time, value) 
 
  inputStream.addData(( 
  new 
  Timestamp(now),  
  1 
  ), ( 
  new 
  Timestamp(now),  
  2 
  ), 
 
  ( 
  new 
  Timestamp(now),  
  1 
  ), ( 
  new 
  Timestamp(now+ 
  5000 
  ),  
  2 
  )) 
 
  while 
  (!query.isActive) { 
 
  // wait the query to activate 
 
  } 
 
  logInfo(s 
  "Query was activated, sleep for 9 seconds before sending new data. Current timestamp " 
  + 
 
  s 
  "is ${System.currentTimeMillis()}" 
  ) 
 
  Thread.sleep( 
  11000 
  ) 
 
  logInfo(s 
  "Awaken at ${System.currentTimeMillis()} where the query status is ${query.lastProgress.json}" 
  ) 
 
  // In the logs we can observe the following entry: 
 
  // ``` 
 
  // Filtering state store on: (created#5-T6000ms <= 4000000) 
 
  // (org.apache.spark.sql.execution.streaming.StreamingDeduplicateExec:54) 
 
  // ``` 
 
  // As you can correctly deduce, among the entries below: 
 
  // - 1, 2 and 3 will be filtered 
 
  // - 4 will be accepted 
 
  // Moreover, the 4 will be used to compute the new watermark. Later in the logs we can observe the following: 
 
  // ``` 
 
  // Filtering state store on: (created#5-T6000ms <= 6000000) 
 
  // (org.apache.spark.sql.execution.streaming.StreamingDeduplicateExec:54) 
 
  // ``` 
 
  inputStream.addData(( 
  new 
  Timestamp(now),  
  1 
  ), ( 
  new 
  Timestamp(now- 
  1000 
  ),  
  2 
  ), 
 
  ( 
  new 
  Timestamp(now- 
  3000 
  ),  
  3 
  ), ( 
  new 
  Timestamp(now+ 
  7000 
  ),  
  4 
  )) 
 
  Thread.sleep( 
  9000 
  ) 
 
  // Here the value 1 is after the watermark so automatically discarded 
 
  InMemoryKeyedStore.addValue(testKeyLastProgress, query.lastProgress.json) 
 
  inputStream.addData(( 
  new 
  Timestamp(now),  
  1 
  )) 
 
  Thread.sleep( 
  7000 
  ) 
 
  inputStream.addData(( 
  new 
  Timestamp(now),  
  1 
  )) 
 
  InMemoryKeyedStore.addValue(testKeyLastProgress, query.lastProgress.json) 
 
  } 
 
  }).start() 
 
  query.awaitTermination( 
  55000 
  ) 
 
  val 
  accumulatedValues  
  = 
  InMemoryKeyedStore.getValues(testKey) 
 
  accumulatedValues should have size  
  4 
 
  accumulatedValues should contain allOf( 
  "1970-01-01 01:00:10.0 -> 2" 
  ,  
  "1970-01-01 01:00:05.0 -> 1" 
  , 
 
  "1970-01-01 01:00:05.0 -> 2" 
  ,  
  "1970-01-01 01:00:12.0 -> 4" 
  ) 
 
  val 
  seenProgresses  
  = 
  InMemoryKeyedStore.getValues(testKeyLastProgress) 
 
  // This progress represents the moment where all 4 rows are considered as within the watermark 
 
  val 
  initialProgress  
  = 
  seenProgresses( 
  0 
  ) 
 
  initialProgress should include( 
  "\"stateOperators\":[{\"numRowsTotal\":4,\"numRowsUpdated\":0}]" 
  ) 
 
  initialProgress should include( 
  "\"eventTime\":{\"watermark\":\"1970-01-01T00:00:04.000Z\"}" 
  ) 
 
  // This progress represents the moment where 2 rows (1970-01-01 01:00:05.0 -> 1 and 1970-01-01 01:00:05.0 -> 2) 
 
  // were removed from the state store because of the watermark expiration 
 
  val 
  progressWithRemovedStates  
  = 
  seenProgresses( 
  1 
  ) 
 
  progressWithRemovedStates should include( 
  "\"stateOperators\":[{\"numRowsTotal\":2,\"numRowsUpdated\":0}]" 
  ) 
 
  progressWithRemovedStates should include( 
  "\"eventTime\":{\"watermark\":\"1970-01-01T00:00:06.000Z\"}" 
  ) 
 
}
ref:http://www.waitingforcode.com/apache-spark-structured-streaming/apache-spark-structured-streaming-watermarks/read
Apache Spark Structured Streaming and watermarks

猜你喜欢