当executor宕机了,可以从checkpoint目录恢复元数据、以及该rdd已经计算好的结果等等
而spark streaming当整个driver程序关了之后,再次重新运行该driver程序,由于还是使用同一个checkpoint目录,所以此时该driver会从该checkpoint目录恢复数据。。。可以运行两次试试,第二次运行时打印的时间戳是紧跟着上一次运行结束时的时间戳。
def main(args: Array[String]): Unit = {
val checkpointDir = "/checkpoint";
def createStreamingContext():StreamingContext={
val streamingContext = new StreamingContext(new SparkConf().setMaster("local[*]").setAppName("sparkStreamTest"), Milliseconds(3000))
streamingContext.checkpoint(checkpointDir)
val dStream: ReceiverInputDStream[String] = streamingContext.socketTextStream("localhost", 9999)
val wordToSumStream: DStream[(String, Int)] = dStream.flatMap(_.split(" "))
.map((_, 1))
.reduceByKey(_ + _)
val value: DStream[(String, Int)] = wordToSumStream.reduceByKeyAndWindow((left: Int, right: Int) => left + right, Milliseconds(15000), Milliseconds(3000))
value.print()
streamingContext
}
val streamingContext = StreamingContext.getOrCreate(checkpointDir,createStreamingContext)
streamingContext.start()
streamingContext.awaitTermination()
}