Generar datos acumulativos, incluidos los registros históricos
1. Utilice el operador updateStateByKey para acumular registros históricos
Utilice streamingContext.chekcpoint para almacenar en caché datos históricos
2. Implementación del código
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.DStream
object sparkStreaming02 {
//todo 定义方法将历史结果累加
def updateFunc(inputSum: Seq[Int], resultSum: Option[Int]): Option[Int] = {
val finalResult = inputSum.sum + resultSum.getOrElse(0)
Option(finalResult)
}
def main(args: Array[String]): Unit = {
//创建函数,使用update
/**
* 两个参数
* Seq 传入的值
* Option 将历史数据保留下来
* 返回值低optain
*/
val sparkContext = new SparkContext(new SparkConf().setAppName("hdfsStreaming").setMaster("local[4]"))
sparkContext.setLogLevel("WARN")
val streamingContext = new StreamingContext(sparkContext, Seconds(5))
val streaming = streamingContext.textFileStream("hdfs://node01:8020/sparkStreaming")
//将历史数据保存在某个目录下
streamingContext.checkpoint("./check_point")
val finalkey: DStream[(String, Int)] = streaming.flatMap(_.split(" ")).map((_, 1)).updateStateByKey(updateFunc)
// 将结果打印
finalkey.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}