SparkStreaming of WordCount cases (cumulative statistics) (b)
I. Introduction case
using the netcat tool to port 9999 to send data constantly by SparkStreaming read data port and the cumulative count the number of different word appears.
Two, netcat operation
1, the virtual machine installed netcat
[@ hadoop1 the root Spark] yum the install -Y NC #
2, start the program and data transmission
[root @ hadoop1 spark] # nc -lk 9999
Third, the code implements
a, dependent Maven
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.1</version>
</dependency>
2, java codes
package com.it.sparkStreaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
object SparkStreamingState {
def main(args: Array[String]): Unit = {
//创建StreamingContext对象
val sparkConf = new SparkConf().setAppName("SparkStreamingState").setMaster("local[*]")
val sc: StreamingContext = new StreamingContext(sparkConf,Seconds(5))
//设置checkpoint,缓存数据
sc.checkpoint("cp")
//创建ReceiverInputDStream ,输入流
val receiver: ReceiverInputDStream[String] = sc.socketTextStream("hadoop1",9999)
//创建DStream,处理接收的数据
val wordOne: DStream[(String, Int)] = receiver.flatMap(_.split(" ")).map((_,1))
//汇总接收的数据,updateStateByKey
val dstream: DStream[(String, Int)] = wordOne.updateStateByKey((p1: Seq[Int], p2: Option[Int]) => {
val current = p1.sum
val next = p2.getOrElse(0)
Some(current + next)
})
//处理汇总数据,reduceByKey
val result: DStream[(String, Int)] = dstream.reduceByKey(_+_)
result.print()
//启动StreamingContext
sc.start()
sc.awaitTermination()
}
}