SparkStreaming的模拟双流join操作详解

简介: 所谓的双流Join就是可以从多个数据源端实时消费数据进行处理，例如从多个TCP Socket接收数据，对每批次数据进行词频统计，使用DStream#union函数合并接收数据流

代码详解:

import org.apache.spark.streaming.dstream.{
    
    DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{
    
    Seconds, StreamingContext}
import org.apache.spark.{
    
    SparkConf, SparkContext}

/**
 * @author liu a fu
 * @date 2021/1/23 0023
 * @version 1.0
 * @DESC
 *     1-准备好StreamingContext的环境
 *     2-使用socketTextStream接受来自node1:9999的数据
 *     3-接受来自于node1:9998的数据
 *     4-将来自于不同数据源的数据进行join
 *     5-通过flatmap,map,updataStateBy进行统计
 *     6-结果输出print
 */
object _01twoSourceJoin {
    
    

  /**
   * @param currentValue 当前的值
   * @param historyValue 历史的值
   * @return
   */
  def updateFunc(currentValue:Seq[Int], historyValue:Option[Int]) : Option[Int] = {
    
    
      val adSum: Int = currentValue.sum + historyValue.getOrElse(0)  //有值就sum  没有就赋值为0
      Some(adSum)
  }


  def main(args: Array[String]): Unit = {
    
    
    val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.stripSuffix("$")).setMaster("local[6]")
    val sc = new SparkContext(conf)
    val scc = new StreamingContext(sc, Seconds(5))

    scc.checkpoint("data/checkpoint/check002")

    //2-使用socketTextStream接受来自node1:9999的数据
    val valueDS1: ReceiverInputDStream[String] = scc.socketTextStream("node1", 9999)
    val valueDS2: ReceiverInputDStream[String] = scc.socketTextStream("node1", 9998)

    //3-将来自于不同数据源的数据进行join
    val unionDS: DStream[String] = valueDS1.union(valueDS2)

    //4-通过flatmap,map,updataStateBy进行统计
    val result: DStream[(String, Int)] = unionDS
      .flatMap(_.split("\\s+"))
      .map((_, 1))
      .updateStateByKey(updateFunc)

    //5-结果输出
    result.print()
    //6-开始接受数据
    scc.start()
    //7-停止条件
    scc.awaitTermination()
    //8-停止scc  (stopSparkContext: Boolean, stopGracefully: Boolean
    scc.stop(true,true)  //温和停止

  }

}

在这里插入图片描述

SparkStreaming的模拟双流join操作详解

猜你喜欢