Spark Streaming和Spark SQL关联使用,实现词频统计(scala)

此实例为官网的整合实例,仅供参考。

package com.fyy.spark.streaming

import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}

/**
  * @Title: SqlAndStreaming
  * @ProjectName SparkStreamingProject
  * @Description: Spark Streaming整合Spark SQL进行词频统计操作
  * @author fanyanyan
  */
object SqlAndStreaming {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("SqlAndStreaming").setMaster("local[*]")
    val ssc = new StreamingContext(sparkConf, Seconds(5))

    val lines = ssc.socketTextStream("01.server.bd", 6666)
    val words = lines.flatMap(_.split(" "))

    // 将words DStream的RDD转换为DataFrame并运行SQL查询
    words.foreachRDD { (rdd: RDD[String], time: Time) =>
      val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
      import spark.implicits._

      // 将RDD[String]转换为RDD[case class]在形成DataFrame
      val wordsDataFrame = rdd.map(w => Record(w)).toDF()

      // 为DataFrame创建一个临时的视图
      wordsDataFrame.createOrReplaceTempView("words")

      // 通过sql进行统计
      val wordCountsDataFrame =
        spark.sql("select word, count(*) as num from words group by word")
      println(s"=========分隔符=========")
      wordCountsDataFrame.show()
    }

    ssc.start()
    ssc.awaitTermination()
  }

  /** Case class for converting RDD to DataFrame */
  case class Record(word: String)


  /** 延迟实例化SparkSession的单例实例 */
  object SparkSessionSingleton {

    @transient private var instance: SparkSession = _

    def getInstance(sparkConf: SparkConf): SparkSession = {
      if (instance == null) {
        instance = SparkSession
          .builder
          .config(sparkConf)
          .getOrCreate()
      }
      instance
    }
  }

}

猜你喜欢

转载自blog.csdn.net/adayan_2015/article/details/88422038
今日推荐