spark--Spark SQL完成WordCount-★★★★★

package cn.hanjiaxiaozhi.sql
​
import org.apache.spark.SparkContext
import org.apache.spark.sql.{
    
    DataFrame, Dataset, SparkSession}/**
 * Author hanjiaxiaozhi
 * Date 2020/7/25 9:22
 * Desc 使用SparkSQL完成WordCount
 */
object WordCount {
    
    
  def main(args: Array[String]): Unit = {
    
    
    //1.准备SparkSQL执行环境-SparkSession
    val spark: SparkSession = SparkSession.builder().appName("sql").master("local[*]").getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")
    import spark.implicits._
​
    //2.读取数据获取DataFrame/DataSet
    //可以使用sc.textFile("路径")进行读取,然后返回RDD,再转为DataFrame/DataSet
    //也可以直接使用SparkSession的读取方法直接返回DataFrame/DataSet
    val df: DataFrame = spark.read.text("D:\\data\\words.txt")
    val ds: Dataset[String] = spark.read.textFile("D:\\data\\words.txt")//3.处理数据
    //对每一行单词进行按照空格切分
    //df.flatMap((line:String)=>{line.split(" ")})//DataFrame没有泛型,不支持支持参数类型
    //df.flatMap(_.split(" "))//DataFrame没有泛型,不知道_表示字符串
    //上面的DF为什么不可以使用flatMap(_.split(" "))? ---因为:DataFrame没有泛型!不知道_表示字符串
    //wordDS: Dataset[一个个的单词]
    val wordDS: Dataset[String] = ds.flatMap(_.split(" "))
    wordDS.show(false)
    wordDS.printSchema()
    /*
+-----+
|value|
+-----+
|hello|
|me   |
|you  |
  ....
root
 |-- value: string (nullable = true)
     *///4.得出WordCount结果
    //TODO 1 DSL风格
    wordDS.groupBy("value")
      .count()
      .orderBy($"count".desc)
      .show(false)
​
​
    //TODO 2 SQL风格
    wordDS.createOrReplaceTempView("t_word")
    val sql:String =
      """
        |select value as word,count(*) as counts
        |from t_word
        |group by value
        |order by counts desc
        |""".stripMargin
    spark.sql(sql).show(false)}
}

猜你喜欢

转载自blog.csdn.net/qq_46893497/article/details/113926569