[Spark][spark_sql]#3_SparkSQL API

SparkSession


import org.apache.spark.sql.{DataFrame, SparkSession}

bject SparkSessionApp {

  def main(args: Array[String]): Unit = {

    // DF/DS编程的入口点
    val spark: SparkSession = SparkSession.builder()
      .master("local").getOrCreate()

    // 读取文件
    val df: DataFrame = spark.read.text("file:///Users/eric/Desktop/coding385/sparksql-train/data/input.txt")

    // TODO... 业务逻辑处理,通过DF/DS提供的API来完成业务
    df.printSchema()
    df.show()  
    spark.stop()
  }
}

DataFrame

import spark.implicits._


val people: DataFrame = spark.read.json("file:///data/people.json")


    // TODO... DF里面有两列,只要name列 ==> select name from people
people.select("name").show()
people.select($"name").show()

    // TODO...  select * from people where age > 21
people.filter($"age" > 21).show()
people.filter("age > 21").show()

    // TODO... select age, count(1) from people group by age
people.groupBy("age").count().show()

    // TODO... select name,age+10 from people
 people.select($"name", ($"age"+10).as("new_age")).show()


    // TODO... 使用SQL的方式操作
   people.createOrReplaceTempView("people")
 spark.sql("select name from people where age > 21").show()

DataSet


import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object DatasetApp {

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().master("local").appName("DatasetApp").getOrCreate()
    import spark.implicits._

    val ds: Dataset[Person] = Seq(Person("PK","30")).toDS()
    ds.show()

    val primitiveDS: Dataset[Int] = Seq(1,2,3).toDS()
    primitiveDS.map(x => x+1).collect().foreach(println)

    val peopleDF: DataFrame = spark.read.json("file:///data/people.json")
    val peopleDS: Dataset[Person] = peopleDF.as[Person]
    peopleDS.show(false)


    peopleDF.select("anme").show()     // 是在运行期报错
    peopleDS.map(x => x.name).show()  //编译期报错

    spark.stop()
  }

  case class Person(name: String, age: String)

}

InteroperatingRDD


import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object InteroperatingRDDApp {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().master("local").appName("DatasetApp").getOrCreate()
   
runInferSchema(spark)

runProgrammaticSchema(spark)
    spark.stop()
  }

  /**
    * 第二种方式:自定义编程
    */
  def runProgrammaticSchema(spark:SparkSession): Unit = {
    import spark.implicits._


    // step1
    val peopleRDD: RDD[String] = spark.sparkContext.textFile("file:///data/people.txt")
    val peopleRowRDD: RDD[Row] = peopleRDD.map(_.split(",")) // RDD
      .map(x => Row(x(0), x(1).trim.toInt))

    // step2
    val struct =
        StructType(
          StructField("name", StringType, true) ::
          StructField("age", IntegerType, false) ::Nil)

    // step3
    val peopleDF: DataFrame = spark.createDataFrame(peopleRowRDD, struct)

    peopleDF.show()

    peopleRowRDD
  }

  /**
    * 第一种方式:反射
    * 1)定义case class
    * 2)RDD map,map中每一行数据转成case class
    */
  def runInferSchema(spark: SparkSession): Unit = {
    import spark.implicits._

    val peopleRDD: RDD[String] = spark.sparkContext.textFile("file:///data/people.txt")

    //TODO... RDD => DF
    val peopleDF: DataFrame = peopleRDD.map(_.split(",")) //RDD
      .map(x => People(x(0), x(1).trim.toInt)) //RDD
      .toDF()
    //peopleDF.show(false)

    peopleDF.createOrReplaceTempView("people")
    val queryDF: DataFrame = spark.sql("select name,age from people where age between 19 and 29")
    //queryDF.show()

    //queryDF.map(x => "Name:" + x(0)).show()  // from index
    queryDF.map(x => "Name:" + x.getAs[String]("name")).show // from field
  }

  case class People(name:String, age:Int)
}

Published 78 original articles · won praise 0 · Views 1422

Guess you like

Origin blog.csdn.net/qq_30782921/article/details/102797125