import spark.implicits._
val people: DataFrame = spark.read.json("file:///data/people.json")
// TODO... DF里面有两列,只要name列 ==> select name from people
people.select("name").show()
people.select($"name").show()
// TODO... select * from people where age > 21
people.filter($"age" > 21).show()
people.filter("age > 21").show()
// TODO... select age, count(1) from people group by age
people.groupBy("age").count().show()
// TODO... select name,age+10 from people
people.select($"name", ($"age"+10).as("new_age")).show()
// TODO... 使用SQL的方式操作
people.createOrReplaceTempView("people")
spark.sql("select name from people where age > 21").show()
DataSet
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object DatasetApp {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("DatasetApp").getOrCreate()
import spark.implicits._
val ds: Dataset[Person] = Seq(Person("PK","30")).toDS()
ds.show()
val primitiveDS: Dataset[Int] = Seq(1,2,3).toDS()
primitiveDS.map(x => x+1).collect().foreach(println)
val peopleDF: DataFrame = spark.read.json("file:///data/people.json")
val peopleDS: Dataset[Person] = peopleDF.as[Person]
peopleDS.show(false)
peopleDF.select("anme").show() // 是在运行期报错
peopleDS.map(x => x.name).show() //编译期报错
spark.stop()
}
case class Person(name: String, age: String)
}
InteroperatingRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object InteroperatingRDDApp {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("DatasetApp").getOrCreate()
runInferSchema(spark)
runProgrammaticSchema(spark)
spark.stop()
}
/**
* 第二种方式:自定义编程
*/
def runProgrammaticSchema(spark:SparkSession): Unit = {
import spark.implicits._
// step1
val peopleRDD: RDD[String] = spark.sparkContext.textFile("file:///data/people.txt")
val peopleRowRDD: RDD[Row] = peopleRDD.map(_.split(",")) // RDD
.map(x => Row(x(0), x(1).trim.toInt))
// step2
val struct =
StructType(
StructField("name", StringType, true) ::
StructField("age", IntegerType, false) ::Nil)
// step3
val peopleDF: DataFrame = spark.createDataFrame(peopleRowRDD, struct)
peopleDF.show()
peopleRowRDD
}
/**
* 第一种方式:反射
* 1)定义case class
* 2)RDD map,map中每一行数据转成case class
*/
def runInferSchema(spark: SparkSession): Unit = {
import spark.implicits._
val peopleRDD: RDD[String] = spark.sparkContext.textFile("file:///data/people.txt")
//TODO... RDD => DF
val peopleDF: DataFrame = peopleRDD.map(_.split(",")) //RDD
.map(x => People(x(0), x(1).trim.toInt)) //RDD
.toDF()
//peopleDF.show(false)
peopleDF.createOrReplaceTempView("people")
val queryDF: DataFrame = spark.sql("select name,age from people where age between 19 and 29")
//queryDF.show()
//queryDF.map(x => "Name:" + x(0)).show() // from index
queryDF.map(x => "Name:" + x.getAs[String]("name")).show // from field
}
case class People(name:String, age:Int)
}