1.添加依赖
在idea项目的pom.xml中添加依赖。
<!--spark sql依赖,注意版本号-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.0.0</version>
</dependency>
2.案例代码
package com.zf.bigdata.spark.sql
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
DataFrame, Dataset, Row, SparkSession}
object Spark01_SparkSql_Basic {
def main(args: Array[String]): Unit = {
//创建上下文环境配置对象
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSql")
//创建 SparkSession 对象
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
// DataFrame
val df: DataFrame = spark.read.json("datas/user.json")
//df.show()
// DataFrame => Sql
//df.createOrReplaceTempView("user")
//spark.sql("select * from user").show()
//spark.sql("select age from user").show()
//spark.sql("select avg(age) from user").show()
//DataFrame => Dsl
//如果涉及到转换操作,转换需要引入隐式转换规则,否则无法转换,比如使用$提取数据的值
//spark 不是包名,是上下文环境对象名
import spark.implicits._
//df.select("age","username").show()
//df.select($"age"+1).show()
//df.select('age+1).show()
// DataSet
//val seq = Seq(1,2,3,4)
//val ds: Dataset[Int] = seq.toDS()
// ds.show()
// RDD <=> DataFrame
val rdd = spark.sparkContext.makeRDD(List((1,"张三",10),(2,"李四",20)))
val df1: DataFrame = rdd.toDF("id", "name", "age")
val rdd1: RDD[Row] = df1.rdd
// DataFrame <=> DataSet
val ds: Dataset[User] = df1.as[User]
val df2: DataFrame = ds.toDF()
// RDD <=> DataSet
val ds1: Dataset[User] = rdd.map {
case (id, name, age) => {
User(id, name = name, age = age)
}
}.toDS()
val rdd2: RDD[User] = ds1.rdd
spark.stop()
}
case class User(id:Int,name:String,age:Int)
}