Basic implementation structure
basic environment
In the actual development of SparkSQL, most of them are developed using IDEA.
add dependencies
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.0.0</version>
</dependency>
Basic implementation structure
def main(args: Array[String]): Unit = {
//创建sparksql的运行环境
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
//创建sparkSession对象
//.config()传入配置对象
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
//执行逻辑操作
//关闭环境
spark.close()
}
Code
object sprakSQL01 {
def main(args: Array[String]): Unit = {
//创建sparksql的运行环境
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
//创建sparkSession对象
//.config()传入配置对象
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
//执行逻辑操作
//DataFrame
val df: DataFrame = spark.read.json("datas\\user.json")
//展示
//df.show()
//SQL
df.createOrReplaceTempView("user") //创建临时视图
spark.sql("select * from user").show()
spark.sql("select avg(age) from user").show()
//DSL
df.select("age", "username").show()
//在使用DataFrame时,如果涉及到转换操作,需要引入转换规则
import spark.implicits._
df.select($"age" + 1).show()
df.select('age + 1).show()
//DataSet
//DataFrame其实时特定泛型的DataSet
val seq = Seq(1, 2, 3, 4)
val ds: Dataset[Int] = seq.toDS()
ds.show()
//rdd <=> DataFrame
val rdd = spark.sparkContext.makeRDD(List((1, "张三", 30), (2, "李四", 20)))
val df1: DataFrame = rdd.toDF("id", "name", "age") //传入每列字段结构名称
val rdd1: RDD[Row] = df1.rdd
rdd1.collect().foreach(println)
//DataFrame<=>DataSet
val ds1: Dataset[User] = df1.as[User]
val df2: DataFrame = ds1.toDF()
//rdd<=>DataSet
val ds2: Dataset[User] = rdd.map {
case (id, name, age) => {
User(id, name, age)
}
}.toDS()
val userRDD: RDD[User] = ds2.rdd
//关闭环境
spark.close()
}
case class User(id: Int, name: String, age: Int)
}
user-defined function
Users can add custom functions through the spark.udf function to realize custom functions.
UDF
def main(args: Array[String]): Unit = {
//创建sparksql的运行环境
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
//创建sparkSession对象
//.config()传入配置对象
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
import spark.implicits._
val df: DataFrame = spark.read.json("datas\\user.json")
df.createOrReplaceTempView("user") //创建视图
spark.udf.register("defName",(name:String) =>{ //自定义函数
"name:" + name
})
spark.sql("select defName(username) , age from user").show() //使用自定义函数
}
OUT OF
Both strongly typed Dataset and weakly typed DataFrame provide relevant aggregation functions, such as count(), countDistinct(), avg(), max(), min(). In addition, users can set their own custom aggregation functions. Implement the user-defined weak type aggregate function by inheriting from UserDefinedAggregateFunction. Since Spark3.0, UserDefinedAggregateFunction has been deprecated. The strongly typed aggregate function Aggregator can be uniformly adopted
UDAF weak class function implementation
object sparkSQL03 {
def main(args: Array[String]): Unit = {
//创建sparksql的运行环境
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
//创建sparkSession对象
//.config()传入配置对象
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
import spark.implicits._
val df: DataFrame = spark.read.json("datas\\user.json")
df.createOrReplaceTempView("user") //创建视图
spark.udf.register("avgAge", new MyAvgUDAF)
spark.sql("select avgAge(age) from user").show() //使用自定义函数
}
//自定义聚合函数类:计算年龄的平均值
//继承
//重写方法
class MyAvgUDAF extends UserDefinedAggregateFunction{
//输入数据的结构 INTO
override def inputSchema: StructType ={
StructType(
Array(
StructField("age",LongType)
)
)
}
//缓冲区数据的结构
override def bufferSchema: StructType = {
StructType(
Array(
StructField("total",LongType),
StructField("count",LongType)
)
)
}
//函数计算结果类型 OUT
override def dataType: DataType = LongType
//函数的稳定性
override def deterministic: Boolean = true
//缓冲区初始化
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer.update(0,0L)
buffer.update(1,0L)
}
//根据输入的值更新缓冲区的数据
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
buffer.update(0,buffer.getLong(0)+input.getLong(0))
buffer.update(1,buffer.getLong(1)+1)
}
//缓冲区数据合并
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1.update(0,buffer1.getLong(0) + buffer2.getLong(0))
buffer1.update(1,buffer1.getLong(1) + buffer2.getLong(1))
}
//计算平均值
override def evaluate(buffer: Row): Any = {
buffer.getLong(0)/buffer.getLong(1)
}
}
}
Implementation of UDAF strongly typed functions
//自定义聚合函数类
//继承 Aggregator[IN,BUF,OUT]
//重写方法
case class Buff( var sum:Long, var cnt:Long )
// totalage, count
class MyAvgUDAF extends Aggregator[Long, Buff, Double]{
def zero: Buff = Buff(0,0)
def reduce(b: Buff, a: Long): Buff = {
b.sum += a
b.cnt += 1
b
}
21
def merge(b1: Buff, b2: Buff): Buff = {
b1.sum += b2.sum
b1.cnt += b2.cnt
b1
}
def finish(reduction: Buff): Double = {
reduction.sum.toDouble/reduction.cnt
}
def bufferEncoder: Encoder[Buff] = Encoders.product
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}