SparkSQL-IDEA environment object, UDF, UDAF

       basic environment

add dependencies

 Basic implementation structure

Code

user-defined function

 UDF

 OUT OF


basic environment

        In the actual development of SparkSQL, most of them are developed using IDEA.

add dependencies

<dependency>
 <groupId>org.apache.spark</groupId>
 <artifactId>spark-sql_2.12</artifactId>
 <version>3.0.0</version>
</dependency>

 Basic implementation structure

def main(args: Array[String]): Unit = {
    //创建sparksql的运行环境
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
    //创建sparkSession对象
      //.config()传入配置对象
    val spark = SparkSession.builder().config(sparkConf).getOrCreate()

    //执行逻辑操作

    //关闭环境
    spark.close()
  }

Code

object sprakSQL01 {
  def main(args: Array[String]): Unit = {
    //创建sparksql的运行环境
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
    //创建sparkSession对象
    //.config()传入配置对象
    val spark = SparkSession.builder().config(sparkConf).getOrCreate()

    //执行逻辑操作
    //DataFrame
    val df: DataFrame = spark.read.json("datas\\user.json")
    //展示
    //df.show()
    //SQL
    df.createOrReplaceTempView("user") //创建临时视图
    spark.sql("select * from user").show()
    spark.sql("select avg(age) from user").show()

    //DSL
    df.select("age", "username").show()
    //在使用DataFrame时,如果涉及到转换操作,需要引入转换规则
    import spark.implicits._
    df.select($"age" + 1).show()
    df.select('age + 1).show()

    //DataSet
    //DataFrame其实时特定泛型的DataSet
    val seq = Seq(1, 2, 3, 4)
    val ds: Dataset[Int] = seq.toDS()
    ds.show()

    //rdd <=> DataFrame
    val rdd = spark.sparkContext.makeRDD(List((1, "张三", 30), (2, "李四", 20)))

    val df1: DataFrame = rdd.toDF("id", "name", "age") //传入每列字段结构名称
    val rdd1: RDD[Row] = df1.rdd
    rdd1.collect().foreach(println)

    //DataFrame<=>DataSet
    val ds1: Dataset[User] = df1.as[User]
    val df2: DataFrame = ds1.toDF()

    //rdd<=>DataSet
    val ds2: Dataset[User] = rdd.map {
      case (id, name, age) => {
        User(id, name, age)
      }
    }.toDS()

    val userRDD: RDD[User] = ds2.rdd

    //关闭环境
    spark.close()
  }
  case class User(id: Int, name: String, age: Int)
}

user-defined function

        Users can add custom functions through the spark.udf function to realize custom functions.

 UDF

 def main(args: Array[String]): Unit = {
    //创建sparksql的运行环境
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
    //创建sparkSession对象
    //.config()传入配置对象
    val spark = SparkSession.builder().config(sparkConf).getOrCreate()
    import spark.implicits._

    val df: DataFrame = spark.read.json("datas\\user.json")
    df.createOrReplaceTempView("user") //创建视图
    spark.udf.register("defName",(name:String) =>{ //自定义函数
      "name:" + name
    })
    spark.sql("select defName(username) , age from user").show() //使用自定义函数
  }

 OUT OF

        Both strongly typed Dataset and weakly typed DataFrame provide relevant aggregation functions, such as count(), countDistinct(), avg(), max(), min(). In addition, users can set their own custom aggregation functions. Implement the user-defined weak type aggregate function by inheriting from UserDefinedAggregateFunction. Since Spark3.0, UserDefinedAggregateFunction has been deprecated. The strongly typed aggregate function Aggregator can be uniformly adopted

  UDAF weak class function implementation

object sparkSQL03 {
  def main(args: Array[String]): Unit = {
    //创建sparksql的运行环境
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
    //创建sparkSession对象
    //.config()传入配置对象
    val spark = SparkSession.builder().config(sparkConf).getOrCreate()
    import spark.implicits._

    val df: DataFrame = spark.read.json("datas\\user.json")
    df.createOrReplaceTempView("user") //创建视图
    spark.udf.register("avgAge", new MyAvgUDAF)
    spark.sql("select avgAge(age) from user").show() //使用自定义函数
  }
  //自定义聚合函数类:计算年龄的平均值
    //继承
    //重写方法
  class MyAvgUDAF extends UserDefinedAggregateFunction{
    //输入数据的结构 INTO
    override def inputSchema: StructType ={
      StructType(
        Array(
          StructField("age",LongType)
        )
      )
    }
    //缓冲区数据的结构
    override def bufferSchema: StructType = {
      StructType(
        Array(
          StructField("total",LongType),
          StructField("count",LongType)
        )
      )

    }
    //函数计算结果类型 OUT
    override def dataType: DataType = LongType

    //函数的稳定性
    override def deterministic: Boolean = true
    //缓冲区初始化
    override def initialize(buffer: MutableAggregationBuffer): Unit = {
      buffer.update(0,0L)
      buffer.update(1,0L)
    }

    //根据输入的值更新缓冲区的数据
    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
      buffer.update(0,buffer.getLong(0)+input.getLong(0))
      buffer.update(1,buffer.getLong(1)+1)
    }

    //缓冲区数据合并
    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
      buffer1.update(0,buffer1.getLong(0) + buffer2.getLong(0))
      buffer1.update(1,buffer1.getLong(1) + buffer2.getLong(1))
    }

    //计算平均值
    override def evaluate(buffer: Row): Any = {
      buffer.getLong(0)/buffer.getLong(1)
    }
  }
}

  Implementation of UDAF strongly typed functions

//自定义聚合函数类
  //继承 Aggregator[IN,BUF,OUT]
  //重写方法
  case class Buff( var sum:Long, var cnt:Long )
  // totalage, count
  class MyAvgUDAF extends Aggregator[Long, Buff, Double]{
    def zero: Buff = Buff(0,0)

    def reduce(b: Buff, a: Long): Buff = {
      b.sum += a
      b.cnt += 1
      b
    }
    21

    def merge(b1: Buff, b2: Buff): Buff = {
      b1.sum += b2.sum
      b1.cnt += b2.cnt
      b1
    }

    def finish(reduction: Buff): Double = {
      reduction.sum.toDouble/reduction.cnt
    }

    def bufferEncoder: Encoder[Buff] = Encoders.product

    def outputEncoder: Encoder[Double] = Encoders.scalaDouble
  }

Guess you like

Origin blog.csdn.net/dafsq/article/details/129581431