Hay muchas personas que no saben cómo empezar con SparkSql. A continuación, escribiré un ejemplo para ayudarte a recordar cómo se escribió SparkSql.
package com.sparksql
import org.apache.log4j.{
Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
DataFrame, SparkSession}
case class Employee(id:String,name:String,sex:String,balance:Int)
object SparkSQLCase1 {
def main(args: Array[String]): Unit = {
//SparkSession
val spark: SparkSession = SparkSession
.builder()
.master("local")
.appName("DataFrameFromStuctType")
.getOrCreate()
Logger.getLogger("org").setLevel(Level.ERROR)
val lineRDD: RDD[String] = spark.sparkContext.textFile("C:\\employee.txt")
val employeeRDD: RDD[Employee] = lineRDD.map(line => {
val linearray: Array[String] = line.split(",")
Employee(linearray(0), linearray(1), linearray(2), linearray(3).trim.toInt)
})
//一定要导入这个包,spark是session对象
import spark.implicits._
val employeeDF: DataFrame = employeeRDD.toDF()
//赋予表名
employeeDF.createOrReplaceTempView("employee")
//1、求销售额最高的员工信息
//spark.sql("select * from employee where balance=(select max(balance) mb from employee)").show()
//2、按照性别分组求最高销售额的员工信息
spark.sql("select max(balance) mb,sex balance from employee group by sex").show()
//3、按照性别分组,求销售额前3的员工信息
//开窗函数适用于求Top N
//按照性别开窗,按照销售额倒序排序,通过开窗函数进行编号,求编号前3即为结果
//row_number() over
spark.sql("select * from (select *,row_number() over (partition by sex order by balance desc) rk from employee) tmp " +
" where rk<=3 ").show()
//使用其他的开窗函数
spark.sql("select *,row_number() over (partition by sex order by balance desc) rk from employee ").show()
spark.sql("select *,rank() over (partition by sex order by balance desc) rk from employee ").show()
spark.sql("select *,dense_rank() over (partition by sex order by balance desc) rk from employee ").show()
spark.sql("select *,ntile(2) over (partition by sex order by balance desc) rk from employee ").show()
}
}