spark sql的开窗函数的运用案例

求每门课程第一名学生信息

package sparkRdd_practice
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
  * @Description * @Author 黄仁议<[email protected]>
  * @Version V1.0
  * @Since 1.0
  * @Date 2019/6/4 0004 17:46
  * @Description * @ClassName WindowDemo
  */
case class Student(subject:String,name:String)
object WindowDemo {
  //分组topn
  // row_number rank dense_rank
  // over partition by
  //学科 name rank
  def main(args: Array[String]): Unit = {

    val  sparkSession = SparkSession.builder().appName("windowDemo").master("local[2]").getOrCreate()

    val frame: DataFrame = sparkSession.read.json("d:\\data\\score.json").toDF()
    //注册一个临时表
    frame.createOrReplaceTempView("t_student")
    //根据科目进行分组,统计每个人在每个科目的票数(subject,name,counts)
    val dataFrame1: DataFrame = sparkSession.sql("SELECT course,name,score,count(*) counts from t_student GROUP BY course,name,score")
    //根据票数进行排序,显示每个人最后的排名结果,最后要取每组的top1
    dataFrame1.createOrReplaceTempView("t_student_count")
    //(10,10,20,30)row_number(1,2,3,4) rank(1,1,3,4) dense_ranke(1,1,2,3)
    sparkSession.sql("SELECT * FROM (SELECT *,row_number() over(partition by course order by score desc)rank FROM " +
      " t_student_count) t_temp where rank <2").show()
    sparkSession.stop()

  }

}

猜你喜欢

转载自blog.csdn.net/weixin_43562705/article/details/91364961
今日推荐