Analyze the movie rating data, use SQL programming to obtain the top 10 average movie scores, and require that the number of movie ratings be greater than 200
Data Display
code
package org.example.SQL
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
//电影数据分析
object sql_Movie {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.ERROR)
val spark: SparkSession = SparkSession.builder().appName("sparksql").master("local[*]")
.config("spark.sql.shuffle.partitions", 4)
.getOrCreate()
import spark.implicits._
val ds: Dataset[String] = spark.read.textFile("data/text/rating_100k.data")
// ds.printSchema()
// ds.show()
val movies: DataFrame = ds.map(line => {
val arr: Array[String] = line.split("\t")
(arr(1), arr(2).toInt)
}).toDF("movieid", "score")
movies.printSchema()
movies.show()
//统计 评分次数>200的电影平均分Top10
movies.createOrReplaceTempView("movie")
var sql =
"""
select movieid,count(*) as counts,avg(score) as avgs
from movie
group by movieid
having counts >200
order by avgs
limit 10
""".stripMargin
spark.sql(sql).show()
}
}
data printing
+-------+-----+
|movieid|score|
+-------+-----+
| 242| 3|
| 302| 3|
| 377| 1|
| 51| 2|
| 346| 1|
| 474| 4|
| 265| 2|
| 465| 5|
| 451| 3|
| 86| 3|
| 257| 2|
| 1014| 5|
| 222| 5|
| 40| 3|
| 29| 3|
| 785| 3|
| 387| 5|
| 274| 2|
| 1042| 4|
| 1184| 2|
+-------+-----+
result output
+-------+------+------------------+
|movieid|counts| avgs|
+-------+------+------------------+
| 678| 219| 2.808219178082192|
| 235| 217| 2.847926267281106|
| 323| 240| 2.933333333333333|
| 289| 259|2.9806949806949805|
| 546| 254| 3.031496062992126|
| 322| 218|3.0871559633027523|
| 245| 240|3.1083333333333334|
| 748| 316|3.1234177215189876|
| 597| 206| 3.150485436893204|
| 294| 485| 3.156701030927835|
+-------+------+------------------+