Spark-based DataFrame combat

Another core feature in Spark is DataFrame, which facilitates working with structured data. The example is still based on the data in the previous blog.

We request the following data:

1. View the score records of 338 users;

2. Save the result in csv format;

3. The user id with the most comments on movies;

4. Movie id and title most commented by users;

5. Comment on the youngest and oldest of the films;

6. Favorite movies of users aged 25 to 30;

7. The most popular movies of users;

code show as below:

 

import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark. {SparkConf, SparkContext}


/**
  * For more information, please refer to: http://www.iteblog.com/archives/1566#DataFrame-4
  *
  */
object MoviesDataStatistics {

  case class Ratings(userId: Int, movieId: Int, rating: Double)

  case class Movies(id: Int, movieTitle: String, releaseDate: String)

  case class Users(id: Int, age: Int, gender: String)

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("MoviesDataStatistics")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._
    val ratingsDF: DataFrame = sc.textFile("/data/ratings.data").map(x => x.split("::")).map(line => Ratings(line(0).toInt, line(1).toInt, line(2).toDouble)).toDF()
    ratingsDF.registerTempTable("ratings")
    //View the number of 338 score records
    println("sql for 338 rateing info is : ")
    sqlContext.sql("select * from ratings where userId = 338").show()
    println("dataframe 338 rateing info is : ")
    ratingsDF.filter(ratingsDF("userId").equalTo(338)).show()

    val userDataDF = sc.textFile("/data/user.data").map(x => x.split("[|]")).map(line => Users(line(0).toInt, line(1).toInt, line(2))).toDF()
    userDataDF.registerTempTable("users")
    sqlContext.sql("select * from users where id = 338").show()
    userDataDF.filter(userDataDF("id").equalTo(338)).show()

    val movieDF = sc.textFile("/data/movies.data").map(x => x.split("::")).map(line => Movies(line(0).toInt, line(1), line(2))).toDF()
    movieDF.registerTempTable("movies")
    movieDF.collect()
    sqlContext.sql("select * from movies where id = 1").show()
    movieDF.filter(movieDF("id").equalTo(1)).show()

    sqlContext.sql("select r.userId,m.movieTitle,r.rating from movies m inner join ratings r on m.id = r.movieId and r.userId = 338 order by r.rating desc ").show()
    val resultDF = movieDF.join(ratingsDF.filter(ratingsDF("userId").equalTo(338)), movieDF("id").equalTo(ratingsDF("movieId")))
      .sort(ratingsDF("rating").desc).select("userId", "movieTitle", "rating")

    resultDF.collect().foreach(println)
    import org.apache.spark.sql.functions._
    // save the result to csv format
    //val saveOptions = Map("header" -> "true", "path" -> "/data/rat_movie.csv")
    //resultDF.write.format("com.databricks.spark.csv").mode(SaveMode.Overwrite).options(saveOptions).save()
    // The user id with the most comments on the movie
    sqlContext.sql("select userId,count(*) as count from ratings group by userId order by count desc ").show(1)
    val userIdCountDF = ratingsDF.groupBy("userId").count()
    userIdCountDF.join(userIdCountDF.agg(max("count").alias("max_count")), $"count".equalTo($"max_count")).select("userId").show(1)

    // Movie id and title most commented by users
    val movieIDGroupDF = ratingsDF.groupBy("movieId").count()
    val movieCountDF = movieIDGroupDF.join(movieIDGroupDF.agg(max("count").alias("max_count"))).filter($"count".equalTo($"max_count"))
    //Star Wars is the most commented movie by users
    movieCountDF.join(movieDF).filter($"movieId".equalTo($"id")).select("movieId", "movieTitle", "releaseDate").show()

    // Comment on the youngest and oldest of the movies
    // The oldest is 73 years old, the youngest is 7 years old
    ratingsDF.join(userDataDF, ratingsDF("userId").equalTo(userDataDF("id")))
      .agg(min($"age").alias("min_age"), max($"age").alias("max_age"))
      .join(userDataDF, $"age".isin($"min_age", $"max_age"))
      .select("id", "age", "gender").show(2)
    // https://spark.apache.org/docs/1.6.2/api/java/org/apache/spark/sql/GroupedData.html
    // Movies popular with users aged 25 to 30
    userDataDF.filter($"age".between(25, 30)).join(ratingsDF, $"id".equalTo($"userId"))
      .select("userId", "movieId", "rating").join(movieDF, $"rating".equalTo(5)).select("movieId", "movieTitle").show(10)
    // Most popular movie
    ratingsDF.groupBy("movieId").agg(avg("rating").alias("avg_rate"))
      .sort($"avg_rate".desc).limit(10)
      .join(movieDF, $"movieId".equalTo($"id"))
      .select("movieTitle").show(false)
  }
}

 Summarize:

 

1. Import sqlContext.implicits._ needs to be introduced when creating DF

2. When using DF functions, you need to import org.apache.spark.sql.functions._

3. The functions of DF are very powerful, and the basic functions must be mastered;

4. Personally think that the function of DF is more powerful than that of Sql

refer to:

https://www.ibm.com/developerworks/cn/opensource/os-cn-spark-practice3/

https://spark.apache.org/docs/1.6.2/api/java/org/apache/spark/sql/GroupedData.html

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=327058753&siteId=291194637