创建推荐引擎

假设有一个在线电影网站,公司希望运用大数据分析推荐引擎,增加会员观看电影次数。

ALS算法:

ALS算法是基于模型的推荐算法。起基本思想是对稀疏矩阵进行模型分解,评估出缺失项的值,以此来得到一个基本的训练模型。然后依照此模型可以针对新的用户和物品数据进行评估。ALS是采用交替的最小二乘法来算出缺失项的。交替的最小二乘法是在最小二乘法的基础上发展而来的。

根据用户对铲平项目的评分分为:

数据文件:

显示评分:

  •   网站上用户对某个产品进行评分,如1~5颗星。

隐式评分:

  •   不会请网站进行评分,但是会记录用户是否点选了某个产品。

创建Recommend项目:

1.创建Recommend.scala文件

2.导入链接库

import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }

3.业务逻辑代码

def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
    var choose = ""
    while (choose != "3") { //如果选择3.离开,就结束运行程序
      print("请选择要推荐类型  1.针对用户推荐电影 2.针对电影推荐感兴趣的用户 3.离开?")
      choose = readLine() //读取用户输入
      if (choose == "1") { //如果输入1.针对用户推荐电影
        print("请输入用户id?")
        val inputUserID = readLine() //读取用户ID
        RecommendMovies(model, movieTitle, inputUserID.toInt) //针对此用户推荐电影
      } else if (choose == "2") { //如果输入2.针对电影推荐感兴趣的用户
        print("请输入电影的 id?")
        val inputMovieID = readLine() //读取MovieID
        RecommendUsers(model, movieTitle, inputMovieID.toInt) //针对此电影推荐用户
      }
    }
  }

main程序代码分为3部分:

  • 数据准备阶段
  • 训练阶段
  • 推荐阶段

4.SetLogger设置不显示log信息

  def SetLogger = {
    Logger.getLogger("org").setLevel(Level.OFF)
    Logger.getLogger("com").setLevel(Level.OFF)
    System.setProperty("spark.ui.showConsoleProgress", "false")
    Logger.getRootLogger().setLevel(Level.OFF);
  }

5.创建PrepareData()函数

 def PrepareData(): (RDD[Rating], Map[Int, String]) = {

    val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
    //spark在迭代计算的过程中,会导致linage剧烈变长,所需的栈空间也急剧上升,最终爆栈,显示指明checkpoint路径,问题便可得到解决。
    sc.setCheckpointDir("checkpoint")
    //----------------------1.创建用户评分数据-------------
    print("开始读取用户评分数据中...")
    //val DataDir = "data"
    //val rawUserData = sc.textFile(new File(DataDir, "u.data").toString) 
    val rawUserData = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.data")
    val rawRatings = rawUserData.map(_.split("\t").take(3))
    val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    println("共计:" + ratingsRDD.count.toString() + "条ratings")
    //----------------------2.创建电影ID与名称对照表-------------
    print("开始读取电影数据中...")
    //val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
    val itemRDD = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.item")
    val movieTitle = itemRDD.map(line => line.split("\\|").take(2)) 
      .map(array => (array(0).toInt, array(1))).collect().toMap
    //----------------------3.显示数据记录数-------------      
    val numRatings = ratingsRDD.count() 
    val numUsers = ratingsRDD.map(_.user).distinct().count() 
    val numMovies = ratingsRDD.map(_.product).distinct().count() 
    println("共计:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
    return (ratingsRDD, movieTitle)
  }

6.recommend推荐程序代码

    def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
    var choose = ""
    while (choose != "3") { //如果选择3.离开,就结束运行程序
      print("请选择要推荐类型  1.针对用户推荐电影 2.针对电影推荐感兴趣的用户 3.离开?")
      choose = readLine() //读取用户输入
      if (choose == "1") { //如果输入1.针对用户推荐电影
        print("请输入用户id?")
        val inputUserID = readLine() //读取用户ID
        RecommendMovies(model, movieTitle, inputUserID.toInt) //针对此用户推荐电影
      } else if (choose == "2") { //如果输入2.针对电影推荐感兴趣的用户
        print("请输入电影的 id?")
        val inputMovieID = readLine() //读取MovieID
        RecommendUsers(model, movieTitle, inputMovieID.toInt) //针对此电影推荐用户
      }
    }
  }

7.main函数

  def main(args: Array[String]) {
    //设置不要显示多余信息
    SetLogger
    println("==========数据准备阶段===============")
    val (ratings, movieTitle) = PrepareData()
    println("==========训练阶段===============")
    print("开始使用 " + ratings.count() + "条评比数据进行训练模型... ")
    val model = ALS.train(ratings, 20, 15, 0.1) 
    println("训练完成!")
    println("==========推荐阶段===============")
    recommend(model, movieTitle)
    println("完成")
  }

 8.Recommend.scala全部代码

import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }
 object Recommend {
  def main(args: Array[String]) {
    //设置不要显示多余信息
    SetLogger
    println("==========数据准备阶段===============")
    val (ratings, movieTitle) = PrepareData()
    println("==========训练阶段===============")
    print("开始使用 " + ratings.count() + "条评比数据进行训练模型... ")
    val model = ALS.train(ratings, 20, 15, 0.1) 
    println("训练完成!")
    println("==========推荐阶段===============")
    recommend(model, movieTitle)
    println("完成")
  }

    def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
    var choose = ""
    while (choose != "3") { //如果选择3.离开,就结束运行程序
      print("请选择要推荐类型  1.针对用户推荐电影 2.针对电影推荐感兴趣的用户 3.离开?")
      choose = readLine() //读取用户输入
      if (choose == "1") { //如果输入1.针对用户推荐电影
        print("请输入用户id?")
        val inputUserID = readLine() //读取用户ID
        RecommendMovies(model, movieTitle, inputUserID.toInt) //针对此用户推荐电影
      } else if (choose == "2") { //如果输入2.针对电影推荐感兴趣的用户
        print("请输入电影的 id?")
        val inputMovieID = readLine() //读取MovieID
        RecommendUsers(model, movieTitle, inputMovieID.toInt) //针对此电影推荐用户
      }
    }
  }


  def SetLogger = {
    Logger.getLogger("org").setLevel(Level.OFF)
    Logger.getLogger("com").setLevel(Level.OFF)
    System.setProperty("spark.ui.showConsoleProgress", "false")
    Logger.getRootLogger().setLevel(Level.OFF);
  }

  def PrepareData(): (RDD[Rating], Map[Int, String]) = {

    val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
    //spark在迭代计算的过程中,会导致linage剧烈变长,所需的栈空间也急剧上升,最终爆栈,显示指明checkpoint路径,问题便可得到解决。
    sc.setCheckpointDir("checkpoint")
    //----------------------1.创建用户评分数据-------------
    print("开始读取用户评分数据中...")
    //val DataDir = "data"
    //val rawUserData = sc.textFile(new File(DataDir, "u.data").toString) 
    val rawUserData = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.data")
    val rawRatings = rawUserData.map(_.split("\t").take(3))
    val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    println("共计:" + ratingsRDD.count.toString() + "条ratings")
    //----------------------2.创建电影ID与名称对照表-------------
    print("开始读取电影数据中...")
    //val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
    val itemRDD = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.item")
    val movieTitle = itemRDD.map(line => line.split("\\|").take(2)) 
      .map(array => (array(0).toInt, array(1))).collect().toMap
    //----------------------3.显示数据记录数-------------      
    val numRatings = ratingsRDD.count() 
    val numUsers = ratingsRDD.map(_.user).distinct().count() 
    val numMovies = ratingsRDD.map(_.product).distinct().count() 
    println("共计:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
    return (ratingsRDD, movieTitle)
  }

  def RecommendMovies(model: MatrixFactorizationModel, movieTitle: Map[Int, String], inputUserID: Int) = {
    val RecommendMovie = model.recommendProducts(inputUserID, 10) 
    var i = 1
    println("针对用户id" + inputUserID + "推荐下列电影:")
    RecommendMovie.foreach { r => 
      println(i.toString() + "." + movieTitle(r.product) + "评分:" + r.rating.toString())
      i += 1
    }
  }

  def RecommendUsers(model: MatrixFactorizationModel, movieTitle: Map[Int, String], inputMovieID: Int) = {
    val RecommendUser = model.recommendUsers(inputMovieID, 10) 
    var i = 1
    println("针对电影 id" + inputMovieID + "电影名:" + movieTitle(inputMovieID.toInt) + "推荐下列用户id:")
    RecommendUser.foreach { r => 
      println(i.toString + "用户id:" + r.user + "   评分:" + r.rating)
      i = i + 1
    }
  }

}

9.运行 Recommend.scala

10.运行界面

11.针对用户推荐电影

12.针对电影推荐给感兴趣的人

注意:

 如果不加 sc.setCheckpointDir("checkpoint"),则会栈溢出 stackoverflow。

spark在迭代计算的过程中,会导致linage剧烈变长,所需的栈空间也急剧上升,最终爆栈了。。

这类问题解决方法如下:

在代码中加入 sc.setCheckpointDir(path),显示指明checkpoint路径,问题便可得到解决。

参考链接:https://blog.csdn.net/asdfghjkl1993/article/details/78626439

13.创建AlsEvaluation.scala调校推荐引擎参数

分为三个阶段

  • 数据准备阶段
  • 训练评估阶段
  • 测试阶段

14.创建PrepareData()数据准备

def PrepareData(): (RDD[Rating], RDD[Rating], RDD[Rating]) = {

    val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
    //----------------------1.创建用户评分数据-------------
    print("开始读取用户评分数据...")
    val DataDir = "data"
    val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)

    val rawRatings = rawUserData.map(_.split("\t").take(3))

    val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    println("共计:" + ratingsRDD.count.toString() + "条ratings")

    //----------------------2.创建电影ID与名称对照表-------------
    print("开始读取电影数据...")
    val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
    val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
      .map(array => (array(0).toInt, array(1))).collect().toMap
    //----------------------3.显示数据记录数-------------
    val numRatings = ratingsRDD.count()
    val numUsers = ratingsRDD.map(_.user).distinct().count()
    val numMovies = ratingsRDD.map(_.product).distinct().count()
    println("共计:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
    //----------------------4.以随机方式将数据分为3个部分并且返回-------------                
    println("将数据分为")
    val Array(trainData, validationData, testData) = ratingsRDD.randomSplit(Array(0.8, 0.1, 0.1))

    println("  trainData:" + trainData.count() + "  validationData:" + validationData.count() + "  testData:" + testData.count())
    return (trainData, validationData, testData)
  }

15.进行训练评估

  def trainValidation(trainData: RDD[Rating], validationData: RDD[Rating]): MatrixFactorizationModel = {
    println("-----评估 rank参数使用 ---------")
    evaluateParameter(trainData, validationData, "rank", Array(5, 10, 15, 20, 50, 100), Array(10), Array(0.1))
    println("-----评估 numIterations ---------")
    evaluateParameter(trainData, validationData, "numIterations", Array(10), Array(5, 10, 15, 20, 25), Array(0.1))
    println("-----评估 lambda ---------")
    evaluateParameter(trainData, validationData, "lambda", Array(10), Array(10), Array(0.05, 0.1, 1, 5, 10.0))
    println("-----所有参数交叉评估找出最好的参数组合---------")
    val bestModel = evaluateAllParameter(trainData, validationData, Array(5, 10, 15, 20, 25), Array(5, 10, 15, 20, 25), Array(0.05, 0.1, 1, 5, 10.0))
    return (bestModel)
  }

  def evaluateParameter(trainData: RDD[Rating], validationData: RDD[Rating],
                        evaluateParameter: String, rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]) =
    {

      var dataBarChart = new DefaultCategoryDataset()

      var dataLineChart = new DefaultCategoryDataset()
      for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) {

        val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)

        val parameterData =
          evaluateParameter match {
            case "rank"          => rank;
            case "numIterations" => numIterations;
            case "lambda"        => lambda
          }
        dataBarChart.addValue(rmse, evaluateParameter, parameterData.toString())
        dataLineChart.addValue(time, "Time", parameterData.toString())
      }

      Chart.plotBarLineChart("ALS evaluations " + evaluateParameter, evaluateParameter, "RMSE", 0.58, 5, "Time", dataBarChart, dataLineChart)
    }

16.Chart.plotBarLineChart绘制出柱形图与折线图

import org.jfree.chart._
import org.jfree.data.xy._
import org.jfree.data.category.DefaultCategoryDataset
import org.jfree.chart.axis.NumberAxis
import org.jfree.chart.axis._
import java.awt.Color
import org.jfree.chart.renderer.category.LineAndShapeRenderer;
import org.jfree.chart.plot.DatasetRenderingOrder;
import org.jfree.chart.labels.StandardCategoryToolTipGenerator;
import java.awt.BasicStroke

object Chart {
  def plotBarLineChart(Title: String, xLabel: String, yBarLabel: String, yBarMin: Double, yBarMax: Double, yLineLabel: String, dataBarChart : DefaultCategoryDataset, dataLineChart: DefaultCategoryDataset): Unit = {

    //画出Bar Chart    
    val chart = ChartFactory
         .createBarChart(  
        "", // Bar Chart 标题
        xLabel, // X轴标题
        yBarLabel, // Bar Chart 标题 y轴标题l
        dataBarChart , // Bar Chart数据
        org.jfree.chart.plot.PlotOrientation.VERTICAL,//画图方向垂直
        true, // 包含 legend
        true, // 显示tooltips
        false // 不要URL generator
        );
    //取得plot  
    val plot = chart.getCategoryPlot();
    plot.setBackgroundPaint(new Color(0xEE, 0xEE, 0xFF));
    plot.setDomainAxisLocation(AxisLocation.BOTTOM_OR_RIGHT);
    plot.setDataset(1, dataLineChart); plot.mapDatasetToRangeAxis(1, 1)
    //画直方图y轴
    val vn = plot.getRangeAxis(); vn.setRange(yBarMin, yBarMax);  vn.setAutoTickUnitSelection(true)
    //画折线图y轴   
    val axis2 = new NumberAxis(yLineLabel); plot.setRangeAxis(1, axis2);
    val renderer2 = new LineAndShapeRenderer()
    renderer2.setToolTipGenerator(new StandardCategoryToolTipGenerator());
    //设置先画直方图,再画折线图以免折线图被盖掉 
    plot.setRenderer(1, renderer2);plot.setDatasetRenderingOrder(DatasetRenderingOrder.FORWARD);
    //创建画框   
    val frame = new ChartFrame(Title,chart); frame.setSize(500, 500);
    frame.pack(); frame.setVisible(true)
  }
}

17.trainModel训练模型

 def trainModel(trainData: RDD[Rating], validationData: RDD[Rating], rank: Int, iterations: Int, lambda: Double): (Double, Double) = {
    val startTime = new DateTime()
    val model = ALS.train(trainData, rank, iterations, lambda)
    val endTime = new DateTime()
    val Rmse = computeRMSE(model, validationData)
    val duration = new Duration(startTime, endTime)
    println(f"训练参数:rank:$rank%3d,iterations:$iterations%.2f ,lambda = $lambda%.2f 结果 Rmse=$Rmse%.2f" + "训练需要时间" + duration.getMillis + "毫秒")
    (Rmse, duration.getStandardSeconds)
  }

 18.计算RMSE

EMSE是用来计算推荐系统对用户喜好的预测,与用户实际喜好的误差平均值,通常RMSE越小代表误差越小,即代表预测值与真实值越接近,准确度越高。

  def computeRMSE(model: MatrixFactorizationModel, RatingRDD: RDD[Rating]): Double = {

    val num = RatingRDD.count()
    val predictedRDD = model.predict(RatingRDD.map(r => (r.user, r.product)))
    val predictedAndRatings =
      predictedRDD.map(p => ((p.user, p.product), p.rating))
        .join(RatingRDD.map(r => ((r.user, r.product), r.rating)))
        .values
    math.sqrt(predictedAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / num)
  }

 19.evaluateAllParameter找出最佳的参数组合

  def evaluateAllParameter(trainData: RDD[Rating], validationData: RDD[Rating],
                           rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]): MatrixFactorizationModel =
    {
      val evaluations =
        for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) yield {
          val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
          (rank, numIterations, lambda, rmse)
        }
      val Eval = (evaluations.sortBy(_._4))
      val BestEval = Eval(0)
      println("最佳model参数:rank:" + BestEval._1 + ",iterations:" + BestEval._2 + "lambda" + BestEval._3 + ",结果rmse = " + BestEval._4)
      val bestModel = ALS.train(trainData, BestEval._1, BestEval._2, BestEval._3)
      (bestModel)
    }

 我们希望找出rank、numIterations、lambda,交叉评估找出最好的参数组合。

20.AlsEvaluation全部代码

import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }
import org.joda.time.format._
import org.joda.time._
import org.joda.time.Duration
import org.jfree.data.category.DefaultCategoryDataset
import org.apache.spark.mllib.regression.LabeledPoint

object AlsEvaluation {

  def main(args: Array[String]) {
    SetLogger
    println("==========数据准备阶段===============")
    val (trainData, validationData, testData) = PrepareData()
    trainData.persist(); validationData.persist(); testData.persist()
    println("==========训练验证阶段===============")
    val bestModel = trainValidation(trainData, validationData)
    println("==========测试阶段===============")
    val testRmse = computeRMSE(bestModel, testData)
    println("使用testData测试bestModel," + "结果rmse = " + testRmse)
    trainData.unpersist(); validationData.unpersist(); testData.unpersist()
  }

  def trainValidation(trainData: RDD[Rating], validationData: RDD[Rating]): MatrixFactorizationModel = {
    println("-----评估 rank参数使用 ---------")
    evaluateParameter(trainData, validationData, "rank", Array(5, 10, 15, 20, 50, 100), Array(10), Array(0.1))
    println("-----评估 numIterations ---------")
    evaluateParameter(trainData, validationData, "numIterations", Array(10), Array(5, 10, 15, 20, 25), Array(0.1))
    println("-----评估 lambda ---------")
    evaluateParameter(trainData, validationData, "lambda", Array(10), Array(10), Array(0.05, 0.1, 1, 5, 10.0))
    println("-----所有参数交叉评估找出最好的参数组合---------")
    val bestModel = evaluateAllParameter(trainData, validationData, Array(5, 10, 15, 20, 25), Array(5, 10, 15, 20, 25), Array(0.05, 0.1, 1, 5, 10.0))
    return (bestModel)
  }
  def evaluateParameter(trainData: RDD[Rating], validationData: RDD[Rating],
                        evaluateParameter: String, rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]) =
    {

      var dataBarChart = new DefaultCategoryDataset()

      var dataLineChart = new DefaultCategoryDataset()
      for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) {

        val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)

        val parameterData =
          evaluateParameter match {
            case "rank"          => rank;
            case "numIterations" => numIterations;
            case "lambda"        => lambda
          }
        dataBarChart.addValue(rmse, evaluateParameter, parameterData.toString())
        dataLineChart.addValue(time, "Time", parameterData.toString())
      }

      Chart.plotBarLineChart("ALS evaluations " + evaluateParameter, evaluateParameter, "RMSE", 0.58, 5, "Time", dataBarChart, dataLineChart)
    }

  def evaluateAllParameter(trainData: RDD[Rating], validationData: RDD[Rating],
                           rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]): MatrixFactorizationModel =
    {
      val evaluations =
        for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) yield {
          val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
          (rank, numIterations, lambda, rmse)
        }
      val Eval = (evaluations.sortBy(_._4))
      val BestEval = Eval(0)
      println("最佳model参数:rank:" + BestEval._1 + ",iterations:" + BestEval._2 + "lambda" + BestEval._3 + ",结果rmse = " + BestEval._4)
      val bestModel = ALS.train(trainData, BestEval._1, BestEval._2, BestEval._3)
      (bestModel)
    }
  def PrepareData(): (RDD[Rating], RDD[Rating], RDD[Rating]) = {

    val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
    //spark在迭代计算的过程中,会导致linage剧烈变长,所需的栈空间也急剧上升,最终爆栈,显示指明checkpoint路径,问题便可得到解决。
    sc.setCheckpointDir("checkpoint")
    //----------------------1.创建用户评分数据-------------
    print("开始读取用户评分数据...")
    val DataDir = "data"
    val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)

    val rawRatings = rawUserData.map(_.split("\t").take(3))

    val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    println("共计:" + ratingsRDD.count.toString() + "条ratings")

    //----------------------2.创建电影ID与名称对照表-------------
    print("开始读取电影数据...")
    val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
    val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
      .map(array => (array(0).toInt, array(1))).collect().toMap
    //----------------------3.显示数据记录数-------------
    val numRatings = ratingsRDD.count()
    val numUsers = ratingsRDD.map(_.user).distinct().count()
    val numMovies = ratingsRDD.map(_.product).distinct().count()
    println("共计:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
    //----------------------4.以随机方式将数据分为3个部分并且返回-------------                
    println("将数据分为")
    val Array(trainData, validationData, testData) = ratingsRDD.randomSplit(Array(0.8, 0.1, 0.1))

    println("  trainData:" + trainData.count() + "  validationData:" + validationData.count() + "  testData:" + testData.count())
    return (trainData, validationData, testData)
  }

  def trainModel(trainData: RDD[Rating], validationData: RDD[Rating], rank: Int, iterations: Int, lambda: Double): (Double, Double) = {
    val startTime = new DateTime()
    val model = ALS.train(trainData, rank, iterations, lambda)
    val endTime = new DateTime()
    val Rmse = computeRMSE(model, validationData)
    val duration = new Duration(startTime, endTime)
    println(f"训练参数:rank:$rank%3d,iterations:$iterations%.2f ,lambda = $lambda%.2f 结果 Rmse=$Rmse%.2f" + "训练需要时间" + duration.getMillis + "毫秒")
    (Rmse, duration.getStandardSeconds)
  }

  def computeRMSE(model: MatrixFactorizationModel, RatingRDD: RDD[Rating]): Double = {

    val num = RatingRDD.count()
    val predictedRDD = model.predict(RatingRDD.map(r => (r.user, r.product)))
    val predictedAndRatings =
      predictedRDD.map(p => ((p.user, p.product), p.rating))
        .join(RatingRDD.map(r => ((r.user, r.product), r.rating)))
        .values
    math.sqrt(predictedAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / num)
  }

  def SetLogger = {
    Logger.getLogger("org").setLevel(Level.OFF)
    Logger.getLogger("com").setLevel(Level.OFF)
    System.setProperty("spark.ui.showConsoleProgress", "false")
    Logger.getRootLogger().setLevel(Level.OFF);
  }

}

21.运行AlsEvaluation

柱状图代表RMSE,折线图代表时间。

评估rank参数的结果图

评估numIterations 

 

评估lambda

 

经过训练,所有参数交叉评估找出最好的参数组合

 

 22.修改Recommend.scala为最佳参数组合

猜你喜欢

转载自blog.csdn.net/Draven__/article/details/90212980