《Spark高级数据分析》——音乐推荐（ALS算法）

0. 简介

来源: 《Spark高级数据分析》
原书GitHub地址: https://github.com/sryza/aas
内容简述：利用Spark中ALS算法，为音乐用户推荐合适的艺术家

1. 数据准备

艺术家id和名称的关系映射数据 artist_data.txt

// 读取原始数据
// 2 columns: artistid artist_name
val path1 = "./profiledata_06-May-2005/artist_data.txt"
val rawArtistData = spark.read.textFile(path1)
// 数据预处理
def transformArtistData(rawArtistData: Dataset[String]): DataFrame = {
   import rawArtistData.sparkSession.implicits._

   rawArtistData.flatMap(line => {
     val (id, name) = line.span(_ != '\t')
     try {
       if (name.nonEmpty)
         Some(id.toInt, name.trim)
       else
         None
     } catch {
       case _: Exception => None
     }
   }).toDF("id", "name")
}
val artistIdDF = transformArtistData(rawArtistData)

艺术家与其别名 artist_alias.txt

// 读取原始数据
// 2 columns: badid, goodid
 val path2 = "./profiledata_06-May-2005/artist_alias.txt"
 val rawAliasData = spark.read.textFile(path2)
 // 数据预处理
def transformAliasData(rawAliasData: Dataset[String]): Dataset[(Int, Int)] = {
  import rawAliasData.sparkSession.implicits._

  rawAliasData.flatMap(line => {
    val Array(artist, alias) = line.split('\t')
    try {
      if (artist.nonEmpty)
        Some(artist.toInt, alias.toInt)
      else
        None
    } catch {
      case _: Exception => None
    }
  })
}
val artistAlias = transformAliasData(rawAliasData).collect().toMap

用户与艺术家的关系数据集（某个用户听了某艺术家的音乐多少次） user_artist_data.txt

// 原始数据
// 3 columns: userid artistid playcount
val path0 = "./profiledata_06-May-2005/user_artist_data.txt"
val rawUserArtistData = spark.read.textFile(path0)
// 数据预处理
def tarnsformUserArtistData(spark: SparkSession, rawUserArtistDS: Dataset[String], artistAlias: Map[Int, Int]): DataFrame = {
  import spark.implicits._

  val bArtistAlias = spark.sparkContext.broadcast(artistAlias)

  rawUserArtistDS.map(line => {
    val Array(userId, artistId, count) = line.split(' ').map(_.toInt)
    // 将id转成别称，没有的就还是用id
    val finalArtistId = bArtistAlias.value.getOrElse(artistId, artistId)
    (userId, finalArtistId, count)
  }).toDF("user", "artist", "count")
}
val allDF = transformUserArtistData(spark, rawUserArtistData, artistAlias)

拆分训练集、测试集

val Array(trainDF, testDF) = allDF.randomSplit(Array(0.9, 0.1))
// 对训练数据集做好缓存，后续ALS模型迭代计算会多次使用
trainDF.unpersist()

2. 训练ALS模型

构建ALS模型，开始训练

// 构建模型
val als = new ALS()
  .setSeed(Random.nextLong())
  .setImplicitPrefs(true)
  .setRank(30)
  .setRegParam(0.0001)
  .setAlpha(1.0)
  .setMaxIter(5)
  .setUserCol("user")
  .setItemCol("artist")
  .setRatingCol("count")
  .setPredictionCol("prediction")

// 训练模型
val model = als.fit(trainDF)

// 释放缓存资源
trainDF.unpersist()

3. 为用户推荐音乐家

准备用户数据

val someUsers = testDF.select("user").as[Int].take(10).distinct

推荐音乐家

// 构建推荐的方法
def recommend(model: ALSModel, userId: Int, howMany: Int, artistIdDF: DataFrame): DataFrame = {
  import artistIdDF.sparkSession.implicits._

  // 为用户userId推荐艺术家
  val toRecommend = model.itemFactors
    .select($"id".as("artist"))
    .withColumn("user", lit(userId))

  // 获取推荐的前几名艺术家
  val topRecommendtions = model.transform(toRecommend)
    .select("artist", "prediction")
    .orderBy($"prediction".desc)
    .limit(howMany)

  // 得到需要推荐的艺术家的id
  val recommendedArtistIds = topRecommendtions.select("artist").as[Int].collect()

  artistIdDF.filter($"id" isin (recommendedArtistIds: _*))
}

// 开始推荐
someUsers.map { user =>
  // 推荐
  val recommendDF = recommend(model, user, 5, artistIdDF)
  val strings = recommendDF.map(_.mkString("|")).collect()

  (user, strings.toBuffer)
}.foreach(println)

4. 利用网格搜索与AUC评分

网格搜索

// 利用for循环，生成不同的超参数配置
for (rank <- Seq(5, 30);
     regParam <- Seq(4.0, 0.0001);
     alpha <- Seq(1.0, 40.0))
  yield {
    // 构建模型，训练
    ……
    // 返回结果
  }

AUC评分（此部分代码来自于原书GitHub）
注：关于理解AUC，可以参考这篇博客

/**
  * 计算AUC评分
  *
  * @param positiveData    测试数据
  * @param bAllArtistIDs   所有的艺术家ID
  * @param predictFunction model.transform
  * @return 评分 0-1
  */
def areaUnderCurve(positiveData: DataFrame,
                   bAllArtistIDs: Broadcast[Array[Int]],
                   predictFunction: DataFrame => DataFrame): Double = {
  import positiveData.sparkSession.implicits._

  // What this actually computes is AUC, per user. The result is actually something
  // that might be called "mean AUC".

  // Take held-out data as the "positive".
  // Make predictions for each of them, including a numeric score
  val positivePredictions = predictFunction(positiveData.select("user", "artist")).
    withColumnRenamed("prediction", "positivePrediction")

  // BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
  // small AUC problems, and it would be inefficient, when a direct computation is available.

  // Create a set of "negative" products for each user. These are randomly chosen
  // from among all of the other artists, excluding those that are "positive" for the user.
  val negativeData = positiveData.select("user", "artist").as[(Int, Int)].
    groupByKey { case (user, _) => user }.
    flatMapGroups { case (userID, userIDAndPosArtistIDs) =>
      val random = new Random()
      val posItemIDSet = userIDAndPosArtistIDs.map { case (_, artist) => artist }.toSet
      val negative = new ArrayBuffer[Int]()
      val allArtistIDs = bAllArtistIDs.value
      var i = 0
      // Make at most one pass over all artists to avoid an infinite loop.
      // Also stop when number of negative equals positive set size
      while (i < allArtistIDs.length && negative.size < posItemIDSet.size) {
        val artistID = allArtistIDs(random.nextInt(allArtistIDs.length))
        // Only add new distinct IDs
        if (!posItemIDSet.contains(artistID)) {
          negative += artistID
        }
        i += 1
      }
      // Return the set with user ID added back
      negative.map(artistID => (userID, artistID))
    }.toDF("user", "artist")

  // Make predictions on the rest:
  val negativePredictions = predictFunction(negativeData).
    withColumnRenamed("prediction", "negativePrediction")

  // Join positive predictions to negative predictions by user, only.
  // This will result in a row for every possible pairing of positive and negative
  // predictions within each user.
  val joinedPredictions = positivePredictions.join(negativePredictions, "user").
    select("user", "positivePrediction", "negativePrediction").cache()

  // Count the number of pairs per user
  val allCounts = joinedPredictions.
    groupBy("user").agg(count(lit("1")).as("total")).
    select("user", "total")
  // Count the number of correctly ordered pairs per user
  val correctCounts = joinedPredictions.
    filter($"positivePrediction" > $"negativePrediction").
    groupBy("user").agg(count("user").as("correct")).
    select("user", "correct")

  // Combine these, compute their ratio, and average over all users
  val meanAUC = allCounts.join(correctCounts, Seq("user"), "left_outer").
    select($"user", (coalesce($"correct", lit(0)) / $"total").as("auc")).
    agg(mean("auc")).
    as[Double].first()

  joinedPredictions.unpersist()

  meanAUC
}

合并网格搜索+AUC评分的调用

// 艺术家id数据，用于AUC评分
val allArtistIds = allDF.select("artist").as[Int].distinct().collect()
val bAllArtistIds = spark.sparkContext.broadcast(allArtistIds)
 
// 网格搜索
val evaluations =
  // 利用for循环，生成不同的超参数配置
  for (rank <- Seq(5, 30);
       regParam <- Seq(4.0, 0.0001);
       alpha <- Seq(1.0, 40.0))
    yield {
      // 构建模型
      val als = new ALS()
        .setSeed(Random.nextLong())
        .setImplicitPrefs(true)
        .setRank(10)
        .setRegParam(0.01)
        .setAlpha(1.0)
        .setMaxIter(5)
        .setUserCol("user")
        .setItemCol("artist")
        .setRatingCol("count")
        .setPredictionCol("prediction")
      // 训练模型
      val model = als.fit(trainDF)

	  // 进行AUC评分
      val auc = areaUnderCurve(testDF, bAllArtistIds, model.transform)

      // 释放资源
      model.userFactors.unpersist()
      model.itemFactors.unpersist()

      (auc, (rank, regParam, alpha))
    }

evaluations.sorted.reverse.foreach(println)

结果示例

(0.9039124436650243,(30,1.0E-4,1.0))
(0.9034269912559532,(5,1.0E-4,1.0))
(0.9032449249724098,(30,1.0E-4,40.0))
(0.9028574761056848,(30,4.0,1.0))
(0.9019663966459797,(5,1.0E-4,40.0))
(0.9017698705975027,(30,4.0,40.0))
(0.9015351771563618,(5,4.0,40.0))
(0.9011632951254114,(5,4.0,1.0))

5. 完整代码

package com.skey.analytics.ch03

import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.recommendation.{ALS, ALSModel}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

import scala.collection.mutable.ArrayBuffer
import scala.util.Random

/**
  * 第三章 - 音乐推荐
  *
  * @author ALion
  * @version 2019/2/24 10:53
  */
object Recommender {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[4]")
      .setAppName("RecommenderApp")
    val spark = new SparkSession.Builder()
      .config(conf)
      .enableHiveSupport()
      .getOrCreate()

    import spark.implicits._

    //artist_data.txt
    //2 columns: artistid artist_name
    val path1 = "./profiledata_06-May-2005/artist_data.txt"
    val rawArtistData = spark.read.textFile(path1)
    val artistIdDF = transformArtistData(rawArtistData)

    //artist_alias.txt
    //2 columns: badid, goodid
    val path2 = "./profiledata_06-May-2005/artist_alias.txt"
    val rawAliasData = spark.read.textFile(path2)
    val artistAlias = transformAliasData(rawAliasData).collect().toMap

    //user_artist_data.txt
    //3 columns: userid artistid playcount
    val path0 = "./profiledata_06-May-2005/user_artist_data.txt"
    val rawUserArtistData = spark.read.textFile(path0)

    val allDF = transformUserArtistData(spark, rawUserArtistData, artistAlias)
    allDF.persist()

    // 拆分训练集和测试集
    val Array(trainDF, testDF) = allDF.randomSplit(Array(0.9, 0.1))
    trainDF.persist()

    //    // 构建模型
    //    val als = new ALS()
    //      .setSeed(Random.nextLong())
    //      .setImplicitPrefs(true)
    //      .setRank(30)
    //      .setRegParam(0.0001)
    //      .setAlpha(1.0)
    //      .setMaxIter(5)
    //      .setUserCol("user")
    //      .setItemCol("artist")
    //      .setRatingCol("count")
    //      .setPredictionCol("prediction")
    //
    //    // 训练模型
    //    val model = als.fit(trainDF)
    //
    //    // 释放缓存资源
    //    trainDF.unpersist()
    //
    //    // 开始推荐
    //
    //    // 准备需要推荐的用户
    //    val someUsers = testDF.select("user").as[Int].take(10).distinct
    //
    //    someUsers.map { user =>
    //      // 推荐
    //      val recommendDF = recommend(model, user, 5, artistIdDF)
    //      val strings = recommendDF.map(_.mkString("|")).collect()
    //
    //      (user, strings.toBuffer)
    //    }.foreach(println)

    // 艺术家id数据，用于AUC评分
    val allArtistIds = allDF.select("artist").as[Int].distinct().collect()
    val bAllArtistIds = spark.sparkContext.broadcast(allArtistIds)

    // 网格搜索
    val evaluations =
    // 利用for循环，生成不同的超参数配置
      for (rank <- Seq(5, 30);
           regParam <- Seq(4.0, 0.0001);
           alpha <- Seq(1.0, 40.0))
        yield {
          // 构建模型
          val als = new ALS()
            .setSeed(Random.nextLong())
            .setImplicitPrefs(true)
            .setRank(10)
            .setRegParam(0.01)
            .setAlpha(1.0)
            .setMaxIter(5)
            .setUserCol("user")
            .setItemCol("artist")
            .setRatingCol("count")
            .setPredictionCol("prediction")

          val model = als.fit(trainDF)

          val auc = areaUnderCurve(testDF, bAllArtistIds, model.transform)

          // 释放资源
          model.userFactors.unpersist()
          model.itemFactors.unpersist()

          (auc, (rank, regParam, alpha))
        }

    evaluations.sorted.reverse.foreach(println)
    //(0.9039124436650243,(30,1.0E-4,1.0))
    //(0.9034269912559532,(5,1.0E-4,1.0))
    //(0.9032449249724098,(30,1.0E-4,40.0))
    //(0.9028574761056848,(30,4.0,1.0))
    //(0.9019663966459797,(5,1.0E-4,40.0))
    //(0.9017698705975027,(30,4.0,40.0))
    //(0.9015351771563618,(5,4.0,40.0))
    //(0.9011632951254114,(5,4.0,1.0))

    spark.stop()
  }

  /**
    * 合并数据，创建一个总的数据集
    *
    * @param spark           SparkSession
    * @param rawUserArtistDS 用户和艺术家的关系数据集
    * @param artistAlias     艺术家别名id，用于补全
    * @return
    */
  def transformUserArtistData(spark: SparkSession, rawUserArtistDS: Dataset[String], artistAlias: Map[Int, Int]): DataFrame = {
    import spark.implicits._

    val bArtistAlias = spark.sparkContext.broadcast(artistAlias)

    rawUserArtistDS.map(line => {
      val Array(userId, artistId, count) = line.split(' ').map(_.toInt)
      val finalArtistId = bArtistAlias.value.getOrElse(artistId, artistId)
      (userId, finalArtistId, count)
    }).toDF("user", "artist", "count")
  }

  def transformArtistData(rawArtistData: Dataset[String]): DataFrame = {
    import rawArtistData.sparkSession.implicits._

    rawArtistData.flatMap(line => {
      val (id, name) = line.span(_ != '\t')
      try {
        if (name.nonEmpty)
          Some(id.toInt, name.trim)
        else
          None
      } catch {
        case _: Exception => None
      }
    }).toDF("id", "name")
  }

  def transformAliasData(rawAliasData: Dataset[String]): Dataset[(Int, Int)] = {
    import rawAliasData.sparkSession.implicits._

    rawAliasData.flatMap(line => {
      val Array(artist, alias) = line.split('\t')
      try {
        if (artist.nonEmpty)
          Some(artist.toInt, alias.toInt)
        else
          None
      } catch {
        case _: Exception => None
      }
    })
  }

  /**
    * 为指定用户推荐艺术家
    *
    * @param model      训练好的ALS模型
    * @param userId     用户id
    * @param howMany    推荐多少个艺术家
    * @param artistIdDF 艺术家id和名称的关系映射
    * @return
    */
  def recommend(model: ALSModel, userId: Int, howMany: Int, artistIdDF: DataFrame): DataFrame = {
    import artistIdDF.sparkSession.implicits._

    val toRecommend = model.itemFactors
      .select($"id".as("artist"))
      .withColumn("user", lit(userId))

    val topRecommendtions = model.transform(toRecommend)
      .select("artist", "prediction")
      .orderBy($"prediction".desc)
      .limit(howMany)

    // 得到需要推荐的艺术家的id
    val recommendedArtistIds = topRecommendtions.select("artist").as[Int].collect()

    artistIdDF.filter($"id" isin (recommendedArtistIds: _*))
  }

  /**
    * 计算AUC评分
    *
    * @param positiveData    测试数据
    * @param bAllArtistIDs   所有的艺术家ID
    * @param predictFunction model.transform
    * @return 评分 0-1
    */
  def areaUnderCurve(positiveData: DataFrame,
                     bAllArtistIDs: Broadcast[Array[Int]],
                     predictFunction: DataFrame => DataFrame): Double = {
    import positiveData.sparkSession.implicits._

    // What this actually computes is AUC, per user. The result is actually something
    // that might be called "mean AUC".

    // Take held-out data as the "positive".
    // Make predictions for each of them, including a numeric score
    val positivePredictions = predictFunction(positiveData.select("user", "artist")).
      withColumnRenamed("prediction", "positivePrediction")

    // BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
    // small AUC problems, and it would be inefficient, when a direct computation is available.

    // Create a set of "negative" products for each user. These are randomly chosen
    // from among all of the other artists, excluding those that are "positive" for the user.
    val negativeData = positiveData.select("user", "artist").as[(Int, Int)].
      groupByKey { case (user, _) => user }.
      flatMapGroups { case (userID, userIDAndPosArtistIDs) =>
        val random = new Random()
        val posItemIDSet = userIDAndPosArtistIDs.map { case (_, artist) => artist }.toSet
        val negative = new ArrayBuffer[Int]()
        val allArtistIDs = bAllArtistIDs.value
        var i = 0
        // Make at most one pass over all artists to avoid an infinite loop.
        // Also stop when number of negative equals positive set size
        while (i < allArtistIDs.length && negative.size < posItemIDSet.size) {
          val artistID = allArtistIDs(random.nextInt(allArtistIDs.length))
          // Only add new distinct IDs
          if (!posItemIDSet.contains(artistID)) {
            negative += artistID
          }
          i += 1
        }
        // Return the set with user ID added back
        negative.map(artistID => (userID, artistID))
      }.toDF("user", "artist")

    // Make predictions on the rest:
    val negativePredictions = predictFunction(negativeData).
      withColumnRenamed("prediction", "negativePrediction")

    // Join positive predictions to negative predictions by user, only.
    // This will result in a row for every possible pairing of positive and negative
    // predictions within each user.
    val joinedPredictions = positivePredictions.join(negativePredictions, "user").
      select("user", "positivePrediction", "negativePrediction").cache()

    // Count the number of pairs per user
    val allCounts = joinedPredictions.
      groupBy("user").agg(count(lit("1")).as("total")).
      select("user", "total")
    // Count the number of correctly ordered pairs per user
    val correctCounts = joinedPredictions.
      filter($"positivePrediction" > $"negativePrediction").
      groupBy("user").agg(count("user").as("correct")).
      select("user", "correct")

    // Combine these, compute their ratio, and average over all users
    val meanAUC = allCounts.join(correctCounts, Seq("user"), "left_outer").
      select($"user", (coalesce($"correct", lit(0)) / $"total").as("auc")).
      agg(mean("auc")).
      as[Double].first()

    joinedPredictions.unpersist()

    meanAUC
  }

}

蒋含竹

发布了128 篇原创文章 · 获赞 45 · 访问量 15万+

私信关注

《Spark高级数据分析》——音乐推荐（ALS算法）

文章目录

《Spark高级数据分析》——音乐推荐（ALS算法）

0. 简介

1. 数据准备

2. 训练ALS模型

3. 为用户推荐音乐家

4. 利用网格搜索与AUC评分

5. 完整代码

猜你喜欢