第80课：Spark SQL网站搜索综合案例实战

内容：

1.案例分析
2.案例实战

一、案例分析

   项目：以京东找出搜索平台排名的产品，The hottest
   元数据：date，userID,item,city,device
   总体思路：混合使用Spark SQL和Spark Core的内容
       第一步：原始的ETL，过滤数据后产生目标数据，实际企业中可能过滤条件非常复杂（进行广播），使用RDD的filter等进行操作；
       第二步：过滤后的目标数据进行指定条件的查询，查询条件也可能非常复杂（进行广播），使用RDD的filter算子；
       第三步：由于商品是分为种类的，我们在得出最终的结果之前，首先会基于商品进行UV（当然也可以对用户的商品的访问PV），此时要对商品镜像UV=计算的话，必须构建K-V的 RDD，例如构建过程为为(dateItem,UserID)以方便进行groupByKey，在调用了的groupByKey之后对user进行去重，并计算出每一天每一种商品的UV，最终计算出来的的结果的数据类型（）；
       第四步：使用开窗函数row_number统计出每日商品UV前五名的内容，row_number（）OVER (PARTITION BY date ORDER BY UV DESC) rank,此时会产生以date为日期、item、uv为Row的dataFrame
       第五步：DataFrame转换成RDD，根据日期进行分组并分析出弥天排名为前5的热搜item；
       第六步：进行Key-Values交换，然后调用sortByKey进行点击热度排名；
       第七步：再次进行Key-Value交换，得出目标数据为（data#item，UV）的格式；
       第八步：通过RDD直接操作MYSQL等把结果放入生成系统中的DB中，通过Java EE等Server技术进行可视化以提供市场营销人员、仓库调度系统、快递系统、仓库决策人员吃用数据创造价值；
           当然也可以放在Hive中，Java EE等技术通过JDBC等链接访问Hive；
           当然也可以放在Spark SQL中，通过Thrift技术通过Java EE使用等；
           当然，如果像双十一等时候，一般首选放在Redis中，这样可以实现类似秒杀系统的响应速度

二、代码案例

1.生成日志的代码示例

package SparkSQL

import java.io.{File, FileWriter, PrintWriter}
import java.text.SimpleDateFormat
import java.util.{Calendar, Random}

import scala.reflect.macros.ParseException

/**
  * FileName: SparkSQLUserLogsManually
  * Author:   hadoop
  * Email:    [email protected]
  * Date:     18-11-24 下午9:48
  * Description:
  *
  */
object SparkSQLUserLogsManually {
  def main (args: Array[String]): Unit = {
    //数据的容量为10000条
    val numberItems = 10000
    //数据存储位置
    val pathPath = "/home/hadoop/IdeaScala/"
    ganerateUserLogs(numberItems,pathPath)
  }

  /**
    * 将数据写入到指定的文件中
    * @param pathPath 数据存储的目录
    * @param fileName 数据存储的文件
    * @param strUserLog
    */
  def writeLog (pathPath: String, fileName: String, strUserLog: String) = {
    var fw:FileWriter = null
    var out:PrintWriter = null
    try{
      val writeFile = new File(pathPath+fileName)
      if(!writeFile.exists()){
        writeFile.createNewFile()
      }else{
        writeFile.delete()
      }
       fw = new FileWriter(writeFile,true)
       out = new PrintWriter(fw)
       out.print(strUserLog)
    }catch{
      case e:Exception=>e.printStackTrace()
    }finally {
      try{
        if (out!=null)
          out.close
        if (fw != null)
          fw.close
      }catch{
        case e:Exception=>e.printStackTrace()
      }
    }

  }

  /**
    *a
    * @param value
    * @param formate
    * @param step
    * @return
    */
  def getCountDate (value: Null, formate: String, step: Int) = {
    val sdf = new SimpleDateFormat(formate)
    val cal = Calendar.getInstance()
    if(value != null){
      try{
        cal.setTime(sdf.parse(value))
      }catch{
        case e:ParseException=>e.printStackTrace()
      }
    }
    cal.add(Calendar.DAY_OF_MONTH,step)
    sdf.format(cal.getTime)
  }


  def ganerateUserID () = {
    val random = new Random
    val userID = Array[String](
      "98415b9c-f3d4-45c3-bc7f-dce3126c6c0b", "7371b4bd-8535-461f-a5e2-c4814b2151e1",
      "49852bfa-a662-4060-bf68-0dddde5feea1", "8768f089-f736-4346-a83d-e23fe05b0ecd",
      "a76ff021-049c-4a1a-8372-02f9c51261d5", "8d5dc011-cbe2-4332-99cd-a1848ddfd65d",
      "a2bccbdf-f0e9-489c-8513-011644cb5cf7", "89c79413-a7d1-462c-ab07-01f0835696f7",
      "8d525daa-3697-455e-8f02-ab086cda7851", "c6f57c89-9871-4a92-9cbe-a2d76cd79cd0",
      "19951134-97e1-4f62-8d5c-134077d1f955", "3202a063-4ebf-4f3f-a4b7-5e542307d726",
      "40a0d872-45cc-46bc-b257-64ad898df281", "b891a528-4b5e-4ba7-949c-2a32cb5a75ec",
      "0d46d52b-75a2-4df2-b363-43874c9503a2", "c1e4b8cf-0116-46bf-8dc9-55eb074ad315",
      "6fd24ac6-1bb0-4ea6-a084-52cc22e9be42", "5f8780af-93e8-4907-9794-f8c960e87d34",
      "692b1947-8b2e-45e4-8051-0319b7f0e438", "dde46f46-ff48-4763-9c50-377834ce7137")

    userID(random.nextInt(20))
  }

  def ganerateItemID()={
    val random = new Random
    val itemID = Array("小米","休闲鞋","洗衣机","显示器","显卡","洗衣液","行车记录仪")
    itemID(random.nextInt(7))
  }

  def ganerateCityID()={
    val random = new Random
    val CityNames = Array("上海", "北京", "深圳", "广州", "纽约", "伦敦", "东京", "首尔", "莫斯科", "巴黎")
    CityNames(random.nextInt(10))
  }

  def ganerateDevice()={
    val random = new Random
    val Devices = Array("android","iphone","ipad","PC")
    Devices(random.nextInt(4))
  }

  def ganerateUserLogs(numberItems: Int, pathPath: String): Unit = {
    val userLogBuffer = new StringBuffer()
    val fileName = "SparkSQLUserlogsHottest.log"
    val formate = "yyyy-MM-dd"
    for (i <- 0 until  numberItems){
      val date = getCountDate(null,formate,-1)
      val userID = ganerateUserID()
      val itemID = ganerateItemID()
      val cityID = ganerateCityID()
      val device = ganerateDevice()
      userLogBuffer.append(date+"\t"
        +userID +"\t"+itemID+"\t"+cityID+"\t"+device+"\n")
      println(userLogBuffer.toString)
      writeLog(pathPath,fileName,userLogBuffer+"")
    }

  }


}

2.日志案例

package SparkSQL

import java.util

import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}

object SparkSQLUserLogsHosttest {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("SparkSQLUserLogsHosttest").setMaster("local[4]")
    val sc = new SparkContext(conf)
   // val sqlContext = new HiveContext(sc)
    val spark = SparkSession.builder().appName("SparkSQLUserLogsHosttest").master("local[4]").config("spark.sql.warehouse.dir", "/home/hadoop/IdeaScala/LearningBigdata/spark-warehouse").getOrCreate()

    var path = "/home/hadoop/IdeaScala/SparkSQLUserlogsHottest.log"
    val file = sc.textFile(path)

    val devicebd = "iphone"
    val broadcastdevice = sc.broadcast(devicebd)
    val lines = file.filter(line=>{
      line.contains(broadcastdevice.value)
    })
    val listRow = lines.collect()
    for(row <- listRow){
      println(row)
    }

    val pairs = lines.map(line=>{
      val splited = line.split("\t")
      val one = 1
      val dataanditemanduserid = splited(0)+"#"+splited(2)+"#"+splited(1)
      (dataanditemanduserid,one)
    })

    val pairRow = pairs.collect()
    for(pair <- pairRow){
      println(pair)
    }

    val reduceedPairs = pairs.reduceByKey(_+_)
    val reduceedRow = reduceedPairs.collect()
    val peopleInformation:util.ArrayList[String] = new util.ArrayList[String]()
    for(eachRow <- reduceedRow) {
      println(eachRow)
      val rowSplited = eachRow._1.split("#")
      val userID = rowSplited(2)
      val itemID = rowSplited(1)
      val dateID = rowSplited(0)
      //val jsonzip = "{ Date :" +dateID+", UserID :"+userID+",Username :"+userID+",Item : "+itemID+",count : "+ eachRow._2+"}"
      val jsonzip = "{\"Date\":\"" + dateID + "\", \"UserID\":\"" + userID + "\", \"Username\":\"" + userID + "\", \"Item\":\"" + itemID + "\", \"count\":" + eachRow._2 + "}"
      peopleInformation.add(jsonzip)
    }

    for (row <- peopleInformation.toArray()){
      println(row)
    }

    val peopleInformationRDD = sc.parallelize(peopleInformation.toArray())
    val peopleInformationDS = spark.read.json(peopleInformationRDD.toString())
    peopleInformationDS.createOrReplaceTempView("peopleInformations")
    val sqlText = "SELECT UserID,Item, count "+
      "FROM ("+
      "SELECT "+
      "UserID,Item, count,"+
      "row_number() OVER (PARTITION BY UserID ORDER BY count DESC) rank"+
      " FROM peopleInformations "+
      ") sub_peopleInformations "+
      "WHERE rank <= 3 "
    val execellentNameAgeDS = spark.sql(sqlText)
    execellentNameAgeDS.show()
    execellentNameAgeDS.write.format("json").save(""+"Result")

  }

}