0904-各区域Top3商品统计

需求六: 各区域Top3商品统计

6.1 需求概述

统计各个区域中Top3的热门商品,热门商品的评判指标是商品被点击的次数,对于user_visit_action表,click_product_id表示被点击的商品。

  • 用到的表
  1. 用户访问行为表
    在这里插入图片描述
  2. 商品信息表
    在这里插入图片描述
  3. 城市信息
    在这里插入图片描述
  • 结果数据
    在这里插入图片描述

5.2 简要运行流程

在这里插入图片描述

5.3 具体运行流程

  1. 从用户行为表里拿到城市id和点击商品id, 并以城市ID为key,用于和城市信息进行join操作
  2. join后转化结构,指定个字段的类型
  3. 创建基本数据表 (city_id, city_name, area, product_id)
  4. GROUP BY area,product_id 获取每个区域,每个商品的点击总和,并通过自定义函数的方式将城市信息综合
  5. 丰富表的内容 , 加入其它信息(商品状态,区域等级.商品名称)
  6. 窗口函数, 获取分组Top3
  7. 封装case class
  8. 入库

在这里插入图片描述

5.4 代码实现

5.4.1 获取数据

        // 1. 获取数据
        // RDD[(cityId, pid)]
        val cityId2PidRDD: RDD[(Long, Long)] = getCityAndProductInfo(sparkSession, taskParams)
        // RDD[(cityId, CityAreaInfo)]
        val cityId2AreaInfoRDD = getCityAreaInfo(sparkSession)
    def getCityAndProductInfo(sparkSession: SparkSession, taskParams: JSONObject) = {
        val startDate = ParamUtils.getParam(taskParams, Constants.PARAM_START_DATE)
        val endDate = ParamUtils.getParam(taskParams, Constants.PARAM_END_DATE)

        // 只获取发生过点击的action的数据
        // 获取到的一条action数据就代表一个点击行为
        val sql = "select city_id, click_product_id from user_visit_action where date>='" + startDate +
            "' and date<='" + endDate + "' and click_product_id != -1"

        import sparkSession.implicits._
        val cityId2PidRDD: RDD[(Long, Long)] = sparkSession.sql(sql)
            .as[CityClickProduct]
            .rdd
            .map {
                case (cityClickProduct) =>
                    (cityClickProduct.city_id, cityClickProduct.click_product_id)
            }
        cityId2PidRDD
    }

    def getCityAreaInfo(sparkSession: SparkSession) = {
        val cityAreaInfoArray = Array(
            (0L, "北京", "华北"),
            (1L, "上海", "华东"),
            (2L, "南京", "华东"),
            (3L, "广州", "华南"),
            (4L, "三亚", "华南"),
            (5L, "武汉", "华中"),
            (6L, "长沙", "华中"),
            (7L, "西安", "西北"),
            (8L, "成都", "西南"),
            (9L, "哈尔滨", "东北")
        )
        val cityId2AreaInfoRDD: RDD[(Long, CityAreaInfo)] = sparkSession.sparkContext
            .makeRDD(cityAreaInfoArray)
            .map {
                case (cityId, cityName, area) =>
                    (cityId, CityAreaInfo(cityId, cityName, area))
            }
        cityId2AreaInfoRDD
    }

5.4.2 Join得到基本表信息

        // 2. Join得到基本表信息
        val areaPidBasicInfoRDD: RDD[(Long, String, String, Long)] = cityId2PidRDD
            .join(cityId2AreaInfoRDD)
            .map {
                case (cityId, (pid, areaInfo)) =>
                    (cityId, areaInfo.city_name, areaInfo.area, pid)
            }
        import sparkSession.implicits._
        areaPidBasicInfoRDD.toDF("city_id", "city_name", "area", "pid")
            .createOrReplaceTempView("tmp_area_basic_info")

5.4.3 获取每个区域,每个商品的点击总和,并将城市信息综合

        // 3. group by ;获取每个区域,每个商品的点击总和,并将城市信息综合
        //    DataFrame:createOrReplaceTempView("tmp_area_product_click_count")
        // 3.1 自定义UDF函数 concat_long_string: 将city_id 和 city_name 进行拼接
        sparkSession.udf.register("concat_long_string", (v1: Long, v2: String, split: String) => {
            v1 + split + v2
        })
        // 3.2 自定义UDAF函数 group_concat_distinct, 将一个区域里的city信息进行聚合
        sparkSession.udf.register("group_concat_distinct", new GroupConcatDistinct)
        // 3.3 group by
        val sql = "select area, pid, count(*) click_count, " +
            "group_concat_distinct(concat_long_string(city_id, city_name, ':')) city_infos " +
            "from tmp_area_basic_info group by area, pid"
        sparkSession.sql(sql).createOrReplaceTempView("tmp_area_click_count")
  • UDAF函数
package com.lz.area

import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, StringType, StructType}

class GroupConcatDistinct extends UserDefinedAggregateFunction {
    override def inputSchema: StructType = {
        new StructType().add("cityInfo", StringType)
    }

    override def bufferSchema: StructType = {
        new StructType().add("bufferCityInfo", StringType)
    }

    override def dataType: DataType = StringType

    override def deterministic: Boolean = true

    override def initialize(buffer: MutableAggregationBuffer): Unit = {
        buffer(0) = ""
    }

    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
        var bufferCityInfo = buffer.getString(0)
        val cityInfo = input.getString(0)

        if (!bufferCityInfo.contains(cityInfo)) {
            if ("".equals(bufferCityInfo)) {
                bufferCityInfo += cityInfo
            } else {
                bufferCityInfo += "," + cityInfo
            }
            buffer.update(0, bufferCityInfo)
        }
    }

    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
        // bufferCityInfo1: cityId1:cityName1, cityId2:cityName2
        var bufferCityInfo1 = buffer1.getString(0)
        // bufferCityInfo2: cityId1:cityName1, cityId2:cityName2
        val bufferCityInfo2 = buffer2.getString(0)

        for (cityInfo <- bufferCityInfo2.split(",")) {
            if (!bufferCityInfo1.contains(cityInfo)) {
                if ("".equals(bufferCityInfo1)) {
                    bufferCityInfo1 += cityInfo
                } else {
                    bufferCityInfo1 += "," + cityInfo
                }
            }
        }
        buffer1.update(0, bufferCityInfo1)
    }

    override def evaluate(buffer: Row): Any = {
        buffer.getString(0)
    }
}

5.4.4 加入其它字段

        // 4. 丰富tmp_area_click_count 的内容,加入其它字段
        // tmp_area_click_count: area, city_infos, pid, click_count   tacc
        // product_info: product_id, product_name, extend_info   pi
        sparkSession.udf.register("get_json_field", (json: String, field: String) => {
            val jsonObject = JSONObject.fromObject(json)
            jsonObject.getString(field)
        })
        val sql2 = "select tacc.area, tacc.city_infos, tacc.pid, pi.product_name," +
            "if(get_json_field(pi.extend_info, 'product_status')='0','Self','Third Party') product_status," +
            "tacc.click_count " +
            "from tmp_area_click_count tacc join product_info pi on tacc.pid = pi.product_id"

        sparkSession.sql(sql2).createOrReplaceTempView("tmp_area_count_product_info")
        sparkSession.sql(sql2).show()

5.4.5 获取top3

        // 5. 获取top3
        val sql3 = "select area, " +
            "CASE " +
            "WHEN area='华北' OR area='华东' THEN 'A_Level' " +
            "WHEN area='华中' OR area='华南' THEN 'B_Level' " +
            "WHEN area='西南' OR area='西北' THEN 'C_Level' " +
            "ELSE 'D_Level' " +
            "END area_level," +
            "city_infos, pid, product_name, product_status, click_count from (" +
            "select area, city_infos, pid, product_name, product_status, click_count, " +
            "row_number() over(PARTITION BY area ORDER BY click_count DESC) rank " +
            "from tmp_area_count_product_info) t where rank <=3"
        val top3ProductDF: DataFrame = sparkSession.sql(sql3)

5.4.6 封装case class

        // 6. 封装case class
        val top3ProductRDD: RDD[AreaTop3Product] = top3ProductDF.rdd.map {
            case (row) =>
                AreaTop3Product(
                    taskId
                    , row.getAs[String]("area")
                    , row.getAs[String]("area_level")
                    , row.getAs[Long]("pid")
                    , row.getAs[String]("city_infos")
                    , row.getAs[Long]("click_count")
                    , row.getAs[String]("product_name")
                    , row.getAs[String]("product_status")
                )
        }

5.4.7 入库

        // 7. 入库
        top3ProductRDD.toDF().write
            .format("jdbc")
            .option("url", ConfigurationManager.config.getString(Constants.JDBC_URL))
            .option("dbtable", "area_top3_product")
            .option("user", ConfigurationManager.config.getString(Constants.JDBC_USER))
            .option("password", ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
            .mode(SaveMode.Append)
            .save()

小结

  1. UDF
  2. UDAF
  3. IF
  4. CASE…WHEN
  5. 窗口函数
发布了43 篇原创文章 · 获赞 0 · 访问量 514

猜你喜欢

转载自blog.csdn.net/qq_35199832/article/details/103711946