46.Spark大型电商项目-用户访问session分析-top10热门品类之join品类与点击下单支付次数

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/someby/article/details/88750675

目录

代码

UserVisitSessionSpark.java


本篇文章将记录用户访问session分析-top10热门品类之join品类与点击下单支付次数。

代码

UserVisitSessionSpark.java

    /**
     * 获取Top10的品类
     * @param filteredSessionid2AggrInfoRDD
     * @param session2actionRDD
     */
    private static void getTop10Category(JavaPairRDD<String, String> filteredSessionid2AggrInfoRDD, JavaPairRDD<String, Row> session2actionRDD) {

        JavaPairRDD<String,Row> sessionid2detailRDD = filteredSessionid2AggrInfoRDD
                .join(session2actionRDD)
                .mapToPair(
                        new PairFunction<Tuple2<String, Tuple2<String, Row>>, String, Row>() {

                            private static final long serialVersionUID = 1L;

                            @Override
                            public Tuple2<String, Row> call(Tuple2<String, Tuple2<String, Row>> tuple) throws Exception {
                                return new Tuple2<String,Row>(tuple._1,tuple._2._2);
                            }
                });


        // 获取session访问过的所有品类id
        // 访问过:指的是,点击过、下单过、支付过的品类

        JavaPairRDD<Long,Long> categoryidRDD = sessionid2detailRDD.flatMapToPair(
                new PairFlatMapFunction<Tuple2<String, Row>, Long, Long>() {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public Iterator<Tuple2<Long, Long>> call(Tuple2<String, Row> tuple) throws Exception {

                        Row row = tuple._2;
                        List<Tuple2<Long,Long>> list = new ArrayList<>();

                        Long clickCategoryId = Long.valueOf(row.getLong(6));
                        long maxid = 10240L;
                        if (clickCategoryId != maxid){
                            list.add(new Tuple2<Long,Long>(clickCategoryId,clickCategoryId));
                        }

                        String orderCategoryIds = row.getString(8);
                        if (orderCategoryIds != null){
                            String[] orderCategoryIdsSplited = orderCategoryIds.split(",");
                            for (String orderCategory: orderCategoryIdsSplited){
                                list.add(new Tuple2<Long,Long>(Long.valueOf(orderCategory),Long.valueOf(orderCategory)));
                            }
                        }

                        String payCategoryIds = row.getString(10);
                        if (payCategoryIds != null){
                            String[] payCategoryIdsSplited = payCategoryIds.split(",");
                            for (String payCategoryId : payCategoryIdsSplited){
                                list.add(new Tuple2<Long,Long>(Long.valueOf(payCategoryId),Long.valueOf(payCategoryId)));
                            }
                        }


                        return list.iterator();
                    }
                }


        );
        /**
         * 第二步:计算各品类的点击、下单和支付的次数
         */

        // 访问明细中,其中三种访问行为是:点击、下单和支付
        // 分别来计算各品类点击、下单和支付的次数,可以先对访问明细数据进行过滤
        // 分别过滤出点击、下单和支付行为,然后通过map、reduceByKey等算子来进行计算

        // 计算各个品类的点击次数
        JavaPairRDD<Long, Long> clickCategoryId2CountRDD =
                getClickCategoryId2CountRDD(sessionid2detailRDD);
        // 计算各个品类的下单次数
        JavaPairRDD<Long, Long> orderCategoryId2CountRDD =
                getOrderCategoryId2CountRDD(sessionid2detailRDD);
        // 计算各个品类的支付次数
        JavaPairRDD<Long, Long> payCategoryId2CountRDD =
                getPayCategoryId2CountRDD(sessionid2detailRDD);

        /**
         * 第三步:join各品类与它的点击、下单和支付的次数
         *
         * categoryidRDD中,是包含了所有的符合条件的session,访问过的品类id
         *
         * 上面分别计算出来的三份,各品类的点击、下单和支付的次数,可能不是包含所有品类的
         * 比如,有的品类,就只是被点击过,但是没有人下单和支付
         *
         * 所以,这里,就不能使用join操作,要使用leftOuterJoin操作,就是说,如果categoryidRDD不能
         * join到自己的某个数据,比如点击、或下单、或支付次数,那么该categoryidRDD还是要保留下来的
         * 只不过,没有join到的那个数据,就是0了
         *
         */
        JavaPairRDD<Long, String> categoryid2countRDD = joinCategoryAndData(
                categoryidRDD, clickCategoryId2CountRDD, orderCategoryId2CountRDD,
                payCategoryId2CountRDD);
        

    }

    /**
     * 连接品类与数据的RDD
     * @param categoryidRDD
     * @param clickCategoryId2CountRDD
     * @param orderCategoryId2CountRDD
     * @param payCategoryId2CountRDD
     * @return
     */
    private static JavaPairRDD<Long,String> joinCategoryAndData(
            JavaPairRDD<Long,Long> categoryidRDD,
            JavaPairRDD<Long,Long> clickCategoryId2CountRDD,
            JavaPairRDD<Long,Long> orderCategoryId2CountRDD,
            JavaPairRDD<Long,Long> payCategoryId2CountRDD
    ){
        // 解释一下,如果用leftOuterJoin,就可能出现,右边那个RDD中,join过来时,没有值
        // 所以Tuple中的第二个值用Optional<Long>类型,就代表,可能有值,可能没有值
        //JavaPairRDD<Long,Tuple2<Long,Optional<Long>>> tmpJoinRDD =
                categoryidRDD.leftOuterJoin(clickCategoryId2CountRDD);

        JavaPairRDD<Long, Tuple2<Long, Optional<Long>>> tmpJoinRDD =
                categoryidRDD.leftOuterJoin(clickCategoryId2CountRDD);
        JavaPairRDD<Long,String> tmpMapRDD= tmpJoinRDD.mapToPair(
                new PairFunction<Tuple2<Long, Tuple2<Long, Optional<Long>>>, Long, String>() {
                    private static final long verialVersionUID = 1L;
                    @Override
                    public Tuple2<Long, String> call(Tuple2<Long, Tuple2<Long, Optional<Long>>> tuple) throws Exception {
                        long categoryId = tuple._1;
                        Optional<Long> optional = tuple._2._2;
                        long clickCount = 0L;
                        if (optional.isPresent()){
                            clickCount = optional.get();
                        }
                        String value = Constants.FIELD_CATEGORY_ID +"=" +categoryId+"|"
                                +Constants.FIELD_CLICK_COUNT+"="+clickCount;
                        return new Tuple2<Long,String>(categoryId,value);
                    }
                }
        );


        tmpMapRDD = tmpMapRDD.leftOuterJoin(orderCategoryId2CountRDD).mapToPair(

                new PairFunction<Tuple2<Long,Tuple2<String,Optional<Long>>>, Long, String>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<Long, String> call(
                            Tuple2<Long, Tuple2<String, Optional<Long>>> tuple)
                            throws Exception {
                        long categoryid = tuple._1;
                        String value = tuple._2._1;

                        Optional<Long> optional = tuple._2._2;
                        long orderCount = 0L;

                        if(optional.isPresent()) {
                            orderCount = optional.get();
                        }

                        value = value + "|" + Constants.FIELD_ORDER_COUNT + "=" + orderCount;

                        return new Tuple2<Long, String>(categoryid, value);
                    }

                });

        tmpMapRDD = tmpMapRDD.leftOuterJoin(payCategoryId2CountRDD).mapToPair(

                new PairFunction<Tuple2<Long,Tuple2<String,Optional<Long>>>, Long, String>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<Long, String> call(
                            Tuple2<Long, Tuple2<String, Optional<Long>>> tuple)
                            throws Exception {
                        long categoryid = tuple._1;
                        String value = tuple._2._1;

                        Optional<Long> optional = tuple._2._2;
                        long payCount = 0L;

                        if(optional.isPresent()) {
                            payCount = optional.get();
                        }

                        value = value + "|" + Constants.FIELD_PAY_COUNT + "=" + payCount;

                        return new Tuple2<Long, String>(categoryid, value);
                    }

                });


        return tmpMapRDD;

    }

猜你喜欢

转载自blog.csdn.net/someby/article/details/88750675
今日推荐