spark 项目应用-topn搜索热词统计

本文章通过spark读取hive数据,分析top热点搜索词

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import scala.Tuple2;

/**
 * 每日top3热点搜索词统计案例
 * @author Administrator
 *
 */
public class DailyTop3Keyword {

   @SuppressWarnings("deprecation")
   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("DailyTop3Keyword");  
      JavaSparkContext sc = new JavaSparkContext(conf);
      HiveContext sqlContext = new HiveContext(sc.sc());  
      
      // 伪造出一份数据,查询条件
      // 备注:实际上,在实际的企业项目开发中,很可能,这个查询条件,是通过J2EE平台插入到某个MySQL表中的
      // 然后,这里呢,实际上,通常是会用Spring框架和ORM框架(MyBatis)的,去提取MySQL表中的查询条件
      Map<String, List<String>> queryParamMap = new HashMap<String, List<String>>();
      queryParamMap.put("city", Arrays.asList("beijing"));  
      queryParamMap.put("platform", Arrays.asList("android"));  
      queryParamMap.put("version", Arrays.asList("1.0", "1.2", "1.5", "2.0"));  
      
      // 根据我们实现思路中的分析,这里最合适的方式,是将该查询参数Map封装为一个Broadcast广播变量
      // 这样可以进行优化,每个Worker节点,就拷贝一份数据即可
      final Broadcast<Map<String, List<String>>> queryParamMapBroadcast = 
            sc.broadcast(queryParamMap);
      
      // 针对HDFS文件中的日志,获取输入RDD
      JavaRDD<String> rawRDD = sc.textFile("hdfs://spark1:9000/spark-study/keyword.txt"); 
      
      // 使用查询参数Map广播变量,进行筛选
      JavaRDD<String> filterRDD = rawRDD.filter(new Function<String, Boolean>() {
         
         private static final long serialVersionUID = 1L;

         @Override
         public Boolean call(String log) throws Exception {
            // 切割原始日志,获取城市、平台和版本
            String[] logSplited = log.split("\t");  
            
            String city = logSplited[3];
            String platform = logSplited[4];
            String version = logSplited[5];
            
            // 与查询条件进行比对,任何一个条件,只要该条件设置了,且日志中的数据没有满足条件
            // 则直接返回false,过滤该日志
            // 否则,如果所有设置的条件,都有日志中的数据,则返回true,保留日志
            Map<String, List<String>> queryParamMap = queryParamMapBroadcast.value();
            
            List<String> cities = queryParamMap.get("city");  
            if(cities.size() > 0 && !cities.contains(city)) {
               return false;
            }
            
            List<String> platforms = queryParamMap.get("platform");  
            if(platforms.size() > 0 && !platforms.contains(platform)) {
               return false;
            }
            
            List<String> versions = queryParamMap.get("version");  
            if(versions.size() > 0 && !versions.contains(version)) {
               return false;
            }
            
            return true;
         }
         
      });
      
      // 过滤出来的原始日志,映射为(日期_搜索词, 用户)的格式
      // 2017-09-01_abc,jhp 2017-09-01_abca,jhp 2017-09-01_abc,jhp 2017-09-01_abc,jhp
      JavaPairRDD<String, String> dateKeywordUserRDD = filterRDD.mapToPair(
            
            new PairFunction<String, String, String>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Tuple2<String, String> call(String log) throws Exception {
                  String[] logSplited = log.split("\t");  
                  
                  String date = logSplited[0];
                  String user = logSplited[1];
                  String keyword = logSplited[2];
                  
                  return new Tuple2<String, String>(date + "_" + keyword, user);
               }
               
            });
      
      // 进行分组,获取每天每个搜索词,有哪些用户搜索了(没有去重)
      JavaPairRDD<String, Iterable<String>> dateKeywordUsersRDD = dateKeywordUserRDD.groupByKey();
      
      // 对每天每个搜索词的搜索用户,执行去重操作,获得其uv
      JavaPairRDD<String, Long> dateKeywordUvRDD = dateKeywordUsersRDD.mapToPair(
            
            new PairFunction<Tuple2<String,Iterable<String>>, String, Long>() {

               private static final long serialVersionUID = 1L;
      
               @Override
               public Tuple2<String, Long> call(
                     Tuple2<String, Iterable<String>> dateKeywordUsers) throws Exception {
                  String dateKeyword = dateKeywordUsers._1;
                  Iterator<String> users = dateKeywordUsers._2.iterator();
                  
                  // 对用户进行去重,并统计去重后的数量
                  List<String> distinctUsers = new ArrayList<String>();
                  
                  while(users.hasNext()) {
                     String user = users.next();
                     if(!distinctUsers.contains(user)) {
                        distinctUsers.add(user);
                     }
                  }
                  
                  // 获取uv
                  long uv = distinctUsers.size();
                  
                  return new Tuple2<String, Long>(dateKeyword, uv);  
               }
               
            });
      
      // 将每天每个搜索词的uv数据,转换成DataFrame
      JavaRDD<Row> dateKeywordUvRowRDD = dateKeywordUvRDD.map(
            
            new Function<Tuple2<String,Long>, Row>() {

               private static final long serialVersionUID = 1L;
      
               @Override
               public Row call(Tuple2<String, Long> dateKeywordUv) throws Exception {
                  String date = dateKeywordUv._1.split("_")[0];
                  String keyword = dateKeywordUv._1.split("_")[1];
                  long uv = dateKeywordUv._2;
                  return RowFactory.create(date, keyword, uv);
               }
               
            });
      
      List<StructField> structFields = Arrays.asList(
            DataTypes.createStructField("date", DataTypes.StringType, true),
            DataTypes.createStructField("keyword", DataTypes.StringType, true),
            DataTypes.createStructField("uv", DataTypes.LongType, true));

      StructType structType = DataTypes.createStructType(structFields);
      
      DataFrame dateKeywordUvDF = sqlContext.createDataFrame(dateKeywordUvRowRDD, structType);
      
      // 使用Spark SQL的开窗函数,统计每天搜索uv排名前3的热点搜索词
      dateKeywordUvDF.registerTempTable("daily_keyword_uv");  
      
      DataFrame dailyTop3KeywordDF = sqlContext.sql(""
            + "SELECT date,keyword,uv "
            + "FROM ("
               + "SELECT "
                  + "date,"
                  + "keyword,"
                  + "uv,"
                  + "row_number() OVER (PARTITION BY date ORDER BY uv DESC) rank "
               + "FROM daily_keyword_uv"  
            + ") tmp "
            + "WHERE rank<=3");  
      
      // 将DataFrame转换为RDD,然后映射,计算出每天的top3搜索词的搜索uv总数
      JavaRDD<Row> dailyTop3KeywordRDD = dailyTop3KeywordDF.javaRDD();
      
      JavaPairRDD<String, String> top3DateKeywordUvRDD = dailyTop3KeywordRDD.mapToPair(
            new PairFunction<Row, String, String>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Tuple2<String, String> call(Row row)
                     throws Exception {
                  String date = String.valueOf(row.get(0));  
                  String keyword = String.valueOf(row.get(1));  
                  Long uv = Long.valueOf(String.valueOf(row.get(2)));                   
                  return new Tuple2<String, String>(date, keyword + "_" + uv);
               }
               
            });
      
      JavaPairRDD<String, Iterable<String>> top3DateKeywordsRDD = top3DateKeywordUvRDD.groupByKey();
      
      JavaPairRDD<Long, String> uvDateKeywordsRDD = top3DateKeywordsRDD.mapToPair(
            new PairFunction<Tuple2<String,Iterable<String>>, Long, String>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Tuple2<Long, String> call(
                     Tuple2<String, Iterable<String>> tuple)
                     throws Exception {
                  String date = tuple._1;
                  
                  Long totalUv = 0L;
                  String dateKeywords = date;  
                  
                  Iterator<String> keywordUvIterator = tuple._2.iterator();
                  while(keywordUvIterator.hasNext()) {
                     String keywordUv = keywordUvIterator.next();
                     
                     Long uv = Long.valueOf(keywordUv.split("_")[1]);  
                     totalUv += uv;
                     
                     dateKeywords += "," + keywordUv;
                  }
                  
                  return new Tuple2<Long, String>(totalUv, dateKeywords);
               }
               
            });
      
      // 按照每天的总搜索uv进行倒序排序
      JavaPairRDD<Long, String> sortedUvDateKeywordsRDD = uvDateKeywordsRDD.sortByKey(false);
      
      // 再次进行映射,将排序后的数据,映射回原始的格式,Iterable<Row>
      JavaRDD<Row> sortedRowRDD = sortedUvDateKeywordsRDD.flatMap(
            
            new FlatMapFunction<Tuple2<Long,String>, Row>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Iterable<Row> call(Tuple2<Long, String> tuple)
                     throws Exception {
                  String dateKeywords = tuple._2;
                  String[] dateKeywordsSplited = dateKeywords.split(",");  
                  
                  String date = dateKeywordsSplited[0];
                  
                  List<Row> rows = new ArrayList<Row>();
                  rows.add(RowFactory.create(date, 
                        dateKeywordsSplited[1].split("_")[0],
                        Long.valueOf(dateKeywordsSplited[1].split("_")[1]))); 
                  rows.add(RowFactory.create(date, 
                        dateKeywordsSplited[2].split("_")[0],
                        Long.valueOf(dateKeywordsSplited[2].split("_")[1]))); 
                  rows.add(RowFactory.create(date, 
                        dateKeywordsSplited[3].split("_")[0],
                        Long.valueOf(dateKeywordsSplited[3].split("_")[1]))); 
                  
                  return rows;
               }
               
            });
      
      // 将最终的数据,转换为DataFrame,并保存到Hive表中
      DataFrame finalDF = sqlContext.createDataFrame(sortedRowRDD, structType);
      
      finalDF.saveAsTable("daily_top3_keyword_uv");
      
      sc.close();
   }
   
}

猜你喜欢

转载自blog.csdn.net/qq_18603599/article/details/79953676
今日推荐