1、文件数据
Spark,100 Hadoop,62 Flink,77 Kafka,91 Hadoop,93 Spark,78 Hadoop,69 Spark,98 Hadoop,62 Spark,99 Hadoop,61 Spark,70 Hadoop,75 Spark,88 Hadoop,68 Spark,90 Hadoop,61
2、Scala代码:
package topN import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object GroupTop { def main(args: Array[String]): Unit = { /** * 判断参数的多少 */ if (args.length < 2){ println( """ |topN.GroupTop<inputPath><outputPath> |<inputPath> 文件输入目录 |<outputPath> 输出目录 """.stripMargin ) System.exit(0) } /** * 接收参数 */ val Array(inputPath,outputPath) = args /** * 初始化程序入口 */ val conf = new SparkConf() conf.setAppName(s"${this.getClass.getSimpleName}") conf.setMaster("local") conf.set("spark.serializer","org.apache.spark.serializer.KryoSerializer") /** * 计算topN */ val sc = new SparkContext(conf) val lines: RDD[String] = sc.textFile(inputPath) //拆分为Tuple2 val tupleRDD: RDD[(String, Int)] = lines.map(line => { (line.split(",")(0), line.split(",")(1).toInt) }) //分组 val groutRDD: RDD[(String, Iterable[Int])] = tupleRDD.groupByKey() //针对分组对value排序,返回Tuple2 val groupSort: RDD[(String, List[Int])] = groutRDD.map(grouped => { (grouped._1, grouped._2.toList.sortWith(_ > _).take(5))//升序,取Top3 }) //遍历输出 groupSort.sortByKey().collect().foreach(pair => { println(pair._1+":") pair._2.foreach(s => println(s + "\t")) }) sc.stop() } }
运行结果:
Flink: 77 Hadoop: 93 75 69 68 62 Kafka: 91 Spark: 100 99 98 90 88
3、Java代码:
package topN; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.rdd.RDD; import scala.Tuple2; import java.util.Comparator; import java.util.Iterator; public class GroupTopN { public static void main(String[] args) { /** * 判断参数的个数 */ if (args.length > 2){ System.out.println("topN.GroupTopN" + "need two parameter <inputPath><outputPath> \n" + "<inputPath> 输入路径\n" + "<outputPath> 输出路径"); System.exit(0); } /** * 接收参数 */ String inputPath = args[0]; String outputPath = args[1]; SparkConf conf = new SparkConf().setAppName("topN.GroupTopN").setMaster("local");; JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(inputPath); //拆分为JavaPairRDD代码 匿名内部类 JavaPairRDD<String, Integer> cs = lines.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { return new Tuple2(s.split(",")[0],Integer.valueOf(s.split(",")[1])); } }); //拆分为JavaPairRDD代码 用lamda表达式 // JavaPairRDD cs = lines.mapToPair((PairFunction) s -> new Tuple2(s.toString().split(",")[0],Integer.valueOf(s.toString().split(",")[1]))); //根据Key分组 JavaPairRDD<String, Iterable<Integer>> csPairsRDD = cs.groupByKey(); //根据Key排序,降序 JavaPairRDD<String, Iterable<Integer>> sortbykey = csPairsRDD.sortByKey(); //遍历取出Top3 sortbykey.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() { @Override public void call(Tuple2<String, Iterable<Integer>> csPair) throws Exception { String name = csPair._1(); Iterator<Integer> ite = csPair._2().iterator(); Integer[] res = new Integer[3]; //排序,取出Top3 while (ite.hasNext()){ Integer score = ite.next(); for (int i = 0; i < 3; i++){ if (res[i] == null){ res[i] = score; break; } else if (res[i] < score){ for (int j= 2 ; j > i; j--){ res[i] = res[j - 1]; } res[i] = score; break; } } } System.out.println(name+":"); for (int i = 0; i < res.length; i++){ System.out.println(res[i] + "\t"); } System.out.println(); } }); sc.close(); } }
运行结果:
Flink: 77 null null Hadoop: 93 75 68 Kafka: 91 null null Spark: 100 99 90