Spark: topN

针对一串数字进行 Top 的处理

3
5
6
7
1
4
5
6
9
0
3

package com.core.demo;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.omg.PortableInterceptor.INACTIVE;
import scala.Int;
import scala.Tuple2;
import java.util.*;

public class TestTop3 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setAppName("WordCountSort");
        conf.setMaster("local");

        JavaSparkContext jsc = new JavaSparkContext(conf);
        jsc.setLogLevel("ERROR");

        JavaRDD<String> lines = jsc.textFile("in/top.txt");

        JavaPairRDD<Integer,String> pairs= lines.mapToPair(new PairFunction<String,Integer,String>() {
            private static final long serialVersionUID = 1L;
            @Override
            public Tuple2<Integer, String> call(String s) throws Exception {
                return new Tuple2<Integer, String>(Integer.valueOf(s),s);
            }
        });

         // 反向排序
        JavaPairRDD<Integer,String> sortedPairs = pairs.sortByKey(false);
        
        JavaRDD<Integer> mapRDD = sortedPairs.map(new Function<Tuple2<Integer,String>, Integer>() {
            private static final long serialVersionUID = 1L;
            @Override
            public Integer call(Tuple2<Integer, String> v1) throws Exception {
                System.out.println("map: "+v1._1);
                return v1._1;
            }
        });

        List<Integer> listTop3 = mapRDD.take(3);

        for(Integer num:listTop3){
            System.out.println(num);
        }

        jsc.stop();
    }
}

针对 pair 类型进行 top N 的处理

class1 90
class2 56
class1 87
class1 76
class2 88
class1 95
class1 74
class2 87
class2 67
class2 77

package com.core.demo;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Int;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Iterator;

public class GroupTop3 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setAppName("WordCountSort");
        conf.setMaster("local");

        JavaSparkContext jsc = new JavaSparkContext(conf);
        jsc.setLogLevel("ERROR");

        JavaRDD<String> lines = jsc.textFile("in/score.txt");

        // 对 PairRDD 进行分组
        JavaPairRDD<String,Iterable<Integer>> groupRDD = lines.mapToPair(new PairFunction<String, String, Integer>() {
            private static final long serialVersionUID = 1L;
            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {
                return new Tuple2<String, Integer>(s.split(" ")[0],Integer.valueOf(s.split(" ")[1]));
            }
        }).groupByKey();

        //对分组后的 RDD 的 value 进行排序
        JavaPairRDD<String, Iterable<Integer>> pairRDD = groupRDD.mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() {
            private static final long serialVersionUID = 1L;
            @Override
            public Tuple2<String, Iterable<Integer>> call(Tuple2<String, Iterable<Integer>> s) throws Exception {
                Integer[] top3 = new Integer[3];

                String className = s._1;
                Iterator<Integer> scores = s._2.iterator();

                while (scores.hasNext()){
                    Integer score = scores.next();
                    System.out.print("Mtop: "+score+",  ");
                    for(int i =0;i< 3;i++){
                        if(top3[i] ==null){
                            top3[i] = score; 
                            System.out.println("Atop["+i+"]"+top3[i]); 
                            break;
                        }else if(score > top3[i]){ 
                            for(int j = 2;j> i;j--){
                                top3[j] = top3[j-1];
                              //  System.out.println("Btop["+j+"]"+top3[j]);
                            }
                            top3[i] =score;
                            break;
                        }
                    }
                }
                return new Tuple2<String, Iterable<Integer>>(className, Arrays.asList(top3));
            }
        });

        pairRDD.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
            @Override
            public void call(Tuple2<String, Iterable<Integer>> v) throws Exception {
                Iterator<Integer> scoreList = v._2.iterator();
                System.out.println("class: " + v._1);
                while (scoreList.hasNext()){
                    Integer score = scoreList.next();

                    System.out.println(score);
                }

                System.out.println("============================");
            }
        });

        jsc.stop();
    }
}

猜你喜欢

转载自blog.csdn.net/dec_sun/article/details/90648927