TopN——GroupTopN

TopN:

package spark.core;

import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

public class TopN {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("job_name");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        
        JavaRDD<String> textFile = sc.textFile("E:/operator/number.txt");
        //输入的是一个string的字符串,输出的是一个(String, Integer) 的map
        JavaPairRDD<Integer,String> mapToPair = textFile.mapToPair(new PairFunction<String, Integer, String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Tuple2<Integer, String> call(String value) throws Exception {
                return new Tuple2<Integer, String>(Integer.valueOf(value),value);
            }
        });
        
        JavaPairRDD<Integer,String> sortByKey = mapToPair.sortByKey(false);
        List<String> take = sortByKey.map(new Function<Tuple2<Integer,String>, String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public String call(Tuple2<Integer, String> result) throws Exception {
                return result._2;
            }
        }).take(3);
        
        for (String string : take) {
            System.out.println(string);
        }
        
        sc.close();
    }

}

分组TopN(在worker端排序):

package spark.core;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

public class GroupTopN {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("job_name");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        
        JavaRDD<String> textFile = sc.textFile("E:/operator/persion-score.txt");
        JavaPairRDD<String,Integer> mapToPair = textFile.mapToPair(new PairFunction<String, String, Integer>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Tuple2<String, Integer> call(String line) throws Exception {
                String[] split = line.split(" ");
                return new Tuple2<String, Integer>(split[0],Integer.valueOf(split[1]));
            }
        });
        
        JavaPairRDD<String,Iterable<Integer>> groupByKey = mapToPair.groupByKey();
        JavaPairRDD<String,Iterable<Integer>> mapToPair2 = groupByKey.mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Tuple2<String, Iterable<Integer>> call(
                    Tuple2<String, Iterable<Integer>> tuple) throws Exception {
                Iterable<Integer> scores = tuple._2;//把成绩取出来(但是它是封装的 里面还有数据 那我们遍历出来)
                Iterator<Integer> it = scores.iterator();//这里需要迭代一下
                List<Integer> arrayList = new ArrayList<Integer>();
                while (it.hasNext()) {
                Integer score = it.next();//循环遍历出每个成绩
                arrayList.add(score);//把成绩封装到Integer
                }
                //接下来需要把list中的封装的数据进行排序
                //Collections 操作集合 里面有个排序方法Comparator
                    
                Collections.sort(arrayList,new Comparator<Integer>() {

                    @Override
                    public int compare(Integer o1, Integer o2) {
                        return -(o1-o2);
                    }
                });
                
//                List<Integer> list = arrayList.subList(0, 2);//截取前两个
                return new Tuple2<String, Iterable<Integer>>(tuple._1,arrayList);
            }
        });
        
        mapToPair2.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {
            
            private static final long serialVersionUID = 1L;

            @Override
            public void call(Tuple2<String, Iterable<Integer>> tulple) throws Exception {
                System.out.println(tulple);
            }
        });
        
        
        
        sc.close();
    }

}

分组TopN②(在driver段调spark排序):

繁琐操作(一般不用)

package spark.core;

import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

public class GroupTopN_dirver {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("job_name");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        
        JavaRDD<String> textFile = sc.textFile("E:/operator/persion-score.txt");
        JavaPairRDD<String,Integer> mapToPair = textFile.mapToPair(new PairFunction<String, String, Integer>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Tuple2<String, Integer> call(String line) throws Exception {
                String[] split = line.split(" ");
                return new Tuple2<String, Integer>(split[0],Integer.valueOf(split[1]));
            }
        });
        
        JavaPairRDD<String,Iterable<Integer>> groupByKey = mapToPair.groupByKey();
        
        //把key取到
        final List<String> keys = groupByKey.map(new Function<Tuple2<String,Iterable<Integer>>, String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public String call(Tuple2<String, Iterable<Integer>> tuple)
                    throws Exception {
                return tuple._1;
            }
        }).collect();
        
        for (int i = 0; i < keys.size(); i++) {
             final int index = i; 

        JavaRDD<Integer> flatMap = groupByKey.filter(new Function<Tuple2<String,Iterable<Integer>>, Boolean>() {
            
            private static final long serialVersionUID = 1L;

            @Override
            public Boolean call(Tuple2<String, Iterable<Integer>> tuple)
                    throws Exception {
                return tuple._1.equals(keys.get(index));
            }
        }).flatMap(new FlatMapFunction<Tuple2<String,Iterable<Integer>>, Integer>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Iterator<Integer> call(Tuple2<String, Iterable<Integer>> tuple)
                    throws Exception {
                return tuple._2.iterator();
            }
        });
        
        JavaPairRDD<Integer, String> kv = flatMap.mapToPair(new PairFunction<Integer, Integer, String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Tuple2<Integer, String> call(Integer v) throws Exception {
                return new Tuple2<Integer, String>(v,keys.get(index));
            }
        });
        
        JavaPairRDD<Integer,String> sortByKey = kv.sortByKey(false);
        JavaPairRDD<String,Integer> mapToPair2 = sortByKey.mapToPair(new PairFunction<Tuple2<Integer,String>, String, Integer>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Tuple2<String, Integer> call(Tuple2<Integer, String> tuple)
                    throws Exception {
                return new Tuple2<String, Integer>(tuple._2, tuple._1);
            }
        });
        
        JavaPairRDD<String,Iterable<Integer>> groupByKey2 = mapToPair2.groupByKey();
        groupByKey2.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {
            
            private static final long serialVersionUID = 1L;

            @Override
            public void call(Tuple2<String, Iterable<Integer>> result) throws Exception {
                System.out.println(result);
            }
        });
        }
        
        sc.close();
    }

}

猜你喜欢

转载自www.cnblogs.com/ibigjy/p/10310548.html