针对一串数字进行 Top 的处理
3
5
6
7
1
4
5
6
9
0
3
package com.core.demo;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.omg.PortableInterceptor.INACTIVE;
import scala.Int;
import scala.Tuple2;
import java.util.*;
public class TestTop3 {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName("WordCountSort");
conf.setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
jsc.setLogLevel("ERROR");
JavaRDD<String> lines = jsc.textFile("in/top.txt");
JavaPairRDD<Integer,String> pairs= lines.mapToPair(new PairFunction<String,Integer,String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<Integer, String> call(String s) throws Exception {
return new Tuple2<Integer, String>(Integer.valueOf(s),s);
}
});
// 反向排序
JavaPairRDD<Integer,String> sortedPairs = pairs.sortByKey(false);
JavaRDD<Integer> mapRDD = sortedPairs.map(new Function<Tuple2<Integer,String>, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Tuple2<Integer, String> v1) throws Exception {
System.out.println("map: "+v1._1);
return v1._1;
}
});
List<Integer> listTop3 = mapRDD.take(3);
for(Integer num:listTop3){
System.out.println(num);
}
jsc.stop();
}
}
针对 pair 类型进行 top N 的处理
class1 90
class2 56
class1 87
class1 76
class2 88
class1 95
class1 74
class2 87
class2 67
class2 77
package com.core.demo;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Int;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
public class GroupTop3 {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName("WordCountSort");
conf.setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
jsc.setLogLevel("ERROR");
JavaRDD<String> lines = jsc.textFile("in/score.txt");
// 对 PairRDD 进行分组
JavaPairRDD<String,Iterable<Integer>> groupRDD = lines.mapToPair(new PairFunction<String, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String, Integer>(s.split(" ")[0],Integer.valueOf(s.split(" ")[1]));
}
}).groupByKey();
//对分组后的 RDD 的 value 进行排序
JavaPairRDD<String, Iterable<Integer>> pairRDD = groupRDD.mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Iterable<Integer>> call(Tuple2<String, Iterable<Integer>> s) throws Exception {
Integer[] top3 = new Integer[3];
String className = s._1;
Iterator<Integer> scores = s._2.iterator();
while (scores.hasNext()){
Integer score = scores.next();
System.out.print("Mtop: "+score+", ");
for(int i =0;i< 3;i++){
if(top3[i] ==null){
top3[i] = score;
System.out.println("Atop["+i+"]"+top3[i]);
break;
}else if(score > top3[i]){
for(int j = 2;j> i;j--){
top3[j] = top3[j-1];
// System.out.println("Btop["+j+"]"+top3[j]);
}
top3[i] =score;
break;
}
}
}
return new Tuple2<String, Iterable<Integer>>(className, Arrays.asList(top3));
}
});
pairRDD.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
@Override
public void call(Tuple2<String, Iterable<Integer>> v) throws Exception {
Iterator<Integer> scoreList = v._2.iterator();
System.out.println("class: " + v._1);
while (scoreList.hasNext()){
Integer score = scoreList.next();
System.out.println(score);
}
System.out.println("============================");
}
});
jsc.stop();
}
}