1.源数据
hello word
hello java
hello python
hello waad
hello word
hello java
hello python
hello waad
hello word
hello java
hello python
hello waad
hello word
hello java
hello python
hello waad
hello word
hello java
hello python
hello waad
2.Java版
package com.bjsxt.com;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import scala.actors.threadpool.Arrays;
public class Test {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("test").setMaster("local");
JavaSparkContext jsc= new JavaSparkContext(conf);
JavaRDD<String> line = jsc.textFile("./words");
JavaRDD<String> rdd1 = line.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String s) throws Exception {
return Arrays.asList(s.split(" "));
}
});
JavaPairRDD<String, Integer> rdd2 = rdd1.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String w) throws Exception {
return new Tuple2<String, Integer>(w, 1);
}
});
JavaPairRDD<String, Integer> result = rdd2.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return (v1+v2);
}
});
result.foreach(new VoidFunction<Tuple2<String,Integer>>() {
@Override
public void call(Tuple2<String, Integer> arg0) throws Exception {
System.out.println(arg0);
}
});
jsc.stop();
}
}
Scala版
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object SparkWord {
def main(args:Array[String]):Unit={
val conf=new SparkConf()
conf.setAppName("test")
conf.setMaster("local")
val sc=new SparkContext(conf)
val line=sc.textFile("./words")
val rdd1=line.flatMap(sd=>{
sd.split(" ")
})
val rdd2=rdd1.map(s=>{
new Tuple2(s,1)
})
val rdd3=rdd2.reduceByKey((v1,v2)=>{
v1+v2
})
val res=rdd3.sortBy(tuple=>{tuple._2},false)
res.foreach(sd=>{
print(sd)
})
}
}