不同点:map操作是对RDD中每个元素进行操作的,操作的结果是一对一的而flatMap操作也是对RDD中每个元素进行操作的,但是它的操作结果是一对一或者是一对多的
如spark入门的单词统计案例中对单词的分割就要用到flatMap,因为分割以后的结果比元素要多用map就不行:
package com.lilei.rdd;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class WordCount {
public static void main(String[] args) {
//本地测试
// SparkConf conf = new SparkConf()
// .setAppName("javaWC").setMaster("local[2]");
SparkConf conf = new SparkConf()
.setAppName("javaWC");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("hdfs://192.168.xx.xxx:9000/wcdemo/wcdemo.txt");
JavaRDD<String> words = lines.flatMap(
new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
return Arrays.asList(s.split(",")).iterator();
}
}
);
JavaPairRDD<String, Long> ones = words.mapToPair(
new PairFunction<String, String, Long>() {
@Override
public Tuple2<String, Long> call(String s) throws Exception {
return new Tuple2<>(s, 1l);
}
}
);
JavaPairRDD<String, Long> counts = ones.reduceByKey(
new Function2<Long, Long, Long>() {
@Override
public Long call(Long v1, Long v2) throws Exception {
return v1 + v2;
}
}
);
List<Tuple2<String, Long>> collect = counts.collect();
for (Tuple2<?, ?> tuple : collect) {
System.out.println(tuple._1() + ": " + tuple._2());
}
sc.close();
}
}
案例demo展示结果: