SparkRDD之combineByKey

combineByKey是Spark中一个比较核心的高级函数,其他一些高阶键值对函数底层都是用它实现的。诸如 groupByKey,reduceByKey等等。combineByKey作用在键值对RDD上,根据键来对RDD进行合并。

java示例:

package com.cb.spark.sparkrdd;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;

public class CombineByKeyExample {
	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("CombineByKey").setMaster("local");
		JavaSparkContext jsc = new JavaSparkContext(conf);

		List<String> l1 = new ArrayList<>();
		l1.add("dog");
		l1.add("cat");
		l1.add("gnu");
		l1.add("salmon");
		l1.add("rabbit");
		l1.add("turkey");
		l1.add("wolf");
		l1.add("bear");
		l1.add("bee");
		JavaRDD<String> javaRDD = jsc.parallelize(l1, 3);
		JavaRDD<Integer> javaRDD2 = jsc.parallelize(Arrays.asList(1, 1, 2, 2, 2, 1, 2, 2, 2), 3);
		JavaPairRDD<Integer, String> javaPairRDD = javaRDD2.zip(javaRDD);
		JavaPairRDD<Integer, List<String>> javaPairRDD2 = javaPairRDD
//输入string,返回List<String>,也就是将每个partition的第一个元素(String类型)添加到list中,此时每个partition中的元素为List<string>,string,string
				.combineByKey(new Function<String, List<String>>() {
					private static final long serialVersionUID = 1L;

					@Override
					public List<String> call(String arg0) throws Exception {
						List<String> list = new ArrayList<>();
						list.add(arg0);
						return list;
					}
				}, new Function2<List<String>, String, List<String>>() {
					private static final long serialVersionUID = 1L;
//输入List<String>和String,这里的List<String>就是上一个函数作用的结果,这一步作用是把每个partition中剩余的String类型元素添加到List<String>当中,最后返回一个List<String>
					@Override
					public List<String> call(List<String> arg0, String arg1) throws Exception {
						arg0.add(arg1);
						return arg0;
					}
				}, new Function2<List<String>, List<String>, List<String>>() {
					private static final long serialVersionUID = 1L;
//输入List<String>,输出List<String>,这一个函数作用是把各个partition中的List<String>进行合并,返回最终的List<String>
					@Override
					public List<String> call(List<String> arg0, List<String> arg1) throws Exception {
						arg0.addAll(arg1);
						return arg0;
					}
				});
		// (1,[dog, cat, turkey])
		// (2,[gnu, salmon, rabbit, wolf, bear, bee])
		javaPairRDD2.foreach(x -> System.out.print(x + " "));

		jsc.stop();
	}
}

猜你喜欢

转载自blog.csdn.net/u013230189/article/details/81698724