Operator_combineByKey

package com.bjsxt.spark.transformations;


import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;


import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;


import scala.Tuple2;


public class Operator_combineByKey {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("combineByKey");
JavaSparkContext sc = new JavaSparkContext(conf);
/**
  * zhangsan 7
  * lisi 9
  * wangwu 5
  */
JavaPairRDD<String, Integer> parallelizePairs = sc.parallelizePairs(Arrays.asList(
new Tuple2<String, Integer>("zhangsan", 18),
new Tuple2<String, Integer>("zhangsan", 19),
new Tuple2<String, Integer>("lisi",20),
new Tuple2<String, Integer>("zhangsan", 21),
new Tuple2<String, Integer>("wangwu", 22),
new Tuple2<String, Integer>("lisi", 23)
),2);
parallelizePairs.mapPartitionsWithIndex(new Function2<Integer, Iterator<Tuple2<String,Integer>>, Iterator<Tuple2<String,Integer>>>() {


/**

*/
private static final long serialVersionUID = 1L;


@Override
public Iterator<Tuple2<String, Integer>> call(Integer index,
Iterator<Tuple2<String, Integer>> iter) throws Exception {
List<Tuple2<String, Integer>> list = new ArrayList<Tuple2<String, Integer>>();
while(iter.hasNext()){
Tuple2<String, Integer> next = iter.next();
System.out.println("partitionindex ="+index+",value="+next);
list.add(next);
}
return list.iterator();
}


}, true).collect();

System.out.println("****************");
JavaPairRDD<String, String> combineByKey = parallelizePairs.combineByKey(
new Function<Integer, String>() {


/**
* 把当前分区的第一个值当做v1,这里给每个partition相当于初始化值
*/
private static final long serialVersionUID = 1L;


@Override
public String call(Integer v1) throws Exception {
return v1+"~";
}
}, new Function2<String, Integer, String>() {


/**
* 合并在同一个partition中的值
* call 方法的第一个参数就是分区中的初始值,第二个参数是分区中的第二个值,将结果再赋值给第一个参数,以此类推。。。。。。

* partition 0 : 
* (zhangsan,18) ----  18~
  * (zhangsan,19)
  * (lisi,20) ----  20~
  *  0分区结果:(zhangsan,18~!19)
  (lisi,20~)
  * partition 1 :
  * (zhangsan,21) ----  21~
  * (wangwu,22) ----  22~
  * (lisi,23) ----  23~
  *  1分区结果:(zhangsan,21~)
  (wangwu,22~)
  (lisi,23~)
*/
private static final long serialVersionUID = 1L;


@Override
public String call(String s, Integer v) throws Exception {
return s+"!"+v;
}
}, new Function2<String,String,String>() {


/**
* 合并不同partition中的值
* partition 0 :
* 0分区结果:(zhangsan,18~!19)
  (lisi,20~)
  * partition 1 :
  *  1分区结果:(zhangsan,21~)
  (wangwu,22~)
  (lisi,23~)
  *  
  *  分区合并结果
  (zhangsan,18~!19#21~)
  (lisi,20~#23~)
  *    (wangwu,22~)
*/
private static final long serialVersionUID = 1L;


@Override
public String call(String s1, String s2) throws Exception {
return s1+"#"+s2;
}
});
combineByKey.foreach(new  VoidFunction<Tuple2<String,String>>() {

/**

*/
private static final long serialVersionUID = 1L;


@Override
public void call(Tuple2<String, String> tuple) throws Exception {
System.out.println(tuple);
}
});
sc.stop();
}
}

猜你喜欢

转载自blog.csdn.net/weixin_40398774/article/details/80723136