join
cogroup
cartesian
coalesce
repartition
package com.paic.Spark;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.*;
/**
* Created by Xlucas on 2018/4/8.
*/
public class SparkDemo3 {
public static void main(String[] args){
SparkConf conf=new SparkConf();
conf.setAppName("SparkDemo3");
conf.setMaster("local[2]");
JavaSparkContext sc=new JavaSparkContext(conf);
sc.setLogLevel("ERROR");
JavaRDD<String> rdd1=sc.parallelize(Arrays.asList("1","2","3","4"));
JavaRDD<String> rdd2=sc.parallelize(Arrays.asList("3","4","5","6"));
//定义一个List 里面是一个Tuple2 这个是 一个PairRDD,从源码我们可以看到参数是一个List 类型是 Tuple2
//def parallelizePairs[K, V](list : java.util.List[scala.Tuple2[K, V]])
List<Tuple2<String,String>> tuple=new ArrayList<>();
tuple.add(new Tuple2<>("1","xlucas"));
tuple.add(new Tuple2<>("2","xlucas2"));
tuple.add(new Tuple2<>("3","xlucas3"));
JavaPairRDD<String,String> rdd3=sc.parallelizePairs(tuple);
List<Tuple2<String,String>> tuple1=new ArrayList<>();
tuple1.add(new Tuple2<>("1","xlucas"));
tuple1.add(new Tuple2<>("2","xlucas2"));
tuple1.add(new Tuple2<>("4","xlucas4"));
JavaPairRDD<String,String> rdd4=sc.parallelizePairs(tuple1);
//在做join的时候必须保证rdd 是pairRDD,
//join是按照key 进行关联,返回的是一个JavaPairRDD[K, scala.Tuple2[V, W]]
//[(2,(xlucas2,xlucas2)), (1,(xlucas,xlucas))] 返回的结果
System.out.println(rdd3.join(rdd4).collect());
//cogroup 类似一个fullOuterJoin 返回是一个PairRDD,
//返回值类型:JavaPairRDD[K, scala.Tuple2[java.lang.Iterable[V], java.lang.Iterable[W]]]
//[(4,([],[xlucas4])), (2,([xlucas2],[xlucas2])), (3,([xlucas3],[])), (1,([xlucas],[xlucas]))]
System.out.println(rdd3.cogroup(rdd4).collect());
// cartesian 返回的是一个笛卡尔积
//返回是一个JavaPairRDD[T, U]
System.out.println(rdd3.cartesian(rdd4).collect());
//coalesce(numPartitions)
//将RDD的分区数减至指定的numPartitions分区数
rdd4.coalesce(5);
//repartition(numPartitions),功能与coalesce函数相同,
// 实质上它调用的就是coalesce函数,只不是shuffle = true,意味着可能会导致大量的网络开销
//def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope {
//coalesce(numPartitions, shuffle = true)
// }
rdd4.repartition(5);
}
}