package com.ws.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object JoinTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("JoinTest")
val sc = new SparkContext(conf)
val data1 = List(("1", "baby"), ("1", "baby"), ("2", "dog"),("3","fire"),("4","mouse"))
val data2 = List(("5", "rabbit"), ("1", "baby"), ("2", "dog"),("3","fire"),("5","rabbit"))
val RDD1 = sc.parallelize(data1)
val RDD2 = sc.parallelize(data2)
//join算子底层采用cogroup,然后计算笛卡尔积
val resultData: RDD[(String, (String, String))] = RDD1.join(RDD2)
//cogroup :: 把两边RDD相同key的value值抽取出来,形成2个迭代器,一个是装第一个RDD的value值,另一个是装第二个RDD的value值
// 然后形成一个元组(key,(value1,value2)),key还是原来的key,value*是每个RDD的value值,是迭代器
//join算子其实就是将 value1,value2两两操作,假设value1迭代器值为(1,2),value2迭代器为(2,3),那么join的结果为
//(1,2),(1,3),(2,2),(2,3)
//如果一方不存在这个相同的key,那么将不会有结果
val value: RDD[(String, (Iterable[String], Iterable[String]))] = RDD1.cogroup(RDD2)
for (i <- value){
println(i)
for (j <- i._2._1;k <- i._2._2){
println(s"($j,$k)")
}
}
println(resultData.collect.toBuffer)
sc.stop()
}
}
(5,(CompactBuffer(),CompactBuffer(rabbit, rabbit)))
(2,(CompactBuffer(dog),CompactBuffer(dog)))
(3,(CompactBuffer(fire),CompactBuffer(fire)))
(4,(CompactBuffer(mouse),CompactBuffer()))
(1,(CompactBuffer(baby, baby),CompactBuffer(baby)))
(baby,baby)
(dog,dog)
(fire,fire)
(baby,baby)
ArrayBuffer((1,(baby,baby)), (1,(baby,baby)), (2,(dog,dog)), (3,(fire,fire)))