join/cogroup示例

package com.ws.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object JoinTest {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("JoinTest")

    val sc = new SparkContext(conf)

    val data1 = List(("1", "baby"), ("1", "baby"), ("2", "dog"),("3","fire"),("4","mouse"))
    val data2 = List(("5", "rabbit"), ("1", "baby"), ("2", "dog"),("3","fire"),("5","rabbit"))

    val RDD1 = sc.parallelize(data1)
    val RDD2 = sc.parallelize(data2)

    //join算子底层采用cogroup,然后计算笛卡尔积
    val resultData: RDD[(String, (String, String))] = RDD1.join(RDD2)

    //cogroup :: 把两边RDD相同key的value值抽取出来,形成2个迭代器,一个是装第一个RDD的value值,另一个是装第二个RDD的value值
    //  然后形成一个元组(key,(value1,value2)),key还是原来的key,value*是每个RDD的value值,是迭代器
    //join算子其实就是将 value1,value2两两操作,假设value1迭代器值为(1,2),value2迭代器为(2,3),那么join的结果为
    //(1,2),(1,3),(2,2),(2,3)
    //如果一方不存在这个相同的key,那么将不会有结果
    val value: RDD[(String, (Iterable[String], Iterable[String]))] = RDD1.cogroup(RDD2)

    for (i <- value){
      println(i)
      for (j <- i._2._1;k <- i._2._2){
        println(s"($j,$k)")
      }
    }

    println(resultData.collect.toBuffer)

    sc.stop()


  }

}

(5,(CompactBuffer(),CompactBuffer(rabbit, rabbit)))
(2,(CompactBuffer(dog),CompactBuffer(dog)))
(3,(CompactBuffer(fire),CompactBuffer(fire)))
(4,(CompactBuffer(mouse),CompactBuffer()))
(1,(CompactBuffer(baby, baby),CompactBuffer(baby)))
(baby,baby)
(dog,dog)
(fire,fire)
(baby,baby)
ArrayBuffer((1,(baby,baby)), (1,(baby,baby)), (2,(dog,dog)), (3,(fire,fire)))

猜你喜欢

转载自blog.csdn.net/bb23417274/article/details/84728651