1. 复习List的操作
list操作:
val ls1 = List(1)
val ls2 = List(2)
// 追加一个元素
ls1 :+ 2 这个结果为:res15: List[Any] = List(1, 2)
// 追加一个集合
ls1 ++ ls2 这个结果为:res16: List[Int] = List(1, 2)
// 追加一个集合
ls1 ::: ls2 这个结果为:res17: List[Int] = List(1, 2)
2. combineByKey
val rdd1 = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","wolf","bear","bee"),3)
val rdd2 = sc.parallelize(List(1,2,1,2,1,2,1,2),3)
scala> val rdd3 = rdd2.zip(rdd1).collect
rdd3: Array[(Int, String)] = Array((1,dog), (2,cat), (1,gnu), (2,salmon), (1,rabbit), (2,wolf), (1,bear), (2,bee))
scala> val rdd3 = rdd2.zip(rdd1)
rdd3: org.apache.spark.rdd.RDD[(Int, String)] = ZippedPartitionsRDD2[5] at zip at <console>:27
scala> rdd3.combineByKey(List(_),(a:List[String],b:String)=> a :+ b,(x:List[String],y:List[String])=> x ::: y)
res21: org.apache.spark.rdd.RDD[(Int, List[String])] = ShuffledRDD[6] at combineByKey at <console>:26
scala> res21.collect
res22: Array[(Int, List[String])] = Array((1,List(gnu, rabbit, dog, bear)), (2,List(cat, wolf, bee, salmon)))
说明: List[String],b:String)=> a :+ b 作用是什么?