Description
- De-duplicate the elements in the RDD and return to the new RDD.
- The number of new RDD partitions is the same as the old RDD by default.
- This process will happen shuffle.
Function signature
Code
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getName).setMaster("local[*]")
val sc = new SparkContext(conf)
val rdd: RDD[Int] = sc.makeRDD(List(1, 3, 5, 7, 9, 9, 7, 3), 2)
println("-----------------去重前-------------------")
rdd.mapPartitionsWithIndex {
(index, datas) => {
println(index + "----->" + datas.mkString(","))
datas
}
}.collect()
println("-----------------去重后-------------------")
val newRDD: RDD[Int] = rdd.distinct(3)
newRDD.mapPartitionsWithIndex{
(index, datas) => {
println(index + "----->" + datas.mkString(","))
datas
}
}.collect()
sc.stop()