一、aggregateByKey

/**
     * 转换算子: aggregateByKey
     * 注意: 作用在PairedRDD身上
     * 逻辑: 也是一个聚合运算，类似于reduceByKey和foldByKey
     *       aggregateByKey[U: ClassTag](zeroValue: U)(seqOp: (U, V) => U, combOp: (U, U) => U)
     *       - 按照相同的Key进行分组，将所有的value放入到一个分组中，对其进行处理
     *       - 依次将每一个分区中的相同的键对应的数据进行运算，按照seqOp进行运算，得到结果。这个过程中，会使用到zeroValue
     *       - 每一个分区都计算完成后，将不同的分区中相同的键聚合到一起，这个过程使用的是combOp函数进行聚合。这个过程中，不会使用到zeroValue
     *
     * zeroValue: 分区内计算的初始值
     * seqOp: 分区内聚合的计算逻辑
     * combOp: 不同分区之间进行聚合的计算逻辑
     */
    @Test def aggregateByKeyTest(): Unit = {
        // 1. 准备数据
        val rdd: RDD[(String, Int)] = sc.parallelize(Array(("贝吉塔", 3), ("贝吉塔", 4), ("卡卡罗特", 5), ("樱木花道", 6), ("卡卡罗特", 7), ("贝吉塔", 8)), 3)
        // 2. 聚合
        val res: RDD[(String, Int)] = rdd.aggregateByKey(5)(Math.max, _ + _)

        res.foreach(println)
    }

二、combineByKey

/**
     * 转换算子: combineByKey
     * 注意: 作用在PairedRDD身上
     * 逻辑:
     *       combineByKey[C](createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C)
     *
     *       - createCombiner:
     *         combineByKey在进行逻辑处理的时候，会依次遍历分区中的每一个键值对
     *         如果这个key是第一次遍历到的，此时就会触发这个函数，为其创建一个自定义的累加器的初始值
     *
     *       - mergeValue:
     *         如果一个分区中所有的数据，键在之前都已经遍历过了，已经创建好了累加器了，此时就会触发这个方法，进行累加（分区内）
     *
     *       - mergeCombiners:
     *         不同的分区中，可能存在相同的键，当所有的分区都聚合完成，最后将不同的分区中的数据进行合并，相同的键按照这个函数进行合并
     */
    @Test def combineByKeyTest(): Unit = {
        // 1. 准备数据
        val rdd: RDD[(String, Int)] = sc.parallelize(Array(("chinese", 90), ("math", 97), ("math", 90), ("english", 87), ("math", 88), ("chinese", 98)), 2)
        // 2. 统计每一个学科的总成绩和成绩的数量
        val rdd2: RDD[(String, (Int, Int))] = rdd.combineByKey(
            v => (v, 1),
            (combiner: (Int, Int), v) => (combiner._1 + v, combiner._2 + 1),
            (c1: (Int, Int), c2: (Int, Int)) => (c1._1 + c2._1, c1._2 + c2._2)
        )
        // 3. 遍历最后的结果
        rdd2.foreach(println)
    }

三、sortByKey

/**
     * 转换算子: sortByKey
     * 注意事项: 作用在PairedRDD身上的
     * 逻辑: 将数据按照key进行排序
     *
     *     - ascending: true => 升序排列, false => 降序排列
     *     - numPartitions: 排序之后的数据放到几个分区中，默认是照原来的分区数量
     */
    @Test def sortByKeyTest(): Unit = {
        // 2. 准备数据
        val rdd: RDD[(Int, String)] = sc.parallelize(Array("jerry", "tomtomtom", "hank", "jaifei", "snoopy"), 3).keyBy(_.length)
        val res1: RDD[(Int, String)] = rdd.sortByKey(ascending = false, 1)
        res1.foreach(println)
    }

四、sortBy

/**
     * 转换算子: sortBy
     */
    @Test def sortByTest(): Unit = {
        //
        val rdd: RDD[String] = sc.parallelize(Array("tom", "jerry", "snoppy", "hank", "kitty"))
        // 按照字符串的长度进行排序
        val rdd2: RDD[String] = rdd.sortBy(_.length, ascending = true, 1)

        rdd2.foreach(println)


        val rdd3: RDD[(String, Int)] = rdd.map(s => (s, s.length))
        val res: RDD[(String, Int)] = rdd3.sortBy(-_._2)
        res.foreach(println)
    }

五、union

/**
     * 转换算子: union
     * 逻辑: 对两个RDD进行合并，合并结果不去重
     * 合并之后，会生成参与合并的两个RDD的分区数量总和的分区数
     */
    @Test def unionTest(): Unit = {
        val rdd1: RDD[Int] = sc.parallelize(Array(1, 2, 3, 3, 4, 5, 5), 2)
        val rdd2: RDD[Int] = sc.parallelize(Array(2, 3, 3, 4, 4, 5, 6), 3)
        // 合并
        val res: RDD[Int] = rdd1.union(rdd2)

        //res.foreach(print)
        //println(res.getNumPartitions)

        val res1: RDD[Int] = rdd1.++(rdd2)
        res1.foreach(println)
        println(res1.getNumPartitions)
    }

六、distinct

/**
     * 转换算子: distinct
     * 逻辑: 去除重复元素
     */
    @Test def distinctTest(): Unit = {
        val rdd: RDD[Int] = sc.parallelize(Array(1, 2, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6))
       // val res: RDD[Int] = rdd.distinct()
       // res.foreach(println)

        val res1: RDD[Int] = rdd.distinct(2)
        println(res1.getNumPartitions)
        res1.foreach(println)
    }

七、intersection

/**
     * 转换算子: intersection
     * 计算两个RDD的交集
     */
    @Test def intersectionTest(): Unit = {
        val rdd1: RDD[Int] = sc.parallelize(Array(1, 2, 3, 3, 4, 5, 6))
        val rdd2: RDD[Int] = sc.parallelize(Array(2, 3, 3, 4, 7, 8, 9))

        // 计算交集
        val rdd3: RDD[Int] = rdd1.intersection(rdd2)

        rdd3.foreach(println)
    }

八、join

    /**
     * 转换算子: join
     * 注意事项: 作用在PairedRDD身上的
     * 逻辑: 类似于SQL中的连接查询，将两个RDD中的数据连接到一起，这会形成笛卡尔积
     */
    @Test def joinTest(): Unit = {
        // 1. 准备数据
        val rdd1: RDD[(Int, String)] = sc.parallelize(Array("Tom", "Jerry", "诸葛亮亮", "上官婉儿")).keyBy(_.length)
        val rdd2: RDD[(Int, String)] = sc.parallelize(Array("chinese", "math", "ABCD")).keyBy(_.length)

        // 2. (内)连接
        val res0: RDD[(Int, (String, String))] = rdd1.join(rdd2)
        res0.foreach(println)

        // 3. 左连接
        val res1: RDD[(Int, (String, Option[String]))] = rdd1.leftOuterJoin(rdd2)
        res1.foreach(println)

        // 4. 右连接
        val res2: RDD[(Int, (Option[String], String))] = rdd1.rightOuterJoin(rdd2)
        res2.foreach(println)

        // 5. 全连接
        val res3: RDD[(Int, (Option[String], Option[String]))] = rdd1.fullOuterJoin(rdd2)
        res3.foreach(println)
    }

九、repartition

    /**
     * 转换算子: repartition
     * 对一个RDD进行重新的分区（repartition适合扩大分区）
     */
    @Test def repartitionTest(): Unit = {
        // 1. 通过集合，创建RDD
        val rdd1: RDD[Int] = sc.parallelize(1 to 20, 2)
        rdd1.mapPartitionsWithIndex((index, iterator) => iterator.map(index + " => " + _)).foreach(println)

        // 2. 重新分区
        val rdd2: RDD[Int] = rdd1.repartition(4)
        rdd2.mapPartitionsWithIndex((index, iterator) => iterator.map(index + " => " + _)).foreach(println)
    }

十、coalesce

    /**
     * 转换算子: coalesce
     * 对一个RDD进行重新的分区
     *
     * repartition底层就是调用的coalesce来实现的重新分区
     * - 如果要扩大分区，则一定要触发shuffle，否则将无法分区
     * - 如果要缩小分区，可以触发shuffle，也可以不触发
     *
     * repartition:
     *     - 由于底层调用coalesce强制触发shuffle了，因此适合于扩大分区
     * coalesce:
     *     - 可以自己决定是否要触发shuffle，默认是不触发的，因此适合缩小分区
     *
     */
    @Test def coalesceTest(): Unit = {
        // 1. 通过集合，创建RDD
        val rdd1: RDD[Int] = sc.parallelize(1 to 20, 4)
        rdd1.mapPartitionsWithIndex((index, iterator) => iterator.map(index + " => " + _)).foreach(println)

        // 2. 重新分区
        val rdd2: RDD[Int] = rdd1.coalesce(2, true)
        rdd2.mapPartitionsWithIndex((index, iterator) => iterator.map(index + " => " + _)).foreach(println)
    }

十一、cogroup

    /**
     * 转换算子: cogroup
     * 注意事项: 作用在PairedRDD身上的算子
     * 逻辑: 将多个RDD中的数据，按照Key进行分组，将相同Key的值聚合成一个集合
     */
    @Test def cogroupTest(): Unit = {
        val rdd1: RDD[(String, Int)] = sc.parallelize(Array(("chinese", 90), ("chinese", 87), ("chinese", 100), ("math", 99), ("math", 80)))
        val rdd2: RDD[(String, Int)] = sc.parallelize(Array(("chinese", 89), ("chinese", 76), ("math", 100), ("math", 76), ("math", 57)))

        // 一个RDD和另外的一个RDD进行聚合分组
        val res1: RDD[(String, (Iterable[Int], Iterable[Int]))] = rdd1.cogroup(rdd2)
        res1.foreach(println)

        val res2: RDD[(String, (Iterable[Int], Iterable[Int], Iterable[Int], Iterable[Int]))] = rdd1.cogroup(rdd2, rdd2, rdd2)
    }

十二、sample

    /**
     * 转换算子: sample
     *
     */
    @Test def sampleTest(): Unit = {
        // 1. 准备一个RDD
        val rdd1: RDD[Int] = sc.parallelize(1 to 1000)

        // 2.
        val res: RDD[Int] = rdd1.sample(withReplacement = false, 0.1)

        res.foreach(println)
    }

RDD编程-RDD算子的使用（二）

一、aggregateByKey

二、combineByKey

三、sortByKey

四、sortBy

五、union

六、distinct

七、intersection

八、join

九、repartition

十、coalesce

十一、cogroup

十二、sample

Guess you like