spark高级算子(一)

import org.apache.spark.{SparkConf, SparkContext}

/**
  * @author zoujc
  * @date 2018/11/1
  */
object SparkRDDTest1 {
   def main(args: Array[String]): Unit = {
      val conf = new SparkConf().setAppName("SparkRDDTest1").setMaster("local[2]")
      val sc = new SparkContext(conf)

      //指定为两个分区
      val rdd1 = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7), 2)
      //设定一个函数，设定分区的ID索引，数值
      val func1 = (index: Int, iter: Iterator[Int]) => {
         iter.toList.map(x => s"[partID: $index,val: $x]").iterator
      }

      //查看每个分区的信息
      val res1 = rdd1.mapPartitionsWithIndex(func1)
      //    println(res1.collect().toBuffer)

      //用aggregate,指定初始值，对rdd1进行聚合操作，先局部求和，在进行全局求和
      val res2 = rdd1.aggregate(0)(_ + _, _ + _)
      //    println(res2)

      //将每个分区中最大的找出来求和
      val res3 = rdd1.aggregate(0)(math.max(_, _),(_ + _))
      //每个分区都以10为初始值，10用了3次
      val res4 = rdd1.aggregate(10)(_ + _, _ + _)
      //在List中有多少元素比e大和有多少元素比e小
      val rdd2 = sc.parallelize(List("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"))
      val (biggerthane, lessthane) = rdd2.aggregate((0, 0))(
         (ee, str) => {
            var biggere = ee._1
            var lesse = ee._2
            if (str.compareTo("e") >= 0) biggere = ee._1 + 1
            else if (str.compareTo("e") < 0) lesse = ee._2 + 1
            (biggere, lesse)
         },
         (x, y) => (x._1 + y._1, x._2 + y._2)
      )
      //    println((biggerthane,lessthane))

      //aggregate与aggregateByKey区别：前者针对序列操作，后者针对k,v对操作
      //原型
      //    def aggregateByKey[U: ClassTag](zeroValue: U)(seqOp: (U, V)) =>
      //    combOp: (U,U): RDD[(K,U)] = self.withScope{
      //       aggregateByKey(zeroValue, defaultPartitioner(self))(seqOp, combOp)
      //    }

      //combineByKey
//    def combineByKey[C](
//       createCombiner: V => C,
//       mergeValue: (C, V) => C,
//       mergeCombiners: (C, C) => C): RDD[(K, C)] = self.withScope {
//       combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners)(null)
//    }
      //从上面这段源码可以清晰看出，aggregateByKey调用的就是combineByKey方法。
      // seqOp方法就是mergeValue，combOp方法则是mergeCombiners，cleanedSeqOp(createZero(), v)是createCombiner,
      // 也就是传入的seqOp函数, 只不过其中一个值是传入的zeroValue而已！
      //因此, 当createCombiner和mergeValue函数的操作相同, aggregateByKey更为合适！

      val rdd3 = sc.parallelize(List("a","b","c","d","e","f"),2)
      val res5 = rdd3.aggregate("|")(_ + _, _ + _)
//    println(res5)

      val rdd4 = sc.parallelize(List("12","23","345","4567"),2)
      //两个分区，计算出字符串最大长度，然后合成字符串
      val res6 = rdd4.aggregate("")((x,y) => math.max(x.length,y.length).toString, (x,y) => x + y)
//    println(res6)   24

      val rdd5 = sc.parallelize(List("12", "23", "345", ""), 2)
      val res7 = rdd4.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y)
//    println(res7)   11

      val rdd6 = sc.parallelize(List("12", "23", "", "345"), 2)
      val res8 = rdd6.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y)
//    println(res8)   11

      //aggregateByKey可以先进行局部操作，再进行全局操作。
      val pariRDD = sc.parallelize(List(("cat",2), ("cat",5), ("mouse", 4), ("cat", 12), ("dog", 12), ("mouse", 2)),2)

      def func2(index: Int, iter: Iterator[(String,Int)]): Iterator[String] ={
         iter.toList.map(x => s"[PartID: $index, val: $x]").iterator
      }

      println(pariRDD.mapPartitionsWithIndex(func2).collect().toBuffer)
      //把每种类型最大的次数取出来
      val res9 = pariRDD.aggregateByKey(0)(math.max(_, _),_ + _)
//    println(res9.collect().toBuffer)
//    ArrayBuffer((dog,12), (cat,17), (mouse,6))

      //不为10的变成10
      val res10 = pariRDD.aggregateByKey(10)(math.max(_, _),_ + _)
//    println(res10.collect().toBuffer)
//    ArrayBuffer((dog,12), (cat,22), (mouse,20))

      /**
        * pairRDD.aggregateByKey(0)(_ + _ , _ + _).collect与pairRDD.reduceByKey( _ + _).collect，
        * 这两个方法执行结果是一样的，实际上底层都是调用的同一个方法：combineByKey
        */
   }
}
猜你喜欢