import org.apache.spark.{SparkConf, SparkContext}
/**
* @author zoujc
* @date 2018/11/1
*/
object SparkRDDTest1 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkRDDTest1").setMaster("local[2]")
val sc = new SparkContext(conf)
//指定为两个分区
val rdd1 = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7), 2)
//设定一个函数,设定分区的ID索引,数值
val func1 = (index: Int, iter: Iterator[Int]) => {
iter.toList.map(x => s"[partID: $index,val: $x]").iterator
}
//查看每个分区的信息
val res1 = rdd1.mapPartitionsWithIndex(func1)
// println(res1.collect().toBuffer)
//用aggregate,指定初始值,对rdd1进行聚合操作,先局部求和,在进行全局求和
val res2 = rdd1.aggregate(0)(_ + _, _ + _)
// println(res2)
//将每个分区中最大的找出来求和
val res3 = rdd1.aggregate(0)(math.max(_, _),(_ + _))
//每个分区都以10为初始值,10用了3次
val res4 = rdd1.aggregate(10)(_ + _, _ + _)
//在List中有多少元素比e大和有多少元素比e小
val rdd2 = sc.parallelize(List("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"))
val (biggerthane, lessthane) = rdd2.aggregate((0, 0))(
(ee, str) => {
var biggere = ee._1
var lesse = ee._2
if (str.compareTo("e") >= 0) biggere = ee._1 + 1
else if (str.compareTo("e") < 0) lesse = ee._2 + 1
(biggere, lesse)
},
(x, y) => (x._1 + y._1, x._2 + y._2)
)
// println((biggerthane,lessthane))
//aggregate与aggregateByKey区别:前者针对序列操作,后者针对k,v对操作
//原型
// def aggregateByKey[U: ClassTag](zeroValue: U)(seqOp: (U, V)) =>
// combOp: (U,U): RDD[(K,U)] = self.withScope{
// aggregateByKey(zeroValue, defaultPartitioner(self))(seqOp, combOp)
// }
//combineByKey
// def combineByKey[C](
// createCombiner: V => C,
// mergeValue: (C, V) => C,
// mergeCombiners: (C, C) => C): RDD[(K, C)] = self.withScope {
// combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners)(null)
// }
//从上面这段源码可以清晰看出,aggregateByKey调用的就是combineByKey方法。
// seqOp方法就是mergeValue,combOp方法则是mergeCombiners,cleanedSeqOp(createZero(), v)是createCombiner,
// 也就是传入的seqOp函数, 只不过其中一个值是传入的zeroValue而已!
//因此, 当createCombiner和mergeValue函数的操作相同, aggregateByKey更为合适!
val rdd3 = sc.parallelize(List("a","b","c","d","e","f"),2)
val res5 = rdd3.aggregate("|")(_ + _, _ + _)
// println(res5)
val rdd4 = sc.parallelize(List("12","23","345","4567"),2)
//两个分区,计算出字符串最大长度,然后合成字符串
val res6 = rdd4.aggregate("")((x,y) => math.max(x.length,y.length).toString, (x,y) => x + y)
// println(res6) 24
val rdd5 = sc.parallelize(List("12", "23", "345", ""), 2)
val res7 = rdd4.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y)
// println(res7) 11
val rdd6 = sc.parallelize(List("12", "23", "", "345"), 2)
val res8 = rdd6.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y)
// println(res8) 11
//aggregateByKey可以先进行局部操作,再进行全局操作。
val pariRDD = sc.parallelize(List(("cat",2), ("cat",5), ("mouse", 4), ("cat", 12), ("dog", 12), ("mouse", 2)),2)
def func2(index: Int, iter: Iterator[(String,Int)]): Iterator[String] ={
iter.toList.map(x => s"[PartID: $index, val: $x]").iterator
}
println(pariRDD.mapPartitionsWithIndex(func2).collect().toBuffer)
//把每种类型最大的次数取出来
val res9 = pariRDD.aggregateByKey(0)(math.max(_, _),_ + _)
// println(res9.collect().toBuffer)
// ArrayBuffer((dog,12), (cat,17), (mouse,6))
//不为10的变成10
val res10 = pariRDD.aggregateByKey(10)(math.max(_, _),_ + _)
// println(res10.collect().toBuffer)
// ArrayBuffer((dog,12), (cat,22), (mouse,20))
/**
* pairRDD.aggregateByKey(0)(_ + _ , _ + _).collect与pairRDD.reduceByKey( _ + _).collect,
* 这两个方法执行结果是一样的,实际上底层都是调用的同一个方法:combineByKey
*/
}
}
spark高级算子(一)
猜你喜欢
转载自blog.csdn.net/weixin_38799368/article/details/83620387
今日推荐
周排行