版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/GUANYAQI1996/article/details/78631979
object RddTest { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[*]").setAppName("rdd test") val sc = new SparkContext(conf) //mapTest(sc) //val file= sc.textFile("/user-logs-large.txt") //file.flatMap(_.split("\\s")).foreach(println) //distinctTest(sc) // filterTest(sc) // keyByTest(sc) //sortByTest(sc) // rePartitionTest(sc) groupBy(sc) sc.stop() } //flatMap map mapPatitions def mapTest(sc: SparkContext) = { val file = sc.textFile("/user-logs-large.txt") val mapResult = file.map(x => { val info = x.split("\\s") (info(0), info(1)) }) //mapResult.take(10).foreach(println) //分区转换 val mapPartitionResult = file.mapPartitions(x => { var info = new Array[String](3) for (line <- x) yield { info = line.split("\\s") (info(0), info(1)) } }) //mapPartitionResult.take(10).foreach(println) //通过转换把一条new_tweet的记录转换成2条login的记录 val flatMapResult = file.flatMap(x => { val info = x.split("\\s") info(0) match { case "new_tweet" => for (i <- 1 to 2) yield s"${info(0)} login ${info(2)}" case _ => Array(x) } }) //flatMapResult.take(10).foreach(println) } //distinct排重 def distinctTest(sc: SparkContext) = { val file = sc.textFile("/user-logs-large.txt", 3) val userRdd = file.map(_.split("\\t")(0)).distinct() userRdd.foreach(println) } //过滤 def filterTest(sc: SparkContext) = { val file = sc.textFile("/user-logs-large.txt", 3) val loginFilter = file.filter(_.split("\\s")(1) == "login") loginFilter.foreach(println) println(loginFilter.count()) } //keyBy 结果的key值是自定义的,v是原数据x def keyByTest(sc: SparkContext) = { val file = sc.textFile("/user-logs-large.txt", 3) val userActionType = file.keyBy(x => { val info = x.split("\\s") s"${info(0)}-----${info(1)}" }) userActionType.foreach(println) } //sortBy排序 def sortByTest(sc: SparkContext) = { val file = sc.textFile("/spark/part-00001") val sortByResult = file.sortBy(x => x.split("\\s+")(1).toInt) sortByResult.foreach(println) } //topN def topNTest(sc: SparkContext) = { val list = List(1, 2, 5, 11, 545, 22, 12, 55) val rdd = sc.parallelize(list, 2) val takeOrdered = rdd.takeOrdered(3) takeOrdered.foreach(println) //默认升序 val topN = rdd.top(3) topN.foreach(println) //默认降序 } //重新分区 def rePartitionTest(sc: SparkContext) = { val file = sc.textFile("/user-logs-large.txt") val result = file.repartition(5) file.foreachPartition(x => { println(s"fileRdd分区,该分区数据:${x.size} 条") }) //reParttion分区 宽依赖 result.foreachPartition(x => { var sum = 0 x.foreach(x => sum += 1) println(s"resultRdd分区,该分区数据:$sum 条") }) //coalsce分区 窄依赖 val coalResult = file.coalesce(3) coalResult.foreachPartition(x => { println(s"coalResultRdd:${x.size}") }) } //groupBy def groupBy(sc: SparkContext) = { val file = sc.textFile("/user-logs-large.txt") val groupBy = file.groupBy(x => x.split("\\s")(0)) groupBy.foreachPartition(x => { println(s"GroupByRdd分区,该分区数据:${x.size} 条") }) groupBy.foreach(x => { println(s"GroupByRdd的一条记录,key为${x._1},value上集合的记录是:${x._2.size}条") }) //计算用户登录次数 groupBy.foreach(x=>{ var sum=0 x._2.foreach(line=>{ line.split("\\s")(1) match{ case "login"=>sum+=1 case _=> } }) println(s"user:${x._1},logintimes:$sum") }) } def aggSumTest(sc: SparkContext)={ val list=List(3,5,8,9,12,55) val rdd=sc.parallelize(list,3) //reduce计算sum val reduceSum=rdd.reduce(_+_) //fold计算sum val foldRedult=rdd.fold(0)(_+_) //aggreagate把元素连成一个字符串 val aggregateResult=rdd.aggregate("")((c,v)=>{ c match{ case ""=>v.toString case _=>s"$c,$v" } },(c1,c2)=>{ c1 match{ case ""=>c2 case _=>s"$c1,$c2" } }) println(s"reduceResult: $reduceSum") println(s"foldRedult: $foldRedult") println(s"aggregateResult: $aggregateResult") } //persist def persistTest(sc: SparkContext)={ val file=sc.textFile("/user-logs-large.txt") file.cache() file.persist(StorageLevel.MEMORY_ONLY) //计算用户的数量 file.map(x=>x.split("\\s")(0)).distinct().count //计算ip的数量 file.map(x=>x.split("\\s")(2)).distinct().count }