import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, SparkConf}
object Ex2_Computations {
//
// utilities for printing out a dependency tree
//
private def showDep[T](r: RDD[T], depth: Int) : Unit = {
println("".padTo(depth, ' ') + "RDD id=" + r.id)
r.dependencies.foreach(dep => {
showDep(dep.rdd, depth + 1)
})
}
def showDep[T](r: RDD[T]) : Unit = {
showDep(r, 0)
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Ex2_Computations").setMaster("local[4]")
val sc = new SparkContext(conf)
// set up a simple computation
val numbers = sc.parallelize(1 to 10, 4)
val bigger = numbers.map(n => n * 100)
val biggerStill = bigger.map(n => n + 1)
println("Debug string for the RDD 'biggerStill'")
println(biggerStill.toDebugString)
val s = biggerStill.reduce(_ + _)
println("sum = " + s)
println("IDs of the various RDDs")
println("numbers: id=" + numbers.id)
println("bigger: id=" + bigger.id)
println("biggerStill: id=" + biggerStill.id)
println("dependencies working back from RDD 'biggerStill'")
showDep(biggerStill)
val moreNumbers = bigger ++ biggerStill
println("The RDD 'moreNumbers' has mroe complex dependencies")
println(moreNumbers.toDebugString)
println("moreNumbers: id=" + moreNumbers.id)
showDep(moreNumbers)
moreNumbers.cache()
// things in cache can be lost so dependency tree is not discarded
println("cached it: the dependencies don't change")
println(moreNumbers.toDebugString)
showDep(moreNumbers)
println("has RDD 'moreNumbers' been checkpointed? : " + moreNumbers.isCheckpointed)
// set moreNumbers up to be checkpointed
sc.setCheckpointDir("/tmp/sparkcps")
moreNumbers.checkpoint()
// it will only happen after we force the values to be computed
println("NOW has it been checkpointed? : " + moreNumbers.isCheckpointed)
moreNumbers.count()
println("NOW has it been checkpointed? : " + moreNumbers.isCheckpointed)
println(moreNumbers.toDebugString)
showDep(moreNumbers)
// again, calculations are not done until strictly necessary
println("this shouldn't throw an exception")
val thisWillBlowUp = numbers map {
case (7) => { throw new Exception }
case (n) => n
}
// notice it didn't blow up yet even though there's a 7
println("the exception should get thrown now")
try {
println(thisWillBlowUp.count())
} catch {
case (e: Exception) => println("Yep, it blew up now")
}
}
}
3. 优化后的代码+注释
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Ex2_Computations_FIX {
//
// utilities for printing out a dependency tree
//
// private def showDep[T](r: RDD[T], depth: Int) : Unit = {
// println("".padTo(depth, ' ') + "RDD id=" + r.id)
// r.dependencies.foreach(dep => {
// showDep(dep.rdd, depth + 1)
// })
// }
// def showDep[T](r: RDD[T]) : Unit = {
// showDep(r, 0)
// }
// 递归查找依赖
// 当需要默认值时可以这样写,不需要像Java一样重载方法
private def showDep[T](r: RDD[T], depth: Int = 0) : Unit = {
// println("".padTo(depth, ' ') + "RDD id=" + r.id)
// 生成重复depth个空格的字符串
// 建议下面的写法,更易懂
println(" " * depth + "RDD id=" + r.id)
r.dependencies.foreach(dep => {
showDep(dep.rdd, depth + 1)
})
}
def main(args: Array[String]) {
// 此处conf的代码不长,可以按照此处的代码写到一行内
val conf = new SparkConf().setAppName("Ex2_Computations").setMaster("local[4]")
val sc = new SparkContext(conf)
// Spark的RDD、DataFrame等类型的数据的变量名,应在后面加上类型名简称,例如RDD、DF等
// 这样做有助于调用时区分普通数据和Spark的分布式数据,有助于逻辑的清晰性
// 如果显示指出变量类型(numbers: RDD[Int])而不在后面追加简称,只能提高该行的阅读性,在后面调用时仍可能会困惑
// set up a simple computation
val numberRDD = sc.parallelize(1 to 10, 4)
// numberRDD后续被多次使用,请做缓存
numberRDD.persist()
// val bigger = numberRDD.map(n => n * 100)
// val biggerStill = bigger.map(n => n + 1)
// 可简写,只做推荐
// 熟悉scala语法后,强烈建议这样写
val biggerRDD = numberRDD.map(_ * 100)
val biggerStillRDD = biggerRDD.map(_ + 1)
// biggerStillRDD后续被多次使用,请做缓存
biggerStillRDD.persist()
println("Debug string for the RDD 'biggerStill'")
println(biggerStillRDD.toDebugString)
// 变量名应通俗易懂,尽量不要用单个字母表示
// val s = biggerStillRDD.reduce(_ + _)
val sum = biggerStillRDD.reduce(_ + _)
println("sum = " + sum)
println("IDs of the various RDDs")
println("numberRDD: id=" + numberRDD.id)
println("bigger: id=" + biggerRDD.id)
println("biggerStill: id=" + biggerStillRDD.id)
println("dependencies working back from RDD 'biggerStill'")
showDep(biggerStillRDD)
val moreNumberRDD = biggerRDD ++ biggerStillRDD
println("The RDD 'moreNumbers' has mroe complex dependencies")
println(moreNumberRDD.toDebugString)
println("moreNumbers: id=" + moreNumberRDD.id)
showDep(moreNumberRDD)
moreNumberRDD.cache()
// things in cache can be lost so dependency tree is not discarded
println("cached it: the dependencies don't change")
println(moreNumberRDD.toDebugString)
showDep(moreNumberRDD)
println("has RDD 'moreNumbers' been checkpointed? : " + moreNumberRDD.isCheckpointed)
// set moreNumbers up to be checkpointed
sc.setCheckpointDir("/tmp/sparkcps")
moreNumberRDD.checkpoint()
// it will only happen after we force the values to be computed
println("NOW has it been checkpointed? : " + moreNumberRDD.isCheckpointed)
moreNumberRDD.count()
println("NOW has it been checkpointed? : " + moreNumberRDD.isCheckpointed)
println(moreNumberRDD.toDebugString)
showDep(moreNumberRDD)
// again, calculations are not done until strictly necessary
println("this shouldn't throw an exception")
// val thisWillBlowUp = numberRDD map {
// case (7) => { throw new Exception }
// case (n) => n
// }
// numberRDD map 属于偏向于函数式的写法,会导致ide可能不出提示
// 建议新手还是写numbers.map
val thisWillBlowUp = numberRDD.map {
// 原代码写法较为冗余,可以这样写
case 7 => throw new Exception
case n => n
}
// notice it didn't blow up yet even though there's a 7
println("the exception should get thrown now")
try {
println(thisWillBlowUp.count())
} catch {
// case (e: Exception) => println("Yep, it blew up now")
// 原代码写法较为冗余,可以这样写
case _: Exception => println("Yep, it blew up now")
}
// 一般情况下Spark应用会自动关闭,但用完最好自己手动马上关闭
sc.stop()
// 同时只在本地的操作可在手动关闭后做
// 这样可以减少集群资源的无意义占用
}
}