Spark代码可读性与性能优化——示例二

1. 内容点大纲

SparkConf可读性提示
Spark的RDD类型变量命名提示
普通变量名提示
Scala语法可读性提示
方法默认值提示
生成重复字符串的提示
代码冗余写法提示
函数式写法提示
persist缓存性能优化提示
sc.stop()性能优化提示
*注意：不同示例之间的内容点经常会有重复，就当做记忆强化吧0 0

2. 原代码（来自GitHub）

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, SparkConf}

object Ex2_Computations {
  //
  // utilities for printing out a dependency tree
  //
  private def showDep[T](r: RDD[T], depth: Int) : Unit = {
    println("".padTo(depth, ' ') + "RDD id=" + r.id)
    r.dependencies.foreach(dep => {
      showDep(dep.rdd, depth + 1)
    })
  }
  def showDep[T](r: RDD[T]) : Unit = {
    showDep(r, 0)
  }

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Ex2_Computations").setMaster("local[4]")
    val sc = new SparkContext(conf)

    // set up a simple computation
    val numbers = sc.parallelize(1 to 10, 4)
    val bigger = numbers.map(n => n * 100)
    val biggerStill = bigger.map(n => n + 1)

    println("Debug string for the RDD 'biggerStill'")
    println(biggerStill.toDebugString)

    val s = biggerStill.reduce(_ + _)

    println("sum = " + s)

    println("IDs of the various RDDs")
    println("numbers: id=" + numbers.id)
    println("bigger: id=" + bigger.id)
    println("biggerStill: id=" + biggerStill.id)
    println("dependencies working back from RDD 'biggerStill'")
    showDep(biggerStill)

    val moreNumbers = bigger ++ biggerStill
    println("The RDD 'moreNumbers' has mroe complex dependencies")
    println(moreNumbers.toDebugString)
    println("moreNumbers: id=" + moreNumbers.id)
    showDep(moreNumbers)

    moreNumbers.cache()
    // things in cache can be lost so dependency tree is not discarded
    println("cached it: the dependencies don't change")
    println(moreNumbers.toDebugString)
    showDep(moreNumbers)

    println("has RDD 'moreNumbers' been checkpointed? : " + moreNumbers.isCheckpointed)
    // set moreNumbers up to be checkpointed
    sc.setCheckpointDir("/tmp/sparkcps")
    moreNumbers.checkpoint()
    // it will only happen after we force the values to be computed
    println("NOW has it been checkpointed? : " + moreNumbers.isCheckpointed)
    moreNumbers.count()
    println("NOW has it been checkpointed? : " + moreNumbers.isCheckpointed)
    println(moreNumbers.toDebugString)
    showDep(moreNumbers)

    // again, calculations are not done until strictly necessary
    println("this shouldn't throw an exception")
    val thisWillBlowUp = numbers map {
      case (7) => { throw new Exception }
      case (n) => n
    }

    // notice it didn't blow up yet even though there's a 7
    println("the exception should get thrown now")
    try {
      println(thisWillBlowUp.count())
    } catch {
      case (e: Exception) => println("Yep, it blew up now")
    }

  }
}

3. 优化后的代码+注释

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Ex2_Computations_FIX {
  //
  // utilities for printing out a dependency tree
  //
//  private def showDep[T](r: RDD[T], depth: Int) : Unit = {
//    println("".padTo(depth, ' ') + "RDD id=" + r.id)
//    r.dependencies.foreach(dep => {
//      showDep(dep.rdd, depth + 1)
//    })
//  }
//  def showDep[T](r: RDD[T]) : Unit = {
//    showDep(r, 0)
//  }

  // 递归查找依赖
  // 当需要默认值时可以这样写，不需要像Java一样重载方法
  private def showDep[T](r: RDD[T], depth: Int = 0) : Unit = {
//    println("".padTo(depth, ' ') + "RDD id=" + r.id)
    // 生成重复depth个空格的字符串
    // 建议下面的写法，更易懂
    println(" " * depth + "RDD id=" + r.id)
    r.dependencies.foreach(dep => {
      showDep(dep.rdd, depth + 1)
    })
  }

  def main(args: Array[String]) {
    // 此处conf的代码不长，可以按照此处的代码写到一行内
    val conf = new SparkConf().setAppName("Ex2_Computations").setMaster("local[4]")
    val sc = new SparkContext(conf)

    // Spark的RDD、DataFrame等类型的数据的变量名，应在后面加上类型名简称，例如RDD、DF等
    // 这样做有助于调用时区分普通数据和Spark的分布式数据，有助于逻辑的清晰性
    // 如果显示指出变量类型（numbers: RDD[Int]）而不在后面追加简称，只能提高该行的阅读性，在后面调用时仍可能会困惑
    // set up a simple computation
    val numberRDD = sc.parallelize(1 to 10, 4)
    // numberRDD后续被多次使用，请做缓存
    numberRDD.persist()
//    val bigger = numberRDD.map(n => n * 100)
//    val biggerStill = bigger.map(n => n + 1)
    // 可简写，只做推荐
    // 熟悉scala语法后，强烈建议这样写
    val biggerRDD = numberRDD.map(_ * 100)
    val biggerStillRDD = biggerRDD.map(_ + 1)
    // biggerStillRDD后续被多次使用，请做缓存
    biggerStillRDD.persist()

    println("Debug string for the RDD 'biggerStill'")
    println(biggerStillRDD.toDebugString)

    // 变量名应通俗易懂，尽量不要用单个字母表示
//    val s = biggerStillRDD.reduce(_ + _)
    val sum = biggerStillRDD.reduce(_ + _)

    println("sum = " + sum)

    println("IDs of the various RDDs")
    println("numberRDD: id=" + numberRDD.id)
    println("bigger: id=" + biggerRDD.id)
    println("biggerStill: id=" + biggerStillRDD.id)
    println("dependencies working back from RDD 'biggerStill'")
    showDep(biggerStillRDD)

    val moreNumberRDD = biggerRDD ++ biggerStillRDD
    println("The RDD 'moreNumbers' has mroe complex dependencies")
    println(moreNumberRDD.toDebugString)
    println("moreNumbers: id=" + moreNumberRDD.id)
    showDep(moreNumberRDD)

    moreNumberRDD.cache()
    // things in cache can be lost so dependency tree is not discarded
    println("cached it: the dependencies don't change")
    println(moreNumberRDD.toDebugString)
    showDep(moreNumberRDD)

    println("has RDD 'moreNumbers' been checkpointed? : " + moreNumberRDD.isCheckpointed)
    // set moreNumbers up to be checkpointed
    sc.setCheckpointDir("/tmp/sparkcps")
    moreNumberRDD.checkpoint()
    // it will only happen after we force the values to be computed
    println("NOW has it been checkpointed? : " + moreNumberRDD.isCheckpointed)
    moreNumberRDD.count()
    println("NOW has it been checkpointed? : " + moreNumberRDD.isCheckpointed)
    println(moreNumberRDD.toDebugString)
    showDep(moreNumberRDD)

    // again, calculations are not done until strictly necessary
    println("this shouldn't throw an exception")
//    val thisWillBlowUp = numberRDD map {
//      case (7) => { throw new Exception }
//      case (n) => n
//    }
    // numberRDD map 属于偏向于函数式的写法，会导致ide可能不出提示
    // 建议新手还是写numbers.map
    val thisWillBlowUp = numberRDD.map {
      // 原代码写法较为冗余，可以这样写
      case 7 => throw new Exception
      case n => n
    }

    // notice it didn't blow up yet even though there's a 7
    println("the exception should get thrown now")
    try {
      println(thisWillBlowUp.count())
    } catch {
//      case (e: Exception) => println("Yep, it blew up now")
      // 原代码写法较为冗余，可以这样写
      case _: Exception => println("Yep, it blew up now")
    }

    // 一般情况下Spark应用会自动关闭，但用完最好自己手动马上关闭
    sc.stop()
    // 同时只在本地的操作可在手动关闭后做
    // 这样可以减少集群资源的无意义占用
  }
}

蒋含竹

发布了128 篇原创文章 · 获赞 45 · 访问量 15万+

私信关注

Spark代码可读性与性能优化——示例二

文章目录

Spark代码可读性与性能优化——示例二

1. 内容点大纲

2. 原代码（来自GitHub）

3. 优化后的代码+注释

猜你喜欢