Spark代码可读性与性能优化——示例三

1. 内容点大纲

无意义代码清除提示
数据本地获取提示
unpersist解除缓存优化提示
使用匹配模式的提示
代码简写提示
*注意：和前面文章内容重复的不再做提示，已直接修改

2. 原代码（来自GitHub）

import scala.collection.Iterator

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkException, SparkContext, SparkConf}

import scala.collection.mutable.ListBuffer

object Ex3_CombiningRDDs {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Ex3_CombiningRDDs").setMaster("local[4]")
    val sc = new SparkContext(conf)

    // put some data in an RDD
    val letters = sc.parallelize('a' to 'z', 8)

    // another RDD of the same type
    val vowels = sc.parallelize(Seq('a', 'e', 'i', 'o', 'u'), 4)

    // subtract one from another, getting yet another RDD of the same type
    val consonants = letters.subtract(vowels)
    println("There are " + consonants.count() + " consonants")

    val vowelsNotLetters = vowels.subtract(letters)
    println("There are " + vowelsNotLetters.count() + " vowels that aren't letters")

    // union
    val lettersAgain = consonants ++ vowels
    println("There really are " + lettersAgain.count() + " letters")

    // union with duplicates, removed
    val tooManyVowels = vowels ++ vowels
    println("There aren't really " + tooManyVowels.count() + " vowels")
    val justVowels = tooManyVowels.distinct()
    println("There are actually " + justVowels.count() + " vowels")

    // subtraction with duplicates
    val what = tooManyVowels.subtract(vowels)
    println("There are actually " + what.count() + " whats")

    // intersection
    val earlyLetters = sc.parallelize('a' to 'l', 2)
    val earlyVowels = earlyLetters.intersection(vowels)
    println("The early vowels:")
    earlyVowels.foreach(println)

    // RDD of a different type
    val numbers = sc.parallelize(1 to 2, 2)

    // cartesian product
    val cp = vowels.cartesian(numbers)
    println("Product has " + cp.count() + " elements")

    // index the letters
    val indexed = letters.zipWithIndex()
    println("indexed letters")
    indexed foreach {
      case (c, i) => println(i + ":  " + c)
    }

    // another RDD, same size and partitioning as letters
    val twentySix = sc.parallelize(101 to 126, 8)

    // zip the letters and numbers
    val differentlyIndexed = letters.zip(twentySix)
    differentlyIndexed foreach {
      case (c, i) => println(i + ":  " + c)
    }

    // we can't do this if the two RDDs don't have the same partitioning --
    // this is to remind us that it would be enormously costly in terms
    // of communication, so, as we'll see in later examples, we have to
    // fix the partitioning ourselves
    val twentySixBadPart = sc.parallelize(101 to 126, 3)
    val cantGet = letters.zip(twentySixBadPart)
    try {
      cantGet foreach {
        case (c, i) => println(i + ":  " + c)
      }
    } catch {
      case iae: IllegalArgumentException =>
        println("Exception caught: " + iae.getMessage)
    }

    // the zipped RDDs also need to have the same number of elements
    val unequalCount = earlyLetters.zip(numbers)
    try {
      unequalCount foreach {
        case (c, i) => println(i + ":  " + c)
      }
    }
    catch {
      case se: SparkException => {
        val t = se.getMessage
        println("Exception caught: " + se.getMessage)
      }
    }

    // zipPartitions gives us more control, se we can deal with weird cases
    // BUT the result may be surprising because each PARTITION also has
    // unequal numbers of elements, and the function 'zipFunc' gets
    // applied once per partition!
    // also notice the amount of type annotation to make the Scala compiler
    // happy -- it's an interesting exercise to remove some of them and read
    // the complaints

    def zipFunc(lIter: Iterator[Char], nIter: Iterator[Int]) :
      Iterator[(Char, Int)] = {
      val res = new ListBuffer[(Char, Int)]
      while (lIter.hasNext || nIter.hasNext) {
        if (lIter.hasNext && nIter.hasNext) {
          // easy case
          res += ((lIter.next(), nIter.next()))
        } else if (lIter.hasNext) {
          res += ((lIter.next(), 0))
        } else if (nIter.hasNext) {
          res += ((' ', nIter.next()))
        }
      }
      res.iterator
    }

    val unequalOK = earlyLetters.zipPartitions(numbers)(zipFunc)
    println("this may not be what you expected with unequal length RDDs")
    unequalOK foreach {
      case (c, i) => println(i + ":  " + c)
    }
  }
}

3. 优化后的代码+注释

import org.apache.spark.{SparkConf, SparkContext, SparkException}

import scala.collection.Iterator
import scala.collection.mutable.ListBuffer

object Ex3_CombiningRDDs_FIX {
  def main(args: Array[String]) {
    // 下面这些是之前几篇中提到过的，不再专门提示
    // 1. 注意每行代码的长度
    // 2. 注意RDD、DataFrame等Spark分布式数据集的变量命名，要在后面加RDD、DF
    // 3. 注意多次使用后要缓存
    // 4. 注意函数式编程的写法

    val conf = new SparkConf().setAppName("Ex3_CombiningRDDs").setMaster("local[4]")
    val sc = new SparkContext(conf)

    // put some data in an RDD
    val letterRDD = sc.parallelize('a' to 'z', 8)
    letterRDD.persist()

    // another RDD of the same type
    val vowelRDD = sc.parallelize(Seq('a', 'e', 'i', 'o', 'u'), 4)
    vowelRDD.persist()

    // subtract one from another, getting yet another RDD of the same type
    val consonantsRDD = letterRDD.subtract(vowelRDD)
    consonantsRDD.persist()

    println("There are " + consonantsRDD.count() + " consonantsRDD")

    val vowelsNotLetterRDD = vowelRDD.subtract(letterRDD)
    println("There are " + vowelsNotLetterRDD.count() + " vowelRDD that aren't letterRDD")

    // union
    val lettersAgainRDD = consonantsRDD ++ vowelRDD
    println("There really are " + lettersAgainRDD.count() + " letterRDD")
    // 缓存不再使用了后，要解除缓存
    consonantsRDD.unpersist()

    // union with duplicates, removed
    val tooManyVowelRDD = vowelRDD ++ vowelRDD
    tooManyVowelRDD.persist()
    println("There aren't really " + tooManyVowelRDD.count() + " vowelRDD")
    val justVowelRDD = tooManyVowelRDD.distinct()
    println("There are actually " + justVowelRDD.count() + " vowelRDD")

    // subtraction with duplicates
    val whatRDD = tooManyVowelRDD.subtract(vowelRDD)
    println("There are actually " + whatRDD.count() + " whats")
    // 缓存不再使用了后，要解除缓存
    tooManyVowelRDD.unpersist()

    // intersection
    val earlyLetterRDD = sc.parallelize('a' to 'l', 2)
    val earlyVowelRDD = earlyLetterRDD.intersection(vowelRDD)
    println("The early vowelRDD:")
    earlyVowelRDD.foreach(println)

    // RDD of a different type
    val numberRDD = sc.parallelize(1 to 2, 2)
    numberRDD.persist()

    // cartesian product
    val cpRDD = vowelRDD.cartesian(numberRDD)
    println("Product has " + cpRDD.count() + " elements")
    // 缓存不再使用了后，要解除缓存
    vowelRDD.unpersist()

    // index the letterRDD
    val indexed = letterRDD.zipWithIndex()
    println("indexed letterRDD")
    //    indexed foreach {
    //      case (c, i) => println(i + ":  " + c)
    //    }
    // 如果想在集群上看监控日志，查找该打印，那么上面这样写没问题
    // 但如果需要在本地打印时，一定要先将数据拿到本地，例如take、collect
    // foreach是Action算子，会触发Job，但是会在每个Executor上分别执行里面的打印，打印不会显示到本机
    // 在这里，你直接在本地运行，会显示打印，原因是数据计算就在本地，打印代码执行也在本地，当然也会打印到本地控制台
    indexed.collect()
      .foreach {
        case (letter, index) => println(index + ":  " + letter)
      }
    // 另外你还可以尝试这种写法
//    indexed.collect().foreach(charIdx => println(charIdx.swap))

    // another RDD, same size and partitioning as letterRDD
    val twentySix = sc.parallelize(101 to 126, 8)

    // zip the letterRDD and numberRDD
    val differentlyIndexedRDD = letterRDD.zip(twentySix)
//    differentlyIndexedRDD foreach {
//      case (c, i) => println(i + ":  " + c)
//    }
    // 试试下面的写法
    differentlyIndexedRDD.collect().foreach(charIdx => println(charIdx.swap))

    // we can't do this if the two RDDs don't have the same partitioning --
    // this is to remind us that it would be enormously costly in terms
    // of communication, so, as we'll see in later examples, we have to
    // fix the partitioning ourselves
    val twentySixBadPartRDD = sc.parallelize(101 to 126, 3)
    val cantGet = letterRDD.zip(twentySixBadPartRDD)
    try {
//      cantGet foreach {
//        case (c, i) => println(i + ":  " + c)
//      }
      // 试试下面的写法
      cantGet.collect().foreach(charIdx => println(charIdx.swap))
    } catch {
      case iae: IllegalArgumentException =>
        println("Exception caught: " + iae.getMessage)
    }
    // 缓存不再使用了后，要解除缓存
    letterRDD.unpersist()

    // the zipped RDDs also need to have the same number of elements
    val unequalCount = earlyLetterRDD.zip(numberRDD)
    try {
//      unequalCount foreach {
//        case (c, i) => println(i + ":  " + c)
//      }
      // 试试下面的写法
      unequalCount.collect().foreach(charIdx => println(charIdx.swap))
    } catch {
//      case se: SparkException => {
//        val t = se.getMessage
//        println("Exception caught: " + se.getMessage)
//      }
      // 移除无意义的代码
      case se: SparkException =>
        println("Exception caught: " + se.getMessage)
    }

    // zipPartitions gives us more control, se we can deal with weird cases
    // BUT the result may be surprising because each PARTITION also has
    // unequal numberRDD of elements, and the function 'zipFunc' gets
    // applied once per partition!
    // also notice the amount of type annotation to make the Scala compiler
    // happy -- it's an interesting exercise to remove some of them and read
    // the complaints

//    def zipFunc(lIter: Iterator[Char], nIter: Iterator[Int]):
//    Iterator[(Char, Int)] = {
//      val res = new ListBuffer[(Char, Int)]
//      while (lIter.hasNext || nIter.hasNext) {
//        if (lIter.hasNext && nIter.hasNext) {
//          // easy case
//          res += ((lIter.next(), nIter.next()))
//        } else if (lIter.hasNext) {
//          res += ((lIter.next(), 0))
//        } else if (nIter.hasNext) {
//          res += ((' ', nIter.next()))
//        }
//      }
//      res.iterator
//    }
    def zipFunc(lIter: Iterator[Char], nIter: Iterator[Int]): Iterator[(Char, Int)] = {
      val res = new ListBuffer[(Char, Int)]
      // 使用匹配模式，更易于理解
      while (lIter.hasNext || nIter.hasNext) {
        (lIter.hasNext, nIter.hasNext) match {
          case (true, true) => res += ((lIter.next(), nIter.next()))
          case (true, false) => res += ((lIter.next(), 0))
          case (false, true) => res += ((' ', nIter.next()))
          case (false, false) =>
        }
      }

      res.iterator
    }

    val unequalOKRDD = earlyLetterRDD.zipPartitions(numberRDD)(zipFunc)
    println("this may not be whatRDD you expected with unequal length RDDs")
//    unequalOKRDD foreach {
//      case (c, i) => println(i + ":  " + c)
//    }
    // 试试下面的写法
    unequalOKRDD.collect().foreach(charIdx => println(charIdx.swap))

    sc.stop()
  }
}

蒋含竹

发布了128 篇原创文章 · 获赞 45 · 访问量 15万+

私信关注

Spark代码可读性与性能优化——示例三

文章目录

Spark代码可读性与性能优化——示例三

1. 内容点大纲

2. 原代码（来自GitHub）

3. 优化后的代码+注释

猜你喜欢