【Spark二六】Spark代码剖析

SparkEnv初始化的角色

	org.apache.spark.shuffle.sort.SortShuffleManager   ///shuffleManager
	org.apache.spark.MapOutputTrackerMaster
	org.apache.spark.shuffle.ShuffleMemoryManager
	org.apache.spark.network.netty.NettyBlockTransferService
	org.apache.spark.MapOutputTrackerMaster@25e45d
	org.apache.spark.serializer.JavaSerializer@dc42ab   ///closureSeirializer, serializer
	org.apache.spark.storage.BlockManager@16d5aa8
	org.apache.spark.storage.BlockManagerMaster@a62840
	org.apache.spark.network.netty.NettyBlockTransferService@148d5b2   //blockTransferService
	org.apache.spark.CacheManager@1ac9928
	org.apache.spark.HttpFileServer@131d67
	org.apache.spark.metrics.MetricsSystem@516ac3
	org.apache.spark.MapOutputTrackerMaster@25e45d
	org.apache.spark.broadcast.BroadcastManager@f8008d
	C:\Users\hadoop\AppData\Local\Temp\spark-7f0f46d9-28d0-4e8d-94d0-9a8f8f589d14   //sparkFilesDir


    new SparkEnv(
      executorId,
      actorSystem,
      serializer,
      closureSerializer,
      cacheManager,
      mapOutputTracker,
      shuffleManager,
      broadcastManager,
      blockTransferService,
      blockManager,
      securityManager,
      httpFileServer,
      sparkFilesDir,
      metricsSystem,
      shuffleMemoryManager,
      conf)
  }

分析的源代码:

package spark.examples

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

import org.apache.spark.SparkContext._

object SparkWordCount {
  def main(args: Array[String]) {
    System.setProperty("hadoop.home.dir", "E:\\devsoftware\\hadoop-2.5.2\\hadoop-2.5.2");
    val conf = new SparkConf()
    conf.setAppName("SparkWordCount")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val rdd1 = sc.textFile("file:///D:/word.in")
    println(rdd1.toDebugString)
    val rdd2 = rdd.flatMap(_.split(" "))
    println("rdd2:" + rdd2.toDebugString)
    val rdd3 = rdd2.map((_, 1))
    println("rdd3:" + rdd3.toDebugString)
    val rdd4 = rdd4.reduceByKey(_ + _);
    println("rdd4:" + rdd4.toDebugString)
    rdd3.saveAsTextFile("file:///D:/wordout" + System.currentTimeMillis());
    sc.stop
  }
}

 输出的RDD依赖图是:

RDD1

rdd1:(1) file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
 |  file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []

RDD2

rdd2:(1) FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 []
 |  file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
 |  file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []

RDD3

rdd3:(1) MappedRDD[3] at map at SparkWordCount.scala:19 []
 |  FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 []
 |  file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
 |  file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []

 

rdd4:(1) ShuffledRDD[4] at reduceByKey at SparkWordCount.scala:21 []
 +-(1) MappedRDD[3] at map at SparkWordCount.scala:19 []
    |  FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 []
    |  file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
    |  file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []
 
 val writeToFile = (context: TaskContext, iter: Iterator[(K, V)]) => {
      val config = wrappedConf.value
      // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
      // around by taking a mod. We expect that no task will be attempted 2 billion times.
      val attemptNumber = (context.attemptId % Int.MaxValue).toInt

      val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context, config)

      writer.setup(context.stageId, context.partitionId, attemptNumber)
      writer.open()
      try {
        var recordsWritten = 0L
        while (iter.hasNext) {
          val record = iter.next()
          writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])

          // Update bytes written metric every few records
          maybeUpdateOutputMetrics(bytesWrittenCallback, outputMetrics, recordsWritten)
          recordsWritten += 1
        }
      } finally {
        writer.close()
      }
      writer.commit()
      bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() }
    }
 

猜你喜欢

转载自bit1129.iteye.com/blog/2178238
今日推荐