SparkEnv初始化的角色
org.apache.spark.shuffle.sort.SortShuffleManager ///shuffleManager org.apache.spark.MapOutputTrackerMaster org.apache.spark.shuffle.ShuffleMemoryManager org.apache.spark.network.netty.NettyBlockTransferService org.apache.spark.MapOutputTrackerMaster@25e45d org.apache.spark.serializer.JavaSerializer@dc42ab ///closureSeirializer, serializer org.apache.spark.storage.BlockManager@16d5aa8 org.apache.spark.storage.BlockManagerMaster@a62840 org.apache.spark.network.netty.NettyBlockTransferService@148d5b2 //blockTransferService org.apache.spark.CacheManager@1ac9928 org.apache.spark.HttpFileServer@131d67 org.apache.spark.metrics.MetricsSystem@516ac3 org.apache.spark.MapOutputTrackerMaster@25e45d org.apache.spark.broadcast.BroadcastManager@f8008d C:\Users\hadoop\AppData\Local\Temp\spark-7f0f46d9-28d0-4e8d-94d0-9a8f8f589d14 //sparkFilesDir new SparkEnv( executorId, actorSystem, serializer, closureSerializer, cacheManager, mapOutputTracker, shuffleManager, broadcastManager, blockTransferService, blockManager, securityManager, httpFileServer, sparkFilesDir, metricsSystem, shuffleMemoryManager, conf) }
分析的源代码:
package spark.examples import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ object SparkWordCount { def main(args: Array[String]) { System.setProperty("hadoop.home.dir", "E:\\devsoftware\\hadoop-2.5.2\\hadoop-2.5.2"); val conf = new SparkConf() conf.setAppName("SparkWordCount") conf.setMaster("local") val sc = new SparkContext(conf) val rdd1 = sc.textFile("file:///D:/word.in") println(rdd1.toDebugString) val rdd2 = rdd.flatMap(_.split(" ")) println("rdd2:" + rdd2.toDebugString) val rdd3 = rdd2.map((_, 1)) println("rdd3:" + rdd3.toDebugString) val rdd4 = rdd4.reduceByKey(_ + _); println("rdd4:" + rdd4.toDebugString) rdd3.saveAsTextFile("file:///D:/wordout" + System.currentTimeMillis()); sc.stop } }
输出的RDD依赖图是:
RDD1 rdd1:(1) file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 [] | file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 [] RDD2 rdd2:(1) FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 [] | file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 [] | file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 [] RDD3 rdd3:(1) MappedRDD[3] at map at SparkWordCount.scala:19 [] | FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 [] | file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 [] | file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 [] rdd4:(1) ShuffledRDD[4] at reduceByKey at SparkWordCount.scala:21 [] +-(1) MappedRDD[3] at map at SparkWordCount.scala:19 [] | FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 [] | file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 [] | file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []
val writeToFile = (context: TaskContext, iter: Iterator[(K, V)]) => { val config = wrappedConf.value // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it // around by taking a mod. We expect that no task will be attempted 2 billion times. val attemptNumber = (context.attemptId % Int.MaxValue).toInt val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context, config) writer.setup(context.stageId, context.partitionId, attemptNumber) writer.open() try { var recordsWritten = 0L while (iter.hasNext) { val record = iter.next() writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef]) // Update bytes written metric every few records maybeUpdateOutputMetrics(bytesWrittenCallback, outputMetrics, recordsWritten) recordsWritten += 1 } } finally { writer.close() } writer.commit() bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() } }