Spark任务提交全流程的源码的类调用时序图
本篇博客主要是Spark任务提交到执行的全流程中的第二部分:从在本地执行代码碰到Action算子进行runJob到最后Task提交到Executor上执行。
1、RDD类中,Action算子触发任务提交
1、这里RDD调用算子生成新的RDD的候,会把前一个RDD传入到下一个RDD的构造函数作为成员变量。
2、当为Action算子的时候会调用SparkContext.runJob()去提交任务。并且在runJob()方法中调用了DAGScheduler.runJob()去做Stage切分操作。
/* 当用户编写的spark的代码碰到Action算子的时候,就开始提交任务,下面以foreachPartition算子为例 */
RDD.scala {
// TODO RDD的构造函数
def this(@transient oneParent: RDD[_]) =
this(oneParent.context, List(new OneToOneDependency(oneParent)))
// TODO foreachPartition是一个action算子
def foreachPartition(f: Iterator[T] => Unit): Unit = withScope {
val cleanF = sc.clean(f)
// TODO sc是SparkContext的对象实例
sc.runJob(this, (iter: Iterator[T]) => cleanF(iter)) {
// TODO 开始运行Job, 只有是Action操作的时候才会执行该方法, 参见RDD类里面实现的Action的算子
runJob(rdd, func, 0 until rdd.partitions.length){
// TODO 清理闭包,使其准备好序列化并发送给Job
val cleanedFunc = clean(func)
runJob(rdd, (ctx: TaskContext, it: Iterator[T]) => cleanedFunc(it), partitions){
// TODO Job执行后所有分区返回的结果的迭代器
val results = new Array[U](partitions.size)
runJob[T, U](rdd, func, partitions, (index, res) => results(index) = res){
val callSite = getCallSite
val cleanedFunc = clean(func)
logInfo("Starting job: " + callSite.shortForm)
if (conf.getBoolean("spark.logLineage", false)) {
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
}
// TODO 此处DAGScheduler开始对Job划分Stage, 以及提交Task
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
rdd.doCheckpoint()
}
}
}
}
}
}
2、DAGScheduler类
1、这个DAGScheduler的类里面的代码东西非常多,这里面主要是在RDD中调用了DAGScheduler.runJob()开始进行Stage划分,并进行Tasks的提交。
2、划分Stage是利用DFS遍历从后往前遇到一个ShuffleDependency就创建一个Stage。先是创建最后一个Stage,然后从后往前遍历得到所有的ShuffleDependency并存储到一个栈中,然后再遍历整个栈依次从前往后创建Stage。
3、提交Stage需要从前往后提交,先提交祖先Stage然后再提交子Stage。并且根据Stage中分区数的数量生成对应的Task数组,然后封装到TaskSet中,然后通过TaskScheduler去提交Task给Executor执行。
// TODO DAGScheduler主要负责DAG中的Stage的划分
DAGScheduler.scala {
def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System.nanoTime
// TODO 这里开始提交Action Job给Scheduler
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties) {
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
// TODO 这里往DAGScheduler的eventQueue里面写入了一条JobSubmitted的消息
// TODO eventQueue是一个阻塞的队列,eventProcessLoop有一个线程异步地处理消息
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
waiter
}
ThreadUtils.awaitReady(waiter.completionFuture, Duration.Inf)
waiter.completionFuture.value.get match {
case scala.util.Success(_) =>
logInfo("Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
case scala.util.Failure(exception) =>
logInfo("Job %d failed: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
val callerStackTrace = Thread.currentThread().getStackTrace.tail
exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
throw exception
}
}
private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler)
extends EventLoop[DAGSchedulerEvent]("dag-scheduler-event-loop") with Logging {
// TODO 当接受到DAGSchedulerEvent的消息的时候
override def onReceive(event: DAGSchedulerEvent): Unit = {
val timerContext = timer.time()
try {
doOnReceive(event)
} finally {
timerContext.stop()
}
}
// TODO DAGSchedulerEventProcessLoop中处理写入到阻塞的消息队列中的消息
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
// TODO 这里才开始真正的划分Stage
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
}
}
// TODO 这里面的代码很重要, 里面涉及到真正的Stage切分和Task提交操作
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
callSite: CallSite,
listener: JobListener,
properties: Properties) {
// TODO 递归调用创建Stage
finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
// TODO 这里创建一个Job
val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
// TODO 这里去提交Stage
submitStage(finalStage)
}
private def createResultStage(
rdd: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
jobId: Int,
callSite: CallSite): ResultStage = {
checkBarrierStageWithDynamicAllocation(rdd)
checkBarrierStageWithNumSlots(rdd)
checkBarrierStageWithRDDChainPattern(rdd, partitions.toSet.size)
// TODO 先创建出当前RDD的所有的父Stage, 此处传入的是整个DAG中最后一个RDD
val parents = getOrCreateParentStages(rdd, jobId)
val id = nextStageId.getAndIncrement()
// TODO 这里创建最有一个Stage
val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
// TODO 这里存储StageId和Stage的映射关系
stageIdToStage(id) = stage
updateJobIdStageIdMaps(jobId, stage)
stage
}
// TODO 这里的逻辑是从最后一个RDD向前遍历当碰到一个ShuffleDependency就创建一个ShuffleMapStage
// TODO Satge里面包含了Prev指针包含了之前的NarrowDependency的RDD
private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
// TODO 这里getShuffleDependencies()是获取从当前的RDD往前最近的一个ShuffleDependencie
getShuffleDependencies(rdd).map { shuffleDep =>
// TODO 这里是根据拿到的ShuffleDependencie去创建ShuffleMapStage
getOrCreateShuffleMapStage(shuffleDep, firstJobId)
}.toList
}
// TODO 只会返回直接的和当前RDD相连的第一个ShuffleDependency
private[scheduler] def getShuffleDependencies(
rdd: RDD[_]): HashSet[ShuffleDependency[_, _, _]] = {
val parents = new HashSet[ShuffleDependency[_, _, _]]
val visited = new HashSet[RDD[_]]
val waitingForVisit = new ArrayStack[RDD[_]]
waitingForVisit.push(rdd)
// TODO 这里使用DFS的方式从后往前遍历,获取最近的一个ShuffleDependency
while (waitingForVisit.nonEmpty) {
val toVisit = waitingForVisit.pop()
if (!visited(toVisit)) {
visited += toVisit
toVisit.dependencies.foreach {
// TODO 当遍历到的RDD是ShuffleDependency就保存到parents的map中
case shuffleDep: ShuffleDependency[_, _, _] =>
parents += shuffleDep
// TODO 当不是ShuffleDependency就存到waitingForVisit继续下一次遍历
case dependency =>
waitingForVisit.push(dependency.rdd)
}
}
}
parents
}
// TODO 如果shuffleIdToMapStage中存在ShuffleMapStage, 则获取一个ShuffleMapStage。否则,如果ShuffleMapStage还不存在
// TODO 该方法将创建ShuffleMapStage, 此外还将创建任何丢失(还没有开始创建的)的祖先的ShuffleMapStage
// TODO 当第一次调用该方法的时候会创建所有的Stage出来,当第二次调用的时候所有Stage都存在了所以直接返回了
private def getOrCreateShuffleMapStage(
shuffleDep: ShuffleDependency[_, _, _],
firstJobId: Int): ShuffleMapStage = {
// TODO shuffleIdToMapStage是存储shuffleDep.shuffleId和Stage的映射的Map
shuffleIdToMapStage.get(shuffleDep.shuffleId) match {
// TODO 如果当前shuffleDep的Stage已经存在于shuffleIdToMapStage中的话, 则直接返回
case Some(stage) =>
stage
case None =>
// Create stages for all missing ancestor shuffle dependencies.
// TODO 为所有丢失(还没有创建的)的祖先ShuffleDependency创建Stage
// TODO 由于getMissingAncestorShuffleDependencies返回的是一个栈,又由于是从后往前入栈的,所以这里是从前往后创建Stage
getMissingAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
// Even though getMissingAncestorShuffleDependencies only returns shuffle dependencies
// that were not already in shuffleIdToMapStage, it's possible that by the time we
// get to a particular dependency in the foreach loop, it's been added to
// shuffleIdToMapStage by the stage creation process for an earlier dependency. See
// SPARK-13902 for more information.
/*
尽管getMissingAncestorShuffleDependencies只返回未在shuffleIdToMapStage的ShuffleDependency,
有可能的时候我们到达foreach循环的一个特定的dependency时,它已经被Stage创建过程添加到shuffleIdToMapStage用于早期的依赖
*/
// TODO 如果该ShuffleDependency所对应的Stage还没有创建则创建
if (!shuffleIdToMapStage.contains(dep.shuffleId)) {
createShuffleMapStage(dep, firstJobId)
}
}
// Finally, create a stage for the given shuffle dependency.
// TODO 最后,为给本方法中传入的shuffleDep创建一个Stage。因为之前在获取祖先ShuffleDependency的时候没有算在其中。
createShuffleMapStage(shuffleDep, firstJobId)
}
}
// TODO 查找尚未在shuffleToMapStage中注册的祖先ShuffleMapStage, 并返回一个包含所有祖先ShuffleDependency的栈,最早的在最前面
private def getMissingAncestorShuffleDependencies(
rdd: RDD[_]): ArrayStack[ShuffleDependency[_, _, _]] = {
// TODO 存储父ShuffleDependency的栈
val ancestors = new ArrayStack[ShuffleDependency[_, _, _]]
// TODO 存储遍历过的RDD
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
// TODO 存储等待遍历RDD的栈
val waitingForVisit = new ArrayStack[RDD[_]]
// TODO 将ResultRDD压入等待遍历的栈中
waitingForVisit.push(rdd)
while (waitingForVisit.nonEmpty) {
val toVisit = waitingForVisit.pop()
if (!visited(toVisit)) {
visited += toVisit
// TODO 返回给定RDD的上一个直接相连的ShuffleDependency
getShuffleDependencies(toVisit).foreach { shuffleDep =>
if (!shuffleIdToMapStage.contains(shuffleDep.shuffleId)) {
ancestors.push(shuffleDep)
waitingForVisit.push(shuffleDep.rdd)
} // Otherwise, the dependency and its ancestors have already been registered.
}
}
}
ancestors
}
// TODO 创建一个ShuffleMapStage,它生成给定的shuffle依赖项的partition。如果以前运行的Stage生成了相同的shuffle数据
// TODO 此函数将复制前一次shuffle仍然可用的输出位置,以避免不必要的重新生成数据。
def createShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _], jobId: Int): ShuffleMapStage = {
val rdd = shuffleDep.rdd
checkBarrierStageWithDynamicAllocation(rdd)
checkBarrierStageWithNumSlots(rdd)
checkBarrierStageWithRDDChainPattern(rdd, rdd.getNumPartitions)
val numTasks = rdd.partitions.length
// TODO parents这里返回的是当前RDD的所依赖的祖先Stage
val parents = getOrCreateParentStages(rdd, jobId)
val id = nextStageId.getAndIncrement()
val stage = new ShuffleMapStage(
id, rdd, numTasks, parents, jobId, rdd.creationSite, shuffleDep, mapOutputTracker)
// TODO 创建的Stage都存储在了stageIdToStage的Map中
stageIdToStage(id) = stage
// TODO 这里创建ShuffleMapStage
shuffleIdToMapStage(shuffleDep.shuffleId) = stage
updateJobIdStageIdMaps(jobId, stage)
if (!mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) {
// Kind of ugly: need to register RDDs with the cache and map output tracker here
// since we can't do it in the RDD constructor because # of partitions is unknown
logInfo("Registering RDD " + rdd.id + " (" + rdd.getCreationSite + ")")
// TODO 这里需要在缓存和MapOutputTracker中注册RDD,因为分区的数目未知,我们不能在RDD构造函数中进行注册
mapOutputTracker.registerShuffle(shuffleDep.shuffleId, rdd.partitions.length)
}
stage
}
// TODO 由于提交的时候传入的是finalStage,所以需要递归地先提交祖先Stage, 从前往后提交
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
// TODO 如果该Stage被创建了的话就提交
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
// TODO 当该Stage不在等待队列、运行队列、失败队列中是才提交
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
// TODO 获取父Satge
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
// TODO 如果当前Stage没有祖先Satge则提交Stage
if (missing.isEmpty) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
// TODO 如果没有父stage,则提交当前的stage
submitMissingTasks(stage, jobId.get)
} else {
for (parent <- missing) {
// TODO 如果有父stage,则递归提交父stage
submitStage(parent)
}
// TODO 把该父Stage放入等待队列中
waitingStages += stage
}
}
} else {
// TODO 否者就终止该Stage
abortStage(stage, "No active job for stage " + stage.id, None)
}
}
// TODO 找出当前提交任务的直接相关的父Stage
private def getMissingParentStages(stage: Stage): List[Stage] = {
val missing = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
val waitingForVisit = new ArrayStack[RDD[_]]
def visit(rdd: RDD[_]) {
if (!visited(rdd)) {
visited += rdd
val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
if (rddHasUncachedPartitions) {
for (dep <- rdd.dependencies) {
dep match {
case shufDep: ShuffleDependency[_, _, _] =>
val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)
if (!mapStage.isAvailable) {
missing += mapStage
}
case narrowDep: NarrowDependency[_] =>
waitingForVisit.push(narrowDep.rdd)
}
}
}
}
}
waitingForVisit.push(stage.rdd)
while (waitingForVisit.nonEmpty) {
visit(waitingForVisit.pop())
}
missing.toList
}
// TODO 当Stage的祖先Stage有效时该方法被调用,我们现在可以提交他的Task
private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug("submitMissingTasks(" + stage + ")")
// First figure out the indexes of partition ids to compute.
val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()
// Use the scheduling pool, job group, description, etc. from an ActiveJob associated
// with this Stage
val properties = jobIdToActiveJob(jobId).properties
// TODO 把该Stage放入正在运行的队列中
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
// event.
// TODO 这里初始化Stage的状态
stage match {
case s: ShuffleMapStage =>
outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)
case s: ResultStage =>
outputCommitCoordinator.stageStart(
stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)
}
val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
stage match {
case s: ShuffleMapStage =>
partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
case s: ResultStage =>
partitionsToCompute.map { id =>
val p = s.partitions(id)
(id, getPreferredLocs(stage.rdd, p))
}.toMap
}
} catch {
case NonFatal(e) =>
stage.makeNewStageAttempt(partitionsToCompute.size)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
runningStages -= stage
return
}
stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
// If there are tasks to execute, record the submission time of the stage. Otherwise,
// post the even without the submission time, which indicates that this stage was
// skipped.
// TODO 如果有要执行的任务,请记录该Stage的提交时间。 否则,在发送没有提交时间的事件,这表示该Stage将被跳过。
if (partitionsToCompute.nonEmpty) {
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
}
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
/**
* 广播RDD的二进制序列化内容,用于将任务分配给Executor。 请注意,我们广播了RDD的序列化副本,对于每个任务,我们将对其进行反序列化,
* 这意味着每个任务都将获得RDD的不同副本。 这样可以在可能修改其闭包中引用的对象状态的任务之间提供更强的隔离。 在JobConf / Configuration对象
* 不是线程安全的Hadoop中,这是必需的。
*/
var taskBinary: Broadcast[Array[Byte]] = null
var partitions: Array[Partition] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
// TODO 对于ShuffleMapTask,进行序列化和广播(rdd,shuffleDep)。
// TODO 对于ResultTask,进行序列化和广播(rdd,func)。
var taskBinaryBytes: Array[Byte] = null
// taskBinaryBytes and partitions are both effected by the checkpoint status. We need
// this synchronization in case another concurrent job is checkpointing this RDD, so we get a
// consistent view of both variables.
// TODO taskBinaryBytes和分区都受检查点状态影响。 如果另一个并发作业正在对该RDD进行检查点则需要进行同步,因此我们可以获得两个变量的一致视图
RDDCheckpointData.synchronized {
taskBinaryBytes = stage match {
case stage: ShuffleMapStage =>
JavaUtils.bufferToArray(
closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
case stage: ResultStage =>
JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
}
partitions = stage.rdd.partitions
}
// TODO 这里Driver向Worker广播(RDD, ShuffleDependency)的序列化二进制字节
taskBinary = sc.broadcast(taskBinaryBytes)
} catch {
// In the case of a failure during serialization, abort the stage.
case e: NotSerializableException =>
abortStage(stage, "Task not serializable: " + e.toString, Some(e))
runningStages -= stage
// Abort execution
return
case e: Throwable =>
abortStage(stage, s"Task serialization failed: $e\n${Utils.exceptionString(e)}", Some(e))
runningStages -= stage
// Abort execution
return
}
val tasks: Seq[Task[_]] = try {
val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
stage match {
case stage: ShuffleMapStage =>
stage.pendingPartitions.clear()
// TODO 这里有几个分区就生成几个ShuffleMapTask
partitionsToCompute.map { id =>
val locs = taskIdToLocations(id)
val part = partitions(id)
stage.pendingPartitions += id
// TODO 如果是ShuffleMapStage就初始化ShuffleMapTask对象 一个Partition就是一个ShuffleMapTask任务
new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
Option(sc.applicationId), sc.applicationAttemptId, stage.rdd.isBarrier())
}
case stage: ResultStage =>
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = partitions(p)
val locs = taskIdToLocations(id)
// TODO 如果是ResultStage就初始化ResultTask对象 一个Partition就是一个ResultTask任务
new ResultTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, id, properties, serializedTaskMetrics,
Option(jobId), Option(sc.applicationId), sc.applicationAttemptId,
stage.rdd.isBarrier())
}
}
} catch {
case NonFatal(e) =>
abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
runningStages -= stage
return
}
if (tasks.size > 0) {
logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
// TODO TaskScheduler开始提交任务, 将之前初始化的TaskS的转成数组然后封装到TaskSet中, 然后提交任务
taskScheduler.submitTasks(new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
} else {
// TODO 当没有生成Task的时候
// Because we posted SparkListenerStageSubmitted earlier, we should mark
// the stage as completed here in case there are no tasks to run
markStageAsFinished(stage, None)
stage match {
case stage: ShuffleMapStage =>
logDebug(s"Stage ${stage} is actually done; " +
s"(available: ${stage.isAvailable}," +
s"available outputs: ${stage.numAvailableOutputs}," +
s"partitions: ${stage.numPartitions})")
markMapStageJobsAsFinished(stage)
case stage : ResultStage =>
logDebug(s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})")
}
// TODO 提交等待队列中的当前Stage的子Stage
submitWaitingChildStages(stage)
}
}
}