[spark] DAGScheduler submit stage source code analysis

After DAGScheduler divides the stage ( [spark] DAGScheduler divides the stage source code analysis ), it will submit the stage through submitStage(finalStage):

 private def submitStage(stage: Stage) {
    val jobId = activeJobForStage(stage)
    if (jobId.isDefined) {
      logDebug("submitStage(" + stage + ")")
      if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
        //获取未计算完的parentStage,判断是否计算完的条件是
        //_numAvailableOutputs == numPartitions,既有效输出个数是否等于分区数。
        //根据stageid从小到大排序,是因为越前面的stageid越小。
        val missing = getMissingParentStages(stage).sortBy(_.id)
        logDebug("missing: " + missing)
        if (missing.isEmpty) { 
          logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
          submitMissingTasks(stage, jobId.get) //若当前stage没有任何依赖或者所有依赖都已经准备好,则提交task。
        } else {
          //若有未提交的父Stage,则递归提交父Stage
          //标记当前stage为waitingStages ,先等待父stage执行完。
          for (parent <- missing) {
            submitStage(parent) 
          }
          waitingStages += stage
        }
      }
    } else {
      abortStage(stage, "No active job for stage " + stage.id, None)
    }
  }

Take a look at the implementation of getMissingParentStages:

private def getMissingParentStages(stage: Stage): List[Stage] = {
    val missing = new HashSet[Stage] //未计算完的stage
    val visited = new HashSet[RDD[_]] //被访问过的stage
    // We are manually maintaining a stack here to prevent StackOverflowError
    // caused by recursively visiting
    val waitingForVisit = new Stack[RDD[_]] //等待被访问的stage
    def visit(rdd: RDD[_]) {
      if (!visited(rdd)) {
        visited += rdd
        //先判断是否有未cache的分区,若全部都被cache了就不用计算parent Stage了。
        //遍历rdd的所有依赖,当是宽依赖时获取其对应依赖的宽依赖并判断该stage是否可用。
        //判断条件是该stage输出个数是否等于该stage的finalRDD分区数。
        //不等于时说明还有未计算的分区,则将该stage加入missing;
        //若为窄依赖则继续往上遍历。
        val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
        if (rddHasUncachedPartitions) {
          for (dep <- rdd.dependencies) {
            dep match {
              case shufDep: ShuffleDependency[_, _, _] =>
                val mapStage = getShuffleMapStage(shufDep, stage.firstJobId)
                if (!mapStage.isAvailable) {
                  missing += mapStage
                }
              case narrowDep: NarrowDependency[_] =>
                waitingForVisit.push(narrowDep.rdd)
            }
          }
        }
      }
    }
    waitingForVisit.push(stage.rdd)
    while (waitingForVisit.nonEmpty) {
      visit(waitingForVisit.pop())
    }
    missing.toList
  }

If the current stage does not have any dependencies or all dependencies are ready, submit the task through submitMissingTasks to see the specific implementation:

private def submitMissingTasks(stage: Stage, jobId: Int) {
    stage.pendingPartitions.clear()
    // 获取需要计算的分区
    val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()
    val properties = jobIdToActiveJob(jobId).properties
    runningStages += stage  // 标记stage为running状态
    ......
    //获取task最佳计算位置
    val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
      stage match {
        case s: ShuffleMapStage =>
          partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
        case s: ResultStage =>
          val job = s.activeJob.get
          partitionsToCompute.map { id =>
            val p = s.partitions(id)
            (id, getPreferredLocs(stage.rdd, p))
          }.toMap
      }
    } catch {
        ...
    }

    stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))

    var taskBinary: Broadcast[Array[Byte]] = null
    try {
      val taskBinaryBytes: Array[Byte] = stage match {
        case stage: ShuffleMapStage =>
          JavaUtils.bufferToArray(
            closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
        case stage: ResultStage =>
          JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
      }
      taskBinary = sc.broadcast(taskBinaryBytes)
    } catch {
       ...
    }
    val tasks: Seq[Task[_]] = try {
      stage match {
        case stage: ShuffleMapStage =>
          partitionsToCompute.map { id =>
            val locs = taskIdToLocations(id)
            val part = stage.rdd.partitions(id)
            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
              taskBinary, part, locs, stage.latestInfo.taskMetrics, properties)
          }
        case stage: ResultStage =>
          val job = stage.activeJob.get
          partitionsToCompute.map { id =>
            val p: Int = stage.partitions(id)
            val part = stage.rdd.partitions(p)
            val locs = taskIdToLocations(id)
            new ResultTask(stage.id, stage.latestInfo.attemptId,
              taskBinary, part, locs, id, properties, stage.latestInfo.taskMetrics)
          }
      }
    } catch {
      ...
    }
    if (tasks.size > 0) {
      stage.pendingPartitions ++= tasks.map(_.partitionId)
      taskScheduler.submitTasks(new TaskSet(
        tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
      stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
    } else {  
        ...
    }
  }

Each step will be explained in detail below:
stage.findMissingPartitions gets the partitions that need to be calculated. Different stages have different implementations:

//ShuffleMapStage
//根据partition是否有对应的outputLocs来判断哪些分区需要被计算,计算过的partition会被outputLocs记录
override def findMissingPartitions(): Seq[Int] = {
    val missing = (0 until numPartitions).filter(id => outputLocs(id).isEmpty)
    assert(missing.size == numPartitions - _numAvailableOutputs,
      s"${missing.size} missing, expected ${numPartitions - _numAvailableOutputs}")
    missing
  }

//ResultStage
//计算过的分区会被job记录为finish
 override def findMissingPartitions(): Seq[Int] = {
    val job = activeJob.get
    (0 until job.numPartitions).filter(id => !job.finished(id))
  }

taskIdToLocations obtains the best calculation location of the task, mainly through the getPreferredLocs method:

private def getPreferredLocsInternal(
      rdd: RDD[_],
      partition: Int,
      visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation] = {
    // If the partition has already been visited, no need to re-visit.
    // This avoids exponential path exploration.  SPARK-695
    if (!visited.add((rdd, partition))) {
      // Nil has already been returned for previously visited partitions.
      return Nil
    }
    // If the partition is cached, return the cache locations
    val cached = getCacheLocs(rdd)(partition)
    if (cached.nonEmpty) {
      return cached
    }
    // If the RDD has some placement preferences (as is the case for input RDDs), get those
    val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList
    if (rddPrefs.nonEmpty) {
      return rddPrefs.map(TaskLocation(_))
    }
    // If the RDD has narrow dependencies, pick the first partition of the first narrow dependency
    // that has any placement preferences. Ideally we would choose based on transfer sizes,
    // but this will do for now.
    rdd.dependencies.foreach {
      case n: NarrowDependency[_] =>
        for (inPart <- n.getParents(partition)) {
          val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
          if (locs != Nil) {
            return locs
          }
        }
      case _ =>
    }
    Nil
  }
  • In the getCacheLocs method, cacheLocs maintains the location information of the partitions of the RDD, which is an instance of TaskLocation. If the location information of the partition is obtained from the cacheLocs, it will be returned directly. If it cannot be obtained: if the storage level of the RDD is empty, return nil and fill in the cacheLocs. Otherwise, the blockManager that holds the partition information will be obtained through the blockManagerMaster and instantiate the ExecutorCacheTaskLocation into cacheLocs.
  • rdd.preferredLocations, this method first tries to obtain partition information from checpoint. If it is not obtained, then obtain it through the getPreferredLocations(split) method of rdd. Different rdds have different implementations. For example, HadoopRDD obtains the location of the current partition through Hadoop InputSplit.
  • When neither of the first two is obtained, the optimal position information of the partition of parentRDD is searched recursively. Note: Only works with narrow dependencies.

After obtaining the best position of the task, different serialized binary information will be broadcast to each excutor according to different stages. If it is shuffleMapStage, the FinalRDD of the stage and the shffleDep of the stage will be broadcast; if it is ResultStage, the FinalRDD and stage of the stage will be broadcast. func. That is, the actual execution logic of the task has been serialized into taskBinary and broadcast to each executor.

 var taskBinary: Broadcast[Array[Byte]] = null
    try {
      // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
      // For ResultTask, serialize and broadcast (rdd, func).
      val taskBinaryBytes: Array[Byte] = stage match {
        case stage: ShuffleMapStage =>
          JavaUtils.bufferToArray(
            closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
        case stage: ResultStage =>
          JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
      }

      taskBinary = sc.broadcast(taskBinaryBytes)
    } catch {
    }

Different types of tasks are generated according to different stages. Each partition corresponds to a task and each task contains the location information of the target partition. Finally, all tasks will be packaged into a taskSet for submission.

stage match {
        case stage: ShuffleMapStage =>
          partitionsToCompute.map { id =>
            val locs = taskIdToLocations(id)
            val part = stage.rdd.partitions(id)
            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
              taskBinary, part, locs, stage.latestInfo.taskMetrics, properties)
          }
        case stage: ResultStage =>
          val job = stage.activeJob.get
          partitionsToCompute.map { id =>
            val p: Int = stage.partitions(id)
            val part = stage.rdd.partitions(p)
            val locs = taskIdToLocations(id)
            new ResultTask(stage.id, stage.latestInfo.attemptId,
              taskBinary, part, locs, id, properties, stage.latestInfo.taskMetrics)
          }
      }

taskScheduler.submitTasks(new TaskSet(
        tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties)) 
    }

So far, DAGScheduler has completed the division of the stage and submitted it to the taskSchecduler in the form of a taskSet, and then the TaskScheduler will submit the management tasks, and the subsequent sequence will be launched.

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325521434&siteId=291194637