当发生shuffle时,sc.runJob-->DAGScheduler.runJob-->submitStage(),提交stage时,
会首先判断是否有未运行的父stage,如果没有调用submitMissingTasks提交stage
如果有则调用submitStage()先提交父stage
1.sc.runJob
当有action算子时,会调用sc.runJob方法,下图是在源码中搜sc.runJob的结果
我们去SparkContext看下runJob方法:
def runJob[T, U: ClassManifest](rdd: RDD[T], func: Iterator[T] => U): Array[U] = {
//fun:参数为迭代器
runJob(rdd, func, 0 until rdd.splits.size, false)
}
def runJob[T, U: ClassManifest](rdd: RDD[T], func: (TaskContext, Iterator[T]) => U): Array[U] = {
//fun:参数为TaskContext实例和迭代器
runJob(rdd, func, 0 until rdd.splits.size, false)
}
def runJob[T, U: ClassManifest](
rdd: RDD[T],
func: Iterator[T] => U,
partitions: Seq[Int],
allowLocal: Boolean
): Array[U] = {
runJob(rdd, (context: TaskContext, iter: Iterator[T]) => func(iter), partitions, allowLocal)
}
def runJob[T, U: ClassManifest](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
allowLocal: Boolean
): Array[U] = {
logInfo("Starting job...")
val start = System.nanoTime//提供相对精确的计时
val result = scheduler.runJob(rdd, func, partitions, allowLocal)
logInfo("Job finished in " + (System.nanoTime - start) / 1e9 + " s")
result
}
可以看到有四个重载的方法,但最终都会调用第四个,从第四个可以看到调用了scheduler.runJob方法
private var scheduler: Scheduler = {
//正则表达式
val LOCAL_N_REGEX = """local\[([0-9]+)\]""".r
val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+),([0-9]+)\]""".r
master match {
case "local" => //如果是local,创建一个线程的本地调度器,失败重试
new LocalScheduler(1, 0)
case LOCAL_N_REGEX(threads) => //如果是local[n],创建n个线程的本地调度器
new LocalScheduler(threads.toInt, 0)
case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>//如果是local[n,m],m为重试次数
new LocalScheduler(threads.toInt, maxFailures.toInt)
case _ =>//否则创建mesos调度器
MesosNativeLibrary.load()
new MesosScheduler(this, master, frameworkName)
}
}
2.DAGScheduler.runJob
因为MesosScheduler和LocalScheduler都继承了DAGScheduler,直接去看DAGScheduler的runJob
代码有点多,就贴重要的出来
override def runJob[T, U](
finalRdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
allowLocal: Boolean)
(implicit m: ClassManifest[U]): Array[U] = {
lock.synchronized {
....定义的变量,数组,集合
submitStage(finalStage)
....
}
接下来看submitStage()方法,
该方法主要先判断waiting(HashSet存储等待运行的stage)和running(HashSet存储正在运行的stage)是否包含该stage,
如果都不包含,则调用getMissingParentStages获取该stage还未运行的父stage,
如果没有未运行的父stage,调用submitMissingTasks提交stage,并将stage加入running列表
如果有未运行的父stage,先提交父stage运行,并将stage加入waiting
def submitStage(stage: Stage) {
if (!waiting(stage) && !running(stage)) {
val missing = getMissingParentStages(stage)
if (missing == Nil) {
logInfo("Submitting " + stage + ", which has no missing parents")
submitMissingTasks(stage)
running += stage
} else {
for (parent <- missing) {
submitStage(parent)
}
waiting += stage
}
}
}
接下来先看getMissingParentStages方法,该方法主要用于获取stage未运行的父stage,划分stage
visit()方法首先遍历未被划分stage的rdd的依赖,
如果是shuffle依赖,调用getShuffleMapStage获取父stage,
如果是窄依赖,继续调用visit()方法,直到发生shuffle
可以看出stage与父stage划分的依赖就是是否发生shuffle
def getMissingParentStages(stage: Stage): List[Stage] = {
val missing = new HashSet[Stage]//存储未运行的父stage
val visited = new HashSet[RDD[_]]//存储已经划分stage的RDD
def visit(rdd: RDD[_]) {
if (!visited(rdd)) {//首先判断是否划分过
visited += rdd//添加到集合
val locs = getCacheLocs(rdd)
for (p <- 0 until rdd.splits.size) {//遍历分区
if (locs(p) == Nil) {
for (dep <- rdd.dependencies) {//遍历依赖
dep match {
case shufDep: ShuffleDependency[_,_,_] =>//shuffle依赖
val stage = getShuffleMapStage(shufDep)//获取父stage
if (!stage.isAvailable) {
//isAvailable判断为真的条件是:无父stage并且无shuffle依赖
missing += stage//将stage加入为运行的父stage队列
}
case narrowDep: NarrowDependency[_] =>//窄依赖
visit(narrowDep.rdd)//父rdd继续调用visit方法
}
}
}
}
}
}
visit(stage.rdd)
missing.toList
}
接下来看getShuffleMapStage()方法,该方法是发生shuffle时获取父stage,
shuffleToMapStage是HashMap存储(shuffleId,stage)键值对,首先取key为该shuffleId的值
有值,直接返回取到的stage
无值,新建Stage并将该stage加入shuffleToMapStage中
def getShuffleMapStage(shuf: ShuffleDependency[_,_,_]): Stage = {
shuffleToMapStage.get(shuf.shuffleId) match {
case Some(stage) => stage
case None =>
val stage = newStage(shuf.rdd, Some(shuf))
shuffleToMapStage(shuf.shuffleId) = stage
stage
}
}