schedule
private def schedule(): Unit = {
//首先判断,master状态不是ALIVE的话返回,也就是说standby master不会进行
if (state != RecoveryState.ALIVE) {
return
}
// Drivers take strict precedence over executors
//取出workers中所有之前注册上来的worker,过滤出状态为ALIVE的worker,进行shulle随机打乱
val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
//获取打乱的worker数量
val numWorkersAlive = shuffledAliveWorkers.size
var curPos = 0
//driver调度机制:我们以循环方式为每个等待的driver分配worker。对于每个driver,我们从最后一个被分配driver的worker开始,并继续前进,直到我们探索了所有活着的worker。
//遍历waitingDrivers
for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
var launched = false
var numWorkersVisited = 0
//只有还有没有遍历到的且活的worker,而且还没有启动
while (numWorkersVisited < numWorkersAlive && !launched) {
val worker = shuffledAliveWorkers(curPos)
numWorkersVisited += 1
//如果当前这个workerr空闲内存量>=driver需要的内存且worker的空闲cpu数量>=driver需要的cpu数量
if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) {
//启动driver,在waitingDrivers移除
launchDriver(worker, driver)
waitingDrivers -= driver
launched = true
}
//指向下一个worker
curPos = (curPos + 1) % numWorkersAlive
}
}
startExecutorsOnWorkers()
}
shuffle
读取数据到ArrayBuffer,每个位置a取一随机数b(范围(0-a)),将位置a与位置b数据交换,实现每个位置数据打乱。
def shuffle[T, CC[X] <: TraversableOnce[X]](xs: CC[T])(implicit bf: CanBuildFrom[CC[T], T, CC[T]]): CC[T] = {
val buf = new ArrayBuffer[T] ++= xs
def swap(i1: Int, i2: Int) {
val tmp = buf(i1)
buf(i1) = buf(i2)
buf(i2) = tmp
}
for (n <- buf.length to 2 by -1) {
val k = nextInt(n)
swap(n - 1, k)
}
(bf(xs) ++= buf).result()
}
launchDriver
private def launchDriver(worker: WorkerInfo, driver: DriverInfo) {
logInfo("Launching driver " + driver.id + " on worker " + worker.id)
//将driver加入worker缓存中,将worker加入到driver内部的缓存结构中,互相引用
//将worker内使用的内存和cpu数量,都加上driver需要的内存和cpu数量,见下面addDriver源码
worker.addDriver(driver)
driver.worker = Some(worker)
//发送LaunchDriver消息,让worker启动driver
worker.endpoint.send(LaunchDriver(driver.id, driver.desc))
driver.state = DriverState.RUNNING
}
addDriver
def addDriver(driver: DriverInfo) {
drivers(driver.id) = driver
memoryUsed += driver.desc.mem
coresUsed += driver.desc.cores
}
startExecutorsOnWorkers
private def startExecutorsOnWorkers(): Unit = {
// Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
// in the queue, then the second app, etc.
for (app <- waitingApps) {
val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
// If the cores left is less than the coresPerExecutor,the cores left will not be allocated
//还有需要调度的core的application>=每一个executor上的core的数量
if (app.coresLeft >= coresPerExecutor) {
// Filter out workers that don't have enough resources to launch an executor
//从workers中,过滤出状态是ALIVE且可以使用的worker(worker的可以使用的内存>=application中每一个executor的使用的内存且worker的可以使用的core>=每一个executor的使用的core)
val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
.filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
worker.coresFree >= coresPerExecutor)
.sortBy(_.coresFree).reverse
//存储分配的core数量
val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)
// Now that we've decided how many cores to allocate on each worker, let's allocate them分配算法已经决定好了在哪个worker上分配多少cores,下面我们开始进行executor的分配
//遍历worker,只要之前这个worker分配到了core
for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
//分配worker的资源到executors
allocateWorkerResourceToExecutors(
app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos))
}
}
}
}
scheduleExecutorsOnWorkers作用是计算在哪一个worker上启动几个executor
private def scheduleExecutorsOnWorkers(
app: ApplicationInfo,
usableWorkers: Array[WorkerInfo],
spreadOutApps: Boolean): Array[Int] = {
//每一个executor上的core的数量
val coresPerExecutor = app.desc.coresPerExecutor
// 一个executor上最小的cpu的数量(在一个executor上的cpu的cores不一定就是coresPerExecutor还有可能是coresPerExecutor的整数倍)
val minCoresPerExecutor = coresPerExecutor.getOrElse(1)
//coresPerExecutor为空,即没有设置每一个executor上的core的数量,则当前的application只能在一个worker上启动该application的一个executor
val oneExecutorPerWorker = coresPerExecutor.isEmpty
//application中每个executor使用内存
val memoryPerExecutor = app.desc.memoryPerExecutorMB
//可使用的worker数量
val numUsable = usableWorkers.length
//记录worker上已经分配的cpu的核数
val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker
//记录worker上已经分配的executor的数量
val assignedExecutors = new Array[Int](numUsable) // Number of new executors on each worker
//app实际会分配的CPU的数量,取**app剩余的还没有被分配的CPU的数量**和**可以使用的workers的cpu数量总和**之间的最小值
var coresToAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)
/** Return whether the specified worker can launch an executor for this app. 判断当前的这个worker是否可以用来启动一个当前app的executor*/
def canLaunchExecutor(pos: Int): Boolean = {
//app实际会分配的CPU的数量>=executor中允许的最少的cpu的数量
val keepScheduling = coresToAssign >= minCoresPerExecutor
//判断该worker上是否还有足够的cores
val enoughCores = usableWorkers(pos).coresFree - assignedCores(pos) >= minCoresPerExecutor
//如果我们允许多个executor在一个worker上,我们可以启动新的executor,否则,我们只能用这一个executor,不能启动新的executor,总是让一个executor在一个worker上,只能增加唯一的这个executor的core的数量
val launchingNewExecutor = !oneExecutorPerWorker || assignedExecutors(pos) == 0
//可以启动新的executor
if (launchingNewExecutor) {
//该worker上已经被分配的内存
val assignedMemory = assignedExecutors(pos) * memoryPerExecutor
//worker上剩余的内存减去被分配的内存需要大于将要被分配的内存
val enoughMemory = usableWorkers(pos).memoryFree - assignedMemory >= memoryPerExecutor
//已分配的executor的数量+application的数量<application的executor的限制
val underLimit = assignedExecutors.sum + app.executors.size < app.executorLimit
keepScheduling && enoughCores && enoughMemory && underLimit
} else {
// We're adding cores to an existing executor, so no need
// to check memory and executor limits
keepScheduling && enoughCores
}
}
// Keep launching executors until no more workers can accommodate any
// more executors, or if we have reached this application's limits
//启动executor直到没有worker可以容纳更多的executor或者我们达到application设置的executor的限制
//可以启动的worker
var freeWorkers = (0 until numUsable).filter(canLaunchExecutor)
//开始在freeWorkers上面分配executor
while (freeWorkers.nonEmpty) {
freeWorkers.foreach { pos =>
//控制是否使用spreadOutApps算法
var keepScheduling = true
//判断是否可以启动一个executor
while (keepScheduling && canLaunchExecutor(pos)) {
//修改该app已经分配的cores和worker上已经被分配的cores
coresToAssign -= minCoresPerExecutor
assignedCores(pos) += minCoresPerExecutor
// If we are launching one executor per worker, then every iteration assigns 1 core
// to the executor. Otherwise, every iteration assigns cores to a new executor.
//假如我们只允许在一个worker上启动一个executor,那么设置该worker上被分配的executor的数量为1,否则就是在原来的executors的数量上加上1
if (oneExecutorPerWorker) {
assignedExecutors(pos) = 1
} else {
assignedExecutors(pos) += 1
}
//如果我们采用的是spreadOutApps这个算法,就意味着我们需要尽可能的将executor分配到足够多的 worker上,此时就应该设置keepScheduling设置为false,结束在该executors上的分配
//如果我们采用的不是spreadOutApps这个算法,就意味着我们需要一直在这个worker上分配这个executor这个时候我们就需要设置keepScheduling为true,让其一直循环,一直到该worker上的资源不满足再继续分配
if (spreadOutApps) {
keepScheduling = false
}
}
}
//因为最外层是一个while循环,所以这边在过滤一遍,如果分配结束了,那么canLaunchExecutor就会返回false得到的freeworkers必然是一个空,最外层的循环就结束了
freeWorkers = freeWorkers.filter(canLaunchExecutor)
}
//返回每个worker上应该被分配的cores
assignedCores
}
allocateWorkerResourceToExecutors
如果指定了每个executor的core的数量,我们将分配给executor的core平均分配给executor,不留余数。否则,我们将启动一个executor,它将获取此工作器上所有分配的core。
private def allocateWorkerResourceToExecutors(
app: ApplicationInfo,
//在这个worker上需要分配的cores
assignedCores: Int,
//每一个executor需要的cores
coresPerExecutor: Option[Int],
worker: WorkerInfo): Unit = {
//在这个worker上需要分配的executors的数量
val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1)
//将要在这个worker上每一次需要分配的cores的数量
val coresToAssign = coresPerExecutor.getOrElse(assignedCores)
//小于在这个worker上需要分配的executors的数量
for (i <- 1 to numExecutors) {
//给app添加一个executor
val exec = app.addExecutor(worker, coresToAssign)
//启动executor
launchExecutor(worker, exec)
app.state = ApplicationState.RUNNING
}
}
launchExecutor
private def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc): Unit = {
logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
worker.addExecutor(exec)
worker.endpoint.send(LaunchExecutor(masterUrl,
exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory))
exec.application.driver.send(
ExecutorAdded(exec.id, worker.id, worker.hostPort, exec.cores, exec.memory))
}
注:只有用yarn-cluster模式提交的时候,才会注册driver;因为standalone和yarn-client模式,都会在本地直接启动driver,而不会来注册driver,就更不可能让master调度driver。