spark2.11-2.3源码5_资源调度schedule(driver和worker的分配启动)

schedule

private def schedule(): Unit = {
    //首先判断，master状态不是ALIVE的话返回，也就是说standby master不会进行
    if (state != RecoveryState.ALIVE) {
      return
    }
    // Drivers take strict precedence over executors
    //取出workers中所有之前注册上来的worker，过滤出状态为ALIVE的worker，进行shulle随机打乱
    val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
    //获取打乱的worker数量
    val numWorkersAlive = shuffledAliveWorkers.size
    var curPos = 0
    //driver调度机制：我们以循环方式为每个等待的driver分配worker。对于每个driver，我们从最后一个被分配driver的worker开始，并继续前进，直到我们探索了所有活着的worker。
    //遍历waitingDrivers
    for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
     
      var launched = false
      var numWorkersVisited = 0
      //只有还有没有遍历到的且活的worker,而且还没有启动
      while (numWorkersVisited < numWorkersAlive && !launched) {
        val worker = shuffledAliveWorkers(curPos)
        numWorkersVisited += 1
        //如果当前这个workerr空闲内存量>=driver需要的内存且worker的空闲cpu数量>=driver需要的cpu数量
        if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) {
          //启动driver，在waitingDrivers移除
          launchDriver(worker, driver)
          waitingDrivers -= driver
          launched = true
        }
        //指向下一个worker
        curPos = (curPos + 1) % numWorkersAlive
      }
    }
    startExecutorsOnWorkers()
  }

shuffle
读取数据到ArrayBuffer，每个位置a取一随机数b（范围(0-a)），将位置a与位置b数据交换，实现每个位置数据打乱。

 def shuffle[T, CC[X] <: TraversableOnce[X]](xs: CC[T])(implicit bf: CanBuildFrom[CC[T], T, CC[T]]): CC[T] = {
        val buf = new ArrayBuffer[T] ++= xs
        def swap(i1: Int, i2: Int) {
          val tmp = buf(i1)
          buf(i1) = buf(i2)
          buf(i2) = tmp
        }
    
        for (n <- buf.length to 2 by -1) {
          val k = nextInt(n)
          swap(n - 1, k)
        }
    
        (bf(xs) ++= buf).result()
      }

launchDriver

private def launchDriver(worker: WorkerInfo, driver: DriverInfo) {
    logInfo("Launching driver " + driver.id + " on worker " + worker.id)
    //将driver加入worker缓存中，将worker加入到driver内部的缓存结构中，互相引用
    //将worker内使用的内存和cpu数量，都加上driver需要的内存和cpu数量，见下面addDriver源码
    worker.addDriver(driver)
    driver.worker = Some(worker)
    //发送LaunchDriver消息，让worker启动driver
    worker.endpoint.send(LaunchDriver(driver.id, driver.desc))
    driver.state = DriverState.RUNNING
  }

addDriver

def addDriver(driver: DriverInfo) {
    drivers(driver.id) = driver
    memoryUsed += driver.desc.mem
    coresUsed += driver.desc.cores
  }

startExecutorsOnWorkers

private def startExecutorsOnWorkers(): Unit = {
    // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
    // in the queue, then the second app, etc.
    for (app <- waitingApps) {
      val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
      // If the cores left is less than the coresPerExecutor,the cores left will not be allocated
      //还有需要调度的core的application>=每一个executor上的core的数量
      if (app.coresLeft >= coresPerExecutor) {
        // Filter out workers that don't have enough resources to launch an executor
        //从workers中，过滤出状态是ALIVE且可以使用的worker(worker的可以使用的内存>=application中每一个executor的使用的内存且worker的可以使用的core>=每一个executor的使用的core)
        val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
          .filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
            worker.coresFree >= coresPerExecutor)
          .sortBy(_.coresFree).reverse
          //存储分配的core数量
        val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)

        // Now that we've decided how many cores to allocate on each worker, let's allocate them分配算法已经决定好了在哪个worker上分配多少cores，下面我们开始进行executor的分配
        //遍历worker，只要之前这个worker分配到了core
        for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
        	//分配worker的资源到executors
          allocateWorkerResourceToExecutors(
            app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos))
        }
      }
    }
  }

scheduleExecutorsOnWorkers作用是计算在哪一个worker上启动几个executor

private def scheduleExecutorsOnWorkers(
      app: ApplicationInfo,
      usableWorkers: Array[WorkerInfo],
      spreadOutApps: Boolean): Array[Int] = {
     //每一个executor上的core的数量
    val coresPerExecutor = app.desc.coresPerExecutor
    // 一个executor上最小的cpu的数量(在一个executor上的cpu的cores不一定就是coresPerExecutor还有可能是coresPerExecutor的整数倍)
    val minCoresPerExecutor = coresPerExecutor.getOrElse(1)
    //coresPerExecutor为空，即没有设置每一个executor上的core的数量，则当前的application只能在一个worker上启动该application的一个executor
    val oneExecutorPerWorker = coresPerExecutor.isEmpty
    //application中每个executor使用内存
    val memoryPerExecutor = app.desc.memoryPerExecutorMB
    //可使用的worker数量
    val numUsable = usableWorkers.length
    //记录worker上已经分配的cpu的核数
    val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker
    //记录worker上已经分配的executor的数量
    val assignedExecutors = new Array[Int](numUsable) // Number of new executors on each worker
    //app实际会分配的CPU的数量，取**app剩余的还没有被分配的CPU的数量**和**可以使用的workers的cpu数量总和**之间的最小值
    var coresToAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)

    /** Return whether the specified worker can launch an executor for this app. 判断当前的这个worker是否可以用来启动一个当前app的executor*/
    def canLaunchExecutor(pos: Int): Boolean = {
      //app实际会分配的CPU的数量>=executor中允许的最少的cpu的数量	
      val keepScheduling = coresToAssign >= minCoresPerExecutor
      //判断该worker上是否还有足够的cores
      val enoughCores = usableWorkers(pos).coresFree - assignedCores(pos) >= minCoresPerExecutor

      //如果我们允许多个executor在一个worker上，我们可以启动新的executor，否则，我们只能用这一个executor，不能启动新的executor，总是让一个executor在一个worker上，只能增加唯一的这个executor的core的数量
      val launchingNewExecutor = !oneExecutorPerWorker || assignedExecutors(pos) == 0
      //可以启动新的executor
      if (launchingNewExecutor) {
         //该worker上已经被分配的内存
        val assignedMemory = assignedExecutors(pos) * memoryPerExecutor
        //worker上剩余的内存减去被分配的内存需要大于将要被分配的内存
        val enoughMemory = usableWorkers(pos).memoryFree - assignedMemory >= memoryPerExecutor
        //已分配的executor的数量+application的数量<application的executor的限制
        val underLimit = assignedExecutors.sum + app.executors.size < app.executorLimit
        keepScheduling && enoughCores && enoughMemory && underLimit
      } else {
        // We're adding cores to an existing executor, so no need
        // to check memory and executor limits
        keepScheduling && enoughCores
      }
    }

    // Keep launching executors until no more workers can accommodate any
    // more executors, or if we have reached this application's limits
    //启动executor直到没有worker可以容纳更多的executor或者我们达到application设置的executor的限制
    //可以启动的worker
    var freeWorkers = (0 until numUsable).filter(canLaunchExecutor)
    //开始在freeWorkers上面分配executor
    while (freeWorkers.nonEmpty) {
      freeWorkers.foreach { pos =>
     	 //控制是否使用spreadOutApps算法
        var keepScheduling = true
        //判断是否可以启动一个executor
        while (keepScheduling && canLaunchExecutor(pos)) {
        //修改该app已经分配的cores和worker上已经被分配的cores
          coresToAssign -= minCoresPerExecutor
          assignedCores(pos) += minCoresPerExecutor

          // If we are launching one executor per worker, then every iteration assigns 1 core
          // to the executor. Otherwise, every iteration assigns cores to a new executor.
          //假如我们只允许在一个worker上启动一个executor，那么设置该worker上被分配的executor的数量为1,否则就是在原来的executors的数量上加上1
          if (oneExecutorPerWorker) {
            assignedExecutors(pos) = 1
          } else {
            assignedExecutors(pos) += 1
          }
          //如果我们采用的是spreadOutApps这个算法，就意味着我们需要尽可能的将executor分配到足够多的 worker上，此时就应该设置keepScheduling设置为false，结束在该executors上的分配
		  //如果我们采用的不是spreadOutApps这个算法，就意味着我们需要一直在这个worker上分配这个executor这个时候我们就需要设置keepScheduling为true，让其一直循环，一直到该worker上的资源不满足再继续分配
      if (spreadOutApps) {
            keepScheduling = false
          }
        }
      }
      //因为最外层是一个while循环，所以这边在过滤一遍，如果分配结束了，那么canLaunchExecutor就会返回false得到的freeworkers必然是一个空，最外层的循环就结束了
      freeWorkers = freeWorkers.filter(canLaunchExecutor)
    }
    //返回每个worker上应该被分配的cores
    assignedCores
  }

allocateWorkerResourceToExecutors

如果指定了每个executor的core的数量，我们将分配给executor的core平均分配给executor，不留余数。否则，我们将启动一个executor，它将获取此工作器上所有分配的core。

 private def allocateWorkerResourceToExecutors(
          app: ApplicationInfo,
          //在这个worker上需要分配的cores
          assignedCores: Int,
          //每一个executor需要的cores
          coresPerExecutor: Option[Int],
          worker: WorkerInfo): Unit = {  
         //在这个worker上需要分配的executors的数量
        val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1)
        //将要在这个worker上每一次需要分配的cores的数量
        val coresToAssign = coresPerExecutor.getOrElse(assignedCores)
        //小于在这个worker上需要分配的executors的数量
        for (i <- 1 to numExecutors) {
          //给app添加一个executor
          val exec = app.addExecutor(worker, coresToAssign)
          //启动executor
          launchExecutor(worker, exec)
          app.state = ApplicationState.RUNNING
        }
      }

launchExecutor

private def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc): Unit = {
    logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
    worker.addExecutor(exec)
    worker.endpoint.send(LaunchExecutor(masterUrl,
      exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory))
    exec.application.driver.send(
      ExecutorAdded(exec.id, worker.id, worker.hostPort, exec.cores, exec.memory))
  }

注：只有用yarn-cluster模式提交的时候，才会注册driver；因为standalone和yarn-client模式，都会在本地直接启动driver，而不会来注册driver，就更不可能让master调度driver。

spark2.11-2.3源码5_资源调度schedule(driver和worker的分配启动)

猜你喜欢