spark2.11-2.3源码5_资源调度schedule(driver和worker的分配启动)

schedule

private def schedule(): Unit = {
    //首先判断,master状态不是ALIVE的话返回,也就是说standby master不会进行
    if (state != RecoveryState.ALIVE) {
      return
    }
    // Drivers take strict precedence over executors
    //取出workers中所有之前注册上来的worker,过滤出状态为ALIVE的worker,进行shulle随机打乱
    val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
    //获取打乱的worker数量
    val numWorkersAlive = shuffledAliveWorkers.size
    var curPos = 0
    //driver调度机制:我们以循环方式为每个等待的driver分配worker。对于每个driver,我们从最后一个被分配driver的worker开始,并继续前进,直到我们探索了所有活着的worker。
    //遍历waitingDrivers
    for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
     
      var launched = false
      var numWorkersVisited = 0
      //只有还有没有遍历到的且活的worker,而且还没有启动
      while (numWorkersVisited < numWorkersAlive && !launched) {
        val worker = shuffledAliveWorkers(curPos)
        numWorkersVisited += 1
        //如果当前这个workerr空闲内存量>=driver需要的内存且worker的空闲cpu数量>=driver需要的cpu数量
        if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) {
          //启动driver,在waitingDrivers移除
          launchDriver(worker, driver)
          waitingDrivers -= driver
          launched = true
        }
        //指向下一个worker
        curPos = (curPos + 1) % numWorkersAlive
      }
    }
    startExecutorsOnWorkers()
  }

shuffle
读取数据到ArrayBuffer,每个位置a取一随机数b(范围(0-a)),将位置a与位置b数据交换,实现每个位置数据打乱。

 def shuffle[T, CC[X] <: TraversableOnce[X]](xs: CC[T])(implicit bf: CanBuildFrom[CC[T], T, CC[T]]): CC[T] = {
        val buf = new ArrayBuffer[T] ++= xs
        def swap(i1: Int, i2: Int) {
          val tmp = buf(i1)
          buf(i1) = buf(i2)
          buf(i2) = tmp
        }
    
        for (n <- buf.length to 2 by -1) {
          val k = nextInt(n)
          swap(n - 1, k)
        }
    
        (bf(xs) ++= buf).result()
      }

launchDriver

private def launchDriver(worker: WorkerInfo, driver: DriverInfo) {
    logInfo("Launching driver " + driver.id + " on worker " + worker.id)
    //将driver加入worker缓存中,将worker加入到driver内部的缓存结构中,互相引用
    //将worker内使用的内存和cpu数量,都加上driver需要的内存和cpu数量,见下面addDriver源码
    worker.addDriver(driver)
    driver.worker = Some(worker)
    //发送LaunchDriver消息,让worker启动driver
    worker.endpoint.send(LaunchDriver(driver.id, driver.desc))
    driver.state = DriverState.RUNNING
  }

addDriver

def addDriver(driver: DriverInfo) {
    drivers(driver.id) = driver
    memoryUsed += driver.desc.mem
    coresUsed += driver.desc.cores
  }

startExecutorsOnWorkers

private def startExecutorsOnWorkers(): Unit = {
    // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
    // in the queue, then the second app, etc.
    for (app <- waitingApps) {
      val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
      // If the cores left is less than the coresPerExecutor,the cores left will not be allocated
      //还有需要调度的core的application>=每一个executor上的core的数量
      if (app.coresLeft >= coresPerExecutor) {
        // Filter out workers that don't have enough resources to launch an executor
        //从workers中,过滤出状态是ALIVE且可以使用的worker(worker的可以使用的内存>=application中每一个executor的使用的内存且worker的可以使用的core>=每一个executor的使用的core)
        val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
          .filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
            worker.coresFree >= coresPerExecutor)
          .sortBy(_.coresFree).reverse
          //存储分配的core数量
        val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)

        // Now that we've decided how many cores to allocate on each worker, let's allocate them分配算法已经决定好了在哪个worker上分配多少cores,下面我们开始进行executor的分配
        //遍历worker,只要之前这个worker分配到了core
        for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
        	//分配worker的资源到executors
          allocateWorkerResourceToExecutors(
            app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos))
        }
      }
    }
  }

scheduleExecutorsOnWorkers作用是计算在哪一个worker上启动几个executor

private def scheduleExecutorsOnWorkers(
      app: ApplicationInfo,
      usableWorkers: Array[WorkerInfo],
      spreadOutApps: Boolean): Array[Int] = {
     //每一个executor上的core的数量
    val coresPerExecutor = app.desc.coresPerExecutor
    // 一个executor上最小的cpu的数量(在一个executor上的cpu的cores不一定就是coresPerExecutor还有可能是coresPerExecutor的整数倍)
    val minCoresPerExecutor = coresPerExecutor.getOrElse(1)
    //coresPerExecutor为空,即没有设置每一个executor上的core的数量,则当前的application只能在一个worker上启动该application的一个executor
    val oneExecutorPerWorker = coresPerExecutor.isEmpty
    //application中每个executor使用内存
    val memoryPerExecutor = app.desc.memoryPerExecutorMB
    //可使用的worker数量
    val numUsable = usableWorkers.length
    //记录worker上已经分配的cpu的核数
    val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker
    //记录worker上已经分配的executor的数量
    val assignedExecutors = new Array[Int](numUsable) // Number of new executors on each worker
    //app实际会分配的CPU的数量,取**app剩余的还没有被分配的CPU的数量**和**可以使用的workers的cpu数量总和**之间的最小值
    var coresToAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)

    /** Return whether the specified worker can launch an executor for this app. 判断当前的这个worker是否可以用来启动一个当前app的executor*/
    def canLaunchExecutor(pos: Int): Boolean = {
      //app实际会分配的CPU的数量>=executor中允许的最少的cpu的数量	
      val keepScheduling = coresToAssign >= minCoresPerExecutor
      //判断该worker上是否还有足够的cores
      val enoughCores = usableWorkers(pos).coresFree - assignedCores(pos) >= minCoresPerExecutor

      //如果我们允许多个executor在一个worker上,我们可以启动新的executor,否则,我们只能用这一个executor,不能启动新的executor,总是让一个executor在一个worker上,只能增加唯一的这个executor的core的数量
      val launchingNewExecutor = !oneExecutorPerWorker || assignedExecutors(pos) == 0
      //可以启动新的executor
      if (launchingNewExecutor) {
         //该worker上已经被分配的内存
        val assignedMemory = assignedExecutors(pos) * memoryPerExecutor
        //worker上剩余的内存减去被分配的内存需要大于将要被分配的内存
        val enoughMemory = usableWorkers(pos).memoryFree - assignedMemory >= memoryPerExecutor
        //已分配的executor的数量+application的数量<application的executor的限制
        val underLimit = assignedExecutors.sum + app.executors.size < app.executorLimit
        keepScheduling && enoughCores && enoughMemory && underLimit
      } else {
        // We're adding cores to an existing executor, so no need
        // to check memory and executor limits
        keepScheduling && enoughCores
      }
    }

    // Keep launching executors until no more workers can accommodate any
    // more executors, or if we have reached this application's limits
    //启动executor直到没有worker可以容纳更多的executor或者我们达到application设置的executor的限制
    //可以启动的worker
    var freeWorkers = (0 until numUsable).filter(canLaunchExecutor)
    //开始在freeWorkers上面分配executor
    while (freeWorkers.nonEmpty) {
      freeWorkers.foreach { pos =>
     	 //控制是否使用spreadOutApps算法
        var keepScheduling = true
        //判断是否可以启动一个executor
        while (keepScheduling && canLaunchExecutor(pos)) {
        //修改该app已经分配的cores和worker上已经被分配的cores
          coresToAssign -= minCoresPerExecutor
          assignedCores(pos) += minCoresPerExecutor

          // If we are launching one executor per worker, then every iteration assigns 1 core
          // to the executor. Otherwise, every iteration assigns cores to a new executor.
          //假如我们只允许在一个worker上启动一个executor,那么设置该worker上被分配的executor的数量为1,否则就是在原来的executors的数量上加上1
          if (oneExecutorPerWorker) {
            assignedExecutors(pos) = 1
          } else {
            assignedExecutors(pos) += 1
          }
          //如果我们采用的是spreadOutApps这个算法,就意味着我们需要尽可能的将executor分配到足够多的 worker上,此时就应该设置keepScheduling设置为false,结束在该executors上的分配
		  //如果我们采用的不是spreadOutApps这个算法,就意味着我们需要一直在这个worker上分配这个executor这个时候我们就需要设置keepScheduling为true,让其一直循环,一直到该worker上的资源不满足再继续分配
      if (spreadOutApps) {
            keepScheduling = false
          }
        }
      }
      //因为最外层是一个while循环,所以这边在过滤一遍,如果分配结束了,那么canLaunchExecutor就会返回false得到的freeworkers必然是一个空,最外层的循环就结束了
      freeWorkers = freeWorkers.filter(canLaunchExecutor)
    }
    //返回每个worker上应该被分配的cores
    assignedCores
  }

allocateWorkerResourceToExecutors

如果指定了每个executor的core的数量,我们将分配给executor的core平均分配给executor,不留余数。否则,我们将启动一个executor,它将获取此工作器上所有分配的core。

 private def allocateWorkerResourceToExecutors(
          app: ApplicationInfo,
          //在这个worker上需要分配的cores
          assignedCores: Int,
          //每一个executor需要的cores
          coresPerExecutor: Option[Int],
          worker: WorkerInfo): Unit = {  
         //在这个worker上需要分配的executors的数量
        val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1)
        //将要在这个worker上每一次需要分配的cores的数量
        val coresToAssign = coresPerExecutor.getOrElse(assignedCores)
        //小于在这个worker上需要分配的executors的数量
        for (i <- 1 to numExecutors) {
          //给app添加一个executor
          val exec = app.addExecutor(worker, coresToAssign)
          //启动executor
          launchExecutor(worker, exec)
          app.state = ApplicationState.RUNNING
        }
      }

launchExecutor

private def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc): Unit = {
    logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
    worker.addExecutor(exec)
    worker.endpoint.send(LaunchExecutor(masterUrl,
      exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory))
    exec.application.driver.send(
      ExecutorAdded(exec.id, worker.id, worker.hostPort, exec.cores, exec.memory))
  }

注:只有用yarn-cluster模式提交的时候,才会注册driver;因为standalone和yarn-client模式,都会在本地直接启动driver,而不会来注册driver,就更不可能让master调度driver。

猜你喜欢

转载自blog.csdn.net/u011607686/article/details/86534417