Kubernetes调度器源码学习(二):调度核心实现

本文基于Kubernetes v1.22.4版本进行源码学习

4、调度核心实现

1)、调度器运行流程

当把新创建的Pod添加到activeQ活动队列后,另外的协程就可以从队列中弹出堆顶的元素来进行具体的调度处理了。调度器启动后调用了sched.scheduleOne函数,代码如下:

// pkg/scheduler/scheduler.go
func (sched *Scheduler) scheduleOne(ctx context.Context) {
    
    
	// 获取下一个要调度的pod
	podInfo := sched.NextPod()
	// pod could be nil when schedulerQueue is closed
	if podInfo == nil || podInfo.Pod == nil {
    
    
		return
	}
	pod := podInfo.Pod
	// 根据pod来获取指定framework
	fwk, err := sched.frameworkForPod(pod)
	if err != nil {
    
    
		// This shouldn't happen, because we only accept for scheduling the pods
		// which specify a scheduler name that matches one of the profiles.
		klog.ErrorS(err, "Error occurred")
		return
	}
	// 某些情况下跳过调度pod
	if sched.skipPodSchedule(fwk, pod) {
    
    
		return
	}

	klog.V(3).InfoS("Attempting to schedule pod", "pod", klog.KObj(pod))

	// Synchronously attempt to find a fit for the pod.
	start := time.Now()
	state := framework.NewCycleState()
	state.SetRecordPluginMetrics(rand.Intn(100) < pluginMetricsSamplePercent)
	// Initialize an empty podsToActivate struct, which will be filled up by plugins or stay empty.
	podsToActivate := framework.NewPodsToActivate()
	state.Write(framework.PodsToActivateKey, podsToActivate)

	schedulingCycleCtx, cancel := context.WithCancel(ctx)
	defer cancel()
	// 真正执行调度的地方
	scheduleResult, err := sched.Algorithm.Schedule(schedulingCycleCtx, sched.Extenders, fwk, state, pod)
	if err != nil {
    
    
		// Schedule() may have failed because the pod would not fit on any host, so we try to
		// preempt, with the expectation that the next time the pod is tried for scheduling it
		// will fit due to the preemption. It is also possible that a different pod will schedule
		// into the resources that were preempted, but this is harmless.
		nominatedNode := ""
		if fitError, ok := err.(*framework.FitError); ok {
    
    
			if !fwk.HasPostFilterPlugins() {
    
    
				klog.V(3).InfoS("No PostFilter plugins are registered, so no preemption will be performed")
			} else {
    
    
				// Run PostFilter plugins to try to make the pod schedulable in a future scheduling cycle.
				result, status := fwk.RunPostFilterPlugins(ctx, state, pod, fitError.Diagnosis.NodeToStatusMap)
				if status.Code() == framework.Error {
    
    
					klog.ErrorS(nil, "Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", status)
				} else {
    
    
					klog.V(5).InfoS("Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", status)
				}
				if status.IsSuccess() && result != nil {
    
    
					nominatedNode = result.NominatedNodeName
				}
			}
			// Pod did not fit anywhere, so it is counted as a failure. If preemption
			// succeeds, the pod should get counted as a success the next time we try to
			// schedule it. (hopefully)
			metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start))
		} else if err == ErrNoNodesAvailable {
    
    
			// No nodes available is counted as unschedulable rather than an error.
			metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start))
		} else {
    
    
			klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
			metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
		}
		// handleSchedulingFailure记录pod调度失败的事件,并将调度失败的pod加入到不可调度的pod的队列中去
		sched.recordSchedulingFailure(fwk, podInfo, err, v1.PodReasonUnschedulable, nominatedNode)
		return
	}
	metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start))
	// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
	// This allows us to keep scheduling without waiting on binding to occur.
	// 告诉cache暂时绑定的pod现在正在指定的节点上运行,即使尚未绑定,这样就可以保持调度,而不必等待真正的绑定发生
	assumedPodInfo := podInfo.DeepCopy()
	assumedPod := assumedPodInfo.Pod
	// assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost
	// 为pod设置nodeName字段,更新scheduler缓存
	err = sched.assume(assumedPod, scheduleResult.SuggestedHost)
	if err != nil {
    
    
		metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
		// This is most probably result of a BUG in retrying logic.
		// We report an error here so that pod scheduling can be retried.
		// This relies on the fact that Error will check if the pod has been bound
		// to a node and if so will not add it back to the unscheduled pods queue
		// (otherwise this would cause an infinite loop).
		sched.recordSchedulingFailure(fwk, assumedPodInfo, err, SchedulerError, "")
		return
	}

	// Run the Reserve method of reserve plugins.
	// 运行reserve plugins的reserve方法
	if sts := fwk.RunReservePluginsReserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
    
    
		metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
		// trigger un-reserve to clean up state associated with the reserved Pod
		fwk.RunReservePluginsUnreserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
		if forgetErr := sched.SchedulerCache.ForgetPod(assumedPod); forgetErr != nil {
    
    
			klog.ErrorS(forgetErr, "scheduler cache ForgetPod failed")
		}
		sched.recordSchedulingFailure(fwk, assumedPodInfo, sts.AsError(), SchedulerError, "")
		return
	}

	// Run "permit" plugins.
	// 运行permit插件
	runPermitStatus := fwk.RunPermitPlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
	if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
    
    
		var reason string
		if runPermitStatus.IsUnschedulable() {
    
    
			metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start))
			reason = v1.PodReasonUnschedulable
		} else {
    
    
			metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
			reason = SchedulerError
		}
		// One of the plugins returned status different than success or wait.
		// 其中一个插件返回的状态不等于success或者wait的状态,触发reserve plugins的unreserve方法
		fwk.RunReservePluginsUnreserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
		// 从缓存中移除临时绑定的pod
		if forgetErr := sched.SchedulerCache.ForgetPod(assumedPod); forgetErr != nil {
    
    
			klog.ErrorS(forgetErr, "scheduler cache ForgetPod failed")
		}
		sched.recordSchedulingFailure(fwk, assumedPodInfo, runPermitStatus.AsError(), reason, "")
		return
	}

	// At the end of a successful scheduling cycle, pop and move up Pods if needed.
	if len(podsToActivate.Map) != 0 {
    
    
		sched.SchedulingQueue.Activate(podsToActivate.Map)
		// Clear the entries after activation.
		podsToActivate.Map = make(map[string]*v1.Pod)
	}

	// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
	// 异步绑定pod到选定的节点
	go func() {
    
    
		bindingCycleCtx, cancel := context.WithCancel(ctx)
		defer cancel()
		metrics.SchedulerGoroutines.WithLabelValues(metrics.Binding).Inc()
		defer metrics.SchedulerGoroutines.WithLabelValues(metrics.Binding).Dec()

		// 首先调用waitOnPermit扩展,与前面的permit扩展点配合使用实现延迟调度功能
		waitOnPermitStatus := fwk.WaitOnPermit(bindingCycleCtx, assumedPod)
		if !waitOnPermitStatus.IsSuccess() {
    
    
			var reason string
			if waitOnPermitStatus.IsUnschedulable() {
    
    
				metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start))
				reason = v1.PodReasonUnschedulable
			} else {
    
    
				metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
				reason = SchedulerError
			}
			// trigger un-reserve plugins to clean up state associated with the reserved Pod
			fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
			if forgetErr := sched.SchedulerCache.ForgetPod(assumedPod); forgetErr != nil {
    
    
				klog.ErrorS(forgetErr, "scheduler cache ForgetPod failed")
			}
			sched.recordSchedulingFailure(fwk, assumedPodInfo, waitOnPermitStatus.AsError(), reason, "")
			return
		}

		// Run "prebind" plugins.
		// 运行prebind插件
		preBindStatus := fwk.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
		if !preBindStatus.IsSuccess() {
    
    
			metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
			// trigger un-reserve plugins to clean up state associated with the reserved Pod
			fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
			if forgetErr := sched.SchedulerCache.ForgetPod(assumedPod); forgetErr != nil {
    
    
				klog.ErrorS(forgetErr, "scheduler cache ForgetPod failed")
			}
			sched.recordSchedulingFailure(fwk, assumedPodInfo, preBindStatus.AsError(), SchedulerError, "")
			return
		}

		// 调用bind函数进行真正的绑定
		err := sched.bind(bindingCycleCtx, fwk, assumedPod, scheduleResult.SuggestedHost, state)
		if err != nil {
    
    
			metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
			// trigger un-reserve plugins to clean up state associated with the reserved Pod
			fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
			if err := sched.SchedulerCache.ForgetPod(assumedPod); err != nil {
    
    
				klog.ErrorS(err, "scheduler cache ForgetPod failed")
			}
			sched.recordSchedulingFailure(fwk, assumedPodInfo, fmt.Errorf("binding rejected: %w", err), SchedulerError, "")
		} else {
    
    
			// Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2.
			if klog.V(2).Enabled() {
    
    
				klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
			}
			metrics.PodScheduled(fwk.ProfileName(), metrics.SinceInSeconds(start))
			metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
			metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(podInfo)).Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))

			// Run "postbind" plugins.
			// 绑定成功后,调用postbind插件
			fwk.RunPostBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)

			// At the end of a successful binding cycle, move up Pods if needed.
			if len(podsToActivate.Map) != 0 {
    
    
				sched.SchedulingQueue.Activate(podsToActivate.Map)
				// Unlike the logic in scheduling cycle, we don't bother deleting the entries
				// as `podsToActivate.Map` is no longer consumed.
			}
		}
	}()
}

该函数实现了Kubernetes调度器的核心逻辑,流程如下:

  1. 调用sched.NextPod()从activeQ中获取一个优先级最高的待调度Pod,该过程是阻塞的,当activeQ中不存在任何Pod资源对象时,sched.NextPod()处于等待状态
  2. 调用sched.Algorithm.Schedule()方法执行预选调度算法和优选调度算法,为Pod选择一个合适的节点
  3. 调用sched.assume()方法进行预绑定,为Pod设置NodeName字段,更新Scheduler缓存
  4. 调用fwk.RunReservePluginsReserve()方法运行Reserve插件的Reserve()方法
  5. 调用fwk.RunPermitPlugins()方法运行Permit插件
  6. 调用fwk.RunPreBindPlugins()方法运行PreBind插件
  7. 调用sched.bind()方法进行真正的绑定,请求ApiServer异步处理最终的绑定操作,写入etcd
  8. 绑定成功后,调用fwk.RunPostBindPlugins()方法运行PostBind插件

2)、调度前准备

scheduleOne()方法在最开始调用sched.NextPod()方法来获取下一个要调度的Pod,就是从activeQ活动队列中Pop出来元素,创建Scheduler对象时指定了NextPod函数internalqueue.MakeNextPodFunc(podQueue)

// pkg/scheduler/internal/queue/scheduling_queue.go
// MakeNextPodFunc返回一个函数,从给定的调度队列中检索下一个pod
func MakeNextPodFunc(queue SchedulingQueue) func() *framework.QueuedPodInfo {
    
    
	return func() *framework.QueuedPodInfo {
    
    
		podInfo, err := queue.Pop()
		if err == nil {
    
    
			klog.V(4).InfoS("About to try and schedule pod", "pod", klog.KObj(podInfo.Pod))
			return podInfo
		}
		klog.ErrorS(err, "Error while retrieving next pod from scheduling queue")
		return nil
	}
}

这里调用优先级队列的Pop()方法来弹出队列中的Pod进行调度处理

// pkg/scheduler/internal/queue/scheduling_queue.go
// Pop删除activeQ的队头并返回它.如果activeQ为空,它将阻塞,并等待新元素入队
func (p *PriorityQueue) Pop() (*framework.QueuedPodInfo, error) {
    
    
	p.lock.Lock()
	defer p.lock.Unlock()
	for p.activeQ.Len() == 0 {
    
    
		// When the queue is empty, invocation of Pop() is blocked until new item is enqueued.
		// When Close() is called, the p.closed is set and the condition is broadcast,
		// which causes this loop to continue and return from the Pop().
		if p.closed {
    
    
			return nil, fmt.Errorf(queueClosed)
		}
		// 当队列为空时,将阻塞Pop()的调用,直到新元素入队
		p.cond.Wait()
	}
	obj, err := p.activeQ.Pop()
	if err != nil {
    
    
		return nil, err
	}
	pInfo := obj.(*framework.QueuedPodInfo)
	pInfo.Attempts++
	// 增加调度周期次数
	p.schedulingCycle++
	return pInfo, err
}

3)、执行调度

scheduleOne()方法通过调用sched.Algorithm.Schedule()方法来执行预选与优选算法处理,该方法实现如下:

// pkg/scheduler/generic_scheduler.go
func (g *genericScheduler) Schedule(ctx context.Context, extenders []framework.Extender, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
    
    
	trace := utiltrace.New("Scheduling", utiltrace.Field{
    
    Key: "namespace", Value: pod.Namespace}, utiltrace.Field{
    
    Key: "name", Value: pod.Name})
	defer trace.LogIfLong(100 * time.Millisecond)

	// 1.快照node信息,每次调度pod时都会获取一次快照
	if err := g.snapshot(); err != nil {
    
    
		return result, err
	}
	trace.Step("Snapshotting scheduler cache and node infos done")

	if g.nodeInfoSnapshot.NumNodes() == 0 {
    
    
		return result, ErrNoNodesAvailable
	}

	// 2.Predicates阶段:找到所有满足调度条件的节点,不满足的就直接过滤
	feasibleNodes, diagnosis, err := g.findNodesThatFitPod(ctx, extenders, fwk, state, pod)
	if err != nil {
    
    
		return result, err
	}
	trace.Step("Computing predicates done")

	// 3.预选后没有合适的node直接返回
	if len(feasibleNodes) == 0 {
    
    
		return result, &framework.FitError{
    
    
			Pod:         pod,
			NumAllNodes: g.nodeInfoSnapshot.NumNodes(),
			Diagnosis:   diagnosis,
		}
	}

	// When only one node after predicate, just use it.
	// 4.当预选后只剩下一个node,就使用它
	if len(feasibleNodes) == 1 {
    
    
		return ScheduleResult{
    
    
			SuggestedHost:  feasibleNodes[0].Name,
			EvaluatedNodes: 1 + len(diagnosis.NodeToStatusMap),
			FeasibleNodes:  1,
		}, nil
	}

	// 5.Priorities阶段:执行优选算法,获得打分之后的node列表
	priorityList, err := prioritizeNodes(ctx, extenders, fwk, state, pod, feasibleNodes)
	if err != nil {
    
    
		return result, err
	}

	// 6.根据打分选择分数最高的node
	host, err := g.selectHost(priorityList)
	trace.Step("Prioritizing done")

	return ScheduleResult{
    
    
		SuggestedHost:  host,
		EvaluatedNodes: len(feasibleNodes) + len(diagnosis.NodeToStatusMap),
		FeasibleNodes:  len(feasibleNodes),
	}, err
}

Schedule()方法流程如下图:

在这里插入图片描述

4)、预选算法

预选算法会从当前集群中的所有的Node中进行过滤,选出符合当前Pod运行的Nodes。预选的核心流程是通过findNodesThatFit()来完成,其返回预选结果供优选流程使用。预选算法的主要逻辑如下图:

在这里插入图片描述

findNodesThatFit()方法代码如下:

// pkg/scheduler/generic_scheduler.go
// 根据prefilter插件和extender过滤节点以找到适合pod的节点
func (g *genericScheduler) findNodesThatFitPod(ctx context.Context, extenders []framework.Extender, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) ([]*v1.Node, framework.Diagnosis, error) {
    
    
	diagnosis := framework.Diagnosis{
    
    
		NodeToStatusMap:      make(framework.NodeToStatusMap),
		UnschedulablePlugins: sets.NewString(),
	}

	// Run "prefilter" plugins.
	// 运行prefilter插件,预处理pod的相关信息,或者检查集群或pod必须满足的某些条件
	s := fwk.RunPreFilterPlugins(ctx, state, pod)
	allNodes, err := g.nodeInfoSnapshot.NodeInfos().List()
	if err != nil {
    
    
		return nil, diagnosis, err
	}
	if !s.IsSuccess() {
    
    
		if !s.IsUnschedulable() {
    
    
			return nil, diagnosis, s.AsError()
		}
		// All nodes will have the same status. Some non trivial refactoring is
		// needed to avoid this copy.
		for _, n := range allNodes {
    
    
			diagnosis.NodeToStatusMap[n.Node().Name] = s
		}
		// Status satisfying IsUnschedulable() gets injected into diagnosis.UnschedulablePlugins.
		diagnosis.UnschedulablePlugins.Insert(s.FailedPlugin())
		return nil, diagnosis, nil
	}

	// "NominatedNodeName" can potentially be set in a previous scheduling cycle as a result of preemption.
	// This node is likely the only candidate that will fit the pod, and hence we try it first before iterating over all nodes.
	if len(pod.Status.NominatedNodeName) > 0 && feature.DefaultFeatureGate.Enabled(features.PreferNominatedNode) {
    
    
		feasibleNodes, err := g.evaluateNominatedNode(ctx, extenders, pod, fwk, state, diagnosis)
		if err != nil {
    
    
			klog.ErrorS(err, "Evaluation failed on nominated node", "pod", klog.KObj(pod), "node", pod.Status.NominatedNodeName)
		}
		// Nominated node passes all the filters, scheduler is good to assign this node to the pod.
		if len(feasibleNodes) != 0 {
    
    
			return feasibleNodes, diagnosis, nil
		}
	}
	// 查找能够满足filter过滤插件的节点
	feasibleNodes, err := g.findNodesThatPassFilters(ctx, fwk, state, pod, diagnosis, allNodes)
	if err != nil {
    
    
		return nil, diagnosis, err
	}

	// 查找能够满足extenders的节点
	feasibleNodes, err = findNodesThatPassExtenders(extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap)
	if err != nil {
    
    
		return nil, diagnosis, err
	}
	return feasibleNodes, diagnosis, nil
}

findNodesThatFit()方法逻辑如下:

  1. 调用fwk.RunPreFilterPlugins()方法,运行prefilter插件,预处理
  2. 调用findNodesThatPassFilters()方法,查找能够满足filter过滤插件的节点
  3. 调用findNodesThatPassExtenders()方法,查找能够满足extenders的节点

RunPreFilterPlugins()方法代码如下:

// pkg/scheduler/framework/runtime/framework.go
func (f *frameworkImpl) RunPreFilterPlugins(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (status *framework.Status) {
    
    
	startTime := time.Now()
	defer func() {
    
    
		metrics.FrameworkExtensionPointDuration.WithLabelValues(preFilter, status.Code().String(), f.profileName).Observe(metrics.SinceInSeconds(startTime))
	}()
	// 运行所有preFilter插件
	for _, pl := range f.preFilterPlugins {
    
    
		status = f.runPreFilterPlugin(ctx, pl, state, pod)
		if !status.IsSuccess() {
    
    
			status.SetFailedPlugin(pl.Name())
			if status.IsUnschedulable() {
    
    
				return status
			}
			return framework.AsStatus(fmt.Errorf("running PreFilter plugin %q: %w", pl.Name(), status.AsError())).WithFailedPlugin(pl.Name())
		}
	}

	return nil
}

func (f *frameworkImpl) runPreFilterPlugin(ctx context.Context, pl framework.PreFilterPlugin, state *framework.CycleState, pod *v1.Pod) *framework.Status {
    
    
	if !state.ShouldRecordPluginMetrics() {
    
    
		return pl.PreFilter(ctx, state, pod)
	}
	startTime := time.Now()
	// 调用插件的PreFilter函数
	status := pl.PreFilter(ctx, state, pod)
	f.metricsRecorder.observePluginDurationAsync(preFilter, pl.Name(), status, metrics.SinceInSeconds(startTime))
	return status
}

findNodesThatPassFilters()方法代码如下:

// pkg/scheduler/generic_scheduler.go
func (g *genericScheduler) findNodesThatPassFilters(
	ctx context.Context,
	fwk framework.Framework,
	state *framework.CycleState,
	pod *v1.Pod,
	diagnosis framework.Diagnosis,
	nodes []*framework.NodeInfo) ([]*v1.Node, error) {
    
    
	// 根据集群节点数量选择参与调度的节点的数量
	numNodesToFind := g.numFeasibleNodesToFind(int32(len(nodes)))

	// Create feasible list with enough space to avoid growing it
	// and allow assigning.
	feasibleNodes := make([]*v1.Node, numNodesToFind)

	if !fwk.HasFilterPlugins() {
    
    
		length := len(nodes)
		for i := range feasibleNodes {
    
    
			feasibleNodes[i] = nodes[(g.nextStartNodeIndex+i)%length].Node()
		}
		g.nextStartNodeIndex = (g.nextStartNodeIndex + len(feasibleNodes)) % length
		return feasibleNodes, nil
	}

	errCh := parallelize.NewErrorChannel()
	var statusesLock sync.Mutex
	var feasibleNodesLen int32
	ctx, cancel := context.WithCancel(ctx)
	checkNode := func(i int) {
    
    
		// We check the nodes starting from where we left off in the previous scheduling cycle,
		// this is to make sure all nodes have the same chance of being examined across pods.
		// 从上一个调度周期中停止的地方开始检查节点,为了确保所有节点都有相同的机会在调度pod中被检查
		nodeInfo := nodes[(g.nextStartNodeIndex+i)%len(nodes)]
		// 运行filter插件
		status := fwk.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo)
		if status.Code() == framework.Error {
    
    
			errCh.SendErrorWithCancel(status.AsError(), cancel)
			return
		}
		// 如果该节点合适,放入到feasibleNodes列表中
		if status.IsSuccess() {
    
    
			length := atomic.AddInt32(&feasibleNodesLen, 1)
			if length > numNodesToFind {
    
    
				cancel()
				atomic.AddInt32(&feasibleNodesLen, -1)
			} else {
    
    
				feasibleNodes[length-1] = nodeInfo.Node()
			}
		} else {
    
    
			statusesLock.Lock()
			diagnosis.NodeToStatusMap[nodeInfo.Node().Name] = status
			diagnosis.UnschedulablePlugins.Insert(status.FailedPlugin())
			statusesLock.Unlock()
		}
	}

	beginCheckNode := time.Now()
	statusCode := framework.Success
	defer func() {
    
    
		// We record Filter extension point latency here instead of in framework.go because framework.RunFilterPlugins
		// function is called for each node, whereas we want to have an overall latency for all nodes per scheduling cycle.
		// Note that this latency also includes latency for `addNominatedPods`, which calls framework.RunPreFilterAddPod.
		metrics.FrameworkExtensionPointDuration.WithLabelValues(runtime.Filter, statusCode.String(), fwk.ProfileName()).Observe(metrics.SinceInSeconds(beginCheckNode))
	}()

	// Stops searching for more nodes once the configured number of feasible nodes
	// are found.
	// 开启多个协程并行寻找符合条件的node节点
	fwk.Parallelizer().Until(ctx, len(nodes), checkNode)
	processedNodes := int(feasibleNodesLen) + len(diagnosis.NodeToStatusMap)
	// 设置下次开始寻找node的位置
	g.nextStartNodeIndex = (g.nextStartNodeIndex + processedNodes) % len(nodes)
	// 合并返回结果
	feasibleNodes = feasibleNodes[:feasibleNodesLen]
	if err := errCh.ReceiveError(); err != nil {
    
    
		statusCode = framework.Error
		return nil, err
	}
	return feasibleNodes, nil
}

findNodesThatPassFilters()方法逻辑如下:

  1. 调用numFeasibleNodesToFind()方法,根据集群节点数量选择参与调度的节点的数量

  2. checkNode()函数用于检查节点,检查节点时:

    1)先会从上一个调度周期中停止的地方开始检查节点,为了确保所有节点都有相同的机会在调度Pod中被检查

    2)然后调用fwk.RunFilterPluginsWithNominatedPods()运行filter插件

    3)如果该Node节点合适,放入到feasibleNodes列表中

  3. 调用fwk.Parallelizer().Until(ctx, len(nodes), checkNode),默认开启16个goroutine并行寻找符合条件的Node节点

1)确定参与调度的节点数量

numFeasibleNodesToFind()根据集群节点数量选择参与调度的节点的数量,算法的具体逻辑如下图:

  • 如果节点数小于minFeasibleNodesToFind(默认值100),那么全部节点参与调度

  • percentageOfNodesToScore参数值是集群中每次参与调度节点的百分比,范围是1到100之间。如果集群节点数>100,那么就会根据这个值来计算让合适的节点参与调度

    举个例子,如果一个5000个节点的集群,percentageOfNodesToScore为10,也就是每次500个节点参与调度。因为如果一个500个节点的集群来进行调度的话,不进行控制时,每个Pod调度都需要尝试5000次的节点预选过程是非常消耗资源的

  • 如果计算后的参与调度的节点数小于minFeasibleNodesToFind,那么返回minFeasibleNodesToFind

代码如下:

// pkg/scheduler/generic_scheduler.go
func (g *genericScheduler) numFeasibleNodesToFind(numAllNodes int32) (numNodes int32) {
    
    
	// 节点数小于minFeasibleNodesToFind(100),全部节点参与调度
	// percentageOfNodesToScore参数值是集群中每次参与调度节点的百分比,范围是1-100
	if numAllNodes < minFeasibleNodesToFind || g.percentageOfNodesToScore >= 100 {
    
    
		return numAllNodes
	}

	adaptivePercentage := g.percentageOfNodesToScore
	// 当节点数>100时,如果没有设置percentageOfNodesToScore,那么这里需要计算出一个值
	if adaptivePercentage <= 0 {
    
    
		basePercentageOfNodesToScore := int32(50)
		adaptivePercentage = basePercentageOfNodesToScore - numAllNodes/125
		if adaptivePercentage < minFeasibleNodesPercentageToFind {
    
    
			adaptivePercentage = minFeasibleNodesPercentageToFind
		}
	}

	// 如果numAllNodes为5000,而adaptivePercentage为10,则numNodes=5000*10/100=500
	numNodes = numAllNodes * adaptivePercentage / 100
	// numNodes不能太小,不能小于minFeasibleNodesToFind
	if numNodes < minFeasibleNodesToFind {
    
    
		return minFeasibleNodesToFind
	}

	return numNodes
}
2)并行化二次筛选节点

并行取样主要通过调用workqueue的ParallelizeUntil()方法来启动N个goroutine来进行并行取样,并通过Context来协调退出。选取节点的规则由checkNode()函数来定义,checkNode里面调用RunFilterPluginsWithNominatedPods()方法筛选出合适的节点

为什么需要两次筛选节点?

在Kubernetes中经过调度器调度后的Pod结果会放入到SchedulingQueue中暂存,这些Pod(NominatedPods)未来可能会经过后续调度流程运行在提议的Node上,也可能因为某些原因(例如抢占机制等)导致最终没有运行,预选过程为了减少后续因为调度冲突,则会在进行预选的时候,将这部分Pod考虑进去。如果在这些Pod存在的情况下,Node可以满足当前Pod的筛选条件,则会去除NominatedPods再进行筛选

假设当前调度Pod资源对象的亲和性策略依赖的是NominatedPods,而NominatedPods不能保证一定可以调度到对应的节点上,所以会去除NominatedPods进行第二次筛选

RunFilterPluginsWithNominatedPods()方法代码如下:

// pkg/scheduler/framework/runtime/framework.go
func (f *frameworkImpl) RunFilterPluginsWithNominatedPods(ctx context.Context, state *framework.CycleState, pod *v1.Pod, info *framework.NodeInfo) *framework.Status {
    
    
	var status *framework.Status

	// podsAdded主要用于标识当前是否有提议的pod,如果没有提议的pod则就不需要再进行一轮筛选了
	podsAdded := false
	// We run filters twice in some cases. If the node has greater or equal priority
	// nominated pods, we run them when those pods are added to PreFilter state and nodeInfo.
	// If all filters succeed in this pass, we run them again when these
	// nominated pods are not added. This second pass is necessary because some
	// filters such as inter-pod affinity may not pass without the nominated pods.
	// If there are no nominated pods for the node or if the first run of the
	// filters fail, we don't run the second pass.
	// We consider only equal or higher priority pods in the first pass, because
	// those are the current "pod" must yield to them and not take a space opened
	// for running them. It is ok if the current "pod" take resources freed for
	// lower priority pods.
	// Requiring that the new pod is schedulable in both circumstances ensures that
	// we are making a conservative decision: filters like resources and inter-pod
	// anti-affinity are more likely to fail when the nominated pods are treated
	// as running, while filters like pod affinity are more likely to fail when
	// the nominated pods are treated as not running. We can't just assume the
	// nominated pods are running because they are not running right now and in fact,
	// they may end up getting scheduled to a different node.
	for i := 0; i < 2; i++ {
    
    
		stateToUse := state
		nodeInfoToUse := info
		if i == 0 {
    
    
			var err error
			// 查找是否有优先级>=当前pod的NominatedPods,然后加入到nodeInfoToUse中
			podsAdded, stateToUse, nodeInfoToUse, err = addNominatedPods(ctx, f, pod, state, info)
			// 如果第一轮筛选出错,则不会进行第二轮筛选
			if err != nil {
    
    
				return framework.AsStatus(err)
			}
		} else if !podsAdded || !status.IsSuccess() {
    
    
			break
		}

		// 运行filter检查该pod是否能运行在该节点上
		statusMap := f.RunFilterPlugins(ctx, stateToUse, pod, nodeInfoToUse)
		status = statusMap.Merge()
		if !status.IsSuccess() && !status.IsUnschedulable() {
    
    
			return status
		}
	}

	return status
}

5)、优选算法

优选算法先通过prioritizeNodes()方法获得打分之后的node列表,然后再通过selectHost()方法选择分数最高的node,返回结果

1)获得打分之后的node列表

prioritizeNodes()通过运行评分插件对节点进行优先排序,这些插件从RunScorePlugins()方法中为每个节点返回一个分数。每个插件的分数和Extender的分数加在一起,成为该节点的分数。整个流程如下图:

在这里插入图片描述

代码如下:

// pkg/scheduler/generic_scheduler.go
func prioritizeNodes(
	ctx context.Context,
	extenders []framework.Extender,
	fwk framework.Framework,
	state *framework.CycleState,
	pod *v1.Pod,
	nodes []*v1.Node,
) (framework.NodeScoreList, error) {
    
    
	// If no priority configs are provided, then all nodes will have a score of one.
	// This is required to generate the priority list in the required format
	if len(extenders) == 0 && !fwk.HasScorePlugins() {
    
    
		result := make(framework.NodeScoreList, 0, len(nodes))
		for i := range nodes {
    
    
			result = append(result, framework.NodeScore{
    
    
				Name:  nodes[i].Name,
				Score: 1,
			})
		}
		return result, nil
	}

	// Run PreScore plugins.
	// 运行preScore插件,准备评分数据
	preScoreStatus := fwk.RunPreScorePlugins(ctx, state, pod, nodes)
	if !preScoreStatus.IsSuccess() {
    
    
		return nil, preScoreStatus.AsError()
	}

	// Run the Score plugins.
	// 运行score插件对node进行评分
	// scoresMap类型是map[string][]NodeScore,key是插件名字,value是该插件对所有node评分
	scoresMap, scoreStatus := fwk.RunScorePlugins(ctx, state, pod, nodes)
	if !scoreStatus.IsSuccess() {
    
    
		return nil, scoreStatus.AsError()
	}

	if klog.V(10).Enabled() {
    
    
		for plugin, nodeScoreList := range scoresMap {
    
    
			for _, nodeScore := range nodeScoreList {
    
    
				klog.InfoS("Plugin scored node for pod", "pod", klog.KObj(pod), "plugin", plugin, "node", nodeScore.Name, "score", nodeScore.Score)
			}
		}
	}

	// Summarize all scores.
	// result用于汇总所有分数
	result := make(framework.NodeScoreList, 0, len(nodes))

	// 将分数按照node的维度进行汇总
	for i := range nodes {
    
    
		result = append(result, framework.NodeScore{
    
    Name: nodes[i].Name, Score: 0})
		for j := range scoresMap {
    
    
			// 每个算法对应第i个node的结果分值加权后累加
			result[i].Score += scoresMap[j][i].Score
		}
	}

	// 如果配置了extender,还要调用extender对node评分并累加到result中
	if len(extenders) != 0 && nodes != nil {
    
    
		// 因为要多协程并发调用extender并统计分数,所以需要锁来互斥写入node分数
		var mu sync.Mutex
		var wg sync.WaitGroup
		// combinedScores的key是node名字,value是node评分
		combinedScores := make(map[string]int64, len(nodes))
		for i := range extenders {
    
    
			if !extenders[i].IsInterested(pod) {
    
    
				continue
			}
			// 启动协程调用extender对所有node评分
			wg.Add(1)
			go func(extIndex int) {
    
    
				metrics.SchedulerGoroutines.WithLabelValues(metrics.PrioritizingExtender).Inc()
				defer func() {
    
    
					metrics.SchedulerGoroutines.WithLabelValues(metrics.PrioritizingExtender).Dec()
					wg.Done()
				}()
				// 调用extender对node进行评分
				prioritizedList, weight, err := extenders[extIndex].Prioritize(pod, nodes)
				if err != nil {
    
    
					// Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities
					return
				}
				mu.Lock()
				for i := range *prioritizedList {
    
    
					host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score
					if klog.V(10).Enabled() {
    
    
						klog.InfoS("Extender scored node for pod", "pod", klog.KObj(pod), "extender", extenders[extIndex].Name(), "node", host, "score", score)
					}
					// extender的权重是通过Prioritize()返回的,其实该权重是人工配置的
					// 合并后的评分是每个extender对node评分乘以权重的累加和
					combinedScores[host] += score * weight
				}
				mu.Unlock()
			}(i)
		}
		// wait for all go routines to finish
		wg.Wait()
		for i := range result {
    
    
			// MaxExtenderPriority may diverge from the max priority used in the scheduler and defined by MaxNodeScore,
			// therefore we need to scale the score returned by extenders to the score range used by the scheduler.
			// 最终node的评分是所有scorePlugin分数总和+所有extender分数总和
			result[i].Score += combinedScores[result[i].Name] * (framework.MaxNodeScore / extenderv1.MaxExtenderPriority)
		}
	}

	if klog.V(10).Enabled() {
    
    
		for i := range result {
    
    
			klog.InfoS("Calculated node's final score for pod", "pod", klog.KObj(pod), "node", result[i].Name, "score", result[i].Score)
		}
	}
	return result, nil
}
2)根据打分选择分数最高的node

priorityList数组保存了每个Node的名字和它对应的分数,最后通过selectHost()方法选出分数最高的Node对Pod进行绑定和调度。代码如下:

// pkg/scheduler/generic_scheduler.go
func (g *genericScheduler) selectHost(nodeScoreList framework.NodeScoreList) (string, error) {
    
    
	if len(nodeScoreList) == 0 {
    
    
		return "", fmt.Errorf("empty priorityList")
	}
	maxScore := nodeScoreList[0].Score
	selected := nodeScoreList[0].Name
	// 记录最高分数相同的节点数量
	cntOfMaxScore := 1
	for _, ns := range nodeScoreList[1:] {
    
    
		if ns.Score > maxScore {
    
    
			maxScore = ns.Score
			selected = ns.Name
			cntOfMaxScore = 1
		} else if ns.Score == maxScore {
    
    
			// 分数相同就累计数量
			cntOfMaxScore++
			if rand.Intn(cntOfMaxScore) == 0 {
    
    
				// Replace the candidate with probability of 1/cntOfMaxScore
				// 以1/cntOfMaxScore的概率成为最优node
				selected = ns.Name
			}
		}
	}
	return selected, nil
}

6)、小结

调度器运行流程如下图:

调度核心实现总结如下:

在这里插入图片描述

参考:

这8张图终于把K8S调度器讲通透了!

调度插件执行原理

猜你喜欢

转载自blog.csdn.net/qq_40378034/article/details/129971275