引言：我们知道，操作系统能够极大效率地管理和分配系统的资源。而内核当中有许多进程需要申请使用这些资源，那么谁先谁后呢？这就需要我们来制定一套规则能够合理地来为每个进程分配系统资源。Linux内核的调度算法，就是实现这套规则的方法。

一、调度策略概述

Linux进程调度是以优先级调度为基础的，即优先执行优先级最高的的进程。

我们将执行的这些进程分为两类，一类是实时进程，一类是非实时进程。
　　实时进程调度策略：SCHED_RR,SCHED_FIFO.
　　非实时进度调度策略：SCHED_OTHER
系统中所有进程的优先级范围在0 - MAX_PRIO（0-140）之间，数值越低优先级越高，实时进程的优先级范围在0 - MAX_RT_PRIO（0-99）, 而非实时进程的优先级范围在MAX_RT_PRIO - MAX_PRIO（100-140）之间，因此一般来说，实时进程总是优于非实时进程先执行。

SCHED_FIFO：先入先出的方式调度的实时进程，即该进程一旦执行便一直运行到结束。

SCHED_RR:通过时间片轮转的方式调度的实时进程。在运行了指定的时间片后就会被抢占然后重新调度。but 如果没有其他的进程优先级高于它，他还会继续运行。通俗的讲就是执行一个时间片比一下优先级，然后优先级最高的再接着执行一个时间片。

SCHED_OTHER:普通进程，基于优先级进行调度

SCHED_FIFO 与 SCHED_RR 的区别是:
当进程的调度策略为前者时,当前实时进程将一直占用 CPU 直至自动退出,除非有更紧迫的、优先级更高的实时进程需要运行时,它才会被抢占 CPU;当进程的调度策略
为后者时,它与其它实时进程以实时轮流算法去共同使用 CPU，用完时间片放到运行队列尾部。

二、进程调度算法

schedule()函数是完成进程调度的主要函数，它确定下一个应该占有cpu并运行的进程，且完成进程切换的工作。其主要工作流程如下：

首先调用preempt_disable()函数，关闭内核抢占，因为此时要对内核的一些重要数据结构进行操作，所以必须要将内核抢占关闭。

调用sched_find_first_bit()函数，寻找最高优先级的进程队列的偏移量idx,那么queue[idx]->next即为找到的next。

调用context_switch()函数执行进程切换，它主要调用2个函数，一个是switch_mm,完成新旧进程间虚拟内存映射间的切换，另一个是switch_to(),保存和恢复堆栈信息和处理器的寄存器。

调度函数Schedule源码解读如下：

asmlinkage void schedule(void)
{
    
    
	task_t *prev, *next;//prev:换出进程标识符，next:换入进程标识符
	runqueue_t *rq;
	prio_array_t *array;//优先级数组
	struct list_head *queue;
	unsigned long long now;
	unsigned long run_time;
	int idx;
	/*
	 * Test if we are atomic.  Since do_exit() needs to call into
	 * schedule() atomically, we ignore that path for now.
	 * Otherwise, whine if we are scheduling when we should not be.
	 */
	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
    
    
		if (unlikely(in_atomic())) {
    
    
			printk(KERN_ERR "bad: scheduling while atomic!\n");
			dump_stack();
		}
	}

need_resched:
	preempt_disable();//关中断
	prev = current;	
	rq = this_rq();

	release_kernel_lock(prev);
	now = sched_clock();
	if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
		run_time = now - prev->timestamp;
	else
		run_time = NS_MAX_SLEEP_AVG;

	/*
	 * Tasks with interactive credits get charged less run_time
	 * at high sleep_avg to delay them losing their interactive
	 * status
	 */
	if (HIGH_CREDIT(prev))
		run_time /= (CURRENT_BONUS(prev) ? : 1);

	spin_lock_irq(&rq->lock);

	/*
	 * if entering off of a kernel preemption go straight
	 * to picking the next task.
	 */
	if (unlikely(preempt_count() & PREEMPT_ACTIVE))
		goto pick_next_task;

	switch (prev->state) {
    
    
	case TASK_INTERRUPTIBLE:
		if (unlikely(signal_pending(prev))) {
    
    
			prev->state = TASK_RUNNING;
			break;
		}
	default:
		deactivate_task(prev, rq);
		prev->nvcsw++;
		break;
	case TASK_RUNNING:
		prev->nivcsw++;
	}
pick_next_task:
	if (unlikely(!rq->nr_running)) {
    
    
#ifdef CONFIG_SMP
		load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
		if (rq->nr_running)
			goto pick_next_task;
#endif
		next = rq->idle;
		rq->expired_timestamp = 0;
		goto switch_tasks;
	}
    /*当active队列中所有进程用完时间片后，只需交换指向active和expried的指针即可,此交换是实现O(1)算法的核心，由以下程序实现。*/
	array = rq->active;
	if (unlikely(!array->nr_active)) {
    
    
		/*
		 * Switch the active and expired arrays.
		 */
		rq->active = rq->expired;
		rq->expired = array;
		array = rq->active;
		rq->expired_timestamp = 0;
	}
	/*调用函数sched_find_first_bit函数快速定位就绪队列中优先级最高的非空就绪进程链表*/
	idx = sched_find_first_bit(array->bitmap);
	queue = array->queue + idx;
	next = list_entry(queue->next, task_t, run_list);

	if (next->activated > 0) {
    
    
		unsigned long long delta = now - next->timestamp;

		if (next->activated == 1)
			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

		array = next->array;
		dequeue_task(next, array);
		recalc_task_prio(next, next->timestamp + delta);
		enqueue_task(next, array);
	}
	next->activated = 0;
switch_tasks:
	prefetch(next);
	clear_tsk_need_resched(prev);
	RCU_qsctr(task_cpu(prev))++;

	prev->sleep_avg -= run_time;
	if ((long)prev->sleep_avg <= 0){
    
    
		prev->sleep_avg = 0;
		if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
			prev->interactive_credit--;
	}
	prev->timestamp = now;

	if (likely(prev != next)) {
    
    
		next->timestamp = now;
		rq->nr_switches++;
		rq->curr = next;

		prepare_arch_switch(rq, next);
		prev = context_switch(rq, prev, next);
		barrier();

		finish_task_switch(prev);
	} else
		spin_unlock_irq(&rq->lock);

	reacquire_kernel_lock(current);
	preempt_enable_no_resched();
	if (test_thread_flag(TIF_NEED_RESCHED))
		goto need_resched;
}

三、主要数据结构

1）进程实例Struct task_struct

/*进程描述结构体
内核把进程存放在叫做任务队列的双向链表中。链表中的每一项都是类型为task_struct，称为进程描述符的结构体。进程描述符中包含一个具体进程的所有信息。源码在/include/linux/sched.h中*/
struct task_struct {
    
    
	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
    /*通过其task域很容易找到任务实际的task指针*/
	struct thread_info *thread_info;
	atomic_t usage;
	unsigned long flags;	/* per process flags, defined below */
	unsigned long ptrace;

	int lock_depth;		/* Lock depth */
    /*prio是进程的动态优先级，取值范围0-99，static_prio是静态优先级，范围是100-139.*/
	int prio, static_prio;
	struct list_head run_list;

    /*指向当前CPU的active就绪进程队列*/
	prio_array_t *array;

    /*进程平均等待时间:
    根据进程的睡眠时间动态交互改变该进程的优先级大小，sleep_avg越久，优先级越大*/
	unsigned long sleep_avg;
    /*interactive_credit：进程的交互性,根据不同的条件会做出变化*/
	long interactive_credit;
	unsigned long long timestamp;
	int activated;

	unsigned long policy;
	cpumask_t cpus_allowed;
	unsigned int time_slice, first_time_slice;

	struct list_head tasks;
	struct list_head ptrace_children;
	struct list_head ptrace_list;

	struct mm_struct *mm, *active_mm;

/* task state */
	struct linux_binfmt *binfmt;
	int exit_code, exit_signal;
	int pdeath_signal;  /*  The signal sent when the parent dies  */
	/* ??? */
	unsigned long personality;
	int did_exec:1;
	pid_t pid;
	pid_t __pgrp;		/* Accessed via process_group() */
	pid_t tty_old_pgrp;
	pid_t session;
	pid_t tgid;
	/* boolean value for session group leader */
	int leader;
	/* 
	 * pointers to (original) parent process, youngest child, younger sibling,
	 * older sibling, respectively.  (p->father can be replaced with 
	 * p->parent->pid)
	 */
	struct task_struct *real_parent; /* real parent process (when being debugged) */
	struct task_struct *parent;	/* parent process */
	struct list_head children;	/* list of my children */
	struct list_head sibling;	/* linkage in my parent's children list */
	struct task_struct *group_leader;	/* threadgroup leader */

	/* PID/PID hash table linkage. */
	struct pid_link pids[PIDTYPE_MAX];

	wait_queue_head_t wait_chldexit;	/* for wait4() */
	struct completion *vfork_done;		/* for vfork() */
	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */

	unsigned long rt_priority;
	unsigned long it_real_value, it_prof_value, it_virt_value;
	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
	struct timer_list real_timer;
	struct list_head posix_timers; /* POSIX.1b Interval Timers */
	unsigned long utime, stime, cutime, cstime;
	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
	u64 start_time;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
/* process credentials */
	uid_t uid,euid,suid,fsuid;
	gid_t gid,egid,sgid,fsgid;
	int ngroups;
	gid_t	groups[NGROUPS];
	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
	int keep_capabilities:1;
	struct user_struct *user;
/* limits */
	struct rlimit rlim[RLIM_NLIMITS];
	unsigned short used_math;
	char comm[16];
/* file system info */
	int link_count, total_link_count;
	struct tty_struct *tty; /* NULL if no tty */
/* ipc stuff */
	struct sysv_sem sysvsem;
/* CPU-specific state of this task */
	struct thread_struct thread;
/* filesystem information */
	struct fs_struct *fs;
/* open file information */
	struct files_struct *files;
/* namespace */
	struct namespace *namespace;
/* signal handlers */
	struct signal_struct *signal;
	struct sighand_struct *sighand;

	sigset_t blocked, real_blocked;
	struct sigpending pending;

	unsigned long sas_ss_sp;
	size_t sas_ss_size;
	int (*notifier)(void *priv);
	void *notifier_data;
	sigset_t *notifier_mask;
	
	void *security;

/* Thread group tracking */
   	u32 parent_exec_id;
   	u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty */
	spinlock_t alloc_lock;
/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
	spinlock_t proc_lock;
/* context-switch lock */
	spinlock_t switch_lock;

/* journalling filesystem info */
	void *journal_info;

/* VM state */
	struct reclaim_state *reclaim_state;

	struct dentry *proc_dentry;
	struct backing_dev_info *backing_dev_info;

	struct io_context *io_context;

	unsigned long ptrace_message;
	siginfo_t *last_siginfo; /* For ptrace use.  */
};

2）就绪队列Struct runqueue

/*就绪进程队列
 runqueue的设计是O(1)调度器的核心数据结构。2.6内核对它做了较大的改动，每个cpu单独维护一个就绪队列，每个队列都设置了一个自旋锁，而且包含了每个处理器的调度信息。该结构在/kernel/sched.c中
 */
struct runqueue {
    
    
	spinlock_t lock;
	unsigned long nr_running, nr_switches, expired_timestamp,
			nr_uninterruptible;
	task_t *curr, *idle;
	struct mm_struct *prev_mm;
	/*active是指向活动进程队列的指针，expried是指向过期进程队列的指针。array[2]是实际的优先级进程队列，其中一个是活跃的，一个是过期的。过期数组存放时间片耗完的进程。*/
	prio_array_t *active, *expired, arrays[2];

	int prev_cpu_load[NR_CPUS];
#ifdef CONFIG_NUMA
	atomic_t *node_nr_running;
	int prev_node_load[MAX_NUMNODES];
#endif
	task_t *migration_thread;
	struct list_head migration_queue;	atomic_t nr_iowait;
};

3）优先级数组

/*每个处理器的就绪队列都有两个优先级数组，它们是prio_array类型的linux2.6内核正是因为使用了优先级数组，才实现了O(1)调度算法，
该结构源码位于/kernel/sched.c当中*/
struct prio_array {
    
    
    /*相应runqueue中的进程数*/
	int nr_active;5
    /*索引位图，BITMAP_size默认值为5，5个long（32位）类型，可以代表160个优先级，但实际中只有140个。**这里为什么自己还要百度一下***/
	unsigned long bitmap[BITMAP_SIZE];
    /*每个优先级的进程队列，0-99对应位实时进程，100-140对应为普通的进程。数值越小优先级越高，bitmap的每一位斗鱼queue[i]相对应，当queue[i]的进程队列不为空时，bitmap相应位为1，否则为0*/
	struct list_head queue[MAX_PRIO];
};

四、总结篇

不想写了，只愿…阿林别点我。

Linux2.6内核调度算法浅析

一、调度策略概述

二、进程调度算法

调度函数Schedule源码解读如下：

三、主要数据结构

1）进程实例Struct task_struct

2）就绪队列Struct runqueue

3）优先级数组

四、总结篇

猜你喜欢