Linux内核深度解析之中断、异常和系统调用——中断下半部之工作队列

工作队列

工作队列（work queue）是使用内核线程异步执行函数的通用机制。

工作队列是中断处理程序的一种下半部机制，中断处理程序可以把耗时比较长并且可能睡眠的函数交给工作队列。

工作队列不完全是中断处理程序的下半部，同时内核的其他模块也可以将异步执行的函数交给它执行。

1. 编程接口

内核使用工作项保存需要异步执行的函数，工作项的数据接口是work_struct，其定义如下：

include/linux/workqueue.h
struct work_struct {
	atomic_long_t data;
	struct list_head entry;
	work_func_t func;		/* 异步执行的函数 */
};

typedef void (*work_func_t)(struct work_struct *work);		/* 异步执行的函数原型 */

有一类工作项称为延迟工作项，其数据类型为delayed_work，定义如下：

include/linux/workqueue.h
struct delayed_work {
	struct work_struct work;		/* 工作项 */
	struct timer_list timer;        /* 定时器 */

	/* target workqueue and CPU ->timer uses to queue ->work */
	struct workqueue_struct *wq;
	int cpu;
};

把延迟工作项添加到工作队列中的时候，延迟一段时间才会真正地把工作项添加到工作队列中。

工作队列分为两种，一种是内核定义的，一种是自己创建的。内核定义的工作队列如下：

include/linux/workqueue.h
extern struct workqueue_struct *system_wq;		/* 如果工作项的执行时间比较短，应该使用这个工作队列 */
extern struct workqueue_struct *system_highpri_wq;		/* 高优先级的工作队列 */
extern struct workqueue_struct *system_long_wq;		/* 如果工作队列的执行时间比较长，应该使用这个工作队列 */
extern struct workqueue_struct *system_unbound_wq;		/* 这个工作队列使用的内核线程不绑定到某个特定的处理器 */
extern struct workqueue_struct *system_freezable_wq;		/* 这个工作队列可以冻结 */
extern struct workqueue_struct *system_power_efficient_wq;		/* 如果开启了工作队列模块的参数“wq_power_efficient”，那么这个工作队列倾向于省电，否则根system_wq相同 */
extern struct workqueue_struct *system_freezable_power_efficient_wq;		/* 这个工作队列和system_power_efficient_wq的区别是可以冻结 */

（1）定义工作项

定义一个静态的工组项：

DECLARE_WORK(n, f)        /* 参数n是变量名称，参数f是工作项的处理函数 */

定义一个静态的延迟工作项：

DECLARE_DELAYED_WORK(n, f)        /* 参数n是变量名称，参数f是工作项的处理函数 */

定义一个静态的延迟工作项，但使用可推迟的定时器（deferrable timer）：

DECLARE_DEFERRABLE_WORK(n, f)        /* 参数n是变量名称，参数f是工作项的处理函数 */

动态初始化一个工作项：

INIT_WORK(_work, _func)        /* 参数_work是工作项地址，参数_func是需要异步执行的函数 */

动态初始化一个工作项，工作项是栈里面的局部变量：

INIT_WORK_ONSTACK(_work, _func)        /* 参数_work是工作项地址，参数_func是需要异步执行的函数 */

动态初始化一个延迟工作项：

INIT_DELAYED_WORK(_work, _func)

动态初始化一个延迟工作项，工作项是栈里面的局部变量：

INIT_DELAYED_WORK_ONSTACK(_work, _func)

动态初始化一个延迟工作项，但使用可推迟的定时器：

INIT_DEFERRABLE_WORK(_work, _func)

动态初始化一个延迟工作项，工作项是栈里面的局部变量，但使用可推迟的定时器：

INIT_DEFERRABLE_WORK_ONSTACK(_work, _func)

（2）全局工作队列

在全局工作队列中添加一个工作项：

bool schedule_work(struct work_struct *work);

在全局工作队列中添加一个工作项，并且指定工作项的处理器：

bool schedule_work_on(int cpu, struct work_struct *work);

在全局工作队列中添加一个延迟工作项：

bool schedule_delayed_work(struct delayed_work *dwork,
					 unsigned long delay)		/* 参数delay是把工作项添加到工作队列中之前等待的时间，单位是嘀嗒（tick） */

在全局工作队列中添加一个延迟工作项，并且指定工作项的处理器：

bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
					    unsigned long delay)		/* 参数delay是把工作项添加到工作队列中之前等待的时间，单位是嘀嗒（tick） */

冲刷全局工作队列，确保全局工作队列中的所有工作项执行完：

bool flush_work(struct work_struct *work);

（3）专用工作队列

分配工作队列：

/* 参数fmt是工作队列名称的格式 */
/* 参数flag是标志位，可以是0，也可以是下面这些标志位的组合 */
enum {
	WQ_UNBOUND		= 1 << 1, 		/* 处理工作项的内核线程不绑定到任何特定的处理器 */
	WQ_FREEZABLE		= 1 << 2, 		/* 在系统挂起的时候冻结 */
	WQ_MEM_RECLAIM		= 1 << 3, 		/* 在内存回收的时候可能使用这个工作队列 */
	WQ_HIGHPRI		= 1 << 4, 		/* 高优先级 */
	WQ_CPU_INTENSIVE	= 1 << 5, 		/* 处理器密集型 */
	WQ_SYSFS		= 1 << 6, 
	WQ_POWER_EFFICIENT	= 1 << 7,		/* 省电 */
	__WQ_DRAINING		= 1 << 16, 
	__WQ_ORDERED		= 1 << 17, 
	__WQ_LEGACY		= 1 << 18, 
	__WQ_ORDERED_EXPLICIT	= 1 << 19, 
	WQ_MAX_ACTIVE		= 512,	  
	WQ_MAX_UNBOUND_PER_CPU	= 4,	 
	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
};
/* 参数max_active是每个处理器可以同时执行的工作项的最大数量，0表示使用默认值 */
/* 参数args是传给参数fmt的参数 */
alloc_workqueue(fmt, flags, max_active, args...)

分配一个有序的工作队列，有序的工作队列在任何时刻，按照入队的顺序只执行一个工作项：

alloc_ordered_workqueue(fmt, flags, args...)

在指定的工作队列中添加一个工作项：

bool queue_work(struct workqueue_struct *wq, struct work_struct *work);

在指定的工作队列中添加一个工作项，并且指定执行工作项的处理器：

bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work);

冲刷工作队列，确保工作队列中的所有工作项执行完：

void flush_workqueue(struct workqueue_struct *wq);

销毁工作队列的函数是：

void destroy_workqueue(struct workqueue_struct *wq);

（4）其他编程接口

取消一个工作项：

bool cancel_work(struct work_struct *work);

取消一个工作项，并且等待取消操作完成：

bool cancel_work_sync(struct work_struct *work);

取消一个延迟工作项：

bool cancel_delayed_work(struct delayed_work *dwork);

取消一个延迟工作项，并且等待取消操作执行完：

bool cancel_delayed_work_sync(struct delayed_work *dwork);

等待一个工作项执行完：

bool flush_work(struct work_struct *work);

等待一个延迟工作项执行完：

bool flush_delayed_work(struct delayed_work *dwork);

2. 技术原理

工作队列使用的术语：

work：工作，也称为工作项
work queue：工作队列，就是工作的集合，work queue和work是一对多的关系
worker：工人，一个工人对应一个内核线程，我们把工人对应的内核线程称为工人线程
worker_pool：工人池，就是工人的集合，工人池和工人是一对多的关系
pool_workqueue：中介，负责建立工作队列和工人池之间的关系。工作队列和pool_workqueue是一对多的关系，pool_workqueue和工人池是一对一的关系

（1）数据结构

工作队列分为两种：

1）绑定处理器的工作队列：默认创建绑定处理器的工作队列，每个工人线程绑定到一个处理器

2）不当定处理器的工作队列：创建工作队列的时候需要指定标志WQ_UNBOUND，工人线程不绑定到某个处理器，可以在处理器之间迁移

工作项负责保存需要异步执行的函数，数据类型是work_struct，其定义如下：

include/linux/workqueue.h
struct work_struct {
	atomic_long_t data;		/* 传递给函数func的参数 */
	struct list_head entry;		/* 用来把工作项添加到链表中 */
	work_func_t func;		/* 需要异步执行的函数 */
#ifdef CONFIG_LOCKDEP
	struct lockdep_map lockdep_map;
#endif
};

延迟工作项时工作项和定时器的结合，数据类型是delayed_work，其定义如下：

include/linux/workqueue.h
struct delayed_work {
	struct work_struct work;		/* 工作项 */
	struct timer_list timer;		/* 定时器 */

	/* 定时器把工作项添加到工作队列时，需要知道目标工作队列和处理器 */
	struct workqueue_struct *wq;
	int cpu;
};

（2）添加工作项

函数queue_work()用来向工作队列中添加一个工作项，把主要工作委托给函数queue_work_on()。

include/linux/workqueue.h
static inline bool queue_work(struct workqueue_struct *wq,
			      struct work_struct *work)
{
        /* 第一个参数“int cpu”设置为WORK_CPU_UNBOUND，意思是不绑定到任何处理器，优先选择当前处理器 */
	return queue_work_on(WORK_CPU_UNBOUND, wq, work);		
}

函数queue_work_on的代码如下：

kernel/workqueue.c
bool queue_work_on(int cpu, struct workqueue_struct *wq,
		   struct work_struct *work)
{
	bool ret = false;
	unsigned long flags;

	local_irq_save(flags);

	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {		/* 如果工作项没有被添加过，那么给工作项设置 */
		__queue_work(cpu, wq, work);		/* 标志位WORK_STRUCT_PENDING_BIT，然后把主要工作委托给函数__queue_work() */
		ret = true;
	}

	local_irq_restore(flags);
	return ret;
}

如果工作项没有添加过，那么给工作项设置标志位WORK_STRUCT_PENDING_BIT，然后把主要工作委托给函数__queue_work()。

函数__queue_work的代码如下：

static void __queue_work(int cpu, struct workqueue_struct *wq,
			 struct work_struct *work)
{
	struct pool_workqueue *pwq;
	struct worker_pool *last_pool;
	struct list_head *worklist;
	unsigned int work_flags;
	unsigned int req_cpu = cpu;

	/*
	 * While a work item is PENDING && off queue, a task trying to
	 * steal the PENDING will busy-loop waiting for it to either get
	 * queued or lose PENDING.  Grabbing PENDING and queueing should
	 * happen with IRQ disabled.
	 */
	lockdep_assert_irqs_disabled();

	debug_work_activate(work);

	/* if draining, only works from the same workqueue are allowed */
	if (unlikely(wq->flags & __WQ_DRAINING) &&
	    WARN_ON_ONCE(!is_chained_work(wq)))
		return;
retry:
	if (req_cpu == WORK_CPU_UNBOUND)
		cpu = wq_select_unbound_cpu(raw_smp_processor_id());

	/* pwq which will be used unless @work is executing elsewhere */
	if (!(wq->flags & WQ_UNBOUND))		/* 从工作队列中选择pool_workqueue实例：如果是绑定处理器的工作队列，那么选择当前处理器 */
		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);		/* 的pool_wokqueue实例；如果是不绑定处理器的工作队列，那么选择当前处理器 */
	else								/* 所属的内存节点的pool_workqueue实例 */
		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));

	/*
	 * If @work was previously on a different pool, it might still be
	 * running there, in which case the work needs to be queued on that
	 * pool to guarantee non-reentrancy.
	 */
	last_pool = get_work_pool(work);
	if (last_pool && last_pool != pwq->pool) {		/* 如果工作正在被其他pool_workqueue实例的工人执行，那么还是把工作添加 */
		struct worker *worker;		/* 那个pool_workqueue实例 */

		spin_lock(&last_pool->lock);

		worker = find_worker_executing_work(last_pool, work);

		if (worker && worker->current_pwq->wq == wq) {
			pwq = worker->current_pwq;
		} else {
			/* meh... not running there, queue here */
			spin_unlock(&last_pool->lock);
			spin_lock(&pwq->pool->lock);
		}
	} else {
		spin_lock(&pwq->pool->lock);
	}

	/*
	 * pwq is determined and locked.  For unbound pools, we could have
	 * raced with pwq release and it could already be dead.  If its
	 * refcnt is zero, repeat pwq selection.  Note that pwqs never die
	 * without another pwq replacing it in the numa_pwq_tbl or while
	 * work items are executing on it, so the retrying is guaranteed to
	 * make forward-progress.
	 */
	if (unlikely(!pwq->refcnt)) {
		if (wq->flags & WQ_UNBOUND) {
			spin_unlock(&pwq->pool->lock);
			cpu_relax();
			goto retry;
		}
		/* oops */
		WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
			  wq->name, cpu);
	}

	/* pwq determined, queue */
	trace_workqueue_queue_work(req_cpu, pwq, work);

	if (WARN_ON(!list_empty(&work->entry))) {
		spin_unlock(&pwq->pool->lock);
		return;
	}

	pwq->nr_in_flight[pwq->work_color]++;
	work_flags = work_color_to_flags(pwq->work_color);

	if (likely(pwq->nr_active < pwq->max_active)) {		/* 如果pool_workqueue实例的未处理工作数量小于限制，那么把工作添加 */
		trace_workqueue_activate_work(work);		/* 到pool_workqueue实例对应的工人池的链表worklist中；如果pool_workqueue */
		pwq->nr_active++;		/* 实例的未处理工作数量达到限制，那么给工作设置标志位WOKR_STRUCT_DELAYED，并且把工作添加到 */
		worklist = &pwq->pool->worklist;		/* pool_workqueue实例的链表delayed_work中 */
		if (list_empty(worklist))
			pwq->pool->watchdog_ts = jiffies;
	} else {
		work_flags |= WORK_STRUCT_DELAYED;
		worklist = &pwq->delayed_works;
	}

	insert_work(pwq, work, worklist, work_flags);		/* 把工作添加到上面选择的链表中 */

	spin_unlock(&pwq->pool->lock);
}

（3）工人处理工作

每个工人对应一个内核线程，一个工人池对应一个或多个工人。多个工人从工人池的未处理工作链表（worker_pool.worklist）中取工作并处理。

工人线程的处理函数是work_thread()，调用函数process_one_work处理一个工作项。

函数worker_thread()的代码如下：

static int worker_thread(void *__worker)
{
	struct worker *worker = __worker;
	struct worker_pool *pool = worker->pool;

	/* tell the scheduler that this is a workqueue worker */		/* 告诉进程调度器这个一个工作队列的工人线程 */
	set_pf_worker(true);
woke_up:
	spin_lock_irq(&pool->lock);

	/* am I supposed to die? */		/* 我该消亡吗？ */
	if (unlikely(worker->flags & WORKER_DIE)) {		/* 如果工人太多，想要减少工人的数量，那么当前工人线程退出 */
		spin_unlock_irq(&pool->lock);
		WARN_ON_ONCE(!list_empty(&worker->entry));
		set_pf_worker(false);

		set_task_comm(worker->task, "kworker/dying");
		ida_simple_remove(&pool->worker_ida, worker->id);
		worker_detach_from_pool(worker);
		kfree(worker);
		return 0;
	}

	worker_leave_idle(worker);		/* 工人退出空闲状态 */
recheck:
	/* no more worker necessary? */
	if (!need_more_worker(pool))		/* 如果不需要本工人执行工作，那么本工人进入空闲状态 */
		goto sleep;

	/* do we need to manage? */
	if (unlikely(!may_start_working(pool)) && manage_workers(worker))		/* 如果工人池中没有空闲的工人， */
		goto recheck;														/* 那么创建一些工人备用 */

	/*
	 * ->scheduled list can only be filled while a worker is
	 * preparing to process a work or actually processing it.
	 * Make sure nobody diddled with it while I was sleeping.
	 */
	WARN_ON_ONCE(!list_empty(&worker->scheduled));

	/*
	 * Finish PREP stage.  We're guaranteed to have at least one idle
	 * worker or that someone else has already assumed the manager
	 * role.  This is where @worker starts participating in concurrency
	 * management if applicable and concurrency management is restored
	 * after being rebound.  See rebind_workers() for details.
	 */
	worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

	do {
		struct work_struct *work =
			list_first_entry(&pool->worklist,
					 struct work_struct, entry);		/* 从工人池的链表worklist中取出一个工作 */

		pool->watchdog_ts = jiffies;

		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {		/* 如果是正常工作，那么调用函数process_one_work() */
			/* optimization path, not strictly necessary */		/* 执行正常的工作，然后执行工人的链表schedule中的特殊工作 */
			process_one_work(worker, work);
			if (unlikely(!list_empty(&worker->scheduled)))
				process_scheduled_works(worker);
		} else {		/* 如果是特殊工作，那么首先把工作添加到工人的链表scheduled的尾部，然后执行工人的链表scheduled中的 */
			move_linked_works(work, &worker->scheduled, NULL);		/* 特殊工作 */
			process_scheduled_works(worker);
		}
	} while (keep_working(pool));		/* 如果有工作需要处理，并且处于运行状态的工人数量不超过1，那么本工作继续执行工作 */

	worker_set_flags(worker, WORKER_PREP);
sleep:
	/*
	 * pool->lock is held and there's no work to process and no need to
	 * manage, sleep.  Workers are woken up only while holding
	 * pool->lock or from local cpu, so setting the current state
	 * before releasing pool->lock is enough to prevent losing any
	 * event.
	 */
	worker_enter_idle(worker);		/* 工人进入空闲状态，睡眠 */
	__set_current_state(TASK_IDLE);
	spin_unlock_irq(&pool->lock);
	schedule();
	goto woke_up;
}

下面解释一下正常工作和特殊工作：

向工作队列中添加正常工作，是直接添加到工人池的链表worklist中。

调用函数flash_work(t)等待工作 t 执行完，实现方法是添加一个特殊工作：屏障工作，执行这个屏障工作的时候就可以确定工作 t 执行完。如果工作 t 正在被工人 p 执行，那么把屏障工作直接添加到工人 p 的链表scheduled中；如果工作 t 没有执行，那么把屏障工作添加到工人池的链表worklist中，并且给屏障工作设置标志位WORK_STRUCT_LINK。

函数process_one_work()负责处理一个工作，其代码如下：

static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
	struct pool_workqueue *pwq = get_work_pwq(work);
	struct worker_pool *pool = worker->pool;
	bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
	int work_color;
	struct worker *collision;
#ifdef CONFIG_LOCKDEP
	/*
	 * It is permissible to free the struct work_struct from
	 * inside the function that is called from it, this we need to
	 * take into account for lockdep too.  To avoid bogus "held
	 * lock freed" warnings as well as problems when looking into
	 * work->lockdep_map, make a copy and use that here.
	 */
	struct lockdep_map lockdep_map;

	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
	/* ensure we're on the correct CPU */
	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
		     raw_smp_processor_id() != pool->cpu);

	/*
	 * A single work shouldn't be executed concurrently by
	 * multiple workers on a single cpu.  Check whether anyone is
	 * already processing the work.  If so, defer the work to the
	 * currently executing one.
	 */
	collision = find_worker_executing_work(pool, work);		/* 一个工作不应该被多个工作并发执行。如果一个工作正在被工人池 */
	if (unlikely(collision)) {		/* 的其他工人执行，那么把这个工作添加到这个工人的链表scheduled中延后执行 */
		move_linked_works(work, &collision->scheduled, NULL);
		return;
	}

	/* claim and dequeue */
	debug_work_deactivate(work);
	hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);		/* 把工人添加到工人池的散列表busy_hash中 */
	worker->current_work = work;		/* 工人的成员current_work指向当前工作， */
	worker->current_func = work->func;		/* 成员current_func指向当前工作的处理函数， */
	worker->current_pwq = pwq;		/* 成员current_pwq指向当前pool_workqueue实例 */
	work_color = get_work_color(work);

	/*
	 * Record wq name for cmdline and debug reporting, may get
	 * overridden through set_worker_desc().
	 */
	strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

	list_del_init(&work->entry);

	/*
	 * CPU intensive works don't participate in concurrency management.
	 * They're the scheduler's responsibility.  This takes @worker out
	 * of concurrency management and the next code block will chain
	 * execution of the pending work items.
	 */
	if (unlikely(cpu_intensive))		/* 如果工作队列是处理器密集型的，那么给工人设置标志位WORKER_CPU_INTENSIVE，工人 */
		worker_set_flags(worker, WORKER_CPU_INTENSIVE);		/* 不再被工人池动态调度 */

	/*
	 * Wake up another worker if necessary.  The condition is always
	 * false for normal per-cpu workers since nr_running would always
	 * be >= 1 at this point.  This is used to chain execution of the
	 * pending work items for WORKER_NOT_RUNNING workers such as the
	 * UNBOUND and CPU_INTENSIVE ones.
	 */
	if (need_more_worker(pool))		/* 对于不绑定处理器或处理器密集型的工作队列，唤醒更多空闲工人处理工作 */
		wake_up_worker(pool);

	/*
	 * Record the last pool and clear PENDING which should be the last
	 * update to @work.  Also, do this inside @pool->lock so that
	 * PENDING and queued state changes happen together while IRQ is
	 * disabled.
	 */
	set_work_pool_and_clear_pending(work, pool->id);

	spin_unlock_irq(&pool->lock);

	lock_map_acquire(&pwq->wq->lockdep_map);
	lock_map_acquire(&lockdep_map);
	/*
	 * Strictly speaking we should mark the invariant state without holding
	 * any locks, that is, before these two lock_map_acquire()'s.
	 *
	 * However, that would result in:
	 *
	 *   A(W1)
	 *   WFC(C)
	 *		A(W1)
	 *		C(C)
	 *
	 * Which would create W1->C->W1 dependencies, even though there is no
	 * actual deadlock possible. There are two solutions, using a
	 * read-recursive acquire on the work(queue) 'locks', but this will then
	 * hit the lockdep limitation on recursive locks, or simply discard
	 * these locks.
	 *
	 * AFAICT there is no possible deadlock scenario between the
	 * flush_work() and complete() primitives (except for single-threaded
	 * workqueues), so hiding them isn't a problem.
	 */
	lockdep_invariant_state(true);
	trace_workqueue_execute_start(work);
	worker->current_func(work);		/* 执行工作的处理函数 */
	/*
	 * While we must be careful to not use "work" after this, the trace
	 * point will only record its address.
	 */
	trace_workqueue_execute_end(work);
	lock_map_release(&lockdep_map);
	lock_map_release(&pwq->wq->lockdep_map);

	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
		       "     last function: %pf\n",
		       current->comm, preempt_count(), task_pid_nr(current),
		       worker->current_func);
		debug_show_held_locks(current);
		dump_stack();
	}

	/*
	 * The following prevents a kworker from hogging CPU on !PREEMPT
	 * kernels, where a requeueing work item waiting for something to
	 * happen could deadlock with stop_machine as such work item could
	 * indefinitely requeue itself while all other CPUs are trapped in
	 * stop_machine. At the same time, report a quiescent RCU state so
	 * the same condition doesn't freeze RCU.
	 */
	cond_resched();

	spin_lock_irq(&pool->lock);

	/* clear cpu intensive status */
	if (unlikely(cpu_intensive))
		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

	/* we're done with it, release */
	hash_del(&worker->hentry);
	worker->current_work = NULL;
	worker->current_func = NULL;
	worker->current_pwq = NULL;
	pwq_dec_nr_in_flight(pwq, work_color);
}

（4）工人池动态管理工人

工人池可以动态增加和删除工人，算法如下：

1）工人有3种状态：空闲（idle）、运行（running）和挂起（suspend）。空闲是指没有执行工作，运行是指正在执行工作，挂起是指在执行工作的过程中睡眠。

2）如果工人池中有工作需要处理，至少保持一个处在运行状态的工人来处理。

3）如果处在运行状态的工人在执行工作的过程中进入挂起状态，为了保证其他工作的执行，需要唤醒空闲的工人处理工作。

4）如果有工作需要执行，并且处在运行状态的工人数量大于1，会让多余的工人进入空闲状态。

5）如果没有工作需要执行，会让所有工人进入空闲状态。

6）如果创建的工人过多，工人池把空闲时间超过300s（IDLE_WORKER_TIMEOUT）的工人删除。

为了跟踪工人的运行和挂起状态、动态调整工人的数量，工作队列使用在进程调度中加入钩子函数的技巧：

1）跟踪工人从挂起进入到运行状态。唤醒工人进程的时候，如果工人线程正在执行工作的过程中，那么把工人池中处在运行状态的工人计数（nr_running）加1。相关代码如下：

try_to_wake_up()  ->  ttwu_queue()  ->  ttwu_do_activate()  ->  ttwu_activate()  ->  wq_worker_waking_up()

kernel/workqueue.c
void wq_worker_waking_up(struct task_struct *task, int cpu)
{
	struct worker *worker = kthread_data(task);

	if (!(worker->flags & WORKER_NOT_RUNNING)) {
		WARN_ON_ONCE(worker->pool->cpu != cpu);
		atomic_inc(&worker->pool->nr_running);
	}
}

2）跟踪工人从运行进入挂起状态。当一个工人睡眠的时候，如果工人池中没有工人处于运行状态，并且工人池有工作需要执行，那么唤醒一个空闲的工人。其相关代码如下：

__schedule()  ->  wq_worker_sleeping()

kernel/workqueue.c
struct task_struct *wq_worker_sleeping(struct task_struct *task)
{
	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
	struct worker_pool *pool;

	/*
	 * Rescuers, which may not have all the fields set up like normal
	 * workers, also reach here, let's not access anything before
	 * checking NOT_RUNNING.
	 */
	if (worker->flags & WORKER_NOT_RUNNING)
		return NULL;

	pool = worker->pool;

	/* this can only happen on the local cpu */
	if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
		return NULL;

	/*
	 * The counterpart of the following dec_and_test, implied mb,
	 * worklist not empty test sequence is in insert_work().
	 * Please read comment there.
	 *
	 * NOT_RUNNING is clear.  This means that we're bound to and
	 * running on the local cpu w/ rq lock held and preemption
	 * disabled, which in turn means that none else could be
	 * manipulating idle_list, so dereferencing idle_list without pool
	 * lock is safe.
	 */
	if (atomic_dec_and_test(&pool->nr_running) &&
	    !list_empty(&pool->worklist))
		to_wakeup = first_idle_worker(pool);
	return to_wakeup ? to_wakeup->task : NULL;
}

工人池的调度思想是如果有工作需要处理，保持一个处在运行状态的工人来处理，不多也不少。

这种做法有个问题：如果工作是处理器密集型的，虽然工人没有进入挂起状态，但是会长时间占用处理器，让后续的工作阻塞时间太长。

为了解决这个问题，可以在创建工组队列的时候设置标志WQ_CPU_INTENSIVE，声明工作队列是处理器密集型的。当一个工人执行工作的时候，让这个工人不受工人池动态调度，像是进入了挂起状态，工人池创建新的工人来执行后续的工作。

工人线程对处理器密集型的特殊处理如下：

worker_thread()  ->  process_one_work()

kernel/workqueue.c
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
    ...
	bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
	
    ...
	if (unlikely(cpu_intensive))		/* 如果工作队列是处理器密集型的，那么给工人设置标志位WORKER_CPU_INTENSIVE，工人 */
		worker_set_flags(worker, WORKER_CPU_INTENSIVE);		/* 不再被工人池动态调度 */

	if (need_more_worker(pool))		/* 对于不绑定处理器或处理器密集型的工作队列，唤醒更多空闲工人处理工作 */
		wake_up_worker(pool);

	...
	worker->current_func(work);		/* 执行工作的处理函数 */
    ...
	
	/* clear cpu intensive status */
	if (unlikely(cpu_intensive))
		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

	...
}

enum
{
    ...
    WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_CPU_INTENSIVE |
				  WORKER_UNBOUND | WORKER_REBOUND,
    ...
};

static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
	struct worker_pool *pool = worker->pool;

	WARN_ON_ONCE(worker->task != current);

	/* If transitioning into NOT_RUNNING, adjust nr_running. */
	if ((flags & WORKER_NOT_RUNNING) &&
	    !(worker->flags & WORKER_NOT_RUNNING)) {
		atomic_dec(&pool->nr_running);
	}

	worker->flags |= flags;
}

static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
	struct worker_pool *pool = worker->pool;
	unsigned int oflags = worker->flags;

	WARN_ON_ONCE(worker->task != current);

	worker->flags &= ~flags;

	/*
	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
	 * of multiple flags, not a single flag.
	 */
	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
		if (!(worker->flags & WORKER_NOT_RUNNING))
			atomic_inc(&pool->nr_running);
}

可以看到，给工人设置标志位WORKER_CPU_INTENSIVE的时候，把工人池的计数nr_running减1，相当于工人进入挂起状态。