最新《微服务架构的分布式事务解决方案》

一、前言
本文主要讲述下面两部分的内容:

1、将work挂入workqueue的处理过程

2、如何处理挂入workqueue的work

二、用户将一个work挂入workqueue
 

这里先给出几个常用的接口:

1.把一个work挂入未绑定cpu的workqueue中

/**
 * queue_work - queue work on a workqueue
 * @wq: workqueue to use
 * @work: work to queue
 *
 * Returns %false if @work was already on a queue, %true otherwise.
 *
 * We queue the work to the CPU on which it was submitted, but if the CPU dies
 * it can be processed by another CPU.
 */
static inline bool queue_work(struct workqueue_struct *wq,
                  struct work_struct *work)
{
    return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}
2.把一个work在一段时间后,挂入未绑定cpu的workqueue

/**
 * queue_delayed_work - queue work on a workqueue after delay
 * @wq: workqueue to use
 * @dwork: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
 */
static inline bool queue_delayed_work(struct workqueue_struct *wq,
                      struct delayed_work *dwork,
                      unsigned long delay)
{
    return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}
这里的一段时间,是创建一个内核定时器,让在定时器处理函数中把work挂载workqueue上,在前面文章中已经分析过了。

下面我们以queue_work_on函数作为开始:

1、queue_work_on函数

使用workqueue机制的模块可以调用queue_work_on(有其他变种的接口,这里略过,其实思路是一致的)将一个定义好的work挂入workqueue,具体代码如下:

/**
 * queue_work_on - queue work on specific cpu
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_on(int cpu, struct workqueue_struct *wq,
           struct work_struct *work)
{
    bool ret = false;
    unsigned long flags;
 
    local_irq_save(flags);    //把work加入工作队列是在关本地中断下运行的
 
    if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
        __queue_work(cpu, wq, work);        //挂入work list并通知worker thread pool来处理 
        ret = true;
    }
 
    local_irq_restore(flags);    //开本地中断
    return ret;
}
work_struct的data member中的WORK_STRUCT_PENDING_BIT这个bit标识了该work是处于pending状态还是正在处理中,pending状态的work只会挂入一次。大部分的逻辑都是在__queue_work函数中,下面的小节都是描述该函数的执行过程。

 
static void __queue_work(int cpu, struct workqueue_struct *wq,
             struct work_struct *work)
{
    struct pool_workqueue *pwq;
    struct worker_pool *last_pool;
    struct list_head *worklist;
    unsigned int work_flags;
    unsigned int req_cpu = cpu;
 
    /*
     * While a work item is PENDING && off queue, a task trying to
     * steal the PENDING will busy-loop waiting for it to either get
     * queued or lose PENDING.  Grabbing PENDING and queueing should
     * happen with IRQ disabled.
     */
    lockdep_assert_irqs_disabled();
 
    debug_work_activate(work);
 
    /* if draining, only works from the same workqueue are allowed */
    if (unlikely(wq->flags & __WQ_DRAINING) &&
        WARN_ON_ONCE(!is_chained_work(wq)))
        return;
retry:
    if (req_cpu == WORK_CPU_UNBOUND)
        cpu = wq_select_unbound_cpu(raw_smp_processor_id());
 
    /* pwq which will be used unless @work is executing elsewhere */
    if (!(wq->flags & WQ_UNBOUND))
        pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
    else
        pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
 
    /*
     * If @work was previously on a different pool, it might still be
     * running there, in which case the work needs to be queued on that
     * pool to guarantee non-reentrancy.
     */
    last_pool = get_work_pool(work);
    if (last_pool && last_pool != pwq->pool) {
        struct worker *worker;
 
        spin_lock(&last_pool->lock);
 
        worker = find_worker_executing_work(last_pool, work);
 
        if (worker && worker->current_pwq->wq == wq) {
            pwq = worker->current_pwq;
        } else {
            /* meh... not running there, queue here */
            spin_unlock(&last_pool->lock);
            spin_lock(&pwq->pool->lock);
        }
    } else {
        spin_lock(&pwq->pool->lock);
    }
 
    /*
     * pwq is determined and locked.  For unbound pools, we could have
     * raced with pwq release and it could already be dead.  If its
     * refcnt is zero, repeat pwq selection.  Note that pwqs never die
     * without another pwq replacing it in the numa_pwq_tbl or while
     * work items are executing on it, so the retrying is guaranteed to
     * make forward-progress.
     */
    if (unlikely(!pwq->refcnt)) {
        if (wq->flags & WQ_UNBOUND) {
            spin_unlock(&pwq->pool->lock);
            cpu_relax();
            goto retry;
        }
        /* oops */
        WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
              wq->name, cpu);
    }
 
    /* pwq determined, queue */
    trace_workqueue_queue_work(req_cpu, pwq, work);
 
    if (WARN_ON(!list_empty(&work->entry))) {
        spin_unlock(&pwq->pool->lock);
        return;
    }
 
    pwq->nr_in_flight[pwq->work_color]++;
    work_flags = work_color_to_flags(pwq->work_color);
 
    if (likely(pwq->nr_active < pwq->max_active)) {
        trace_workqueue_activate_work(work);
        pwq->nr_active++;
        worklist = &pwq->pool->worklist;
        if (list_empty(worklist))
            pwq->pool->watchdog_ts = jiffies;
    } else {
        work_flags |= WORK_STRUCT_DELAYED;
        worklist = &pwq->delayed_works;
    }
 
    insert_work(pwq, work, worklist, work_flags);
 
    spin_unlock(&pwq->pool->lock);
}
2、__WQ_DRAINING的解释

__queue_work函数一开始会校验__WQ_DRAINING这个flag,如下:

    /* if draining, only works from the same workqueue are allowed */
    if (unlikely(wq->flags & __WQ_DRAINING) &&
        WARN_ON_ONCE(!is_chained_work(wq)))
        return;
__WQ_DRAINING这个flag表示该workqueue正在进行draining的操作,这多半是发送在销毁workqueue的时候,既然要销毁,那么挂入该workqueue的所有的work都要处理完毕,才允许它消亡。当想要将一个workqueue中所有的work都清空的时候,如果还有work挂入怎么办?一般而言,这时候当然是不允许新的work挂入了,毕竟现在的目标是清空workqueue中的work。但是有一种特例(通过is_chained_work判定),也就是正在清空的work(隶属于该workqueue)又触发了一个queue work的操作(也就是所谓chained work),这时候该work允许挂入。

3、选择pool workqueue

retry:
    if (req_cpu == WORK_CPU_UNBOUND)        //这个work是否要求使用哪个cpu
        cpu = wq_select_unbound_cpu(raw_smp_processor_id());
 
    /* pwq which will be used unless @work is executing elsewhere */
    if (!(wq->flags & WQ_UNBOUND))
        pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);    //对于bound型的workqueue,直接使用本地CPU对应pool_workqueue
    else
        pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));   //对于unbound型,调用unbound_pwq_by_node()寻找本地node节点对应的unbound类型的pool_workqueue
WORK_CPU_UNBOUND表示并不指定cpu,这时候,选择当前代码运行的那个cpu了。一旦确定了cpu了,对于非unbound的workqueue,当然使用per cpu的pool workqueue。如果是unbound的workqueue,那么要根据numa node id来选择。cpu_to_node可以从cpu id获取node id。需要注意的是:这里选择的pool wq只是备选的,可能用也可能不用,它有可能会被替换掉,具体参考下一节描述。

4、选择worker thread pool

与其说挂入workqueue,不如说挂入worker thread pool,因为毕竟是线程池来处理具体的work。pool_workqueue有一个相关联的worker thread pool(struct pool_workqueue的pool成员),因此看起来选择了pool wq也就选定了worker pool了,但是,不一定当前选定的那个pool wq对应的worker pool就适合该work,因为有时候该work可能正在其他的worker thread上执行中,在这种情况下,为了确保work的callback function不会重入,该work最好还是挂在那个worker thread pool上,具体代码如下:

 
    /*
     * If @work was previously on a different pool, it might still be
     * running there, in which case the work needs to be queued on that
     * pool to guarantee non-reentrancy.
     */
    last_pool = get_work_pool(work);        //通过work_struct的成员data查询该work上一次是在哪个worker_pool中运行的。
    if (last_pool && last_pool != pwq->pool) {       //如果上次运行的worker_pool和本次不一致
        struct worker *worker;
 
        spin_lock(&last_pool->lock);
 
        worker = find_worker_executing_work(last_pool, work);    //判断一个work是否正在last_pool上运行,也即不在当前worker_pool运行。如果是,返回这个正在执行的工作线程worker
 
        if (worker && worker->current_pwq->wq == wq) {
            pwq = worker->current_pwq;                //利用当前work正在执行的pool_workqueue,利用缓存热度,不进行调度
        } else {
            /* meh... not running there, queue here */
            spin_unlock(&last_pool->lock);
            spin_lock(&pwq->pool->lock);
        }
    } else {
        spin_lock(&pwq->pool->lock);
    }
last_pool记录了上一次该work是被哪一个worker pool处理的,如果last_pool就是pool wq对应的worker pool,那么皆大欢喜,否则只能使用last pool了。使用last pool的例子比较复杂一些,因为这时候需要根据last worker pool找到对应的pool workqueue。find_worker_executing_work函数可以找到具体哪一个worker线程正在处理该work,如果没有找到,那么还是使用第3节中选定的pool wq吧,否则,选择该worker线程当前的那个pool workqueue(其实也就是选定了线程池)。

5.确定这个pwq可用

    /*
     * pwq is determined and locked.  For unbound pools, we could have
     * raced with pwq release and it could already be dead.  If its
     * refcnt is zero, repeat pwq selection.  Note that pwqs never die
     * without another pwq replacing it in the numa_pwq_tbl or while
     * work items are executing on it, so the retrying is guaranteed to
     * make forward-progress.
     */
    if (unlikely(!pwq->refcnt)) {
        if (wq->flags & WQ_UNBOUND) {
            spin_unlock(&pwq->pool->lock);
            cpu_relax();    //对unbound类型pool_workqueue释放是异步的,当refcnt减少到0时,说明该pool_workqueue已经被释放,那么需要跳转到retry出重新选择pool_workqueue
            goto retry;
        }
        /* oops */
        /* 对于绑定cpu的,pwq每个cpu都有两个线程池,所以pwq不会使0,如果是0,只能证明程序已经跑飞 */
        WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
              wq->name, cpu);
    }
 

/* initialize newly alloced @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
             struct worker_pool *pool)
{
    BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
 
    memset(pwq, 0, sizeof(*pwq));
 
    pwq->pool = pool;
    pwq->wq = wq;
    pwq->flush_color = -1;
    pwq->refcnt = 1;        //新申请的pwq,初始化时,值为1
    INIT_LIST_HEAD(&pwq->delayed_works);
    INIT_LIST_HEAD(&pwq->pwqs_node);
    INIT_LIST_HEAD(&pwq->mayday_node);
    INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
}
 

在pwq中加入一个work时,是要引用计数加1

 
/**
 * insert_work - insert a work into a pool
 * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
 * work_struct flags.
 *
 * CONTEXT:
 * spin_lock_irq(pool->lock).
 */
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
            struct list_head *head, unsigned int extra_flags)
{
    struct worker_pool *pool = pwq->pool;
 
    /* we own @work, set data and link */
    set_work_pwq(work, pwq, extra_flags);
    list_add_tail(&work->entry, head);
    get_pwq(pwq);
 
    /*
     * Ensure either wq_worker_sleeping() sees the above
     * list_add_tail() or we see zero nr_running to avoid workers lying
     * around lazily while there are works to be processed.
     */
    smp_mb();
 
    if (__need_more_worker(pool))
        wake_up_worker(pool);
}
 
/**
 * get_pwq - get an extra reference on the specified pool_workqueue
 * @pwq: pool_workqueue to get
 *
 * Obtain an extra reference on @pwq.  The caller should guarantee that
 * @pwq has positive refcnt and be holding the matching pool->lock.
 */
static void get_pwq(struct pool_workqueue *pwq)
{
    lockdep_assert_held(&pwq->pool->lock);
    WARN_ON_ONCE(pwq->refcnt <= 0);
    pwq->refcnt++;
}
 
 

在一个work执行完之后,要在链表中删除这个work前,引用计数减1

/**
 * put_pwq - put a pool_workqueue reference
 * @pwq: pool_workqueue to put
 *
 * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
 * destruction.  The caller should be holding the matching pool->lock.
 */
static void put_pwq(struct pool_workqueue *pwq)
{
    lockdep_assert_held(&pwq->pool->lock);
    if (likely(--pwq->refcnt))            // 减1操作
        return;
    if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
        return;
    /*
     * @pwq can't be released under pool->lock, bounce to
     * pwq_unbound_release_workfn().  This never recurses on the same
     * pool->lock as this path is taken only for unbound workqueues and
     * the release work item is scheduled on a per-cpu workqueue.  To
     * avoid lockdep warning, unbound pool->locks are given lockdep
     * subclass of 1 in get_unbound_pool().
     */
    schedule_work(&pwq->unbound_release_work);
}
 

6、选择work挂入的队列

队列有两个,一个是被推迟执行的队列(pwq->delayed_works),一个是线程池要处理的队列(pwq->pool->worklist),如果挂入线程池要处理的队列,也就意味着该work进入active状态,线程池会立刻启动处理流程,如果挂入推迟执行的队列,那么该work还是pending状态:

    /* pwq determined, queue */
    trace_workqueue_queue_work(req_cpu, pwq, work);
 
    if (WARN_ON(!list_empty(&work->entry))) {
        spin_unlock(&pwq->pool->lock);
        return;
    }
 
    pwq->nr_in_flight[pwq->work_color]++;
    work_flags = work_color_to_flags(pwq->work_color);
 
    if (likely(pwq->nr_active < pwq->max_active)) {        //判断当前pool_workqueue的work活跃数量,如果少于最高限值,就加入链表worker_pool->worklist,否则加入delayed_works链表中
        trace_workqueue_activate_work(work);
        pwq->nr_active++;
        worklist = &pwq->pool->worklist;
        if (list_empty(worklist))
            pwq->pool->watchdog_ts = jiffies;
    } else {
        work_flags |= WORK_STRUCT_DELAYED;
        worklist = &pwq->delayed_works;
    }
 
 

7.将当前work加入到pool_workqueue->worklist尾部

 
    insert_work(pwq, work, worklist, work_flags);
 
    spin_unlock(&pwq->pool->lock);
 
/**
 * insert_work - insert a work into a pool
 * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
 * work_struct flags.
 *
 * CONTEXT:
 * spin_lock_irq(pool->lock).
 */
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
            struct list_head *head, unsigned int extra_flags)
{
    struct worker_pool *pool = pwq->pool;
 
    /* we own @work, set data and link   */
    set_work_pwq(work, pwq, extra_flags);     //把pool_workqueue指针的值和一些flag设置到data成员中,方便下次调用
    list_add_tail(&work->entry, head);        //加入这个线程池的工作链表尾部
    get_pwq(pwq);                             //每次加入,pwq的引用计数都要累加
 
    /*
     * Ensure either wq_worker_sleeping() sees the above
     * list_add_tail() or we see zero nr_running to avoid workers lying
     * around lazily while there are works to be processed.
     */
    //保证wake_up_worker()唤醒worker时,在__schedule()->wq_worker_sleeping()时,这里的list_add_tail()已经完成。同时保证下面__need_more_worker()读取nr_running时list_add_tail()链表已经完成。
    smp_mb();      
 
    if (__need_more_worker(pool))        //如果当前nr_running为0,表示当前worker可能并没有处于运行状态。那么需要wake_up_worker()强行唤醒一次。
        wake_up_worker(pool);
}
 
三、系统wq的初始化的初始化流程


 

四、 workqueue
workqueue 就是存放一组 work 的集合,基本可以分为两类:一类系统创建的 workqueue,一类是用户自己创建的 workqueue。

不论是系统还是用户的 workqueue,如果没有指定 WQ_UNBOUND,默认都是和 normal worker_pool 绑定。

1. 系统 workqueue

系统在初始化时创建了一批默认的 workqueue:system_wq、system_highpri_wq、system_long_wq、system_unbound_wq、system_freezable_wq、system_power_efficient_wq、system_freezable_power_efficient_wq。

 
/**
 * workqueue_init_early - early init for workqueue subsystem
 *
 * This is the first half of two-staged workqueue subsystem initialization
 * and invoked as soon as the bare basics - memory allocation, cpumasks and
 * idr are up.  It sets up all the data structures and system workqueues
 * and allows early boot code to create workqueues and queue/cancel work
 * items.  Actual work item execution starts only after kthreads can be
 * created and scheduled right before early initcalls.
 */
int __init workqueue_init_early(void)
{
    int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };    //每个cpu有高低优先级的,2个,这里HIGHPRI_NICE_LEVEL为-20,对应的prio为100,是普通进程里面的最高优先级
    int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
    int i, cpu;
 
    WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
    BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
    cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
 
    pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
    /* initialize CPU pools 为每个cpu初始化两个线程池  */
    for_each_possible_cpu(cpu) {        
        struct worker_pool *pool;
 
        i = 0;
        for_each_cpu_worker_pool(pool, cpu) {    //每个CPU两个worker_pool,分别对应per-cpu变量cpu_worker_pool[0]和cpu_worker_pool[1]
            BUG_ON(init_worker_pool(pool));        //初始化worker_pool(线程池)
            pool->cpu = cpu;                       // 指定 cpu
            cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
            pool->attrs->nice = std_nice[i++];     //设置nice值
            pool->node = cpu_to_node(cpu);        //pre-cpu的都是设置的near  mermory
 
            /* alloc pool ID */
            mutex_lock(&wq_pool_mutex);
            BUG_ON(worker_pool_assign_id(pool));
            mutex_unlock(&wq_pool_mutex);
        }
    }
 
    /* create default unbound and ordered wq attrs */
    for (i = 0; i < NR_STD_WORKER_POOLS; i++) {    //默认未绑定和ordered的也是各申请两个attribute(高低优先级)
        struct workqueue_attrs *attrs;
 
        BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
        attrs->nice = std_nice[i];
        unbound_std_wq_attrs[i] = attrs;        //设置Unbound类型workqueue的属性
 
        /*
         * An ordered wq should have only one pwq as ordering is
         * guaranteed by max_active which is enforced by pwqs.
         * Turn off NUMA so that dfl_pwq is used for all nodes.
         */
        BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
        attrs->nice = std_nice[i];
        attrs->no_numa = true;
        ordered_wq_attrs[i] = attrs;    //设置ordered类型workqueue的属性,ordered类型workqueue同一时刻只能有一个work item在运行。
    }
 
    system_wq = alloc_workqueue("events", 0, 0);            //普通优先级bound类型工作队列system_wq
    system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);    //高优先级bound类型工作队列system_highpri_wq
    system_long_wq = alloc_workqueue("events_long", 0, 0);            //比较耗时的任务
    system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,    //普通优先级unbound类型工作队列
                        WQ_UNBOUND_MAX_ACTIVE);
    system_freezable_wq = alloc_workqueue("events_freezable",        //freezable(系统挂起后,这类内核线程可以被冻结)类型工作队列system_freezable_wq
                          WQ_FREEZABLE, 0);
    system_power_efficient_wq = alloc_workqueue("events_power_efficient",    //省电类型的工作队列
                          WQ_POWER_EFFICIENT, 0);
    system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",   //freezable并且省电类型的工作队列
                          WQ_FREEZABLE | WQ_POWER_EFFICIENT,
                          0);
    BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
           !system_unbound_wq || !system_freezable_wq ||
           !system_power_efficient_wq ||
           !system_freezable_power_efficient_wq);
 
    return 0;
}
 

这里我们看到,pre-cpu的线程池是直接初始化的。前面的章节如果有印象,应该知道,pre-cpu的现场池其实是静态定义的,所以它不用动态申请可以直接初始化。

/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
 

下面的几个函数都是系统默认的system_wq

 
/**
 * schedule_work_on - put work task on a specific cpu
 * @cpu: cpu to put the work task on
 * @work: job to be done
 *
 * This puts a job on a specific cpu
 */
static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
    return queue_work_on(cpu, system_wq, work);
}
 
/**
 * schedule_work - put work task in global workqueue
 * @work: job to be done
 *
 * Returns %false if @work was already on the kernel-global workqueue and
 * %true otherwise.
 *
 * This puts a job in the kernel-global workqueue if it was not already
 * queued and leaves it in the same position on the kernel-global
 * workqueue otherwise.
 */
static inline bool schedule_work(struct work_struct *work)
{
    return queue_work(system_wq, work);
}
 
 
/**
 * schedule_delayed_work - put work task in global workqueue after delay
 * @dwork: job to be done
 * @delay: number of jiffies to wait or 0 for immediate execution
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
static inline bool schedule_delayed_work(struct delayed_work *dwork,
                     unsigned long delay)
{
    return queue_delayed_work(system_wq, dwork, delay);
}
 

五、线程池初始化
 
/**
 * init_worker_pool - initialize a newly zalloc'd worker_pool
 * @pool: worker_pool to initialize
 *
 * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
 *
 * Return: 0 on success, -errno on failure.  Even on failure, all fields
 * inside @pool proper are initialized and put_unbound_pool() can be called
 * on @pool safely to release it.
 */
static int init_worker_pool(struct worker_pool *pool)
{
    spin_lock_init(&pool->lock);
    pool->id = -1;
    pool->cpu = -1;                    //初始值-1表示当前worker_pool是unbound型的
    pool->node = NUMA_NO_NODE;            
    pool->flags |= POOL_DISASSOCIATED;
    pool->watchdog_ts = jiffies;
    INIT_LIST_HEAD(&pool->worklist);   //worker_pool 的 work list,各个 workqueue 把 work 挂载到这个链表上, 让 worker_pool 对应的多个 worker 来执行
    INIT_LIST_HEAD(&pool->idle_list);    //worker_pool 的 idle worker list
    hash_init(pool->busy_hash);          //worker_pool 的 busy worker list
 
    //销毁多余worker,每IDLE_WORKER_TIMEOUT(300秒)执行一次
    timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
 
    //设置mayday_timer,周期为MAYDAY_INTERVAL,即100ms
    timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
 
    INIT_LIST_HEAD(&pool->workers);        //worker_pool的 worker list
 
    ida_init(&pool->worker_ida);
    INIT_HLIST_NODE(&pool->hash_node);
    pool->refcnt = 1;                     //线程池,初始化时引用计数为1,创建一个worker + 1,销毁一个worker - 1
 
    /* shouldn't fail above this point */
    pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
    if (!pool->attrs)
        return -ENOMEM;
    return 0;
}
 

六、线程池如何创建worker线程?
1、per cpu worker pool什么时候创建worker线程?

2.unbound cpu worker pool 什么时候创建worker线程?

 
/**
 * workqueue_init - bring workqueue subsystem fully online
 *
 * This is the latter half of two-staged workqueue subsystem initialization
 * and invoked as soon as kthreads can be created and scheduled.
 * Workqueues have been created and work items queued on them, but there
 * are no kworkers executing the work items yet.  Populate the worker pools
 * with the initial workers and enable future kworker creations.
 */
int __init workqueue_init(void)
{
    struct workqueue_struct *wq;
    struct worker_pool *pool;
    int cpu, bkt;
 
    /*
     * It'd be simpler to initialize NUMA in workqueue_init_early() but
     * CPU to node mapping may not be available that early on some
     * archs such as power and arm64.  As per-cpu pools created
     * previously could be missing node hint and unbound pools NUMA
     * affinity, fix them up.
     *
     * Also, while iterating workqueues, create rescuers if requested.
     */
    wq_numa_init();
 
    mutex_lock(&wq_pool_mutex);
 
    for_each_possible_cpu(cpu) {        //初始化线程池的内存节点编号
        for_each_cpu_worker_pool(pool, cpu) {
            pool->node = cpu_to_node(cpu);    
        }
    }
 
    //workqueues上挂了系统中所有的wq,这里是初始化这些wq
    list_for_each_entry(wq, &workqueues, list) {
        wq_update_unbound_numa(wq, smp_processor_id(), true);
        WARN(init_rescuer(wq),    //创建救援线程
             "workqueue: failed to create early rescuer for %s",
             wq->name);
    }
 
    mutex_unlock(&wq_pool_mutex);
 
    /* create the initial workers  给每个在线的cpu的 worker_pool 创建 worker*/
    for_each_online_cpu(cpu) {        
        for_each_cpu_worker_pool(pool, cpu) {
            pool->flags &= ~POOL_DISASSOCIATED;
            BUG_ON(!create_worker(pool));    //创建一个线程池
        }
    }
 
    //对未绑定cpu的线程池,也创建线程一个默认的
    hash_for_each(unbound_pool_hash, bkt, pool, hash_node)    //创建unbound线程池
        BUG_ON(!create_worker(pool));
 
    wq_online = true;
    wq_watchdog_init();        //初始化wq_watchdog,用于监视某个worker是不是卡主了
 
    return 0;
}
因此,在系统初始化的时候,per cpu workqueue共享的那些线程池(2 x cpu nr)和unbound cpu workqueue的线程池就会通过create_worker创建一个initial worker。

一旦initial worker启动,该线程会执行worker_thread函数来处理work,在处理过程中,如果有需要, worker会创建新的线程。

2、unbound thread pool什么时候创建worker线程?

我们先看看unbound thread pool的建立,和per-CPU不同的是unbound thread pool是全局共享的,因此,每当创建不同属性的unbound workqueue的时候,都需要创建pool_workqueue及其对应的worker pool,这时候就会调用get_unbound_pool函数在当前系统中现存的线程池中找是否有匹配的worker pool,如果没有就需要创建新的线程池。在创建新的线程池之后,会立刻调用create_worker创建一个initial worker。和per cpu worker pool一样,一旦initial worker启动,随着work不断的挂入以及worker处理work的具体情况,线程池会动态创建worker。当然unbound thread pool在系统刚开始初始化的时候是创建了一个的。

 
/**
 * get_unbound_pool - get a worker_pool with the specified attributes
 * @attrs: the attributes of the worker_pool to get
 *
 * Obtain a worker_pool which has the same attributes as @attrs, bump the
 * reference count and return it.  If there already is a matching
 * worker_pool, it will be used; otherwise, this function attempts to
 * create a new one.
 *
 * Should be called with wq_pool_mutex held.
 *
 * Return: On success, a worker_pool with the same attributes as @attrs.
 * On failure, %NULL.
 */
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
    u32 hash = wqattrs_hash(attrs);
    struct worker_pool *pool;
    int node;
    int target_node = NUMA_NO_NODE;
 
    lockdep_assert_held(&wq_pool_mutex);
 
    /* do we already have a matching pool? ,查找是否已经有这个属性的未绑定的cpu的线程池 */
    hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
        if (wqattrs_equal(pool->attrs, attrs)) {        //比较属性
            pool->refcnt++;
            return pool;        //有的话,直接返回
        }
    }
 
    /* if cpumask is contained inside a NUMA node, we belong to that node */
    if (wq_numa_enabled) {
        for_each_node(node) {
            if (cpumask_subset(attrs->cpumask,
                       wq_numa_possible_cpumask[node])) {
                target_node = node;
                break;
            }
        }
    }
 
    /* nope, create a new one ,没有测创建一个新的 */
    pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
    if (!pool || init_worker_pool(pool) < 0)
        goto fail;
 
    lockdep_set_subclass(&pool->lock, 1);    /* see put_pwq() */
    copy_workqueue_attrs(pool->attrs, attrs);
    pool->node = target_node;
 
    /*
     * no_numa isn't a worker_pool attribute, always clear it.  See
     * 'struct workqueue_attrs' comments for detail.
     */
    pool->attrs->no_numa = false;
 
    if (worker_pool_assign_id(pool) < 0)
        goto fail;
 
    /* create and start the initial worker */
    if (wq_online && !create_worker(pool))
        goto fail;
 
    /* install */
    hash_add(unbound_pool_hash, &pool->hash_node, hash);        //新的线程池,加入哈希表
 
    return pool;
fail:
    if (pool)
        put_unbound_pool(pool);
    return NULL;
}
 

3、如何创建worker。代码如下:

 
/**
 * create_worker - create a new workqueue worker
 * @pool: pool the new worker will belong to
 *
 * Create and start a new worker which is attached to @pool.
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
 *
 * Return:
 * Pointer to the newly created worker.
 */
static struct worker *create_worker(struct worker_pool *pool)
{
    struct worker *worker = NULL;
    int id = -1;
    char id_buf[16];
 
    /* ID is needed to determine kthread name */
    id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);    //当前worker_pool->worker_ida获取一个空闲id
    if (id < 0)
        goto fail;
 
    worker = alloc_worker(pool->node);        //分配一个woker结构体
    if (!worker)
        goto fail;
 
    worker->id = id;                          //递增的id
 
    //worker的名字
    if (pool->cpu >= 0)
        snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
             pool->attrs->nice < 0  ? "H" : "");
    else
        snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
 
    //创建内核线程,线程处理函数是worker_thread
    worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
                          "kworker/%s", id_buf);
    if (IS_ERR(worker->task))
        goto fail;
 
    set_user_nice(worker->task, pool->attrs->nice);        //设置内核工作线程的优先级相关
    kthread_bind_mask(worker->task, pool->attrs->cpumask);    
 
    /* successful, attach the worker to the pool */
    worker_attach_to_pool(worker, pool);        //建立worker和线程池的关系 
 
    /* start the newly created worker */
    spin_lock_irq(&pool->lock);
    worker->pool->nr_workers++;                //统计当前worker对应worker_pool中工作线程数目
    worker_enter_idle(worker);                 //让该工作线程进入idle状态
    wake_up_process(worker->task);             //唤醒刚创建的工作线程
    spin_unlock_irq(&pool->lock);    
 
    return worker;
 
fail:
    if (id >= 0)
        ida_simple_remove(&pool->worker_ida, id);
    kfree(worker);
    return NULL;
}
代码不复杂,通过线程池(struct worker_pool)绑定的cpu信息(struct worker_pool的cpu成员)可以知道该pool是per-CPU还是unbound,对于per-CPU线程池,pool->cpu是大于等于0的。对于对于per-CPU线程池,其worker线程的名字是kworker/cpu:worker id,如果是high priority的,后面还跟着一个H字符。对于unbound线程池,其worker线程的名字是kworker/u pool id:worker id。

    if (pool->cpu >= 0)
        snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
             pool->attrs->nice < 0  ? "H" : "");
    else
        snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
可以看到,下面这几个就是pre-cpu的worker

root         4  0.0  0.0      0     0 ?        S     3月15   0:00 [kworker/0:0]
root         5  0.0  0.0      0     0 ?        S<    3月15   0:00 [kworker/0:0H]
root        25  0.0  0.0      0     0 ?        S<    3月15   0:00 [kworker/3:0H]
root        66  0.0  0.0      0     0 ?        S     3月15   0:07 [kworker/1:1]
root        67  0.0  0.0      0     0 ?        S     3月15   0:07 [kworker/0:1]
下面几个是unbound的worker

root        32  0.0  0.0      0     0 ?        S<    3月15   0:00 [kworker/u17:0]
root      4600  0.0  0.0      0     0 ?        S    13:38   0:00 [kworker/u16:2]
root      4605  0.0  0.0      0     0 ?        S    13:43   0:00 [kworker/u16:0]
root      4612  1.0  0.0      0     0 ?        S    14:09   0:00 [kworker/u16:1]
 

七、work的处理
先给出函数,下面分析

 
/**
 * worker_thread - the worker thread function
 * @__worker: self
 *
 * The worker thread function.  All workers belong to a worker_pool -
 * either a per-cpu one or dynamic unbound one.  These workers process all
 * work items regardless of their specific target workqueue.  The only
 * exception is work items which belong to workqueues with a rescuer which
 * will be explained in rescuer_thread().
 *
 * Return: 0
 */
static int worker_thread(void *__worker)
{
    struct worker *worker = __worker;
    struct worker_pool *pool = worker->pool;
 
    /* tell the scheduler that this is a workqueue worker */
    set_pf_worker(true);    //分析1 
woke_up:
    spin_lock_irq(&pool->lock);
 
    /* am I supposed to die?  分析2  */
    if (unlikely(worker->flags & WORKER_DIE)) {
        spin_unlock_irq(&pool->lock);
        WARN_ON_ONCE(!list_empty(&worker->entry));
        set_pf_worker(false);
 
        set_task_comm(worker->task, "kworker/dying");
        ida_simple_remove(&pool->worker_ida, worker->id);
        worker_detach_from_pool(worker);
        kfree(worker);
        return 0;
    }
 
    worker_leave_idle(worker);
recheck:   
    /* no more worker necessary? 分析3  */
    if (!need_more_worker(pool))
        goto sleep;
 
    /* do we need to manage?  分析4  */
    if (unlikely(!may_start_working(pool)) && manage_workers(worker))
        goto recheck;
 
    /*
     * ->scheduled list can only be filled while a worker is
     * preparing to process a work or actually processing it.
     * Make sure nobody diddled with it while I was sleeping.
     */
    //scheduled链表表示工作线程准备处理一个work或者正在执行一个work时才会有work添加到该链表中
    WARN_ON_ONCE(!list_empty(&worker->scheduled));
 
    /* 分析 5
     * Finish PREP stage.  We're guaranteed to have at least one idle
     * worker or that someone else has already assumed the manager
     * role.  This is where @worker starts participating in concurrency
     * management if applicable and concurrency management is restored
     * after being rebound.  See rebind_workers() for details.
     */
    worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
 
    do {
        struct work_struct *work =
            list_first_entry(&pool->worklist,
                     struct work_struct, entry);
 
        pool->watchdog_ts = jiffies;
 
        if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
            /* optimization path, not strictly necessary */
            process_one_work(worker, work);
            if (unlikely(!list_empty(&worker->scheduled)))
                process_scheduled_works(worker);
        } else {
            move_linked_works(work, &worker->scheduled, NULL);
            process_scheduled_works(worker);
        }
    } while (keep_working(pool));
 
    worker_set_flags(worker, WORKER_PREP);
sleep:
    /*
     * pool->lock is held and there's no work to process and no need to
     * manage, sleep.  Workers are woken up only while holding
     * pool->lock or from local cpu, so setting the current state
     * before releasing pool->lock is enough to prevent losing any
     * event.
     */
    worker_enter_idle(worker);
    __set_current_state(TASK_IDLE);
    spin_unlock_irq(&pool->lock);
    schedule();
    goto woke_up;
}
 

分析1:

    /* tell the scheduler that this is a workqueue worker */
    set_pf_worker(true);
 

static void set_pf_worker(bool val)
{
    mutex_lock(&wq_pool_attach_mutex);
    if (val)
        current->flags |= PF_WQ_WORKER;
    else
        current->flags &= ~PF_WQ_WORKER;
    mutex_unlock(&wq_pool_attach_mutex);
}
worker线程函数一开始就会通过PF_WQ_WORKER来标注自己

有了这样一个flag,调度器在调度当前进程sleep的时候可以检查这个准备sleep的进程是否是一个worker线程,如果是的话,那么调度器不能鲁莽的调度到其他的进程,这时候,还需要找到该worker对应的线程池,唤醒一个idle的worker线程。通过workqueue模块和调度器模块的交互,当work A被阻塞后(处理该work的worker线程进入sleep),调度器会唤醒其他的worker线程来处理其他的work B,work C……

分析2:

woke_up:
    spin_lock_irq(&pool->lock);
 
    /* am I supposed to die? */
    if (unlikely(worker->flags & WORKER_DIE)) {        //-WORKER_DIE表示此工作线程将要被销毁,此时即不能使用
        spin_unlock_irq(&pool->lock);
        WARN_ON_ONCE(!list_empty(&worker->entry));
        set_pf_worker(false);
 
        set_task_comm(worker->task, "kworker/dying");
        ida_simple_remove(&pool->worker_ida, worker->id);
        worker_detach_from_pool(worker);
        kfree(worker);
        return 0;
    }
 
    worker_leave_idle(worker);                        //清除worker(线程)的空闲状态
 
 
    ......
 
 
    /*
     * pool->lock is held and there's no work to process and no need to
     * manage, sleep.  Workers are woken up only while holding
     * pool->lock or from local cpu, so setting the current state
     * before releasing pool->lock is enough to prevent losing any
     * event.
     */
    worker_enter_idle(worker);
    __set_current_state(TASK_IDLE);
    spin_unlock_irq(&pool->lock);
    schedule();          //没任务了,主动放弃资源,下次被唤醒时,从调度函数出来
    goto woke_up;        //执行这里,说是被唤醒,可能是删除该线程被唤醒,也可能是有任务了被唤醒
因为线程的销毁是异步的。所以标记了要销毁的线程,会不再挂接work。待已经挂在worker上的work处理完毕,就会销毁。所以上面要是标记了销毁标志且worker上面的work确实没有了,就会销毁这个worker。

当然一般没设置销毁标志的,则会进来之后先清除空闲标志,表示这个线程没处于挂起状态。

分析3:

recheck:
    /* no more worker necessary? */
    if (!need_more_worker(pool))
        goto sleep;
 
 
    ......
 
sleep:
    /*
     * pool->lock is held and there's no work to process and no need to
     * manage, sleep.  Workers are woken up only while holding
     * pool->lock or from local cpu, so setting the current state
     * before releasing pool->lock is enough to prevent losing any
     * event.
     */
    worker_enter_idle(worker);           //线程进入睡眠
    __set_current_state(TASK_IDLE);      //设置线程的睡眠标志
    spin_unlock_irq(&pool->lock);
    schedule();                          //调度出去,让出cpu资源
    goto woke_up;
}
如何判断是否需要创建更多的worker线程呢?原则如下:

(1)有事情做:挂在worker pool中的work list不能是空的,如果是空的,那么当然sleep就好了

/*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
 *
 * Note that, because unbound workers never contribute to nr_running, this
 * function will always return %true for unbound pools as long as the
 * worklist isn't empty.
 */
static bool need_more_worker(struct worker_pool *pool)
{
    return !list_empty(&pool->worklist) && __need_more_worker(pool);
}
 
 
(2)比较忙:worker pool的nr_running成员表示线程池中当前正在干活(running状态)的worker线程有多少个,当nr_running等于0表示所有的worker线程在处理work的时候阻塞了,这时候,必须要启动新的worker线程来处理worker pool上处于active状态的work链表上的work们。

/*
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
 * they're being called with pool->lock held.
 */
 
static bool __need_more_worker(struct worker_pool *pool)
{
    return !atomic_read(&pool->nr_running);
}
 

分析4

recheck:
    /* no more worker necessary? */
    if (!need_more_worker(pool))
        goto sleep;
 
    /* do we need to manage? */
    if (unlikely(!may_start_working(pool)) && manage_workers(worker))
        goto recheck;

 
may_start_working()判断pool中是否有idle状态工作线程。如果没有,那么manage_workers()创建一些工作线程

/* Can I start working?  Called from busy but !running workers. */
static bool may_start_working(struct worker_pool *pool)
{
    return pool->nr_idle;
}
manage_workers()函数动态管理创建工作线程的函数

/**
 * manage_workers - manage worker pool
 * @worker: self
 *
 * Assume the manager role and manage the worker pool @worker belongs
 * to.  At any given time, there can be only zero or one manager per
 * pool.  The exclusion is handled automatically by this function.
 *
 * The caller can safely start processing works on false return.  On
 * true return, it's guaranteed that need_to_create_worker() is false
 * and may_start_working() is true.
 *
 * CONTEXT:
 * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * Return:
 * %false if the pool doesn't need management and the caller can safely
 * start processing works, %true if management function was performed and
 * the conditions that the caller verified before calling the function may
 * no longer be true.
 */
static bool manage_workers(struct worker *worker)
{
    struct worker_pool *pool = worker->pool;
 
    if (pool->flags & POOL_MANAGER_ACTIVE)
        return false;
 
    pool->flags |= POOL_MANAGER_ACTIVE;        //标记线程池在管理创建线程
    pool->manager = worker;            
 
    maybe_create_worker(pool);                 //创建备用线程
 
    pool->manager = NULL;
    pool->flags &= ~POOL_MANAGER_ACTIVE;
    wake_up(&wq_manager_wait);                 //唤醒等待队列中的事件
    return true;
}
maybo_create_worker()函数中while首先调用create_worker()来创建新的工作线程。

 
/**
 * maybe_create_worker - create a new worker if necessary
 * @pool: pool to create a new worker for
 *
 * Create a new worker for @pool if necessary.  @pool is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
 * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be %false and
 * may_start_working() %true.
 *
 * LOCKING:
 * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 */
static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
    spin_unlock_irq(&pool->lock);
 
    /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
    // 启动mayday_timer,如果创建worker时间太长就唤醒紧急worker(rescuer)处理work_struct
    mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
    while (true) {
        //create_worker()创建成功则退出while循环;或者通过need_to_create_worker()判断是否需要继续创建新线程
        if (create_worker(pool) || !need_to_create_worker(pool))
            break;
 
        schedule_timeout_interruptible(CREATE_COOLDOWN);
 
        //再次判断是否需要继续创建新线程,不需要的退出,需要的话继续创建线程
        if (!need_to_create_worker(pool))
            break;
    }
 
    del_timer_sync(&pool->mayday_timer);    //一定时间内创建成功了线程,则删除掉,本函数前面加入的定时器
    spin_lock_irq(&pool->lock);
    /*
     * This is necessary even after a new worker was just successfully
     * created as @pool->lock was dropped and the new worker might have
     * already become busy.
     * 再次判断是否需要继续创建新线程
     */
    if (need_to_create_worker(pool))
        goto restart;
}
 

分析 5 worker线程开始处理work

 
    /*
     * Finish PREP stage.  We're guaranteed to have at least one idle
     * worker or that someone else has already assumed the manager
     * role.  This is where @worker starts participating in concurrency
     * management if applicable and concurrency management is restored
     * after being rebound.  See rebind_workers() for details.
     */
    worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
 
    do {
        struct work_struct *work =                    //获取该线程池上链表上的第一个work
            list_first_entry(&pool->worklist,
                     struct work_struct, entry);
 
        pool->watchdog_ts = jiffies;
 
        if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
            /* optimization path, not strictly necessary */
            process_one_work(worker, work);        //单独处理一个work
            if (unlikely(!list_empty(&worker->scheduled)))
                process_scheduled_works(worker);   //处理worker_pool->scheduled链表上的work_struct
        } else {
            //如果当前work_struct置位WORK_STRUCT_LINKED表示work后面还串上其它work,把这些work迁移到woeker_pool->scheduled中,然后一并再用process_one_work()函数处理
            move_linked_works(work, &worker->scheduled, NULL);
            process_scheduled_works(worker);
        }
    } while (keep_working(pool));
 
    worker_set_flags(worker, WORKER_PREP);
按理说worker线程处理work应该比较简单,从线程池的worklist中取一个work,然后调用process_one_work处理之就OK了,不过现实稍微复杂一些,work和work之间并不是独立的,也就是说,work A和work B可能是linked work,这些linked work应该被一个worker来处理。WORK_STRUCT_LINKED标记了work是属于linked work,如果是linked work,worker并不直接处理,而是将其挂入scheduled work list,然后调用process_scheduled_works来处理。毫无疑问,process_scheduled_works也是调用process_one_work来处理一个一个scheduled work list上的work。

scheduled work list并非仅仅应用在linked work,在worker处理work的时候,有一个原则要保证:同一个work不能被同一个cpu上的多个worker同时执行。这时候,如果worker发现自己要处理的work正在被另外一个worker线程处理,那么本worker线程将不处理该work,只需要挂入正在执行该work的worker线程的scheduled work list即可。

把这些work迁移到woeker_pool->scheduled中,然后一并再用process_one_work()函数处理

 
/**
 * move_linked_works - move linked works to a list
 * @work: start of series of works to be scheduled
 * @head: target list to append @work to
 * @nextp: out parameter for nested worklist walking
 *
 * Schedule linked works starting from @work to @head.  Work series to
 * be scheduled starts at @work and includes any consecutive work with
 * WORK_STRUCT_LINKED set in its predecessor.
 *
 * If @nextp is not NULL, it's updated to point to the next work of
 * the last scheduled work.  This allows move_linked_works() to be
 * nested inside outer list_for_each_entry_safe().
 *
 * CONTEXT:
 * spin_lock_irq(pool->lock).
 */
static void move_linked_works(struct work_struct *work, struct list_head *head,
                  struct work_struct **nextp)
{
    struct work_struct *n;
 
    /*
     * Linked worklist will always end before the end of the list,
     * use NULL for list head.
     */
    list_for_each_entry_safe_from(work, n, NULL, entry) {
        list_move_tail(&work->entry, head);
        if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
            break;
    }
 
    /*
     * If we're already inside safe list traversal and have moved
     * multiple works to the scheduled queue, the next position
     * needs to be updated.
     */
    if (nextp)
        *nextp = n;
}
 
 
 
 
/**
 * process_scheduled_works - process scheduled works
 * @worker: self
 *
 * Process all scheduled works.  Please note that the scheduled list
 * may change while processing a work, so this function repeatedly
 * fetches a work from the top and executes it.
 *
 * CONTEXT:
 * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.
 */
static void process_scheduled_works(struct worker *worker)
{
    while (!list_empty(&worker->scheduled)) {
        struct work_struct *work = list_first_entry(&worker->scheduled,
                        struct work_struct, entry);
        process_one_work(worker, work);
    }
}
 

处理一个work

 
/**
 * process_one_work - process single work
 * @worker: self
 * @work: work to process
 *
 * Process @work.  This function contains all the logics necessary to
 * process a single work including synchronization against and
 * interaction with other workers on the same cpu, queueing and
 * flushing.  As long as context requirement is met, any worker can
 * call this function to process a work.
 *
 * CONTEXT:
 * spin_lock_irq(pool->lock) which is released and regrabbed.
 */
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
    struct pool_workqueue *pwq = get_work_pwq(work);
    struct worker_pool *pool = worker->pool;
    bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;    //判断当前的workqueue是否是CPU_INTENSIVE,会对其所在工作线程进行特殊设置
    int work_color;
    struct worker *collision;
#ifdef CONFIG_LOCKDEP
    /*
     * It is permissible to free the struct work_struct from
     * inside the function that is called from it, this we need to
     * take into account for lockdep too.  To avoid bogus "held
     * lock freed" warnings as well as problems when looking into
     * work->lockdep_map, make a copy and use that here.
     */
    struct lockdep_map lockdep_map;
 
    lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
    /* ensure we're on the correct CPU */
    WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
             raw_smp_processor_id() != pool->cpu);
 
    /*
     * A single work shouldn't be executed concurrently by
     * multiple workers on a single cpu.  Check whether anyone is
     * already processing the work.  If so, defer the work to the
     * currently executing one.
     */
    //-查询当前work是否在worker_pool->busy_hash表中正在运行,如果在就移到当前work正在执行的worker->scheduled并退出当前处理
    collision = find_worker_executing_work(pool, work);
    if (unlikely(collision)) {
        move_linked_works(work, &collision->scheduled, NULL);
        return;
    }
 
    /* claim and dequeue, */
    debug_work_deactivate(work);
    hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);   //将 worker 加入 busy 队列 pool->busy_hash 
    worker->current_work = work;            //把这个work,放到这个worker的当前运行上
    worker->current_func = work->func;
    worker->current_pwq = pwq;
    work_color = get_work_color(work);
 
    /*
     * Record wq name for cmdline and debug reporting, may get
     * overridden through set_worker_desc().
     */
    strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
 
    list_del_init(&work->entry);            //每次执行work,都会从所在链表上删除自己
 
    /*
     * CPU intensive works don't participate in concurrency management.
     * They're the scheduler's responsibility.  This takes @worker out
     * of concurrency management and the next code block will chain
     * execution of the pending work items.
     */
    if (unlikely(cpu_intensive))        //设置cpu密集型标志
        worker_set_flags(worker, WORKER_CPU_INTENSIVE);
 
    /*
     * Wake up another worker if necessary.  The condition is always
     * false for normal per-cpu workers since nr_running would always
     * be >= 1 at this point.  This is used to chain execution of the
     * pending work items for WORKER_NOT_RUNNING workers such as the
     * UNBOUND and CPU_INTENSIVE ones.
     */
        //判断是否需要唤醒更多工作线程,wake_up_worker()去唤醒worker_pool中第一个idle线程。对于bound型worker_pool此时一般nr_running>=1,所以条件不成立
    if (need_more_worker(pool))        
        wake_up_worker(pool);
 
    /*
     * Record the last pool and clear PENDING which should be the last
     * update to @work.  Also, do this inside @pool->lock so that
     * PENDING and queued state changes happen together while IRQ is
     * disabled.
     */
    //清除struct worker中data成员pending标志位,里面使用了smp_wmb保证了pending之前的写操作完成之后才清除pending
    set_work_pool_and_clear_pending(work, pool->id);    
 
    spin_unlock_irq(&pool->lock);
 
    lock_map_acquire(&pwq->wq->lockdep_map);
    lock_map_acquire(&lockdep_map);
    /*
     * Strictly speaking we should mark the invariant state without holding
     * any locks, that is, before these two lock_map_acquire()'s.
     *
     * However, that would result in:
     *
     *   A(W1)
     *   WFC(C)
     *        A(W1)
     *        C(C)
     *
     * Which would create W1->C->W1 dependencies, even though there is no
     * actual deadlock possible. There are two solutions, using a
     * read-recursive acquire on the work(queue) 'locks', but this will then
     * hit the lockdep limitation on recursive locks, or simply discard
     * these locks.
     *
     * AFAICT there is no possible deadlock scenario between the
     * flush_work() and complete() primitives (except for single-threaded
     * workqueues), so hiding them isn't a problem.
     */
    lockdep_invariant_state(true);
    trace_workqueue_execute_start(work);
    worker->current_func(work);                //真正执行work的回调函数
    /*
     * While we must be careful to not use "work" after this, the trace
     * point will only record its address.
     */
    trace_workqueue_execute_end(work);
    lock_map_release(&lockdep_map);
    lock_map_release(&pwq->wq->lockdep_map);
 
    if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
        pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
               "     last function: %pf\n",
               current->comm, preempt_count(), task_pid_nr(current),
               worker->current_func);
        debug_show_held_locks(current);
        dump_stack();
    }
 
    /*
     * The following prevents a kworker from hogging CPU on !PREEMPT
     * kernels, where a requeueing work item waiting for something to
     * happen could deadlock with stop_machine as such work item could
     * indefinitely requeue itself while all other CPUs are trapped in
     * stop_machine. At the same time, report a quiescent RCU state so
     * the same condition doesn't freeze RCU.
     */
    cond_resched();
 
    spin_lock_irq(&pool->lock);
 
    /* clear cpu intensive status */
    if (unlikely(cpu_intensive))
        worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
 
    /* we're done with it, release */
    hash_del(&worker->hentry);               //-work回调函数执行完成后的清理工作
    worker->current_work = NULL;
    worker->current_func = NULL;
    worker->current_pwq = NULL;
    pwq_dec_nr_in_flight(pwq, work_color);
}
 


--------------------- 
作者:to_run_away 
来源:CSDN 
原文:https://blog.csdn.net/qq_16777851/article/details/88560339 
版权声明:本文为博主原创文章,转载请附上博文链接!

猜你喜欢

转载自blog.csdn.net/qq_44120897/article/details/88641804