qemu AIO线程池分析

学习qemu线程需要准备的知识
1 qemu2事件处理机制
2 qemu aio api
3 qemu2 时钟系统分析

在 qemu aio api 我们看到了如何使用线程池api，这里我们来具体分析下代码。这里先从线程池的数据结构，首先AIO的线程中必然要依附于aio,所以AIO 线程池在AioContext数据结构中

struct AioContext {
    GSource source;
    ...
     /* Thread pool for performing work and receiving completion callbacks */
    struct ThreadPool *thread_pool;
    ...
}

thread_pool 正式描述线程池的数据结构

struct ThreadPool {
    AioContext *ctx;   // 所属的AioContext
    QEMUBH *completion_bh;  // 线程池中线程完成任务后处理的BH
    QemuMutex lock;   //锁，用于保护线程池状态
    QemuCond worker_stopped;　//用于等待线程结束的条件队列
    QemuSemaphore sem;　// 工作线程idle时休眠的信号量
    int max_threads;　　//线程池的最大线程数
    QEMUBH *new_thread_bh;　　//创建新线程的bh

    /* The following variables are only accessed from one AioContext. */
    QLIST_HEAD(, ThreadPoolElement) head;  //任务队列

    /* The following variables are protected by lock.  */
    QTAILQ_HEAD(, ThreadPoolElement) request_list;　//任务队列
    int cur_threads;　　//当前线程数量
    int idle_threads;　// idle的线程数量
    int new_threads;     /* backlog of threads we need to create */　　//需要创建的线程数量
    int pending_threads; /* threads created but not running yet */　//正在创建的线程数量　
    bool stopping;　//是否正在停止
};

struct ThreadPoolElement {
    BlockAIOCB common;　　// 任务完成后的回调，这部分放在aio context　线程中完成
    ThreadPool *pool;　//所属线程池
    ThreadPoolFunc *func;　//要在线程池中完成的工作
    void *arg;　//线程池中完成的工作的参数

    /* Moving state out of THREAD_QUEUED is protected by lock.  After
     * that, only the worker thread can write to it.  Reads and writes
     * of state and ret are ordered with memory barriers.
     */
    enum ThreadState state;　//线程状态
    int ret;　//运行结果

    /* Access to this list is protected by lock.  */
    QTAILQ_ENTRY(ThreadPoolElement) reqs;　　//链如工作队列的链表节点

    /* Access to this list is protected by the global mutex.  */
    QLIST_ENTRY(ThreadPoolElement) all;　//链入ThreadPool.hjead链表的节点
};

看到这个数据结构我要说明下aio线程池的设计。

aio　thread实现的能力是在线程池中找到一个线程，去执行用户发起的任务，任务完成后在aio 线程中处理任务完成后的一个回调。这种功能的场景很像Android开发中，在异步线程发起网络请求，请求成功后在主线程根据请求结果更新ai的过程。
另外线程池为了防止创建太多线程，并不是一下子就创建所有最大个数线程，而是当idle进程不够用的时候就会去请求创建。
上述两种功能都是在aio context线程中完成的，这样的好处是实现线程封闭，不用考虑数据安全的问题，这两个功能都是使用aio 的bh功能完成的。其中ThreadPool->new_thread_bh 负责创建新的线程,completion_bh则用于执行结果的回调。
另外ThreadPool->head和ThreadPool->request_list则都是用于描述请求队列，二者有什么区别呢，在于ThreadPool->head只在Aio context线程中访问，ThreadPool->request_list则在线程池线程和aio context中访问，ThreadPool->head主要用于处理结果回调，ThreadPool->request_list主要用于任务派发。

另外aio线程池使用信号量机制来派发任务，当添加任务后信号量+１就会唤醒线程池中的线程，拿到信号量的线程会把信号量值减少-1,知道信号量为０，其他线程阻塞。最后在线程池释放的时候也是通过信号量唤醒idle进程，idle进程检查ThreadPool->stopping状态，进行结束线程。
ThreadPool->worker_stopped的作用在于要终止线程池的时候有些线程还在运行状态，所以要解决线程池的线程先等待在这个条件变量上，线程完成后利用这个条件变量通知终止进程，继续向下工作。

经过上述说明我们知道线程池需要通过锁保护的变量有
request_list
idle_threads
stopping
cur_threads
new_threads
最后我们来对照代码看一下aio 线程池的代码吧

ThreadPool *aio_get_thread_pool(AioContext *ctx)
{
    if (!ctx->thread_pool) {
        ctx->thread_pool = thread_pool_new(ctx);
    }
    return ctx->thread_pool;
}

ThreadPool *thread_pool_new(AioContext *ctx)
{
    ThreadPool *pool = g_new(ThreadPool, 1);
    thread_pool_init_one(pool, ctx);
    return pool;
}

AioContext通过new来创建，是一个单例，注意这些函数都没有加锁，也就是说指正在aio context线程调用。
thread_pool_new　函数中使用g_new分配内存，之后调用thread_pool_init_one进行初始化

static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
{
    if (!ctx) {
        ctx = qemu_get_aio_context();
    }

    memset(pool, 0, sizeof(*pool));
    pool->ctx = ctx;
    pool->completion_bh = aio_bh_new(ctx, thread_pool_completion_bh, pool);
    qemu_mutex_init(&pool->lock);
    qemu_cond_init(&pool->worker_stopped);
    qemu_sem_init(&pool->sem, 0);
#if defined(_WIN32) && !defined(_WIN64)
    /* Limit the number of worker thread on 32-bit Windows: usually we only
     * have 2 GB of address space, and each thread costs us over 1 MB of it. */
    pool->max_threads = 8;
#else
    pool->max_threads = 64;
#endif
    pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool);

    QLIST_INIT(&pool->head);
    QTAILQ_INIT(&pool->request_list);
}

初始化的过程就是初始化各种锁，条件变量，信号量,bh，链表. 这里如果不是windows 32系统默认最大线程数为64,反之为８．

向线程池添加任务的函数为thread_pool_submit_aio

BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool,
        ThreadPoolFunc *func, void *arg,
        BlockCompletionFunc *cb, void *opaque)
{
    ThreadPoolElement *req;

    req = qemu_aio_get(&thread_pool_aiocb_info, NULL, cb, opaque);
    req->func = func;
    req->arg = arg;
    req->state = THREAD_QUEUED;
    req->pool = pool;

    QLIST_INSERT_HEAD(&pool->head, req, all);

    trace_thread_pool_submit(pool, req, arg);

    qemu_mutex_lock(&pool->lock);
    if (pool->idle_threads == 0 && pool->cur_threads < pool->max_threads) {
        spawn_thread(pool);
    }
    QTAILQ_INSERT_TAIL(&pool->request_list, req, reqs);
    qemu_mutex_unlock(&pool->lock);
    qemu_sem_post(&pool->sem);
    return &req->common;
}

这里边有四个参数, func和arg表示在线程池线程调用的方法和参数
cb和opaque则表示完成异步调用后在aio线程回调的方法和参数
所以thread_pool_submit_aio函数首先用ThreadPoolElement那上述４个数据组织起来，在无锁的情况下添加到ThreadPool->head链表上。　在加锁的情况下添加到ThreadPool->request_list链表。之后判如果没有idle线程了，并且线程池还没有满的情况下，调用spawn_thread创建新的线程。最后qemu_sem_post(&pool->sem)给信号量+1,通知有新的任务到来。
最后返回BlockAIOCB给调用者（这里偷偷告诉你，用于取消回调）

我们下来分析下如何创建新线程

static void spawn_thread(ThreadPool *pool)
{
    pool->cur_threads++;
    pool->new_threads++;
    /* If there are threads being created, they will spawn new workers, so
     * we don't spend time creating many threads in a loop holding a mutex or
     * starving the current vcpu.
     *
     * If there are no idle threads, ask the main thread to create one, so we
     * inherit the correct affinity instead of the vcpu affinity.
     */
    if (!pool->pending_threads) {
        qemu_bh_schedule(pool->new_thread_bh);
    }
}

如果没有正在创建过的线程，调用qemu_bh_schedule去执行ThreadPool->new_thread_bh,在线程池初始化的时候创建new_thread_bh的时候传递的回调函数为spawn_thread_bh_fn，另外这里new_threads++后就可以创建新线程了


static void spawn_thread_bh_fn(void *opaque)
{
    ThreadPool *pool = opaque;

    qemu_mutex_lock(&pool->lock);
    do_spawn_thread(pool);
    qemu_mutex_unlock(&pool->lock);
}
static void do_spawn_thread(ThreadPool *pool)
{
    QemuThread t;

    /* Runs with lock taken.  */
    if (!pool->new_threads) {
        return;
    }

    pool->new_threads--;
    pool->pending_threads++;

    qemu_thread_create(&t, "worker", worker_thread, pool, QEMU_THREAD_DETACHED);
}

这里只是把new_thread的计数转移给pending_thread.然后创建线程。　注意线程里的回调函数为worker_thread。

static void *worker_thread(void *opaque)
{
    ThreadPool *pool = opaque;

    qemu_mutex_lock(&pool->lock);
    pool->pending_threads--;
    do_spawn_thread(pool);

    while (!pool->stopping) {
        ThreadPoolElement *req;
        int ret;

        do {
            pool->idle_threads++;
            qemu_mutex_unlock(&pool->lock);
            ret = qemu_sem_timedwait(&pool->sem, 10000);
            qemu_mutex_lock(&pool->lock);
            pool->idle_threads--;
        } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list));
        if (ret == -1 || pool->stopping) {
            break;
        }

        req = QTAILQ_FIRST(&pool->request_list);
        QTAILQ_REMOVE(&pool->request_list, req, reqs);
        req->state = THREAD_ACTIVE;
        qemu_mutex_unlock(&pool->lock);

        ret = req->func(req->arg);

        req->ret = ret;
        /* Write ret before state.  */
        smp_wmb();
        req->state = THREAD_DONE;

        qemu_mutex_lock(&pool->lock);

        qemu_bh_schedule(pool->completion_bh);
    }

    pool->cur_threads--;
    qemu_cond_signal(&pool->worker_stopped);
    qemu_mutex_unlock(&pool->lock);
    return NULL;
}

worker_thread为线程池中线程真正的工作，在线程池里也会根据线程情况创建线程，调用do_spawn_thread函数，然后进入等待信号量，注意这里等待信号量的超时时间为10s，也就是一个线程如果idle 10s则会自动退出。

如果能成功获取信号量，并且也不是需要退出的信号量，则从任务队列中取出一个任务，进行调用，最后调用qemu_bh_schedule处理完成回调。　
另外如果线程因为线程池停止或者idle退出，则减少cur_thread, 通知可能等待线程退出的线程。

所以这么看好像存在bug，在spawn_thread函数中并没有保护pool->cur_threads和 pool->new_threads　线程安全的写。

我们再来看下线程池线程处理成功后的回调函数处理，这里使用的completion_bh进行回调。在初始化时候回调函数设置为thread_pool_completion_bh

static void thread_pool_completion_bh(void *opaque)
{
    ThreadPool *pool = opaque;
    ThreadPoolElement *elem, *next;

restart:
    QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
        if (elem->state != THREAD_DONE) {
            continue;
        }

        trace_thread_pool_complete(pool, elem, elem->common.opaque,
                                   elem->ret);
        QLIST_REMOVE(elem, all);

        if (elem->common.cb) {
            /* Read state before ret.  */
            smp_rmb();

            /* Schedule ourselves in case elem->common.cb() calls aio_poll() to
             * wait for another request that completed at the same time.
             */
            qemu_bh_schedule(pool->completion_bh);

            elem->common.cb(elem->common.opaque, elem->ret);
            qemu_aio_unref(elem);
            goto restart;
        } else {
            qemu_aio_unref(elem);
        }
    }
}

把处理完的任务进行回调，注意这里也没有使用锁进行处理，只是使用内存屏障同步返回值。注意调用完成回调bh那段代码在锁中执行，锁也有内存屏障的作用。所以这里只需要保证状态和返回值不被乱序（有可能是前一个bh）写入，不被cache缓存就好了

TangGeeA

发布了113 篇原创文章 · 获赞 22 · 访问量 9万+

私信关注

qemu AIO线程池分析

猜你喜欢