本文章源码基于kernel 5.5版本,主要分析epoll在kernel的实现原理,主要源码在 kernel/fs/eventpoll.c。
目录
一,关键结构体
关键结构体主要有以下2个需要留意,里边包含的成员用来干嘛同学们看代码应该看得懂,在此只是简单提一下。
struct eventpoll ,struct epitem,这边需要留意的主要就这2个结构体,eventpoll每个进程调epoll_create()后会各创建一个,里边有个rdlist用来存epitem,而epitem里边存的就是所监听的描述符的信息,包括,fd,file,events等。
eventpoll里边是用一颗红黑树来存epitem的,所以查找,插入,删除的最坏时间复杂度是O(log(n))。
二,epoll启动
源码位置 /fs/eventpoll.c
从这开始,
fs_initcall(eventpoll_init);
static int __init eventpoll_init(void)
{
struct sysinfo si;
si_meminfo(&si);//获取系统内存信息
/*
* Allows top 4% of lomem to be allocated for epoll watches (per user).
*/
//设置每个进程能监听的fd数目
max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
EP_ITEM_COST;
//对于 EP_ITEM_COST,
//是由于每注册一个fd需要用掉一个 struct epitem 和一个struct eppoll_entry
BUG_ON(max_user_watches < 0);
/*
* Initialize the structure used to perform epoll file descriptor
* inclusion loops checks.
*/
ep_nested_calls_init(&poll_loop_ncalls);//链表初始化
/*
* We can have many thousands of epitems, so prevent this from
* using an extra cache line on 64-bit (and smaller) CPUs
*/
BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
//创建缓存,用来存 (struct epitem 和 struct eppoll_entry
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
/* Allocates slab cache used to allocate "struct eppoll_entry" */
pwq_cache = kmem_cache_create("eventpoll_pwq",
sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
return 0;
}
好了,初始化完毕,接下来看 epoll_create
三,epoll_create
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
return do_epoll_create(0);
}
用SYSCALL_DEFINE定义了系统调用接口epoll_create,只是对size做了个兼容性的判断,
具体工作在 do_epoll_create里边,接着看
/*
* Open an eventpoll file descriptor.
*/
static int do_epoll_create(int flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);//针对这个进程创建一个 eventpoll(一般来说每个进程一个,毕竟咱们在一个进程里只epoll_create一次嘛)
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
//获取一个可用的文件描述符
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
error = fd;
goto out_free_ep;
}
//创建一个文件实例,将ep装到其priv 里边,后面要用的时候通过该file拿出对应的eventpoll;
//并注册对该file的操作接口eventpoll_fops
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
//将file装到ep里边,后面也可以通过该ep拿出对应的file
//至此,eventpoll和其对应的file形成你中有我我中有你的联系,每个进程调epoll_create 后拥有一个eventpoll,一个fd,一个file
ep->file = file;
//将fd和file挂钩起来,文件系统那块目前还没有去研究源码不太清楚,不过我觉得大概应该是之后可以通过
//该fd操作file吧,比如你对该fd调用llseek,那么后面就会调用前面注册的eventpoll_fops::noop_llseek,具体实现就在这里边做
fd_install(fd, file);
return fd;//返回fd给用户进程
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
这边可以顺便看下 创建eventpoll时做了什么事情,
static int ep_alloc(struct eventpoll **pep)
{
int error;
struct user_struct *user;
struct eventpoll *ep;
user = get_current_user();//获取当前用户信息
error = -ENOMEM;
ep = kzalloc(sizeof(*ep), GFP_KERNEL);//开辟内存
if (unlikely(!ep))
goto free_uid;
//锁初始化
mutex_init(&ep->mtx);
rwlock_init(&ep->lock);
//队列初始化
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
//等待队列 redy list 初始化
INIT_LIST_HEAD(&ep->rdllist);
ep->rbr = RB_ROOT_CACHED;//红黑树初始化,用来存监听的fd的信息
ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;//当前用户信息存在eventpoll里边
*pep = ep;
return 0;
free_uid:
free_uid(user);
return error;
}
好了,epoll_create 看完了,接着看epoll_ctl,这边咱们沿着添加描述的主线去看,
即 EPOLL_CTL_ADD 选项的行为。
四,epoll_ctl
同样的,epoll_ctl用SYSCALL_DEFINE定义为系统调用,
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
struct eventpoll *tep = NULL;
error = -EFAULT;
//获取用户空间传进来的event的内容,获取要监听fd的类型(读,写等)
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
error = -EBADF;
f = fdget(epfd);//通过在epoll_create里边创建的fd(返回给用户空间后又由用户空间传下来)获取其对应的file
if (!f.file)
goto error_return;
/* Get the "struct file *" for the target file */
tf = fdget(fd); //获取该socket对应的file
if (!tf.file)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
//若该socket没有实现poll接口,直接返回(socket实现poll是因为后面这边后面回调用该poll,在后面的epoll_wait分析里边就会遇到)
if (!file_can_poll(tf.file))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(&epds);
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
//把自己的fd传进来了,不能这么搞哦小伙子
error = -EINVAL;
if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;
//......
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;//将之前epoll_create创建的eventpoll取出来,
//即: 用户空间创efd下来,通过fd可以取得file,通过file再取出创建file时装进去的eventpoll
//......
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
epi = ep_find(ep, tf.file, fd);//在该进程的红黑树里边找fd ,file对应的epitem,一边来说第一次EPOLL_CTL_ADD找不到所以为NULL
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= EPOLLERR | EPOLLHUP;
//创建eptime,保存该socket 对应的fd,file,以及eventpoll,然后插入红黑树
error = ep_insert(ep, &epds, tf.file, fd, full_check);
} else
error = -EEXIST;
if (full_check)
clear_tfile_check_list();
break;
//......
}
//......
return error;
}
//所以其实epoll_ctl 添加监听描述符时,主要做的工作是创建epitem,里边保存了该socket的fd,file,以及该进程的eventpoll。
(对于eventpoll,我在想有必要每个epitem都保存一个吗?如果fd比较多的话,加上64位平台,每个fd多用8字节内存,对于kernel
的内存还是有一定的开销的,毕竟咱们每个进程一般只创建一个eventpoll,并且eventpoll指针也已经装在epfd对应的file里边了呀(就是
epoll_create创建时装的)所以我觉得要用的时候可以通过epfd获取file再通过file->private_data获取就可以了呢)
接下来看下如何创建epitem并插入红黑树的,以及在这里边干了什么
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
int error, pwake = 0;
__poll_t revents;
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
lockdep_assert_irqs_enabled();//什么都没有做,不知写这个用来干嘛
//获取该进程监听的fd数目
user_watches = atomic_long_read(&ep->user->epoll_watches);
//超过能监听的最大数目了
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
//创建一个epitem,从初始化时开辟的缓存里边拿内存来用
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
//epitem初始化
/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;//装eventpoll
ep_set_ffd(&epi->ffd, tfile, fd);//装socket的fd,file
epi->event = *event;//存储监听事件类型
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error)
goto error_create_wakeup_source;
} else {
RCU_INIT_POINTER(epi->ws, NULL);
}
/* Initialize the poll table using the queue callback */
epq.epi = epi;
//这边注册回调,将callback函数装到epq.pt._qproc里边
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function. Note that after
* this operation completes, the poll callback can start hitting
* the new item.
*/
revents = ep_item_poll(epi, &epq.pt, 1);//这边调用socket那边的poll,后面得看下socket那边的行为(其实就是回调这个ep_ptable_queue_proc())
//......
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_lock);
//把 epi->fllink, 加到 tfile->f_ep_links里边,即添加到socket那边的file的f_ep_links链表尾部
list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);
/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
*/
//将刚刚创建的epitem插入在epoll_create创建的红黑树里边(每个进程各有一颗)
ep_rbtree_insert(ep, epi);
//......
/* If the file is already "ready" we drop it inside the ready list */
//如果该socket已经有事件的话,无需睡眠等待,
//直接将其装入eventpoll的redy list里边,然后通过wake_up(&ep->wq)调用 ep_poll_callback
if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available *///TODO 这边做了什么事情后面再看下
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);//TODO
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irq(&ep->lock);
//这边将监听的fd数目加1
atomic_long_inc(&ep->user->epoll_watches);
return error;
}
该函数主要是创建一个epitem,调用socket那边的poll接口,然后socket那边会回调ep_ptable_queue_proc(),
装上eventpoll,socket的fd和file,然后插入红黑树,
若监听的fd已经有事件的话,直接将该fd的信息装入evnetpoll的redy list里边,然后调用callback函数 ep_poll_callback()。
好的,接下来看下 ep_item_poll()和 ep_ptable_queue_proc(),ep_poll_callback()等到分析socket那边产生数据后
通知eventpoll这边时再分析。
先看 ep_item_poll()
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
int depth)
{
struct eventpoll *ep;
bool locked;
pt->_key = epi->event.events;//用户空间设置的参数,读/写/listend
if (!is_file_epoll(epi->ffd.file))//走这边
return vfs_poll(epi->ffd.file, pt) & epi->event.events;//进入vfs_poll()
}
看下vfs_poll() ,在 include/linux/poll.h 里边
static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
if (unlikely(!file->f_op->poll))
return DEFAULT_POLLMASK;
return file->f_op->poll(file, pt);//调用文件系统socket 那边的poll ,接下来得看socket那边的poll,咱们就看tcp的吧,应该都差不多
}
接下来就到了socket的poll这边了,在 net/socket.c里边
// 记得wait 里边装有eventpoll那边的callback函数ep_ptable_queue_proc()
static __poll_t sock_poll(struct file *file, poll_table *wait)
{
struct socket *sock = file->private_data;
//......
return sock->ops->poll(file, sock, wait) | flag;
}
这边转而调用tcb那边的poll,即 tcp_poll()那就接着看,源码在 net/ipv4/tcp.c里边,
__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
__poll_t mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
int state;
sock_poll_wait(file, sock, wait);//这边主要是这个,跟进去看下
//......
return mask;
}
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
poll_table *p)
{
if (!poll_does_not_wait(p)) {
//sock->wq.wait 留意这个,后面有事件的时候可能会用到
poll_wait(filp, &sock->wq.wait, p); //这边再跟进去看
//......
}
}
源码位置 /include/linux/poll.h
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);//这边就去回调eventpoll的 ep_ptable_queue_proc()了
//将socket的file,该socket的等待队列传给ep_ptable_queue_proc()
}
好了,接下来看下 回到 eventpoll.c 看 ep_ptable_queue_proc()
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,//whead 是socket那边传过来的
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
//这边也是针对这个socket fd创建一个eppoll_entry
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
//这边注册唤醒回调函数,由ep_poll_call来执行唤醒函数,代替default_wake_function()唤醒
//
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);// 这边将ep_poll_callback挂载到eppoll_entry::wait里边,
//当socket那边有数据时,会统一用__wake_up()唤醒whead,
//而对于 咱们这个pwq->wait,会调用 ep_poll_callback,然后再在callback里边去唤醒应用层调用epoll_wait的用户进程
pwq->whead = whead;//等待队列存socket的等待队列,
pwq->base = epi;
if (epi->event.events & EPOLLEXCLUSIVE)
add_wait_queue_exclusive(whead, &pwq->wait);
else
add_wait_queue(whead, &pwq->wait); //这边将eppoll_entry的等待队列节点添加到socket的等待队列上去
list_add_tail(&pwq->llink, &epi->pwqlist);///将eppoll_entry 添加到 epitem的pwqlist链表里边,后面就可以根据这个链表取出
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}
这边主要做的工作其实就是创建一个eppoll_entry,装上ep_poll_callback()函数,然后注册到socket的事件等待队列里边,
后面socket那边有事件后就会唤醒这个队列的所有节点。
然后将eppoll_entry添加到eptiem里边,TODO 为何要添加目前还没有理清楚,后面得看下
好了,至此,epoll_ctl分析完了,接下来看epoll_wait
五, epoll_wait
同样的,epoll_wait用SYSCALL_DEFINE定义为系统调用,
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
return do_epoll_wait(epfd, events, maxevents, timeout);
}
//接着看 do_epoll_wait(),
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout)
{
int error;
struct fd f;
struct eventpoll *ep;
//......
/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);//通过epfd拿到eventpoll的file对象
if (!f.file)
return -EBADF;
//......
ep = f.file->private_data;
//从file->private_data里边取出对应的eventpoll,还记得吗,这是在epoll_create的时候存进去
/* Time to fish for events ... */
//跟进去看
error = ep_poll(ep, events, maxevents, timeout);
//......
}
好接下来看ep_poll(),
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
//有设置超时的话换算一下时间
if (timeout > 0) {
struct timespec64 end_time = ep_set_mstimeout(timeout);
slack = select_estimate_accuracy(&end_time);
to = &expires;
*to = timespec64_to_ktime(end_time);
} else if (timeout == 0) {
timed_out = 1;
write_lock_irq(&ep->lock);
eavail = ep_events_available(ep);//检查是否此时已经有事件就绪了,如果eventpoll::rdlist不为空就是有事件了
write_unlock_irq(&ep->lock);
goto send_events;
//这边先直接跳到 send_events 看是否已经有事件就绪,有的话就直接将事件返回给用户空间了,
//没有事件的话才回跳到fetch_events这边把自己调度出去
}
fetch_events:
//......
/*
* We don't have any available event to return to the caller. We need
* to sleep here, and we will be woken by ep_poll_callback() when events
* become available.
*/
//没有事件,需要把自己调度出去,先睡会儿,等有事件后socket那边回调ep_poll_callback(),那边会唤醒这个进程
if (!waiter) {
waiter = true;
init_waitqueue_entry(&wait, current);//当前进程结构体,添加到wait节点里边
spin_lock_irq(&ep->wq.lock);
__add_wait_queue_exclusive(&ep->wq, &wait);//把wait节点添加到等待队列 wq里边,
spin_unlock_irq(&ep->wq.lock);
}
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);//将进程设为TASK_INTERRUPTIBLE状态,可唤醒
//.......
eavail = ep_events_available(ep);
if (eavail)//若此时rdlist不为空那就不用睡了,直接跳出去
break;
//.......
//这边把把自己调度出去,后面醒来后又回到for循环的开始处,在 if (eavail)这边再跳出去
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
timed_out = 1;//若是超时发生,则直接跳出去
break;
}
}
__set_current_state(TASK_RUNNING);//醒来后,把自己设为运行态
send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
if (!res && eavail &&
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)//这边把事件和fd给到用户空间并返回
goto fetch_events;
if (waiter) {
spin_lock_irq(&ep->wq.lock);
__remove_wait_queue(&ep->wq, &wait);//把wait从ep->wq里边拿出来
spin_unlock_irq(&ep->wq.lock);
}
return res;
}
这边所做的事情主要是,先判断有没有事件已经就绪(判断rdlist是否为空),若有就绪则直接将
事件和fd返回给用户空间,没有就绪的话先把自己调度出去,然后等待socket那边有事件后通过
回调 ep_poll_callback()来唤醒。
好的,至此,在咱们看ep_send_events()如何将事件给到用户空间之前,
先看下socket那边有事件的行为,如何回调ep_poll_callback(),以及ep_poll_callback()做了什么事情,
接下来再过头来看ep_send_events().scoket那边咱们就挑tcp ipv4的来看吧,
对于一个tcp socket,比如当网卡那边有数据来临时,触发中断,经过链路层,ip层报头处理后,到了tcp这边,报头处理后,
会调用sock_def_readable(),咱们直接看下这个函数,
//y源码位置 net/core/sock.c
static void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
//在这边,会唤醒所有该socket的等待队列里边的节点,即监听该socket的进程,
//对于epoll就是在前边epoll_ctl 添加监听描述符的时候添加进来的,
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
对于 wake_up_interruptible_sync_poll(),具体实现在 /kernel/sched/wait.c __wake_up_common()里边,
若队列里边的节点有实现自己的唤醒函数,先回调该函数,再由该函数来执行唤醒工作,
若没有实现的话,使用默认的kernel/sched/core.c default_wake_function(),
对于eventpoll,自己实现了咱们提了好多次的ep_poll_callback(),好了,接下来看下这个函数实现,
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
struct epitem *epi = ep_item_from_wait(wait);//从wait里边取出对应的epitem
struct eventpoll *ep = epi->ep;//从epitem里边取出对应的eventpoll
__poll_t pollflags = key_to_poll(key);
unsigned long flags;
int ewake = 0;
//.......
/* If this file is already in the ready list we exit soon */
//这边把epitem::rdllink 添加到 rdllist里边
if (!ep_is_linked(epi) &&
list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq)) {
if ((epi->event.events & EPOLLEXCLUSIVE) &&
!(pollflags & POLLFREE)) {
switch (pollflags & EPOLLINOUT_BITS) {
case EPOLLIN:
if (epi->event.events & EPOLLIN)
ewake = 1;
break;
case EPOLLOUT:
if (epi->event.events & EPOLLOUT)
ewake = 1;
break;
case 0:
ewake = 1;
break;
}
}
wake_up(&ep->wq);//这边去唤醒用户进程
}
//......
return ewake;
}
该函数主要做了2个工作,将该socket对应的eptiem添加到eventpoll的rdlist里边,
若对应的用户进程此时还在睡,通过wake_up() 唤醒,
好了,此时eventpoll::rdlist里边已经保存了该socket的事件信息,用户进程(准确的说该进程此时还是内核态)也唤醒了,
咱们可以回到ep_poll()了,这边接下来就到了 ep_send_events(),
继续看,
static int ep_send_events(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
struct ep_send_events_data esed;
esed.maxevents = maxevents;
esed.events = events;//保存用户空间传进来的epoll_event地址
ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);//跟进去
return esed.res;
}
接下来看ep_scan_ready_list()
/**
* ep_scan_ready_list - Scans the ready list in a way that makes possible for
* the scan code, to call f_op->poll(). Also allows for
* O(NumReady) performance.
*/
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
__poll_t (*sproc)(struct eventpoll *,
struct list_head *, void *),
void *priv, int depth, bool ep_locked)
//priv 存有用户传进来的events,
{
__poll_t res;
struct epitem *epi, *nepi;
LIST_HEAD(txlist);
//......
/*
* Steal the ready list, and re-init the original one to the
* empty list. Also, set ep->ovflist to NULL so that events
* happening while looping w/out locks, are not lost. We cannot
* have the poll callback to queue directly on ep->rdllist,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
//将rdlist赋值给txlist,rdlist初始化为空链表
write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, &txlist);
WRITE_ONCE(ep->ovflist, NULL);
write_unlock_irq(&ep->lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);//好了,接下来主要是这个,即 ep_send_events_proc(ep, &txlist, priv)
//......
return res;
}
看下 ep_send_events_proc()
static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
struct ep_send_events_data *esed = priv;//指针强转重新获得ep_send_events_data,里边装有用户空间传进来的epoll_event地址,
__poll_t revents;
struct epitem *epi, *tmp;
struct epoll_event __user *uevent = esed->events;//取出用户空间的epoll_event地址,接下来主要围绕这个进行操作
struct wakeup_source *ws;
poll_table pt;
//.......
//这边扫描整个rdlist链表
list_for_each_entry_safe(epi, tmp, head, rdllink) {
if (esed->res >= esed->maxevents)
break;
//.......
//把事件和fd装到用户空间的epoll_event
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
ep_pm_stay_awaake(epi);
if (!esed->res)
esed->res = -EFAULT;
return 0;
}
//......
return 0;
}
这边主要做的事情就是扫描整个rdlist(不是此时的eventpoll::rdlist,该list已经是空的了,在ep_scan_ready_list()里边置换给txlist
后传进来),然后将事件和fd装到用户空间的epoll_event里边,然后返回。
好了,到了这边,就返回到用户空间(epoll_wait返回),给应用层去处理了。