epoll源码深度剖析

1.基本数据结构

分别是 eventpoll、epitem 和 eppoll_entry。

1.1 eventpoll

我们先看一下 eventpoll 这个数据结构，这个数据结构是我们在调用 epoll_create 之后内核侧创建的一个句柄，表示了一个 epoll 实例。后续如果我们再调用 epoll_ctl 和 epoll_wait 等，都是对这个 eventpoll 数据进行操作，这部分数据会被保存在 epoll_create 创建的匿名文件 file 的 private_data 字段中。

 1 /*
 2  * This structure is stored inside the "private_data" member of the file
 3  * structure and represents the main data structure for the eventpoll
 4  * interface.
 5  */
 6 struct eventpoll {
 7     /* Protect the access to this structure */
 8     spinlock_t lock;
 9 
10     /*
11      * This mutex is used to ensure that files are not removed
12      * while epoll is using them. This is held during the event
13      * collection loop, the file cleanup path, the epoll file exit
14      * code and the ctl operations.
15      */
16     struct mutex mtx;
17 
18     /* Wait queue used by sys_epoll_wait() */
19     //这个队列里存放的是执行epoll_wait从而等待的进程队列
20     wait_queue_head_t wq;
21 
22     /* Wait queue used by file->poll() */
23     //这个队列里存放的是该eventloop作为poll对象的一个实例，加入到等待的队列
24     //这是因为eventpoll本身也是一个file, 所以也会有poll操作
25     wait_queue_head_t poll_wait;
26 
27     /* List of ready file descriptors */
28     //这里存放的是事件就绪的fd列表，链表的每个元素是下面的epitem
29     struct list_head rdllist;
30 
31     /* RB tree root used to store monitored fd structs */
32     //这是用来快速查找fd的红黑树
33     struct rb_root_cached rbr;
34 
35     /*
36      * This is a single linked list that chains all the "struct epitem" that
37      * happened while transferring ready events to userspace w/out
38      * holding ->lock.
39      */
40     struct epitem *ovflist;
41 
42     /* wakeup_source used when ep_scan_ready_list is running */
43     struct wakeup_source *ws;
44 
45     /* The user that created the eventpoll descriptor */
46     struct user_struct *user;
47 
48     //这是eventloop对应的匿名文件，充分体现了Linux下一切皆文件的思想
49     struct file *file;
50 
51     /* used to optimize loop detection check */
52     int visited;
53     struct list_head visited_list_link;
54 
55 #ifdef CONFIG_NET_RX_BUSY_POLL
56     /* used to track busy poll napi_id */
57     unsigned int napi_id;
58 #endif
59 };

这个 epitem 结构是干什么用的呢？

每当我们调用 epoll_ctl 增加一个 fd 时，内核就会为我们创建出一个 epitem 实例，并且把这个实例作为红黑树的一个子节点，增加到 eventpoll 结构体中的红黑树中，对应的字段是 rbr。这之后，查找每一个 fd 上是否有事件发生都是通过红黑树上的 epitem 来操作。

 1 /*
 2  * Each file descriptor added to the eventpoll interface will
 3  * have an entry of this type linked to the "rbr" RB tree.
 4  * Avoid increasing the size of this struct, there can be many thousands
 5  * of these on a server and we do not want this to take another cache line.
 6  */
 7 struct epitem {
 8     union {
 9         /* RB tree node links this structure to the eventpoll RB tree */
10         struct rb_node rbn;
11         /* Used to free the struct epitem */
12         struct rcu_head rcu;
13     };
14 
15     /* List header used to link this structure to the eventpoll ready list */
16     //将这个epitem连接到eventpoll 里面的rdllist的list指针
17     struct list_head rdllink;
18 
19     /*
20      * Works together "struct eventpoll"->ovflist in keeping the
21      * single linked chain of items.
22      */
23     struct epitem *next;
24 
25     /* The file descriptor information this item refers to */
26     //epoll监听的fd
27     struct epoll_filefd ffd;
28 
29     /* Number of active wait queue attached to poll operations */
30     //一个文件可以被多个epoll实例所监听，这里就记录了当前文件被监听的次数
31     int nwait;
32 
33     /* List containing poll wait queues */
34     struct list_head pwqlist;
35 
36     /* The "container" of this item */
37     //当前epollitem所属的eventpoll
38     struct eventpoll *ep;
39 
40     /* List header used to link this item to the "struct file" items list */
41     struct list_head fllink;
42 
43     /* wakeup_source used when EPOLLWAKEUP is set */
44     struct wakeup_source __rcu *ws;
45 
46     /* The structure that describe the interested events and the source fd */
47     struct epoll_event event;
48 };

每次当一个 fd 关联到一个 epoll 实例，就会有一个 eppoll_entry 产生。eppoll_entry 的结构如下：

 1 /* Wait structure used by the poll hooks */
 2 struct eppoll_entry {
 3     /* List header used to link this structure to the "struct epitem" */
 4     struct list_head llink;
 5 
 6     /* The "base" pointer is set to the container "struct epitem" */
 7     struct epitem *base;
 8 
 9     /*
10      * Wait queue item that will be linked to the target file wait
11      * queue head.
12      */
13     wait_queue_entry_t wait;
14 
15     /* The wait queue head that linked the "wait" wait queue item */
16     wait_queue_head_t *whead;
17 };

epoll_create：

我们在使用 epoll 的时候，首先会调用 epoll_create 来创建一个 epoll 实例。这个函数是如何工作的呢?

首先，epoll_create 会对传入的 flags 参数做简单的验证。

1 /* Check the EPOLL_* constant for consistency.  */
2 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
3 
4 if (flags & ~EPOLL_CLOEXEC)
5     return -EINVAL;
6 /*

接下来，内核申请分配 eventpoll 需要的内存空间

1 /* Create the internal data structure ("struct eventpoll").
2 */
3 error = ep_alloc(&ep);
4 if (error < 0)
5   return error;

在接下来，epoll_create 为 epoll 实例分配了匿名文件和文件描述字，其中 fd 是文件描述字，file 是一个匿名文件。这里充分体现了 UNIX 下一切都是文件的思想。

注意，eventpoll 的实例会保存一份匿名文件的引用，通过调用 fd_install 函数将匿名文件和文件描述字完成了绑定。

这里还有一个特别需要注意的地方，在调用 anon_inode_get_file 的时候，epoll_create 将 eventpoll 作为匿名文件 file 的 private_data 保存了起来，这样，在之后通过 epoll 实例的文件描述字来查找时，就可以快速地定位到 eventpoll 对象了。最后，这个文件描述字作为 epoll 的文件句柄，被返回给 epoll_create 的调用者。

 1 /*
 2  * Creates all the items needed to setup an eventpoll file. That is,
 3  * a file structure and a free file descriptor.
 4  */
 5 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
 6 if (fd < 0) {
 7     error = fd;
 8     goto out_free_ep;
 9 }
10 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
11              O_RDWR | (flags & O_CLOEXEC));
12 if (IS_ERR(file)) {
13     error = PTR_ERR(file);
14     goto out_free_fd;
15 }
16 ep->file = file;
17 fd_install(fd, file);
18 return fd;

epoll_ctl

  1 /*
  2  * @epfd: epool_create创建的用于eventpoll的fd
  3  * @op: 控制的命令类型
  4  * @fd: 要操作的文件描述符
  5  * @event:与fd相关的对象.
  6  */
  7 SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
  8         struct epoll_event __user *, event)
  9 {
 10     int error;
 11     struct file *file, *tfile;
 12     struct eventpoll *ep;
 13     struct epitem *epi;
 14     struct epoll_event epds;
 15  
 16     error = -EFAULT;
 17     /*
 18      * 检查是否需要从用户空间拷贝event参数,如果需要拷贝,则调用
 19      * copy_from_user来拷贝.
 20      */
 21     if (ep_op_has_event(op) &&
 22      copy_from_user(&epds, event, sizeof(struct epoll_event)))
 23         goto error_return;
 24  
 25     /* Get the "struct file *" for the eventpoll file */
 26     error = -EBADF;
 27     /*
 28      * 获取epfd对应的file实例
 29      */
 30     file = fget(epfd);
 31     if (!file)
 32         goto error_return;
 33  
 34     /* Get the "struct file *" for the target file */
 35     /*
 36      * 获取要操作的文件描述符对应的file实例
 37      */
 38     tfile = fget(fd);
 39     if (!tfile)
 40         goto error_fput;
 41  
 42     /* The target file descriptor must support poll */
 43     /*
 44      * 检查fd对应的文件是否支持poll
 45      */
 46     error = -EPERM;
 47     if (!tfile->f_op || !tfile->f_op->poll)
 48         goto error_tgt_fput;
 49  
 50     /*
 51      * We have to check that the file structure underneath the file descriptor
 52      * the user passed to us _is_ an eventpoll file. And also we do not permit
 53      * adding an epoll file descriptor inside itself.
 54      */
 55     error = -EINVAL;
 56     /*
 57      * 检查fd对应的文件是否是一个eventpoll文件
 58      */
 59     if (file == tfile || !is_file_epoll(file))
 60         goto error_tgt_fput;
 61  
 62     /*
 63      * At this point it is safe to assume that the "private_data" contains
 64      * our own data structure.
 65      */
 66     /*
 67      * 获取eventpoll文件中的私有数据，该数据是在epoll_create中创建的。
 68      */
 69     ep = file->private_data;
 70  
 71     mutex_lock(&ep->mtx);
 72  
 73     /*
 74      * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
 75      * above, we can be sure to be able to use the item looked up by
 76      * ep_find() till we release the mutex.
 77      */
 78      /*
 79       * 在eventpoll中存储文件描述符信息的红黑树中查找指定的fd对应的epitem实例
 80       */
 81     epi = ep_find(ep, tfile, fd);
 82  
 83     error = -EINVAL;
 84     switch (op) {
 85     case EPOLL_CTL_ADD:
 86         /*
 87          * 如果要添加的fd不存在,则调用ep_insert()插入到红黑树中,
 88          * 如果已存在,则返回EEXIST错误.
 89          */
 90         if (!epi) {
 91             epds.events |= POLLERR | POLLHUP;
 92             error = ep_insert(ep, &epds, tfile, fd);
 93         } else
 94             error = -EEXIST;
 95         break;
 96     case EPOLL_CTL_DEL:
 97         if (epi)
 98             error = ep_remove(ep, epi);
 99         else
100             error = -ENOENT;
101         break;
102     case EPOLL_CTL_MOD:
103         if (epi) {
104             epds.events |= POLLERR | POLLHUP;
105             error = ep_modify(ep, epi, &epds);
106         } else
107             error = -ENOENT;
108         break;
109     }
110     mutex_unlock(&ep->mtx);
111  
112 error_tgt_fput:
113     fput(tfile);
114 error_fput:
115     fput(file);
116 error_return:
117  
118     return error;
119 }

查找 epoll 实例首先，epoll_ctl 函数通过 epoll 实例句柄来获得对应的匿名文件，这一点很好理解，UNIX 下一切都是文件，epoll 的实例也是一个匿名文件。

1 //获得epoll实例对应的匿名文件
2 f = fdget(epfd);
3 if (!f.file)
4     goto error_return;

接下来，获得添加的套接字对应的文件，这里 tf 表示的是 target file，即待处理的目标文件。

1 /* Get the "struct file *" for the target file */
2 //获得真正的文件，如监听套接字、读写套接字
3 tf = fdget(fd);
4 if (!tf.file)
5     goto error_fput;

再接下来，进行了一系列的数据验证，以保证用户传入的参数是合法的，比如 epfd 真的是一个 epoll 实例句柄，而不是一个普通文件描述符。

1 /* The target file descriptor must support poll */
2 //如果不支持poll，那么该文件描述字是无效的
3 error = -EPERM;
4 if (!tf.file->f_op->poll)
5     goto error_tgt_fput;
6 ...

如果获得了一个真正的 epoll 实例句柄，就可以通过 private_data 获取之前创建的 eventpoll 实例了。

1 /*
2  * At this point it is safe to assume that the "private_data" contains
3  * our own data structure.
4  */
5 ep = f.file->private_data;

红黑树查找接下来 epoll_ctl 通过目标文件和对应描述字，在红黑树中查找是否存在该套接字，这也是 epoll 为什么高效的地方。红黑树（RB-tree）是一种常见的数据结构，这里 eventpoll 通过红黑树跟踪了当前监听的所有文件描述字，而这棵树的根就保存在 eventpoll 数据结构中

1 /* RB tree root used to store monitored fd structs */
2 struct rb_root_cached rbr;

对于每个被监听的文件描述字，都有一个对应的 epitem 与之对应，epitem 作为红黑树中的节点就保存在红黑树中。

1 /*
2  * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
3  * above, we can be sure to be able to use the item looked up by
4  * ep_find() till we release the mutex.
5  */
6 epi = ep_find(ep, tf.file, fd);

红黑树是一棵二叉树，作为二叉树上的节点，epitem 必须提供比较能力，以便可以按大小顺序构建出一棵有序的二叉树。其排序能力是依靠 epoll_filefd 结构体来完成的，epoll_filefd 可以简单理解为需要监听的文件描述字，它对应到二叉树上的节点

可以看到这个还是比较好理解的，按照文件的地址大小排序。如果两个相同，就按照文件文件描述字来排序。

 1 struct epoll_filefd {
 2   struct file *file; // pointer to the target file struct corresponding to the fd
 3   int fd; // target file descriptor number
 4 } __packed;
 5 
 6 /* Compare RB tree keys */
 7 static inline int ep_cmp_ffd(struct epoll_filefd *p1,
 8                             struct epoll_filefd *p2)
 9 {
10   return (p1->file > p2->file ? +1: 11 (p1->file < p2->file ? -1 : p1->fd - p2->fd)); 12 }

在进行完红黑树查找之后，如果发现是一个 ADD 操作，并且在树中没有找到对应的二叉树节点，就会调用 ep_insert 进行二叉树节点的增加。

1 case EPOLL_CTL_ADD:
2     if (!epi) {
3         epds.events |= POLLERR | POLLHUP;
4         error = ep_insert(ep, &epds, tf.file, fd, full_check);
5     } else
6         error = -EEXIST;
7     if (full_check) 8  clear_tfile_check_list(); 9 break;

ep_insert:

  1 /*
  2  * Must be called with "mtx" held.
  3  */
  4 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
  5          struct file *tfile, int fd)
  6 {
  7     int error, revents, pwake = 0;
  8     unsigned long flags;
  9     struct epitem *epi;
 10     struct ep_pqueue epq;
 11  
 12     /*
 13      * 检查epoll监视的文件描述符的个数是否超过max_user_watches,
 14      * max_user_watches用来存储每个用户使用epoll可以监视的文件
 15      * 描述符个数
 16      */
 17     if (unlikely(atomic_read(&ep->user->epoll_watches) >=
 18          max_user_watches))
 19         return -ENOSPC;
 20     /*
 21      * 每个加入到epoll中的文件都会附加到一个epitem实例中，
 22      * 分配当前文件对应的epitem实例。
 23      */
 24     if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
 25         return -ENOMEM;
 26  
 27     /*
 28      * 初始化新分配的epitem实例
 29      */
 30     INIT_LIST_HEAD(&epi->rdllink);
 31     INIT_LIST_HEAD(&epi->fllink);
 32     INIT_LIST_HEAD(&epi->pwqlist);
 33     epi->ep = ep;
 34     ep_set_ffd(&epi->ffd, tfile, fd);
 35     epi->event = *event;
 36     epi->nwait = 0;
 37     epi->next = EP_UNACTIVE_PTR;
 38  
 39     /* Initialize the poll table using the queue callback */
 40     epq.epi = epi;
 41     init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
 42  
 43     /*
 44      * 如果fd是套接字，f_op为socket_file_ops，poll函数是
 45      * sock_poll()。如果是TCP套接字的话，进而会调用
 46      * 到tcp_poll()函数。此处调用poll函数查看当前
 47      * 文件描述符的状态，存储在revents中。
 48      * 在poll的处理函数(tcp_poll())中，会调用sock_poll_wait()，
 49      * 在sock_poll_wait()中会调用到epq.pt.qproc指向的函数，
 50      * 也就是ep_ptable_queue_proc()。
 51      */
 52     revents = tfile->f_op->poll(tfile, &epq.pt);
 53  
 54     /*
 55      * ep_ptable_queue_proc()中如果分配内存失败时，会
 56      * 将nwait置为-1。
 57      */
 58     error = -ENOMEM;
 59     if (epi->nwait < 0)
 60         goto error_unregister;
 61  
 62     /* Add the current item to the list of active epoll hook for this file */
 63     spin_lock(&tfile->f_lock);
 64     /*
 65      * 将当前的epitem加入tfile的f_ep_links链表中，
 66      * 在从epoll中移除文件时，用户清理文件对应的
 67      * epitem实例。
 68      */
 69     list_add_tail(&epi->fllink, &tfile->f_ep_links);
 70     spin_unlock(&tfile->f_lock);
 71  
 72     /*
 73      * 将当前的epitem加入到存储监视的所有文件的红黑树中.
 74      */
 75     ep_rbtree_insert(ep, epi);
 76  
 77     /* We have to drop the new item inside our item list to keep track of it */
 78     spin_lock_irqsave(&ep->lock, flags);
 79  
 80     /*
 81      * 如果要监视的文件状态已经就绪并且还没有加入到就绪队列中,则将当前的
 82      * epitem加入到就绪队列中.如果有进程正在等待该文件的状态就绪,则
 83      * 唤醒一个等待的进程.
 84      */
 85     if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
 86         list_add_tail(&epi->rdllink, &ep->rdllist);
 87  
 88         /* Notify waiting tasks that events are available */
 89         /*
 90          * 如果有进程正在等待文件的状态就绪，也就是
 91          * 调用epoll_wait睡眠的进程正在等待，则唤醒一个
 92          * 等待进程。
 93          */
 94         if (waitqueue_active(&ep->wq))
 95             wake_up_locked(&ep->wq);
 96         /*
 97          * 如果有进程等待eventpoll文件本身的事件就绪，
 98          * 则增加临时变量pwake的值，pwake的值不为0时，
 99          * 在释放lock后，会唤醒等待进程。
100          */
101         if (waitqueue_active(&ep->poll_wait))
102             pwake++;
103     }
104  
105     spin_unlock_irqrestore(&ep->lock, flags);
106  
107     /*
108      * 增加eventpoll监视的文件数量。
109      */
110     atomic_inc(&ep->user->epoll_watches);
111  
112     /* We have to call this outside the lock */
113     /*
114      * 唤醒等待eventpoll文件状态就绪的进程
115      */
116      * 
117     if (pwake)
118         ep_poll_safewake(&ep->poll_wait);
119  
120     return 0;
121  
122 error_unregister:
123     ep_unregister_pollwait(ep, epi);
124  
125     /*
126      * We need to do this because an event could have been arrived on some
127      * allocated wait queue. Note that we don't care about the ep->ovflist
128      * list, since that is used/cleaned only inside a section bound by "mtx".
129      * And ep_insert() is called with "mtx" held.
130      */
131     spin_lock_irqsave(&ep->lock, flags);
132     if (ep_is_linked(&epi->rdllink))
133         list_del_init(&epi->rdllink);
134     spin_unlock_irqrestore(&ep->lock, flags);
135  
136     kmem_cache_free(epi_cache, epi);
137  
138     return error;
139 }

ep_insert 首先判断当前监控的文件值是否超过了 /proc/sys/fs/epoll/max_user_watches 的预设最大值，如果超过了则直接返回错误。

1 user_watches = atomic_long_read(&ep->user->epoll_watches);
2 if (unlikely(user_watches >= max_user_watches))
3     return -ENOSPC;

接下来是分配资源和初始化动作。

 1 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
 2         return -ENOMEM;
 3 
 4     /* Item initialization follow here ... */
 5     INIT_LIST_HEAD(&epi->rdllink);
 6     INIT_LIST_HEAD(&epi->fllink);
 7     INIT_LIST_HEAD(&epi->pwqlist);
 8     epi->ep = ep;
 9     ep_set_ffd(&epi->ffd, tfile, fd);
10     epi->event = *event;
11     epi->nwait = 0;
12     epi->next = EP_UNACTIVE_PTR;

再接下来的事情非常重要，ep_insert 会为加入的每个文件描述字设置回调函数。这个回调函数是通过函数 ep_ptable_queue_proc 来进行设置的。这个回调函数是干什么的呢？其实，对应的文件描述字上如果有事件发生，就会调用这个函数，比如套接字缓冲区有数据了，就会回调这个函数。这个函数就是 ep_poll_callback。这里你会发现，原来内核设计也是充满了事件回调的原理。

 1 /*
 2  * This is the callback that is used to add our wait queue to the
 3  * target file wakeup lists.
 4  */
 5 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,poll_table *pt)
 6 {
 7     struct epitem *epi = ep_item_from_epqueue(pt);
 8     struct eppoll_entry *pwq;
 9 
10     if (epi>nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
11         init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
12         pwq->whead = whead;
13         pwq->base = epi;
14         if (epi->event.events & EPOLLEXCLUSIVE)
15             add_wait_queue_exclusive(whead, &pwq->wait);
16         else
17             add_wait_queue(whead, &pwq->wait);
18         list_add_tail(&pwq->llink, &epi->pwqlist);
19         epi->nwait++;
20     } else {
21         /* We have to signal that an error occurred */
22         epi->nwait = -1;
23     }
24 }

ep_poll_callback

ep_poll_callback 函数的作用非常重要，它将内核事件真正地和 epoll 对象联系了起来。它又是怎么实现的呢？首先，通过这个文件的 wait_queue_entry_t 对象找到对应的 epitem 对象，因为 eppoll_entry 对象里保存了 wait_quue_entry_t，根据 wait_quue_entry_t 这个对象的地址就可以简单计算出 eppoll_entry 对象的地址，从而可以获得 epitem 对象的地址。这部分工作在 ep_item_from_wait 函数中完成。一旦获得 epitem 对象，就可以寻迹找到 eventpoll 实例。

  1 /*
  2   * 如果文件类型支持epoll并且有事件发生，发生的事件通过
  3   * 参数key来传送，参见tcp_prequeue()函数中对wake_up_interruptible_poll()
  4   * 的调用。
  5   * @wait: 调用ep_ptable_queue_proc()加入到文件中的唤醒队列时分配的
  6   * eppoll_entry实例的wait成员的地址
  7   * @mode:该参数在回调函数ep_poll_callback()中没有使用，其值为进程
  8   * 睡眠时的状态
  9   * @sync: 唤醒等待进程的标志
 10   */
 11 static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
 12 {
 13     int pwake = 0;
 14     unsigned long flags;
 15     struct epitem *epi = ep_item_from_wait(wait);
 16     struct eventpoll *ep = epi->ep;
 17  
 18     spin_lock_irqsave(&ep->lock, flags);
 19  
 20     /*
 21      * If the event mask does not contain any poll(2) event, we consider the
 22      * descriptor to be disabled. This condition is likely the effect of the
 23      * EPOLLONESHOT bit that disables the descriptor when an event is received,
 24      * until the next EPOLL_CTL_MOD will be issued.
 25      */
 26     /*
 27      * epi->event.events中存储的是用户空间关心的事件，如果该成员
 28      * 没有包含任何poll事件，则跳转到out_unlock处处理
 29      */
 30     if (!(epi->event.events & ~EP_PRIVATE_BITS))
 31         goto out_unlock;
 32  
 33     /*
 34      * Check the events coming with the callback. At this stage, not
 35      * every device reports the events in the "key" parameter of the
 36      * callback. We need to be able to handle both cases here, hence the
 37      * test for "key" != NULL before the event match test.
 38      */
 39     /*
 40      * 如果key不为NULL，也就是值不是0，但是用户关心的
 41      * 事件并没有发生，则跳转到out_unlock处处理。参数key
 42      * 应该不会为0
 43      */
 44     if (key && !((unsigned long) key & epi->event.events))
 45         goto out_unlock;
 46  
 47     /*
 48      * If we are trasfering events to userspace, we can hold no locks
 49      * (because we're accessing user memory, and because of linux f_op->poll()
 50      * semantics). All the events that happens during that period of time are
 51      * chained in ep->ovflist and requeued later on.
 52      */
 53     /* 
 54      * ep_scan_ready_list()是向用户空间传递事件的处理函数，
 55      * ep_scan_ready_list()函数执行时会将ovflist链表中的元素
 56      * 暂存到一个临时变量中，然后将ovflist成员置为NULL，
 57      * 而EP_UNACTIVE_PTR的定义如下:
 58      * #define EP_UNACTIVE_PTR ((void *) -1L)
 59      * 因此(ep->ovflist != EP_UNACTIVE_PTR)成立时，正在向用户空间
 60      * 传递事件。
 61      * 如果当前正在向用户空间传递事件，则将
 62      * 当前的事件对应的epitem实例加入到ovflist链表中。
 63      */
 64     if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
 65         /*
 66          * 如果epi->next不等于EP_UNACTIVE_PTR，则说明已经
 67          * 添加到ovflist链表中，就不用再添加了
 68          */
 69         if (epi->next == EP_UNACTIVE_PTR) {
 70             epi->next = ep->ovflist;
 71             ep->ovflist = epi;
 72         }
 73         goto out_unlock;
 74     }
 75  
 76     /* If this file is already in the ready list we exit soon */
 77     /*
 78      * 如果当前没有在向用户空间传递事件，用户
 79      * 关心的事件已经发生，并且还没有加入到就绪
 80      * 队列中，则将当前的epitem实例加入到就绪队列中。
 81      */
 82     if (!ep_is_linked(&epi->rdllink))
 83         list_add_tail(&epi->rdllink, &ep->rdllist);
 84  
 85     /*
 86      * Wake up ( if active ) both the eventpoll wait list and the ->poll()
 87      * wait list.
 88      */
 89     /*
 90      * 唤醒调用epoll_wait()函数时睡眠的进程。
 91      */
 92     if (waitqueue_active(&ep->wq))
 93         wake_up_locked(&ep->wq);
 94     /*
 95      * 唤醒等待eventpoll文件状态就绪的进程
 96      */
 97     if (waitqueue_active(&ep->poll_wait))
 98         pwake++;
 99  
100 out_unlock:
101     spin_unlock_irqrestore(&ep->lock, flags);
102  
103     /* We have to call this outside the lock */
104     /*
105      * 唤醒等待eventpoll文件的状态就绪的进程
106      */
107     if (pwake)
108         ep_poll_safewake(&ep->poll_wait);
109  
110     return 1;
111 
112 }

epoll源码深度剖析

猜你喜欢