大家都知道epoll的效率比poll和select性能高很多,原因在于在实现时,内核为epoll设置了一个文件系统,并且使用的是红黑树来增删改查struct epitem,然后在监控的文件描述符有数据时,通过回调将struct epitem挂载到struct eventpoll的就绪队列中,不需要将整个fd集合发送到用户态,并轮询查找就绪fd,效率高。
其中的文件系统通过eventpoll_init来初始化这个文件系统。先注册文件系统eventpoll_fs_type,名字为eventpollfs。调用kern_mount将该文件系统安装上,返回值保存在eventpoll_mnt全局变量中。基本原理可以参考linux文件系统-文件系统的安装与拆卸。
epoll的系统调用:
epoll_create:创建struct eventpoll
epoll_ctl:增删改事件
epoll_wait:等待就绪事件
先看个图:
eventpoll_init:
static int __init eventpoll_init(void)
{
......
/*
* Register the virtual file system that will be the source of inodes
* for the eventpoll files
*/
error = register_filesystem(&eventpoll_fs_type);//注册文件系统
if (error)
goto epanic;
/* Mount the above commented virtual file system */
eventpoll_mnt = kern_mount(&eventpoll_fs_type);//挂载文件系统
error = PTR_ERR(eventpoll_mnt);
if (IS_ERR(eventpoll_mnt))
goto epanic;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n",
current));
return 0;
epanic:
panic("eventpoll_init() failed\n");
}
sys_epoll_create:
/*
* It opens an eventpoll file descriptor by suggesting a storage of "size"
* file descriptors. The size parameter is just an hint about how to size
* data structures. It won't prevent the user to store more than "size"
* file descriptors inside the epoll interface. It is the kernel part of
* the userspace epoll_create(2).
*/
asmlinkage long sys_epoll_create(int size)
{
int error, fd;
struct inode *inode;
struct file *file;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
current, size));
/* Sanity check on the size parameter */
error = -EINVAL;
if (size <= 0)//传入参数检查,必须大于0
goto eexit_1;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure, and inode and a free file descriptor.
*/
error = ep_getfd(&fd, &inode, &file);//创建file、inode、dentry,并相互关联
if (error)
goto eexit_1;
/* Setup the file internal data structure ( "struct eventpoll" ) */
/*创建一个eventpoll对象,初始化等待队列、红黑树、就绪队列,并将其地址挂入file->private_data*/
error = ep_file_init(file);
if (error)
goto eexit_2;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, size, fd));
return fd;//返回对应的文件描述符,在调用epoll_wait、epoll_ctrl作为参数传入
eexit_2:
sys_close(fd);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, size, error));
return error;
}
sys_epoll_create=>ep_getfd:(这个函数大部分与文件系统有关,暂不列出)
继续往下看:
sys_epoll_create=>ep_file_init
static int ep_file_init(struct file *file)
{
struct eventpoll *ep;
if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
return -ENOMEM;
memset(ep, 0, sizeof(*ep));
rwlock_init(&ep->lock);
init_rwsem(&ep->sem);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
ep->rbr = RB_ROOT;
file->private_data = ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",
current, ep));
return 0;
}
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
根据上面的函数参数来看sys_epoll_ctl的实现,第一个参数为sys_epoll_create返回的fd,通过它我们可以找到对应的struct eventpoll,第二个参数op对应于增删改,第三个参数对应为对应的文件描述符,第四个参数为感兴趣的事件读写之类的:
sys_epoll_ctl:
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set. It represents
* the kernel part of the user space epoll_ctl(2).
*/
asmlinkage long
sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
{
int error;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
current, epfd, op, fd, event));
error = -EFAULT;
if (EP_OP_HASH_EVENT(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto eexit_1;
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd);//获取struct eventpoll对应的file结构
if (!file)
goto eexit_1;
/* Get the "struct file *" for the target file */
tfile = fget(fd);//监听的文件描述符的file结构
if (!tfile)
goto eexit_2;
/* The target file descriptor must support poll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll)//如果监听的文件的文件系统不支持poll,报错返回
goto eexit_3;
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
if (file == tfile || !IS_FILE_EPOLL(file))
goto eexit_3;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;//获取struct eventpoll
down_write(&ep->sem);
/* Try to lookup the file inside our hash table */
epi = ep_find(ep, tfile, fd);//先查找对应的struct epitem
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD://添加,进入该分支
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
break;
case EPOLL_CTL_DEL://删除,进入该分支
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD://修改分支
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}
/*
* The function ep_find() increments the usage count of the structure
* so, if this is not NULL, we need to release it.
*/
if (epi)
ep_release_epitem(epi);
up_write(&ep->sem);
eexit_3:
fput(tfile);
eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
current, epfd, op, fd, event, error));
return error;
}
我们以添加为例,并且是TCP连接:
sys_epoll_ctl=>ep_insert
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd)
{
int error, revents, pwake = 0;
unsigned long flags;
struct epitem *epi;
struct ep_pqueue epq;
error = -ENOMEM;
if (!(epi = EPI_MEM_ALLOC()))
goto eexit_1;
/* Item initialization follow here ... */
EP_RB_INITNODE(&epi->rbn);//红黑树节点初始化
INIT_LIST_HEAD(&epi->rdllink);//就绪队列节点的初始化
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->txlink);
INIT_LIST_HEAD(&epi->pwqlist);//该节点等待队列的初始化
epi->ep = ep;//重点:保存struct eventpoll的地址,为后面添加到就绪队列做准备
EP_SET_FFD(&epi->ffd, tfile, fd);//设置红黑树的key
epi->event = *event;//感兴趣的事件类型
atomic_set(&epi->usecnt, 1);//增加使用计数
epi->nwait = 0;
/* Initialize the poll table using the queue callback */
epq.epi = epi;//重点,后面在挂入的时候会将其设置到struct eppoll_entry的base属性中
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);//设置将epi设置到sk->sk_sleep的处理函数
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function.
*/
revents = tfile->f_op->poll(tfile, &epq.pt);//将epi设置到sk->sk_sleep
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
if (epi->nwait < 0)
goto eexit_2;
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_ep_lock);
list_add_tail(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_ep_lock);
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irqsave(&ep->lock, flags);
/* Add the current item to the rb-tree */
ep_rbtree_insert(ep, epi);//插入到红黑树中
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
current, ep, tfile, fd));
return 0;
eexit_2:
ep_unregister_pollwait(ep, epi);
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue.
*/
write_lock_irqsave(&ep->lock, flags);
if (EP_IS_LINKED(&epi->rdllink))
EP_LIST_DEL(&epi->rdllink);
write_unlock_irqrestore(&ep->lock, flags);
EPI_MEM_FREE(epi);
eexit_1:
return error;
}
sys_epoll_ctl=>ep_insert=>init_poll_funcptr
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->qproc = qproc;//将ep_ptable_queue_proc设置到pt中
}
sys_epoll_ctl=>ep_insert=>sock_poll
/* No kernel lock held - perfect */
static unsigned int sock_poll(struct file *file, poll_table * wait)
{
struct socket *sock;
/*
* We can't return errors to poll, so it's either yes or no.
*/
sock = SOCKET_I(file->f_dentry->d_inode);
return sock->ops->poll(file, sock, wait);
}
sys_epoll_ctl=>ep_insert=>sock_poll=>tcp_poll
/*
* Wait for a TCP event.
*
* Note that we don't need to lock the socket, as the upper poll layers
* take care of normal races (between the test and the event) and we don't
* go look at any of the socket buffers directly.
*/
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
unsigned int mask;
struct sock *sk = sock->sk;
struct tcp_sock *tp = tcp_sk(sk);
poll_wait(file, sk->sk_sleep, wait);//这里将wait挂入sk->sk_sleep
......
return mask;
}
sys_epoll_ctl=>ep_insert=>sock_poll=>tcp_poll=>poll_wait
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && wait_address)
p->qproc(filp, wait_address, p);//这里调用的是ep_ptable_queue_proc
}
sys_epoll_ctl=>ep_insert=>sock_poll=>tcp_poll=>poll_wait=>ep_ptable_queue_proc
/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) {
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);//重点:这里在socket的事件到来时,循环遍历执行sk->sk_sleep的节点的回调函数时,会调用ep_poll_callback,在里面将epi挂入ep的rdllist队列中,然后唤醒在ep上等待的进程
pwq->whead = whead;
pwq->base = epi;//将struct epitem对象地址保存到base
add_wait_queue(whead, &pwq->wait);//将等待项挂入sk->sk_sleep
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}
继续回到ep_insert函数中,里面调用ep_rbtree_insert将epi插入ep的红黑树中,一些其他处理完成后,返回。
接着看epoll_wait的实现:
int epoll_wait(int epfd, struct epoll_event *events,
int maxevents, int timeout);
对应内核的函数为:
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout)
{
int error;
struct file *file;
struct eventpoll *ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
current, epfd, events, maxevents, timeout));
/* The maximum number of event must be greater than zero */
if (maxevents <= 0)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
if ((error = verify_area(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))))
goto eexit_1;
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd);//获取对应struct eventpoll的file结构
if (!file)
goto eexit_1;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!IS_FILE_EPOLL(file))
goto eexit_2;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;//拿到struct eventpoll结构
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout);//主体函数
eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
current, epfd, events, maxevents, timeout, error));
return error;
}
sys_epoll_wait=>ep_poll
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res, eavail;
unsigned long flags;
long jtimeout;
wait_queue_t wait;
/*
* Calculate the timeout by checking for the "infinite" value ( -1 )
* and the overflow condition. The passed timeout is in milliseconds,
* that why (t * HZ) / 1000.
*/
jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000;//超时时间检查和换算
retry:
write_lock_irqsave(&ep->lock, flags);
res = 0;
if (list_empty(&ep->rdllist)) {//如果就绪队列为空
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
*/
init_waitqueue_entry(&wait, current);
add_wait_queue(&ep->wq, &wait);//设置等待项
for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);//设置为可唤醒中断状态
if (!list_empty(&ep->rdllist) || !jtimeout)//检查就绪队列是否有节点,或者超时时间已到,终止整个循环
break;
if (signal_pending(current)) {//检查是否有待处理信号
res = -EINTR;
break;
}
write_unlock_irqrestore(&ep->lock, flags);
jtimeout = schedule_timeout(jtimeout);//调度出去,返回时,返回值为剩余时间
write_lock_irqsave(&ep->lock, flags);
}
remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);//设置为就绪状态
}
/* Is it worth to try to dig for events ? */
eavail = !list_empty(&ep->rdllist);
write_unlock_irqrestore(&ep->lock, flags);
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
if (!res && eavail &&
!(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)//如果就绪队列不为空,就发送到用户态进程
goto retry;
return res;
}
sys_epoll_wait=>ep_poll=>ep_events_transfer
/*
* Perform the transfer of events to user space.
*/
static int ep_events_transfer(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
int eventcnt = 0;
struct list_head txlist;
INIT_LIST_HEAD(&txlist);
/*
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
*/
down_read(&ep->sem);
/* Collect/extract ready items */
if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {//内部将rdllist转移到txlist
/* Build result set in userspace */
eventcnt = ep_send_events(ep, &txlist, events);//通过txlist将数据拷贝到用户态缓冲区中
/* Reinject ready items into the ready list */
ep_reinject_items(ep, &txlist);
}
up_read(&ep->sem);
return eventcnt;
}
sys_epoll_wait=>ep_poll=>ep_events_transfer=>ep_collect_ready_items
/*
* Since we have to release the lock during the __copy_to_user() operation and
* during the f_op->poll() call, we try to collect the maximum number of items
* by reducing the irqlock/irqunlock switching rate.
*/
static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
{
int nepi;
unsigned long flags;
struct list_head *lsthead = &ep->rdllist, *lnk;
struct epitem *epi;
write_lock_irqsave(&ep->lock, flags);
//循环将rdllist的节点脱链,并链入txlist, 最多maxevents
for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
epi = list_entry(lnk, struct epitem, rdllink);
lnk = lnk->next;
/* If this file is already in the ready list we exit soon */
if (!EP_IS_LINKED(&epi->txlink)) {
/*
* This is initialized in this way so that the default
* behaviour of the reinjecting code will be to push back
* the item inside the ready list.
*/
epi->revents = epi->event.events;
/* Link the ready item into the transfer list */
list_add(&epi->txlink, txlist);//链入txlist
nepi++;
/*
* Unlink the item from the ready list.
*/
EP_LIST_DEL(&epi->rdllink);//脱链
}
}
write_unlock_irqrestore(&ep->lock, flags);
return nepi;
}
sys_epoll_wait=>ep_poll=>ep_events_transfer=>ep_send_events
/*
* This function is called without holding the "ep->lock" since the call to
* __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
* because of the way poll() is traditionally implemented in Linux.
*/
static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
struct epoll_event __user *events)
{
int eventcnt = 0;
unsigned int revents;
struct list_head *lnk;
struct epitem *epi;
/*
* We can loop without lock because this is a task private list.
* The test done during the collection loop will guarantee us that
* another task will not try to collect this file. Also, items
* cannot vanish during the loop because we are holding "sem".
*/
list_for_each(lnk, txlist) {
epi = list_entry(lnk, struct epitem, txlink);
/*
* Get the ready file event set. We can safely use the file
* because we are holding the "sem" in read and this will
* guarantee that both the file and the item will not vanish.
*/
revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);//返回socket的事件
/*
* Set the return event set for the current file descriptor.
* Note that only the task task was successfully able to link
* the item to its "txlist" will write this field.
*/
epi->revents = revents & epi->event.events;
if (epi->revents) {//拷贝到用户态缓冲区中
if (__put_user(epi->revents,
&events[eventcnt].events) ||
__put_user(epi->event.data,
&events[eventcnt].data))
return -EFAULT;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
eventcnt++;
}
}
return eventcnt;
}
到这里,整个过程基本走完了。有一个点还没讲到,就是socket有数据过来时,怎么触发执行的ep_poll_callback。这个要从创建socket开始看,sys_socket经过一系列操作会执行sock_init_data函数中,里面一行代码非常重要就是:
sk->sk_data_ready = sock_def_readable;
我们来看下sock_def_readable做了什么:
static void sock_def_readable(struct sock *sk, int len)
{
read_lock(&sk->sk_callback_lock);
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible(sk->sk_sleep);//这里会执行epoll的回调函数ep_poll_callback
sk_wake_async(sk,1,POLL_IN);
read_unlock(&sk->sk_callback_lock);
}
到这里,所有的就讲完了。