Linux epoll source code analysis

The source code of this article is based on kernel version 5.5. It mainly analyzes the implementation principle of epoll in the kernel. The main source code is in kernel/fs/eventpoll.c.

Table of contents

1. Key structure

2. epoll starts

Three, epoll_create

Four, epoll_ctl

5. epoll_wait


1. Key structure

There are mainly two key structures that need to be noted. The members contained in them should be used for what students should be able to understand when reading the code. I will just briefly mention them here.

struct eventpoll, struct epitem, these two structures need to be paid attention to here. Each process of eventpoll will create one after calling epoll_create(). There is an rdlist in it to store epitem, and what is stored in epitem is what is monitored. Descriptor information, including fd, file, events, etc.

Eventpoll uses a red-black tree to store epitems, so the worst-case time complexity of search, insertion, and deletion is O(log(n)).

 

2. epoll starts

Source code location/fs/eventpoll.c

From here on,
fs_initcall(eventpoll_init);


static int __init eventpoll_init(void)
{

	struct sysinfo si;

	si_meminfo(&si);//获取系统内存信息
	/*
	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
	 */
	//设置每个进程能监听的fd数目
	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
		EP_ITEM_COST;

//对于 EP_ITEM_COST,
//是由于每注册一个fd需要用掉一个 struct epitem 和一个struct eppoll_entry

	BUG_ON(max_user_watches < 0);

	/*
	 * Initialize the structure used to perform epoll file descriptor
	 * inclusion loops checks.
	 */
	ep_nested_calls_init(&poll_loop_ncalls);//链表初始化

	/*
	 * We can have many thousands of epitems, so prevent this from
	 * using an extra cache line on 64-bit (and smaller) CPUs
	 */
	BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);

	//创建缓存,用来存 (struct epitem 和 struct eppoll_entry
    
	/* Allocates slab cache used to allocate "struct epitem" items */
	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

	/* Allocates slab cache used to allocate "struct eppoll_entry" */
	pwq_cache = kmem_cache_create("eventpoll_pwq",
		sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);

	return 0;
}

Okay, the initialization is complete, let’s look at epoll_create
 

Three, epoll_create

SYSCALL_DEFINE1(epoll_create, int, size)
{
	if (size <= 0)
		return -EINVAL;

	return do_epoll_create(0);
}

The system call interface epoll_create is defined with SYSCALL_DEFINE. It only makes a compatibility judgment on the size.
The specific work is in do_epoll_create. Let’s see next.


/*
 * Open an eventpoll file descriptor.
 */
static int do_epoll_create(int flags)
{
	int error, fd;
	struct eventpoll *ep = NULL;
	struct file *file;

	/* Check the EPOLL_* constant for consistency.  */
	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;
	/*
	 * Create the internal data structure ("struct eventpoll").
	 */
	error = ep_alloc(&ep);//针对这个进程创建一个 eventpoll(一般来说每个进程一个,毕竟咱们在一个进程里只epoll_create一次嘛)
	if (error < 0)
		return error;
	/*
	 * Creates all the items needed to setup an eventpoll file. That is,
	 * a file structure and a free file descriptor.
	 */
	//获取一个可用的文件描述符
	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
	if (fd < 0) {
		error = fd;
		goto out_free_ep;
	}
	//创建一个文件实例,将ep装到其priv 里边,后面要用的时候通过该file拿出对应的eventpoll;
	//并注册对该file的操作接口eventpoll_fops
	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
				 O_RDWR | (flags & O_CLOEXEC));
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto out_free_fd;
	}
	//将file装到ep里边,后面也可以通过该ep拿出对应的file
	//至此,eventpoll和其对应的file形成你中有我我中有你的联系,每个进程调epoll_create 后拥有一个eventpoll,一个fd,一个file
	ep->file = file;

	//将fd和file挂钩起来,文件系统那块目前还没有去研究源码不太清楚,不过我觉得大概应该是之后可以通过
	//该fd操作file吧,比如你对该fd调用llseek,那么后面就会调用前面注册的eventpoll_fops::noop_llseek,具体实现就在这里边做
	fd_install(fd, file);
	return fd;//返回fd给用户进程

out_free_fd:
	put_unused_fd(fd);
out_free_ep:
	ep_free(ep);
	return error;
}

Here you can take a look at what was done when creating eventpoll.
 

static int ep_alloc(struct eventpoll **pep)
{
	int error;
	struct user_struct *user;
	struct eventpoll *ep;

	user = get_current_user();//获取当前用户信息
	error = -ENOMEM;
	ep = kzalloc(sizeof(*ep), GFP_KERNEL);//开辟内存
	if (unlikely(!ep))
		goto free_uid;

	//锁初始化
	mutex_init(&ep->mtx);
	rwlock_init(&ep->lock);
	//队列初始化
	init_waitqueue_head(&ep->wq);
	init_waitqueue_head(&ep->poll_wait);

	//等待队列 redy list 初始化
	INIT_LIST_HEAD(&ep->rdllist);
	ep->rbr = RB_ROOT_CACHED;//红黑树初始化,用来存监听的fd的信息
	ep->ovflist = EP_UNACTIVE_PTR;
	ep->user = user;//当前用户信息存在eventpoll里边

	*pep = ep;

	return 0;

free_uid:
	free_uid(user);
	return error;
}


Okay, now that we've finished reading epoll_create, let's look at epoll_ctl. Here we follow the main line of adding description,
that is, the behavior of the EPOLL_CTL_ADD option.

Four, epoll_ctl

Similarly, epoll_ctl is defined as a system call with SYSCALL_DEFINE,

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	int error;
	int full_check = 0;
	struct fd f, tf;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;
	struct eventpoll *tep = NULL;

	error = -EFAULT;
	//获取用户空间传进来的event的内容,获取要监听fd的类型(读,写等)
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	error = -EBADF;
	f = fdget(epfd);//通过在epoll_create里边创建的fd(返回给用户空间后又由用户空间传下来)获取其对应的file
	if (!f.file)
		goto error_return;

	/* Get the "struct file *" for the target file */
	tf = fdget(fd);  //获取该socket对应的file
	if (!tf.file)
		goto error_fput;

	/* The target file descriptor must support poll */
	error = -EPERM;
	//若该socket没有实现poll接口,直接返回(socket实现poll是因为后面这边后面回调用该poll,在后面的epoll_wait分析里边就会遇到)
	if (!file_can_poll(tf.file))
		goto error_tgt_fput;

	/* Check if EPOLLWAKEUP is allowed */
	if (ep_op_has_event(op))
		ep_take_care_of_epollwakeup(&epds);

	/*
	 * We have to check that the file structure underneath the file descriptor
	 * the user passed to us _is_ an eventpoll file. And also we do not permit
	 * adding an epoll file descriptor inside itself.
	 */
	//把自己的fd传进来了,不能这么搞哦小伙子
	error = -EINVAL;
	if (f.file == tf.file || !is_file_epoll(f.file))
		goto error_tgt_fput;

    //......

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	ep = f.file->private_data;//将之前epoll_create创建的eventpoll取出来,
   //即: 用户空间创efd下来,通过fd可以取得file,通过file再取出创建file时装进去的eventpoll

    //......

	/*
	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
	 * above, we can be sure to be able to use the item looked up by
	 * ep_find() till we release the mutex.
	 */
	epi = ep_find(ep, tf.file, fd);//在该进程的红黑树里边找fd ,file对应的epitem,一边来说第一次EPOLL_CTL_ADD找不到所以为NULL

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD:
		if (!epi) {
			epds.events |= EPOLLERR | EPOLLHUP;
		//创建eptime,保存该socket 对应的fd,file,以及eventpoll,然后插入红黑树
			error = ep_insert(ep, &epds, tf.file, fd, full_check);
		} else
			error = -EEXIST;
		if (full_check)
			clear_tfile_check_list();
		break;

        //......
	
    }

    //......
	return error;
}

//So in fact, when epoll_ctl adds a listening descriptor, the main job is to create an epitem, which stores the fd, file of the socket, and the eventpoll of the process.
(For eventpoll, I'm wondering if it is necessary to save one for each epitem? If there are more fds, and with the 64-bit platform, each fd uses 8 bytes of more memory, and there is still a certain overhead for the kernel's memory. After
all We generally only create one eventpoll for each process, and the eventpoll pointer has been installed in the file corresponding to epfd (it is created and
installed by epoll_create), so I think when you want to use it, you can get the file through epfd and then obtain it through file->private_data That’s it)


Next, let’s take a look at how to create epitem and insert the red-black tree, and what is done here.

static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
	int error, pwake = 0;
	__poll_t revents;
	long user_watches;
	struct epitem *epi;
	struct ep_pqueue epq;

	lockdep_assert_irqs_enabled();//什么都没有做,不知写这个用来干嘛

	//获取该进程监听的fd数目
	user_watches = atomic_long_read(&ep->user->epoll_watches);
	//超过能监听的最大数目了
	if (unlikely(user_watches >= max_user_watches))
		return -ENOSPC;
		//创建一个epitem,从初始化时开辟的缓存里边拿内存来用
	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
		return -ENOMEM;

	//epitem初始化
	/* Item initialization follow here ... */
	INIT_LIST_HEAD(&epi->rdllink);
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->pwqlist);
	epi->ep = ep;//装eventpoll
	ep_set_ffd(&epi->ffd, tfile, fd);//装socket的fd,file
	epi->event = *event;//存储监听事件类型
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;
	if (epi->event.events & EPOLLWAKEUP) {
		error = ep_create_wakeup_source(epi);
		if (error)
			goto error_create_wakeup_source;
	} else {
		RCU_INIT_POINTER(epi->ws, NULL);
	}

	/* Initialize the poll table using the queue callback */
	epq.epi = epi;
	//这边注册回调,将callback函数装到epq.pt._qproc里边
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	/*
	 * Attach the item to the poll hooks and get current event bits.
	 * We can safely use the file* here because its usage count has
	 * been increased by the caller of this function. Note that after
	 * this operation completes, the poll callback can start hitting
	 * the new item.
	 */
	revents = ep_item_poll(epi, &epq.pt, 1);//这边调用socket那边的poll,后面得看下socket那边的行为(其实就是回调这个ep_ptable_queue_proc())

    //......

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_lock);
	//把 epi->fllink, 加到 tfile->f_ep_links里边,即添加到socket那边的file的f_ep_links链表尾部
	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_lock);

	/*
	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	 */
	//将刚刚创建的epitem插入在epoll_create创建的红黑树里边(每个进程各有一颗)
	ep_rbtree_insert(ep, epi);

    //......

	/* If the file is already "ready" we drop it inside the ready list */
	//如果该socket已经有事件的话,无需睡眠等待,
	//直接将其装入eventpoll的redy list里边,然后通过wake_up(&ep->wq)调用 ep_poll_callback
	if (revents && !ep_is_linked(epi)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake(epi);

		/* Notify waiting tasks that events are available *///TODO 这边做了什么事情后面再看下
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);//TODO
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	write_unlock_irq(&ep->lock);

    //这边将监听的fd数目加1
	atomic_long_inc(&ep->user->epoll_watches);


	return error;
}

This function mainly creates an epitem, calls the poll interface on the socket, and then the socket will call back ep_ptable_queue_proc(),
 install eventpoll, the socket's fd and file, and then insert the red-black tree,
if the monitored fd already has an event , directly load the fd information into the redy list of evnetpoll, and then call the callback function ep_poll_callback().

Okay, let's take a look at ep_item_poll() and ep_ptable_queue_proc(). ep_poll_callback() waits until the data generated by the socket is analyzed and
the eventpoll is notified before analyzing.

Let’s look at ep_item_poll() first 

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
				 int depth)
{
	struct eventpoll *ep;
	bool locked;

	pt->_key = epi->event.events;//用户空间设置的参数,读/写/listend
	if (!is_file_epoll(epi->ffd.file))//走这边
		return vfs_poll(epi->ffd.file, pt) & epi->event.events;//进入vfs_poll()
	
}

Take a look at vfs_poll(), in include/linux/poll.h
 

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
        if (unlikely(!file->f_op->poll))
                return DEFAULT_POLLMASK;
        return file->f_op->poll(file, pt);//调用文件系统socket 那边的poll ,接下来得看socket那边的poll,咱们就看tcp的吧,应该都差不多
}

Next comes the poll side of the socket, which is in net/socket.c

// Remember that wait contains the eventpoll callback function ep_ptable_queue_proc()

static __poll_t sock_poll(struct file *file, poll_table *wait)
{
	struct socket *sock = file->private_data;

    //......

	return sock->ops->poll(file, sock, wait) | flag;
}

This side instead calls the poll on the tcb side, that is, tcp_poll(). Let’s continue reading. The source code is in net/ipv4/tcp.c.
 

__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
	__poll_t mask;
	struct sock *sk = sock->sk;
	const struct tcp_sock *tp = tcp_sk(sk);
	int state;

	sock_poll_wait(file, sock, wait);//这边主要是这个,跟进去看下

    //......

	return mask;
}

static inline void sock_poll_wait(struct file *filp, struct socket *sock,
				  poll_table *p)
{
	if (!poll_does_not_wait(p)) {
        //sock->wq.wait 留意这个,后面有事件的时候可能会用到
		poll_wait(filp, &sock->wq.wait, p); //这边再跟进去看
        //......
	}
}

Source code location/include/linux/poll.h
 

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && p->_qproc && wait_address)
		p->_qproc(filp, wait_address, p);//这边就去回调eventpoll的 ep_ptable_queue_proc()了
        //将socket的file,该socket的等待队列传给ep_ptable_queue_proc()
}

Okay, let’s go back to eventpoll.c and look at ep_ptable_queue_proc()
 

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,//whead 是socket那边传过来的
				 poll_table *pt)
{
	struct epitem *epi = ep_item_from_epqueue(pt);
	struct eppoll_entry *pwq;

	//这边也是针对这个socket fd创建一个eppoll_entry
	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
		//这边注册唤醒回调函数,由ep_poll_call来执行唤醒函数,代替default_wake_function()唤醒
		//
		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);// 这边将ep_poll_callback挂载到eppoll_entry::wait里边,
		//当socket那边有数据时,会统一用__wake_up()唤醒whead,
		//而对于 咱们这个pwq->wait,会调用 ep_poll_callback,然后再在callback里边去唤醒应用层调用epoll_wait的用户进程
		pwq->whead = whead;//等待队列存socket的等待队列,
		pwq->base = epi;
		if (epi->event.events & EPOLLEXCLUSIVE)
			add_wait_queue_exclusive(whead, &pwq->wait);
		else
			add_wait_queue(whead, &pwq->wait); //这边将eppoll_entry的等待队列节点添加到socket的等待队列上去
		list_add_tail(&pwq->llink, &epi->pwqlist);///将eppoll_entry 添加到 epitem的pwqlist链表里边,后面就可以根据这个链表取出
		epi->nwait++;
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;
	}
}

 

The main work here is actually to create an eppoll_entry, install the ep_poll_callback() function, and then register it in the event waiting queue of the socket. When there is an event on the socket
later, all nodes in this queue will be awakened.
Then add eppoll_entry to eptiem. It is not clear why TODO should be added. You will have to look at it later.

Okay, now, the analysis of epoll_ctl is finished, let’s look at epoll_wait next
 

5. epoll_wait

Similarly, epoll_wait is defined as a system call using SYSCALL_DEFINE.
 

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
		int, maxevents, int, timeout)
{
	return do_epoll_wait(epfd, events, maxevents, timeout);
}

//接着看 do_epoll_wait(),
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
			 int maxevents, int timeout)
{
	int error;
	struct fd f;
	struct eventpoll *ep;

	//......

	/* Get the "struct file *" for the eventpoll file */
	f = fdget(epfd);//通过epfd拿到eventpoll的file对象
	if (!f.file)
		return -EBADF;

	//......
	
	ep = f.file->private_data;
	//从file->private_data里边取出对应的eventpoll,还记得吗,这是在epoll_create的时候存进去


	/* Time to fish for events ... */
	//跟进去看
	error = ep_poll(ep, events, maxevents, timeout);
	//......
}

Okay, let’s look at ep_poll() next.
 

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	int res = 0, eavail, timed_out = 0;
	u64 slack = 0;
	bool waiter = false;
	wait_queue_entry_t wait;
	ktime_t expires, *to = NULL;


	//有设置超时的话换算一下时间
	if (timeout > 0) {
		struct timespec64 end_time = ep_set_mstimeout(timeout);

		slack = select_estimate_accuracy(&end_time);
		to = &expires;
		*to = timespec64_to_ktime(end_time);
	} else if (timeout == 0) {

		timed_out = 1;

		write_lock_irq(&ep->lock);
		eavail = ep_events_available(ep);//检查是否此时已经有事件就绪了,如果eventpoll::rdlist不为空就是有事件了
		write_unlock_irq(&ep->lock);

		goto send_events;
		//这边先直接跳到 send_events 看是否已经有事件就绪,有的话就直接将事件返回给用户空间了,
		//没有事件的话才回跳到fetch_events这边把自己调度出去
	}

fetch_events:

	//......

	/*
	 * We don't have any available event to return to the caller.  We need
	 * to sleep here, and we will be woken by ep_poll_callback() when events
	 * become available.
	 */
	//没有事件,需要把自己调度出去,先睡会儿,等有事件后socket那边回调ep_poll_callback(),那边会唤醒这个进程
	if (!waiter) {
		waiter = true;
		init_waitqueue_entry(&wait, current);//当前进程结构体,添加到wait节点里边

		spin_lock_irq(&ep->wq.lock);
		__add_wait_queue_exclusive(&ep->wq, &wait);//把wait节点添加到等待队列 wq里边,
		spin_unlock_irq(&ep->wq.lock);
	}

	for (;;) {

		set_current_state(TASK_INTERRUPTIBLE);//将进程设为TASK_INTERRUPTIBLE状态,可唤醒

		//.......

		eavail = ep_events_available(ep);
		if (eavail)//若此时rdlist不为空那就不用睡了,直接跳出去
			break;
		//.......

		//这边把把自己调度出去,后面醒来后又回到for循环的开始处,在	if (eavail)这边再跳出去
		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
			timed_out = 1;//若是超时发生,则直接跳出去
			break;
		}
	}

	__set_current_state(TASK_RUNNING);//醒来后,把自己设为运行态

send_events:
	/*
	 * Try to transfer events to user space. In case we get 0 events and
	 * there's still timeout left over, we go trying again in search of
	 * more luck.
	 */
	if (!res && eavail &&
	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)//这边把事件和fd给到用户空间并返回
		goto fetch_events;

	if (waiter) {
		spin_lock_irq(&ep->wq.lock);
		__remove_wait_queue(&ep->wq, &wait);//把wait从ep->wq里边拿出来
		spin_unlock_irq(&ep->wq.lock);
	}

	return res;
}

 

The main thing done here is to first determine whether any events are ready (determine whether the rdlist is empty). If there are ready events, directly return
the events and fd to the user space. If not, first schedule yourself out, and then wait for the socket. When there is an event there,
wake it up through the callback ep_poll_callback().

Okay, so far, before we look at how ep_send_events() gives events to user space,
let's first look at the behavior of events on the socket, how to call back ep_poll_callback(), and what ep_poll_callback() does, and
then we'll go over it. Let’s look at ep_send_events().scoket. Let’s choose tcp ipv4.

For a tcp socket, for example, when data comes from the network card, an interrupt is triggered. After the link layer and ip layer header processing, it reaches the tcp side. After the header is processed, sock_def_readable() will be called. Let's take a look at this function directly
. ,
//y source code location net/core/sock.c

static void sock_def_readable(struct sock *sk)
{
	struct socket_wq *wq;

	rcu_read_lock();
	wq = rcu_dereference(sk->sk_wq);
	
	//在这边,会唤醒所有该socket的等待队列里边的节点,即监听该socket的进程,
	//对于epoll就是在前边epoll_ctl 添加监听描述符的时候添加进来的,
	if (skwq_has_sleeper(wq))
		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
						EPOLLRDNORM | EPOLLRDBAND);
	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
	rcu_read_unlock();
}

For wake_up_interruptible_sync_poll(), the specific implementation is in /kernel/sched/wait.c __wake_up_common().
If the node in the queue has implemented its own wake-up function, the function will be called back first, and then the function will perform the wake-up work.
If it is not implemented , If so, use the default kernel/sched/core.c default_wake_function(),

For eventpoll, I have implemented ep_poll_callback(), which we have mentioned many times. Okay, let’s take a look at the implementation of this function.

/*
 * This is the callback that is passed to the wait queue wakeup
 * mechanism. It is called by the stored file descriptors when they
 * have events to report.
 */
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
	int pwake = 0;
	struct epitem *epi = ep_item_from_wait(wait);//从wait里边取出对应的epitem
	struct eventpoll *ep = epi->ep;//从epitem里边取出对应的eventpoll
	__poll_t pollflags = key_to_poll(key);
	unsigned long flags;
	int ewake = 0;

	 //.......

	/* If this file is already in the ready list we exit soon */
	//这边把epitem::rdllink 添加到 rdllist里边
	if (!ep_is_linked(epi) &&
	    list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
		ep_pm_stay_awake_rcu(epi);
	}

	/*
	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
	 * wait list.
	 */

	if (waitqueue_active(&ep->wq)) {
		if ((epi->event.events & EPOLLEXCLUSIVE) &&
					!(pollflags & POLLFREE)) {
			switch (pollflags & EPOLLINOUT_BITS) {
			case EPOLLIN:
				if (epi->event.events & EPOLLIN)
					ewake = 1;
				break;
			case EPOLLOUT:
				if (epi->event.events & EPOLLOUT)
					ewake = 1;
				break;
			case 0:
				ewake = 1;
				break;
			}
		}
		wake_up(&ep->wq);//这边去唤醒用户进程
	}

	//......
	return ewake;
}

 

This function mainly does two tasks. It adds the eptiem corresponding to the socket to the rdlist of eventpoll.
If the corresponding user process is still sleeping at this time, wake it up through wake_up().
Okay, now eventpoll::rdlist is already there. The event information of the socket is saved, and the user process (to be precise, the process is still in kernel mode at this time) is also awakened.

We can go back to ep_poll(), here comes ep_send_events(),
continue reading, 

static int ep_send_events(struct eventpoll *ep,
			  struct epoll_event __user *events, int maxevents)
{
	struct ep_send_events_data esed;

	esed.maxevents = maxevents;
	esed.events = events;//保存用户空间传进来的epoll_event地址

	ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);//跟进去
	return esed.res;
}

Next look at ep_scan_ready_list()
 

/**
 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
 *                      the scan code, to call f_op->poll(). Also allows for
 *                      O(NumReady) performance.
 */
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
			      __poll_t (*sproc)(struct eventpoll *,
					   struct list_head *, void *),
			      void *priv, int depth, bool ep_locked)
				  //priv 存有用户传进来的events,
{
	__poll_t res;
	struct epitem *epi, *nepi;
	LIST_HEAD(txlist);

	//......

	/*
	 * Steal the ready list, and re-init the original one to the
	 * empty list. Also, set ep->ovflist to NULL so that events
	 * happening while looping w/out locks, are not lost. We cannot
	 * have the poll callback to queue directly on ep->rdllist,
	 * because we want the "sproc" callback to be able to do it
	 * in a lockless way.
	 */
	//将rdlist赋值给txlist,rdlist初始化为空链表
	write_lock_irq(&ep->lock);
	list_splice_init(&ep->rdllist, &txlist);
	WRITE_ONCE(ep->ovflist, NULL);
	write_unlock_irq(&ep->lock);

	/*
	 * Now call the callback function.
	 */
	res = (*sproc)(ep, &txlist, priv);//好了,接下来主要是这个,即 ep_send_events_proc(ep, &txlist, priv)

	//......

	return res;
}

 

Take a look at ep_send_events_proc()
 

static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
			       void *priv)
{
	struct ep_send_events_data *esed = priv;//指针强转重新获得ep_send_events_data,里边装有用户空间传进来的epoll_event地址,
	__poll_t revents;
	struct epitem *epi, *tmp;
	struct epoll_event __user *uevent = esed->events;//取出用户空间的epoll_event地址,接下来主要围绕这个进行操作
	struct wakeup_source *ws;
	poll_table pt;

	//.......

	//这边扫描整个rdlist链表
	list_for_each_entry_safe(epi, tmp, head, rdllink) {
		if (esed->res >= esed->maxevents)
			break;

	    //.......

		//把事件和fd装到用户空间的epoll_event
		if (__put_user(revents, &uevent->events) ||
		    __put_user(epi->event.data, &uevent->data)) {
			list_add(&epi->rdllink, head);
			ep_pm_stay_awaake(epi);
			if (!esed->res)
				esed->res = -EFAULT;
			return 0;
		}
		//......

	return 0;
}

The main thing here is to scan the entire rdlist (not the eventpoll::rdlist at this time, the list is already empty, replace it with txlist in ep_scan_ready_list() and then pass it in), and then install the events and
fd into the user space inside the epoll_event and then return.

Okay, when we get here, we return to the user space (epoll_wait returns) and let the application layer handle it.

Guess you like

Origin blog.csdn.net/goodnight1994/article/details/105536300