linux epoll 源码分析

本文章源码基于kernel 5.5版本,主要分析epoll在kernel的实现原理,主要源码在 kernel/fs/eventpoll.c。

目录

一,关键结构体

二,epoll启动

三,epoll_create

四,epoll_ctl

五, epoll_wait


一,关键结构体

关键结构体主要有以下2个需要留意,里边包含的成员用来干嘛同学们看代码应该看得懂,在此只是简单提一下。

struct eventpoll ,struct epitem,这边需要留意的主要就这2个结构体,eventpoll每个进程调epoll_create()后会各创建一个,里边有个rdlist用来存epitem,而epitem里边存的就是所监听的描述符的信息,包括,fd,file,events等。

eventpoll里边是用一颗红黑树来存epitem的,所以查找,插入,删除的最坏时间复杂度是O(log(n))。

二,epoll启动

源码位置 /fs/eventpoll.c

从这开始,
fs_initcall(eventpoll_init);


static int __init eventpoll_init(void)
{

	struct sysinfo si;

	si_meminfo(&si);//获取系统内存信息
	/*
	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
	 */
	//设置每个进程能监听的fd数目
	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
		EP_ITEM_COST;

//对于 EP_ITEM_COST,
//是由于每注册一个fd需要用掉一个 struct epitem 和一个struct eppoll_entry

	BUG_ON(max_user_watches < 0);

	/*
	 * Initialize the structure used to perform epoll file descriptor
	 * inclusion loops checks.
	 */
	ep_nested_calls_init(&poll_loop_ncalls);//链表初始化

	/*
	 * We can have many thousands of epitems, so prevent this from
	 * using an extra cache line on 64-bit (and smaller) CPUs
	 */
	BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);

	//创建缓存,用来存 (struct epitem 和 struct eppoll_entry
    
	/* Allocates slab cache used to allocate "struct epitem" items */
	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

	/* Allocates slab cache used to allocate "struct eppoll_entry" */
	pwq_cache = kmem_cache_create("eventpoll_pwq",
		sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);

	return 0;
}

好了,初始化完毕,接下来看 epoll_create
 

三,epoll_create

SYSCALL_DEFINE1(epoll_create, int, size)
{
	if (size <= 0)
		return -EINVAL;

	return do_epoll_create(0);
}

用SYSCALL_DEFINE定义了系统调用接口epoll_create,只是对size做了个兼容性的判断,
具体工作在 do_epoll_create里边,接着看


/*
 * Open an eventpoll file descriptor.
 */
static int do_epoll_create(int flags)
{
	int error, fd;
	struct eventpoll *ep = NULL;
	struct file *file;

	/* Check the EPOLL_* constant for consistency.  */
	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;
	/*
	 * Create the internal data structure ("struct eventpoll").
	 */
	error = ep_alloc(&ep);//针对这个进程创建一个 eventpoll(一般来说每个进程一个,毕竟咱们在一个进程里只epoll_create一次嘛)
	if (error < 0)
		return error;
	/*
	 * Creates all the items needed to setup an eventpoll file. That is,
	 * a file structure and a free file descriptor.
	 */
	//获取一个可用的文件描述符
	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
	if (fd < 0) {
		error = fd;
		goto out_free_ep;
	}
	//创建一个文件实例,将ep装到其priv 里边,后面要用的时候通过该file拿出对应的eventpoll;
	//并注册对该file的操作接口eventpoll_fops
	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
				 O_RDWR | (flags & O_CLOEXEC));
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto out_free_fd;
	}
	//将file装到ep里边,后面也可以通过该ep拿出对应的file
	//至此,eventpoll和其对应的file形成你中有我我中有你的联系,每个进程调epoll_create 后拥有一个eventpoll,一个fd,一个file
	ep->file = file;

	//将fd和file挂钩起来,文件系统那块目前还没有去研究源码不太清楚,不过我觉得大概应该是之后可以通过
	//该fd操作file吧,比如你对该fd调用llseek,那么后面就会调用前面注册的eventpoll_fops::noop_llseek,具体实现就在这里边做
	fd_install(fd, file);
	return fd;//返回fd给用户进程

out_free_fd:
	put_unused_fd(fd);
out_free_ep:
	ep_free(ep);
	return error;
}

这边可以顺便看下 创建eventpoll时做了什么事情,
 

static int ep_alloc(struct eventpoll **pep)
{
	int error;
	struct user_struct *user;
	struct eventpoll *ep;

	user = get_current_user();//获取当前用户信息
	error = -ENOMEM;
	ep = kzalloc(sizeof(*ep), GFP_KERNEL);//开辟内存
	if (unlikely(!ep))
		goto free_uid;

	//锁初始化
	mutex_init(&ep->mtx);
	rwlock_init(&ep->lock);
	//队列初始化
	init_waitqueue_head(&ep->wq);
	init_waitqueue_head(&ep->poll_wait);

	//等待队列 redy list 初始化
	INIT_LIST_HEAD(&ep->rdllist);
	ep->rbr = RB_ROOT_CACHED;//红黑树初始化,用来存监听的fd的信息
	ep->ovflist = EP_UNACTIVE_PTR;
	ep->user = user;//当前用户信息存在eventpoll里边

	*pep = ep;

	return 0;

free_uid:
	free_uid(user);
	return error;
}


好了,epoll_create 看完了,接着看epoll_ctl,这边咱们沿着添加描述的主线去看,
即 EPOLL_CTL_ADD 选项的行为。

四,epoll_ctl

同样的,epoll_ctl用SYSCALL_DEFINE定义为系统调用,

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	int error;
	int full_check = 0;
	struct fd f, tf;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;
	struct eventpoll *tep = NULL;

	error = -EFAULT;
	//获取用户空间传进来的event的内容,获取要监听fd的类型(读,写等)
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	error = -EBADF;
	f = fdget(epfd);//通过在epoll_create里边创建的fd(返回给用户空间后又由用户空间传下来)获取其对应的file
	if (!f.file)
		goto error_return;

	/* Get the "struct file *" for the target file */
	tf = fdget(fd);  //获取该socket对应的file
	if (!tf.file)
		goto error_fput;

	/* The target file descriptor must support poll */
	error = -EPERM;
	//若该socket没有实现poll接口,直接返回(socket实现poll是因为后面这边后面回调用该poll,在后面的epoll_wait分析里边就会遇到)
	if (!file_can_poll(tf.file))
		goto error_tgt_fput;

	/* Check if EPOLLWAKEUP is allowed */
	if (ep_op_has_event(op))
		ep_take_care_of_epollwakeup(&epds);

	/*
	 * We have to check that the file structure underneath the file descriptor
	 * the user passed to us _is_ an eventpoll file. And also we do not permit
	 * adding an epoll file descriptor inside itself.
	 */
	//把自己的fd传进来了,不能这么搞哦小伙子
	error = -EINVAL;
	if (f.file == tf.file || !is_file_epoll(f.file))
		goto error_tgt_fput;

    //......

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	ep = f.file->private_data;//将之前epoll_create创建的eventpoll取出来,
   //即: 用户空间创efd下来,通过fd可以取得file,通过file再取出创建file时装进去的eventpoll

    //......

	/*
	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
	 * above, we can be sure to be able to use the item looked up by
	 * ep_find() till we release the mutex.
	 */
	epi = ep_find(ep, tf.file, fd);//在该进程的红黑树里边找fd ,file对应的epitem,一边来说第一次EPOLL_CTL_ADD找不到所以为NULL

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD:
		if (!epi) {
			epds.events |= EPOLLERR | EPOLLHUP;
		//创建eptime,保存该socket 对应的fd,file,以及eventpoll,然后插入红黑树
			error = ep_insert(ep, &epds, tf.file, fd, full_check);
		} else
			error = -EEXIST;
		if (full_check)
			clear_tfile_check_list();
		break;

        //......
	
    }

    //......
	return error;
}

//所以其实epoll_ctl 添加监听描述符时,主要做的工作是创建epitem,里边保存了该socket的fd,file,以及该进程的eventpoll。
(对于eventpoll,我在想有必要每个epitem都保存一个吗?如果fd比较多的话,加上64位平台,每个fd多用8字节内存,对于kernel
的内存还是有一定的开销的,毕竟咱们每个进程一般只创建一个eventpoll,并且eventpoll指针也已经装在epfd对应的file里边了呀(就是
epoll_create创建时装的)所以我觉得要用的时候可以通过epfd获取file再通过file->private_data获取就可以了呢)


接下来看下如何创建epitem并插入红黑树的,以及在这里边干了什么

static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
	int error, pwake = 0;
	__poll_t revents;
	long user_watches;
	struct epitem *epi;
	struct ep_pqueue epq;

	lockdep_assert_irqs_enabled();//什么都没有做,不知写这个用来干嘛

	//获取该进程监听的fd数目
	user_watches = atomic_long_read(&ep->user->epoll_watches);
	//超过能监听的最大数目了
	if (unlikely(user_watches >= max_user_watches))
		return -ENOSPC;
		//创建一个epitem,从初始化时开辟的缓存里边拿内存来用
	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
		return -ENOMEM;

	//epitem初始化
	/* Item initialization follow here ... */
	INIT_LIST_HEAD(&epi->rdllink);
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->pwqlist);
	epi->ep = ep;//装eventpoll
	ep_set_ffd(&epi->ffd, tfile, fd);//装socket的fd,file
	epi->event = *event;//存储监听事件类型
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;
	if (epi->event.events & EPOLLWAKEUP) {
		error = ep_create_wakeup_source(epi);
		if (error)
			goto error_create_wakeup_source;
	} else {
		RCU_INIT_POINTER(epi->ws, NULL);
	}

	/* Initialize the poll table using the queue callback */
	epq.epi = epi;
	//这边注册回调,将callback函数装到epq.pt._qproc里边
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	/*
	 * Attach the item to the poll hooks and get current event bits.
	 * We can safely use the file* here because its usage count has
	 * been increased by the caller of this function. Note that after
	 * this operation completes, the poll callback can start hitting
	 * the new item.
	 */
	revents = ep_item_poll(epi, &epq.pt, 1);//这边调用socket那边的poll,后面得看下socket那边的行为(其实就是回调这个ep_ptable_queue_proc())

    //......

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_lock);
	//把 epi->fllink, 加到 tfile->f_ep_links里边,即添加到socket那边的file的f_ep_links链表尾部
	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_lock);

	/*
	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	 */
	//将刚刚创建的epitem插入在epoll_create创建的红黑树里边(每个进程各有一颗)
	ep_rbtree_insert(ep, epi);

    //......

	/* If the file is already "ready" we drop it inside the ready list */
	//如果该socket已经有事件的话,无需睡眠等待,
	//直接将其装入eventpoll的redy list里边,然后通过wake_up(&ep->wq)调用 ep_poll_callback
	if (revents && !ep_is_linked(epi)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake(epi);

		/* Notify waiting tasks that events are available *///TODO 这边做了什么事情后面再看下
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);//TODO
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	write_unlock_irq(&ep->lock);

    //这边将监听的fd数目加1
	atomic_long_inc(&ep->user->epoll_watches);


	return error;
}

该函数主要是创建一个epitem,调用socket那边的poll接口,然后socket那边会回调ep_ptable_queue_proc(),
 装上eventpoll,socket的fd和file,然后插入红黑树,
若监听的fd已经有事件的话,直接将该fd的信息装入evnetpoll的redy list里边,然后调用callback函数 ep_poll_callback()。

好的,接下来看下 ep_item_poll()和 ep_ptable_queue_proc(),ep_poll_callback()等到分析socket那边产生数据后
通知eventpoll这边时再分析。

先看 ep_item_poll() 

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
				 int depth)
{
	struct eventpoll *ep;
	bool locked;

	pt->_key = epi->event.events;//用户空间设置的参数,读/写/listend
	if (!is_file_epoll(epi->ffd.file))//走这边
		return vfs_poll(epi->ffd.file, pt) & epi->event.events;//进入vfs_poll()
	
}

看下vfs_poll() ,在 include/linux/poll.h 里边
 

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
        if (unlikely(!file->f_op->poll))
                return DEFAULT_POLLMASK;
        return file->f_op->poll(file, pt);//调用文件系统socket 那边的poll ,接下来得看socket那边的poll,咱们就看tcp的吧,应该都差不多
}

接下来就到了socket的poll这边了,在 net/socket.c里边

// 记得wait 里边装有eventpoll那边的callback函数ep_ptable_queue_proc()

static __poll_t sock_poll(struct file *file, poll_table *wait)
{
	struct socket *sock = file->private_data;

    //......

	return sock->ops->poll(file, sock, wait) | flag;
}

这边转而调用tcb那边的poll,即 tcp_poll()那就接着看,源码在 net/ipv4/tcp.c里边,
 

__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
	__poll_t mask;
	struct sock *sk = sock->sk;
	const struct tcp_sock *tp = tcp_sk(sk);
	int state;

	sock_poll_wait(file, sock, wait);//这边主要是这个,跟进去看下

    //......

	return mask;
}

static inline void sock_poll_wait(struct file *filp, struct socket *sock,
				  poll_table *p)
{
	if (!poll_does_not_wait(p)) {
        //sock->wq.wait 留意这个,后面有事件的时候可能会用到
		poll_wait(filp, &sock->wq.wait, p); //这边再跟进去看
        //......
	}
}

源码位置 /include/linux/poll.h
 

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && p->_qproc && wait_address)
		p->_qproc(filp, wait_address, p);//这边就去回调eventpoll的 ep_ptable_queue_proc()了
        //将socket的file,该socket的等待队列传给ep_ptable_queue_proc()
}

好了,接下来看下 回到 eventpoll.c 看 ep_ptable_queue_proc()
 

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,//whead 是socket那边传过来的
				 poll_table *pt)
{
	struct epitem *epi = ep_item_from_epqueue(pt);
	struct eppoll_entry *pwq;

	//这边也是针对这个socket fd创建一个eppoll_entry
	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
		//这边注册唤醒回调函数,由ep_poll_call来执行唤醒函数,代替default_wake_function()唤醒
		//
		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);// 这边将ep_poll_callback挂载到eppoll_entry::wait里边,
		//当socket那边有数据时,会统一用__wake_up()唤醒whead,
		//而对于 咱们这个pwq->wait,会调用 ep_poll_callback,然后再在callback里边去唤醒应用层调用epoll_wait的用户进程
		pwq->whead = whead;//等待队列存socket的等待队列,
		pwq->base = epi;
		if (epi->event.events & EPOLLEXCLUSIVE)
			add_wait_queue_exclusive(whead, &pwq->wait);
		else
			add_wait_queue(whead, &pwq->wait); //这边将eppoll_entry的等待队列节点添加到socket的等待队列上去
		list_add_tail(&pwq->llink, &epi->pwqlist);///将eppoll_entry 添加到 epitem的pwqlist链表里边,后面就可以根据这个链表取出
		epi->nwait++;
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;
	}
}

这边主要做的工作其实就是创建一个eppoll_entry,装上ep_poll_callback()函数,然后注册到socket的事件等待队列里边,
后面socket那边有事件后就会唤醒这个队列的所有节点。
然后将eppoll_entry添加到eptiem里边,TODO 为何要添加目前还没有理清楚,后面得看下

好了,至此,epoll_ctl分析完了,接下来看epoll_wait
 

五, epoll_wait

同样的,epoll_wait用SYSCALL_DEFINE定义为系统调用,
 

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
		int, maxevents, int, timeout)
{
	return do_epoll_wait(epfd, events, maxevents, timeout);
}

//接着看 do_epoll_wait(),
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
			 int maxevents, int timeout)
{
	int error;
	struct fd f;
	struct eventpoll *ep;

	//......

	/* Get the "struct file *" for the eventpoll file */
	f = fdget(epfd);//通过epfd拿到eventpoll的file对象
	if (!f.file)
		return -EBADF;

	//......
	
	ep = f.file->private_data;
	//从file->private_data里边取出对应的eventpoll,还记得吗,这是在epoll_create的时候存进去


	/* Time to fish for events ... */
	//跟进去看
	error = ep_poll(ep, events, maxevents, timeout);
	//......
}

好接下来看ep_poll(),
 

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	int res = 0, eavail, timed_out = 0;
	u64 slack = 0;
	bool waiter = false;
	wait_queue_entry_t wait;
	ktime_t expires, *to = NULL;


	//有设置超时的话换算一下时间
	if (timeout > 0) {
		struct timespec64 end_time = ep_set_mstimeout(timeout);

		slack = select_estimate_accuracy(&end_time);
		to = &expires;
		*to = timespec64_to_ktime(end_time);
	} else if (timeout == 0) {

		timed_out = 1;

		write_lock_irq(&ep->lock);
		eavail = ep_events_available(ep);//检查是否此时已经有事件就绪了,如果eventpoll::rdlist不为空就是有事件了
		write_unlock_irq(&ep->lock);

		goto send_events;
		//这边先直接跳到 send_events 看是否已经有事件就绪,有的话就直接将事件返回给用户空间了,
		//没有事件的话才回跳到fetch_events这边把自己调度出去
	}

fetch_events:

	//......

	/*
	 * We don't have any available event to return to the caller.  We need
	 * to sleep here, and we will be woken by ep_poll_callback() when events
	 * become available.
	 */
	//没有事件,需要把自己调度出去,先睡会儿,等有事件后socket那边回调ep_poll_callback(),那边会唤醒这个进程
	if (!waiter) {
		waiter = true;
		init_waitqueue_entry(&wait, current);//当前进程结构体,添加到wait节点里边

		spin_lock_irq(&ep->wq.lock);
		__add_wait_queue_exclusive(&ep->wq, &wait);//把wait节点添加到等待队列 wq里边,
		spin_unlock_irq(&ep->wq.lock);
	}

	for (;;) {

		set_current_state(TASK_INTERRUPTIBLE);//将进程设为TASK_INTERRUPTIBLE状态,可唤醒

		//.......

		eavail = ep_events_available(ep);
		if (eavail)//若此时rdlist不为空那就不用睡了,直接跳出去
			break;
		//.......

		//这边把把自己调度出去,后面醒来后又回到for循环的开始处,在	if (eavail)这边再跳出去
		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
			timed_out = 1;//若是超时发生,则直接跳出去
			break;
		}
	}

	__set_current_state(TASK_RUNNING);//醒来后,把自己设为运行态

send_events:
	/*
	 * Try to transfer events to user space. In case we get 0 events and
	 * there's still timeout left over, we go trying again in search of
	 * more luck.
	 */
	if (!res && eavail &&
	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)//这边把事件和fd给到用户空间并返回
		goto fetch_events;

	if (waiter) {
		spin_lock_irq(&ep->wq.lock);
		__remove_wait_queue(&ep->wq, &wait);//把wait从ep->wq里边拿出来
		spin_unlock_irq(&ep->wq.lock);
	}

	return res;
}

这边所做的事情主要是,先判断有没有事件已经就绪(判断rdlist是否为空),若有就绪则直接将
事件和fd返回给用户空间,没有就绪的话先把自己调度出去,然后等待socket那边有事件后通过
回调 ep_poll_callback()来唤醒。

好的,至此,在咱们看ep_send_events()如何将事件给到用户空间之前,
先看下socket那边有事件的行为,如何回调ep_poll_callback(),以及ep_poll_callback()做了什么事情,
接下来再过头来看ep_send_events().scoket那边咱们就挑tcp ipv4的来看吧,

对于一个tcp socket,比如当网卡那边有数据来临时,触发中断,经过链路层,ip层报头处理后,到了tcp这边,报头处理后,
会调用sock_def_readable(),咱们直接看下这个函数,
//y源码位置 net/core/sock.c

static void sock_def_readable(struct sock *sk)
{
	struct socket_wq *wq;

	rcu_read_lock();
	wq = rcu_dereference(sk->sk_wq);
	
	//在这边,会唤醒所有该socket的等待队列里边的节点,即监听该socket的进程,
	//对于epoll就是在前边epoll_ctl 添加监听描述符的时候添加进来的,
	if (skwq_has_sleeper(wq))
		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
						EPOLLRDNORM | EPOLLRDBAND);
	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
	rcu_read_unlock();
}

对于 wake_up_interruptible_sync_poll(),具体实现在 /kernel/sched/wait.c   __wake_up_common()里边,
若队列里边的节点有实现自己的唤醒函数,先回调该函数,再由该函数来执行唤醒工作,
若没有实现的话,使用默认的kernel/sched/core.c   default_wake_function(),

对于eventpoll,自己实现了咱们提了好多次的ep_poll_callback(),好了,接下来看下这个函数实现,

/*
 * This is the callback that is passed to the wait queue wakeup
 * mechanism. It is called by the stored file descriptors when they
 * have events to report.
 */
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
	int pwake = 0;
	struct epitem *epi = ep_item_from_wait(wait);//从wait里边取出对应的epitem
	struct eventpoll *ep = epi->ep;//从epitem里边取出对应的eventpoll
	__poll_t pollflags = key_to_poll(key);
	unsigned long flags;
	int ewake = 0;

	 //.......

	/* If this file is already in the ready list we exit soon */
	//这边把epitem::rdllink 添加到 rdllist里边
	if (!ep_is_linked(epi) &&
	    list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
		ep_pm_stay_awake_rcu(epi);
	}

	/*
	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
	 * wait list.
	 */

	if (waitqueue_active(&ep->wq)) {
		if ((epi->event.events & EPOLLEXCLUSIVE) &&
					!(pollflags & POLLFREE)) {
			switch (pollflags & EPOLLINOUT_BITS) {
			case EPOLLIN:
				if (epi->event.events & EPOLLIN)
					ewake = 1;
				break;
			case EPOLLOUT:
				if (epi->event.events & EPOLLOUT)
					ewake = 1;
				break;
			case 0:
				ewake = 1;
				break;
			}
		}
		wake_up(&ep->wq);//这边去唤醒用户进程
	}

	//......
	return ewake;
}

该函数主要做了2个工作,将该socket对应的eptiem添加到eventpoll的rdlist里边,
若对应的用户进程此时还在睡,通过wake_up() 唤醒,
好了,此时eventpoll::rdlist里边已经保存了该socket的事件信息,用户进程(准确的说该进程此时还是内核态)也唤醒了,

咱们可以回到ep_poll()了,这边接下来就到了 ep_send_events(),
继续看, 

static int ep_send_events(struct eventpoll *ep,
			  struct epoll_event __user *events, int maxevents)
{
	struct ep_send_events_data esed;

	esed.maxevents = maxevents;
	esed.events = events;//保存用户空间传进来的epoll_event地址

	ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);//跟进去
	return esed.res;
}

接下来看ep_scan_ready_list()
 

/**
 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
 *                      the scan code, to call f_op->poll(). Also allows for
 *                      O(NumReady) performance.
 */
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
			      __poll_t (*sproc)(struct eventpoll *,
					   struct list_head *, void *),
			      void *priv, int depth, bool ep_locked)
				  //priv 存有用户传进来的events,
{
	__poll_t res;
	struct epitem *epi, *nepi;
	LIST_HEAD(txlist);

	//......

	/*
	 * Steal the ready list, and re-init the original one to the
	 * empty list. Also, set ep->ovflist to NULL so that events
	 * happening while looping w/out locks, are not lost. We cannot
	 * have the poll callback to queue directly on ep->rdllist,
	 * because we want the "sproc" callback to be able to do it
	 * in a lockless way.
	 */
	//将rdlist赋值给txlist,rdlist初始化为空链表
	write_lock_irq(&ep->lock);
	list_splice_init(&ep->rdllist, &txlist);
	WRITE_ONCE(ep->ovflist, NULL);
	write_unlock_irq(&ep->lock);

	/*
	 * Now call the callback function.
	 */
	res = (*sproc)(ep, &txlist, priv);//好了,接下来主要是这个,即 ep_send_events_proc(ep, &txlist, priv)

	//......

	return res;
}

看下 ep_send_events_proc()
 

static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
			       void *priv)
{
	struct ep_send_events_data *esed = priv;//指针强转重新获得ep_send_events_data,里边装有用户空间传进来的epoll_event地址,
	__poll_t revents;
	struct epitem *epi, *tmp;
	struct epoll_event __user *uevent = esed->events;//取出用户空间的epoll_event地址,接下来主要围绕这个进行操作
	struct wakeup_source *ws;
	poll_table pt;

	//.......

	//这边扫描整个rdlist链表
	list_for_each_entry_safe(epi, tmp, head, rdllink) {
		if (esed->res >= esed->maxevents)
			break;

	    //.......

		//把事件和fd装到用户空间的epoll_event
		if (__put_user(revents, &uevent->events) ||
		    __put_user(epi->event.data, &uevent->data)) {
			list_add(&epi->rdllink, head);
			ep_pm_stay_awaake(epi);
			if (!esed->res)
				esed->res = -EFAULT;
			return 0;
		}
		//......

	return 0;
}

这边主要做的事情就是扫描整个rdlist(不是此时的eventpoll::rdlist,该list已经是空的了,在ep_scan_ready_list()里边置换给txlist
后传进来),然后将事件和fd装到用户空间的epoll_event里边,然后返回。

好了,到了这边,就返回到用户空间(epoll_wait返回),给应用层去处理了。

猜你喜欢

转载自blog.csdn.net/goodnight1994/article/details/105536300