深入epoll技术原理分析

阅读之前请先了解Linux内核的wakeup&callback机制以及前文的select与poll技术分析

epoll技术

为了解决select&poll技术存在的两个性能问题,对于大内存数据拷贝问题,epoll通过epoll_create函数创建epoll空间(相当于一个容器管理),在内核中只存储一份数据来维护N个socket事件的变化,通过epoll_ctl函数来实现对socket事件的增删改操作,并且在内核底层使用虚拟内存的管理方式保证用户空间与内核空间对该内存是具备可见性,直接通过指针引用的方式进行操作,避免了大内存数据的拷贝导致的空间切换性能问题,对于轮询等待事件通过epoll_wait的方式来实现对socket事件的监听,将不断轮询等待高频事件wait与低频socket注册事件两个操作分离开,同时会对监听就绪的socket事件添加到就绪队列中,也就保证唤醒轮询的事件都是具备可读的,现对epoll技术分析如下:

epoll技术定义

// 创建保存epoll文件描述符的空间,该空间也称为“epoll例程”
int epoll_create(int size);    // 使用链表,现在已经弃用
int epoll_create(int flag);    // 使用红黑树的数据结构

// epoll注册/修改/删除 fd的操作
long epoll_ctl(int epfd,                        // 上述epoll空间的fd索引值
               int op,                         // 操作识别，EPOLL_CTL_ADD |  EPOLL_CTL_MOD  |  EPOLL_CTL_DEL
               int fd,                          // 注册的fd
               struct epoll_event *event);      // epoll监听事件的变化
struct epoll_event {
	__poll_t events;
	__u64 data;
} EPOLL_PACKED;

// epoll等待，与select/poll的逻辑一致
epoll_wait(int epfd,                            // epoll空间
           struct epoll_event *events,           // epoll监听事件的变化
           int maxevents,                        // epoll可以保存的最大事件数
        int timeout);                         // 超时时间

epoll技术实现细节

epoll_ctl函数处理socket描述符fd注册问题,关注epoll_ctl的ADD方法

// 摘取核心代码
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
		 bool nonblock)
{
	// ...

	// 在红黑树中查找存储file对应的epitem，添加的时候会将epitem加到红黑树节点中
	epi = ep_find(ep, tf.file, fd);
	
	// 对于EPOLL_CTL_ADD模式,使用mtx加锁添加到wakeup队列中
	switch (op) {
	case EPOLL_CTL_ADD:
	    // fd注册操作
		// epds->events |= EPOLLERR | EPOLLHUP;
		// error = ep_insert(ep, epds, tf.file, fd, full_check);
		break;
	case EPOLL_CTL_DEL:
	    // // 删除操作：存储epitem容器移除epitem信息
		break;
	// 对注册的fd进行修改,但epoll的模式为EPOLLEXCLUSIVE是无法进行操作的
	case EPOLL_CTL_MOD:
	    // 修改操作,内核监听到事件变化执行修改
            //error = ep_modify(ep, epi, epds);			
		break;
	}
	
	// 释放资源逻辑
}

EPOLL_CTL_ADD核心代码逻辑

// 添加逻辑
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
    // ...
	struct epitem *epi;
	struct ep_pqueue epq;
	
	// 将fd包装在epitem的epollfile中
	epi->ep = ep;
	ep_set_ffd(&epi->ffd, tfile, fd);
	epi->event = *event;
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;
	
	// 如果当前监听到事件变化，那么创建wakeup执行的source
	if (epi->event.events & EPOLLWAKEUP) {
		error = ep_create_wakeup_source(epi);
		if (error)
			goto error_create_wakeup_source;
	} else {
		RCU_INIT_POINTER(epi->ws, NULL);
	}

	// 初始化回调函数并与当前的epitem进行绑定添并将callback添加到poll table中，每一个epitem都有对应的callback，并添加到等待队列ep_pqueue
	// 注意这里每次添加fd都会有对应的epitem以及ep_pqueue，而一个ep_pqueue与一个callback进行绑定
	epq.epi = epi;
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

       // 轮询检测epitem中的事件，在内部会对epoll空间的ready list进行扫描，会触发执行回调函数并将已经唤醒就绪的fdset从epoll空间中拷贝到
       // epitem中的ready_list
	revents = ep_item_poll(epi, &epq.pt, 1);

       // 将epitem插入到红黑树中	
	ep_rbtree_insert(ep, epi);

	// 如果有ready_list 则执行唤醒逻辑wakeup，这个是linux内核的唤醒机制，会将read_process添加到就绪队列中让cpu调度执行
	if (revents && !ep_is_linked(epi)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake(epi);

		/* Notify waiting tasks that events are available */
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

    // ....	

	// 存在预唤醒，则唤醒轮询等待节点
	if (pwake)
	    ep_poll_safewake(&ep->poll_wait);

	return 0;

// goto statement code ...
}

上述的代码中存在两个核心逻辑(注册&唤醒逻辑)

// 队列回调添加注册逻辑 ep_ptable_queue_proc
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
				 poll_table *pt)
{
	struct epitem *epi = ep_item_from_epqueue(pt);
	struct eppoll_entry *pwq;

	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
		pwq->whead = whead;
		pwq->base = epi;
		if (epi->event.events & EPOLLEXCLUSIVE)
			add_wait_queue_exclusive(whead, &pwq->wait);
		else
			add_wait_queue(whead, &pwq->wait);
		list_add_tail(&pwq->llink, &epi->pwqlist);
		epi->nwait++;
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;
	}
}

唤醒逻辑

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
				 int depth)
{
	struct eventpoll *ep;
	bool locked;

	pt->_key = epi->event.events;
	if (!is_file_epoll(epi->ffd.file))
		return vfs_poll(epi->ffd.file, pt) & epi->event.events;

	ep = epi->ffd.file->private_data;
	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
	locked = pt && (pt->_qproc == ep_ptable_queue_proc);

	return ep_scan_ready_list(epi->ffd.file->private_data,
				  ep_read_events_proc, &depth, depth,
				  locked) & epi->event.events;
}

// poll_wait还有这个方法
// 执行ep_ptable_queue_proc队列回调函数
p->_qproc(filp, wait_address, p);



// ep_read_events_proc:监控ep空间事件是否可读的回调方法
static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
			       void *priv)
{
	struct epitem *epi, *tmp;
	poll_table pt;
	int depth = *(int *)priv;

	init_poll_funcptr(&pt, NULL);
	depth++;

	list_for_each_entry_safe(epi, tmp, head, rdllink) {
		if (ep_item_poll(epi, &pt, depth)) {
			return EPOLLIN | EPOLLRDNORM;
		} else {
			/*
			 * Item has been dropped into the ready list by the poll
			 * callback, but it's not actually ready, as far as
			 * caller requested events goes. We can remove it here.
			 */
			__pm_relax(ep_wakeup_source(epi));
			list_del_init(&epi->rdllink);
		}
	}

	return 0;
}

// 扫描方法
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
			      __poll_t (*sproc)(struct eventpoll *,
					   struct list_head *, void *),
			      void *priv, int depth, bool ep_locked)
{
	__poll_t res;
	struct epitem *epi, *nepi;
	LIST_HEAD(txlist);

	lockdep_assert_irqs_enabled();

	/*
	 * We need to lock this because we could be hit by
	 * eventpoll_release_file() and epoll_ctl().
	 */

	if (!ep_locked)
		mutex_lock_nested(&ep->mtx, depth);

	/*
	 * Steal the ready list, and re-init the original one to the
	 * empty list. Also, set ep->ovflist to NULL so that events
	 * happening while looping w/out locks, are not lost. We cannot
	 * have the poll callback to queue directly on ep->rdllist,
	 * because we want the "sproc" callback to be able to do it
	 * in a lockless way.
	 */
	write_lock_irq(&ep->lock);
	list_splice_init(&ep->rdllist, &txlist);
	WRITE_ONCE(ep->ovflist, NULL);
	write_unlock_irq(&ep->lock);

	/*
	 * Now call the callback function.
	 */
	res = (*sproc)(ep, &txlist, priv);

	write_lock_irq(&ep->lock);
	/*
	 * During the time we spent inside the "sproc" callback, some
	 * other events might have been queued by the poll callback.
	 * We re-insert them inside the main ready-list here.
	 */
	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
		/*
		 * We need to check if the item is already in the list.
		 * During the "sproc" callback execution time, items are
		 * queued into ->ovflist but the "txlist" might already
		 * contain them, and the list_splice() below takes care of them.
		 */
		if (!ep_is_linked(epi)) {
			/*
			 * ->ovflist is LIFO, so we have to reverse it in order
			 * to keep in FIFO.
			 */
			list_add(&epi->rdllink, &ep->rdllist);
			ep_pm_stay_awake(epi);
		}
	}
	/*
	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
	 * releasing the lock, events will be queued in the normal way inside
	 * ep->rdllist.
	 */
	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);

	/*
	 * Quickly re-inject items left on "txlist".
	 */
	list_splice(&txlist, &ep->rdllist);
	__pm_relax(ep->ws);
	write_unlock_irq(&ep->lock);

	if (!ep_locked)
		mutex_unlock(&ep->mtx);

	return res;
}

在上述的epoll_ctl技术代码实现的细节中存在着两个逻辑,即socket描述符注册与唤醒逻辑,主要体现在两个核心方法上,即ep_ptable_queue_proc & ep_item_poll对此分析如下：

注册逻辑：
- 在epoll空间中创建一个epitem的中间层,初始化一系列epitem的属性，同时将新增加的socket描述符包装到epitem下的epoll_filefd中，同时添加唤醒任务wakeup，同时将epitem的内部ep容器指向epoll空间
- 其次在进行item事件的轮询中，通过队列回调的方式将epitem绑定到队列节点entry上,并在entry节点上绑定epoll的回调函数来唤醒业务处理
- 最后是将epitem插入以epoll空间为根节点的红黑数中，后续内核可以通过fd查找到对应的epitem，通过epitem也就可以找到epoll空间引用
唤醒逻辑：
- 在item事件轮询中，通过轮询检测epoll空间中的等待队列是否有对应的节点entry可读，如果有退出循环，并且从当前注册的epitem开始轮询遍历查询就绪的entry节点并将就绪entry节点的socket描述符添加到ready_list上
- 其次在上述注册的逻辑之后，会检查当前的epitem的ready list节点,如果存在ready_list,会将epoll空间的等待队列唤醒,让执行处理的read_process添加到就绪队列中，让cpu能够进行调度
epoll_wait等待逻辑

// epoll_wait -> do_epoll_wait -> ep_poll, 我们关注核心方法ep_poll
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	// ...
	fetch_events: // 检测epoll是否有事件就绪
		// ...
		for (;;) {
		// ...
		// 检测当前ep空间是否有fd事件就绪
		eavail = ep_events_available(ep);
		if (eavail)
			// 是的跳出循环
			break;
		if (signal_pending(current)) {
			res = -EINTR;
			break;
		}
		// 执行休眠方法 schedule()
		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
			timed_out = 1;
			break;
		}
	}
	// ...
	send_events: // ep有事件就绪,将event_poll转换到用户空间中
		//...
		ep_send_events(ep, events, maxevents);
	
}

从上述可以看出等待处理逻辑主要有fetch_event以及send_events,现分析如下：
- 循环检查当前epoll空间是否有就绪事件，如果有将跳出循环，如果没有将执行schedule的方法进入休眠等待再次轮询，原理与select/poll一致
- 其次当有就绪事件的时候,循环遍历将监听变化的事件拷贝到用户空间中,并且会将就绪事件socket添加到epitem的就绪队列ready_list上

最后基于上述的分析做一个分析小结

解决大内存且频繁copy问题
- 首先,epoll通过epoll_create创建epoll空间,同时在创建空间的同时将epoll空间拷贝到内存中,此后epoll对socket描述的注册监听通过epoll空间来进行操作,仅一次拷贝
- 其次,epoll注册将拆分为ADD/MOD/DEL三个操作,分别只对相应的操作进行处理,大大降低频繁调用的次数,相比select/poll机制，由原先高频率的注册等待转换为高频等待，低频注册的处理逻辑
- 接着,还有一点就是每次注册都通过建立一个epitem结构体对socket相关的fd以及file进行封装，并且epitem的ep容器通过指针引用指向epoll空间,即每次新增加一个socket描述符的时候而是通过单个epitem进行操作，相比fdset较为轻量级
- 最后,epoll在内核中通过虚拟内存方式将内核空间与用户空间的一块地址同时映射到相同的物理内存地址中，这块内存对用户空间以及内核空间均为可见，因此可以减少用户空间与内核空间之间的数据交换
解决只对就绪队列进行唤醒循环遍历
- 首先，我们可以看到在注册的过程中，epoll通过epitem将socket描述符存储到epoll_file中，同时将唤醒逻辑read_process也绑定到epitem，这样当处于唤醒状态就会被触发执行，然后在以当前epitem存储到队列entry节点上，并entry节点绑定回调函数，最后将entry节点添加到ep的等待队列上
- 其次，在进行wiat等待过程中，内核在执行file.poll()后会将等待队列上的节点添加到轮询等待中poll wait，处于半唤醒状态，也就是当前是就绪状态但还没唤醒，同时会将唤醒的socket描述符添加到epoll空间的ready list中
- 接着，每当有一个item被唤醒的时候就会退出上述的轮询遍历并持续设置当前的item处于唤醒状态，然后epoll空间开始遍历item（单链表存储）并执行回调函数通知，如果item为就绪状态，就将epoll空间的readylist拷贝到当前唤醒节点的epitem的ready list中
- 最后，会更新监听变化的事件状态，返回到用户进程，用户进程这个时候获取到ready list中的描述符均为可就绪状态
epoll其他技术
- epoll支持并发执行，上述的休眠与唤醒逻辑都有加锁操作
- 其次对于就绪状态的ready_list是属于无锁操作，因此为了保证执行并发的安全性在epoll使用的加锁方式全局锁

边缘触发与水平触发

边缘触发与水平触发定义

水平触发
- socket接收数据的缓冲区不为空的时候，则一直触发读事件，相当于"不断地询问是否数据是否可读"
- socket发送数据的缓冲区不全满的时候，则一直触发写事件，相当于"不断地询问是否有区域可以让数据写入"
  本质上就是一个不断进行交流的过程，如下图所示：
边缘触发
- socket接收数据的缓冲区发生变化，则触发读取事件，也就是当空的接收数据的socket缓冲区这个时候有数据传送过来的时候触发
- socket发送数据的缓冲区发生变化，则触发写入事件，也就是当满的发送数据的socket缓冲区这个时候刚刷新数据初期的时候触发
  本质上就是socket缓冲区变化而触发，如下图所示：
上述的触发事件会调用epoll_wait方法，也就是
- 水平触发会多次调用epoll_wait
- 边缘触发在socket缓冲区中不发生改变那么就不会调用epoll_wait的方式

水平触发与边缘触发代码

// 默认为水平触发对应标志为EPOLLONESHOT, 边缘触发标志为EPOLLET
list_for_each_entry_safe(epi, tmp, head, rdllink) {
		if (esed->res >= esed->maxevents)
			break;
        
        // 执行唤醒逻辑
		ws = ep_wakeup_source(epi);
		if (ws) {
			if (ws->active)
				__pm_stay_awake(ep->ws);
			__pm_relax(ws);
		}

        // 移除epitem下的ready_list
		list_del_init(&epi->rdllink);

        
        // 重新轮询事件收集就绪事件
		revents = ep_item_poll(epi, &pt, 1);
		if (!revents)
			continue;

		if (__put_user(revents, &uevent->events) ||
		    __put_user(epi->event.data, &uevent->data)) {
			list_add(&epi->rdllink, head);
			ep_pm_stay_awake(epi);
			if (!esed->res)
				esed->res = -EFAULT;
			return 0;
		}
		esed->res++;
		uevent++;
	    
	 
		if (epi->event.events & EPOLLONESHOT)
			epi->event.events &= EP_PRIVATE_BITS;
		else if (!(epi->event.events & EPOLLET)) {
			 // 水平触发模式，需要重新添加到ready_list以便于调用epoll_wait的时候能够检查到事件可用
			list_add_tail(&epi->rdllink, &ep->rdllist);
			ep_pm_stay_awake(epi);
		}
	}

水平触发：遍历epoll下的等待队列的每个entry，唤醒entry节点之后从ready_list移除当前socket事件，然后再轮询当前item收集可用的事件，最后添加到ready_list以便于调用epoll_wait的时候能够检查到socket事件可用
边缘触发：遍历epoll下的等待队列的每个entry，唤醒entry节点之后从ready_list移除当前socket事件，再轮询当前item收集可用的事件然后唤醒执行的业务处理read_process

疾风先生

发布了81 篇原创文章 · 获赞 27 · 访问量 3万+

私信关注

深入epoll技术原理分析

epoll技术

边缘触发与水平触发

猜你喜欢