linux-2.6.11.12内核中的epoll数据结构和机制简析

大家都知道epoll的效率比poll和select性能高很多,原因在于在实现时,内核为epoll设置了一个文件系统,并且使用的是红黑树来增删改查struct epitem,然后在监控的文件描述符有数据时,通过回调将struct epitem挂载到struct eventpoll的就绪队列中,不需要将整个fd集合发送到用户态,并轮询查找就绪fd,效率高。

其中的文件系统通过eventpoll_init来初始化这个文件系统。先注册文件系统eventpoll_fs_type,名字为eventpollfs。调用kern_mount将该文件系统安装上,返回值保存在eventpoll_mnt全局变量中。基本原理可以参考linux文件系统-文件系统的安装与拆卸

epoll的系统调用:

epoll_create:创建struct eventpoll 
epoll_ctl:增删改事件
epoll_wait:等待就绪事件

先看个图:

eventpoll_init:



static int __init eventpoll_init(void)
{
......

	/*
	 * Register the virtual file system that will be the source of inodes
	 * for the eventpoll files
	 */
	error = register_filesystem(&eventpoll_fs_type);//注册文件系统
	if (error)
		goto epanic;

	/* Mount the above commented virtual file system */
	eventpoll_mnt = kern_mount(&eventpoll_fs_type);//挂载文件系统
	error = PTR_ERR(eventpoll_mnt);
	if (IS_ERR(eventpoll_mnt))
		goto epanic;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n",
			current));
	return 0;

epanic:
	panic("eventpoll_init() failed\n");
}

sys_epoll_create:



/*
 * It opens an eventpoll file descriptor by suggesting a storage of "size"
 * file descriptors. The size parameter is just an hint about how to size
 * data structures. It won't prevent the user to store more than "size"
 * file descriptors inside the epoll interface. It is the kernel part of
 * the userspace epoll_create(2).
 */
asmlinkage long sys_epoll_create(int size)
{
	int error, fd;
	struct inode *inode;
	struct file *file;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
		     current, size));

	/* Sanity check on the size parameter */
	error = -EINVAL;
	if (size <= 0)//传入参数检查,必须大于0
		goto eexit_1;

	/*
	 * Creates all the items needed to setup an eventpoll file. That is,
	 * a file structure, and inode and a free file descriptor.
	 */
	error = ep_getfd(&fd, &inode, &file);//创建file、inode、dentry,并相互关联
	if (error)
		goto eexit_1;

	/* Setup the file internal data structure ( "struct eventpoll" ) */
   /*创建一个eventpoll对象,初始化等待队列、红黑树、就绪队列,并将其地址挂入file->private_data*/
	error = ep_file_init(file);
	if (error)
		goto eexit_2;


	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
		     current, size, fd));

	return fd;//返回对应的文件描述符,在调用epoll_wait、epoll_ctrl作为参数传入

eexit_2:
	sys_close(fd);
eexit_1:
	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
		     current, size, error));
	return error;
}

 sys_epoll_create=>ep_getfd:(这个函数大部分与文件系统有关,暂不列出)

继续往下看:

sys_epoll_create=>ep_file_init



static int ep_file_init(struct file *file)
{
	struct eventpoll *ep;

	if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
		return -ENOMEM;

	memset(ep, 0, sizeof(*ep));
	rwlock_init(&ep->lock);
	init_rwsem(&ep->sem);
	init_waitqueue_head(&ep->wq);
	init_waitqueue_head(&ep->poll_wait);
	INIT_LIST_HEAD(&ep->rdllist);
	ep->rbr = RB_ROOT;

	file->private_data = ep;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",
		     current, ep));
	return 0;
}

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);

根据上面的函数参数来看sys_epoll_ctl的实现,第一个参数为sys_epoll_create返回的fd,通过它我们可以找到对应的struct eventpoll,第二个参数op对应于增删改,第三个参数对应为对应的文件描述符,第四个参数为感兴趣的事件读写之类的:

sys_epoll_ctl:



/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.  It represents
 * the kernel part of the user space epoll_ctl(2).
 */
asmlinkage long
sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
{
	int error;
	struct file *file, *tfile;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
		     current, epfd, op, fd, event));

	error = -EFAULT;
	if (EP_OP_HASH_EVENT(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto eexit_1;

	/* Get the "struct file *" for the eventpoll file */
	error = -EBADF;
	file = fget(epfd);//获取struct eventpoll对应的file结构
	if (!file)
		goto eexit_1;

	/* Get the "struct file *" for the target file */
	tfile = fget(fd);//监听的文件描述符的file结构
	if (!tfile)
		goto eexit_2;

	/* The target file descriptor must support poll */
	error = -EPERM;
	if (!tfile->f_op || !tfile->f_op->poll)//如果监听的文件的文件系统不支持poll,报错返回
		goto eexit_3;

	/*
	 * We have to check that the file structure underneath the file descriptor
	 * the user passed to us _is_ an eventpoll file. And also we do not permit
	 * adding an epoll file descriptor inside itself.
	 */
	error = -EINVAL;
	if (file == tfile || !IS_FILE_EPOLL(file))
		goto eexit_3;

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	ep = file->private_data;//获取struct eventpoll

	down_write(&ep->sem);

	/* Try to lookup the file inside our hash table */
	epi = ep_find(ep, tfile, fd);//先查找对应的struct epitem 

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD://添加,进入该分支
		if (!epi) {
			epds.events |= POLLERR | POLLHUP;

			error = ep_insert(ep, &epds, tfile, fd);
		} else
			error = -EEXIST;
		break;
	case EPOLL_CTL_DEL://删除,进入该分支
		if (epi)
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
	case EPOLL_CTL_MOD://修改分支
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;
		break;
	}

	/*
	 * The function ep_find() increments the usage count of the structure
	 * so, if this is not NULL, we need to release it.
	 */
	if (epi)
		ep_release_epitem(epi);

	up_write(&ep->sem);

eexit_3:
	fput(tfile);
eexit_2:
	fput(file);
eexit_1:
	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
		     current, epfd, op, fd, event, error));

	return error;
}

我们以添加为例,并且是TCP连接:

sys_epoll_ctl=>ep_insert

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
		     struct file *tfile, int fd)
{
	int error, revents, pwake = 0;
	unsigned long flags;
	struct epitem *epi;
	struct ep_pqueue epq;

	error = -ENOMEM;
	if (!(epi = EPI_MEM_ALLOC()))
		goto eexit_1;

	/* Item initialization follow here ... */
	EP_RB_INITNODE(&epi->rbn);//红黑树节点初始化
	INIT_LIST_HEAD(&epi->rdllink);//就绪队列节点的初始化
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->txlink);
	INIT_LIST_HEAD(&epi->pwqlist);//该节点等待队列的初始化
	epi->ep = ep;//重点:保存struct eventpoll的地址,为后面添加到就绪队列做准备
	EP_SET_FFD(&epi->ffd, tfile, fd);//设置红黑树的key
	epi->event = *event;//感兴趣的事件类型
	atomic_set(&epi->usecnt, 1);//增加使用计数
	epi->nwait = 0;

	/* Initialize the poll table using the queue callback */
	epq.epi = epi;//重点,后面在挂入的时候会将其设置到struct eppoll_entry的base属性中
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);//设置将epi设置到sk->sk_sleep的处理函数
	/*
	 * Attach the item to the poll hooks and get current event bits.
	 * We can safely use the file* here because its usage count has
	 * been increased by the caller of this function.
	 */
	revents = tfile->f_op->poll(tfile, &epq.pt);//将epi设置到sk->sk_sleep

	/*
	 * We have to check if something went wrong during the poll wait queue
	 * install process. Namely an allocation for a wait queue failed due
	 * high memory pressure.
	 */
	if (epi->nwait < 0)
		goto eexit_2;

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_ep_lock);
	list_add_tail(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_ep_lock);

	/* We have to drop the new item inside our item list to keep track of it */
	write_lock_irqsave(&ep->lock, flags);

	/* Add the current item to the rb-tree */
	ep_rbtree_insert(ep, epi);//插入到红黑树中

	/* If the file is already "ready" we drop it inside the ready list */
	if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);

		/* Notify waiting tasks that events are available */
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	write_unlock_irqrestore(&ep->lock, flags);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&psw, &ep->poll_wait);

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
		     current, ep, tfile, fd));

	return 0;

eexit_2:
	ep_unregister_pollwait(ep, epi);

	/*
	 * We need to do this because an event could have been arrived on some
	 * allocated wait queue.
	 */
	write_lock_irqsave(&ep->lock, flags);
	if (EP_IS_LINKED(&epi->rdllink))
		EP_LIST_DEL(&epi->rdllink);
	write_unlock_irqrestore(&ep->lock, flags);

	EPI_MEM_FREE(epi);
eexit_1:
	return error;
}

sys_epoll_ctl=>ep_insert=>init_poll_funcptr

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
	pt->qproc = qproc;//将ep_ptable_queue_proc设置到pt中
}

sys_epoll_ctl=>ep_insert=>sock_poll

/* No kernel lock held - perfect */
static unsigned int sock_poll(struct file *file, poll_table * wait)
{
	struct socket *sock;

	/*
	 *	We can't return errors to poll, so it's either yes or no. 
	 */
	sock = SOCKET_I(file->f_dentry->d_inode);
	return sock->ops->poll(file, sock, wait);
}

sys_epoll_ctl=>ep_insert=>sock_poll=>tcp_poll


/*
 *	Wait for a TCP event.
 *
 *	Note that we don't need to lock the socket, as the upper poll layers
 *	take care of normal races (between the test and the event) and we don't
 *	go look at any of the socket buffers directly.
 */
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
	unsigned int mask;
	struct sock *sk = sock->sk;
	struct tcp_sock *tp = tcp_sk(sk);

	poll_wait(file, sk->sk_sleep, wait);//这里将wait挂入sk->sk_sleep
......
	return mask;
}

sys_epoll_ctl=>ep_insert=>sock_poll=>tcp_poll=>poll_wait

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && wait_address)
		p->qproc(filp, wait_address, p);//这里调用的是ep_ptable_queue_proc
}

sys_epoll_ctl=>ep_insert=>sock_poll=>tcp_poll=>poll_wait=>ep_ptable_queue_proc

/*
 * This is the callback that is used to add our wait queue to the
 * target file wakeup lists.
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
				 poll_table *pt)
{
	struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt);
	struct eppoll_entry *pwq;

	if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) {
		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);//重点:这里在socket的事件到来时,循环遍历执行sk->sk_sleep的节点的回调函数时,会调用ep_poll_callback,在里面将epi挂入ep的rdllist队列中,然后唤醒在ep上等待的进程
		pwq->whead = whead;
		pwq->base = epi;//将struct epitem对象地址保存到base
		add_wait_queue(whead, &pwq->wait);//将等待项挂入sk->sk_sleep
		list_add_tail(&pwq->llink, &epi->pwqlist);
		epi->nwait++;
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;
	}
}

继续回到ep_insert函数中,里面调用ep_rbtree_insert将epi插入ep的红黑树中,一些其他处理完成后,返回。

接着看epoll_wait的实现:

int epoll_wait(int epfd, struct epoll_event *events,
                      int maxevents, int timeout);

对应内核的函数为:

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_wait(2).
 */
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
			       int maxevents, int timeout)
{
	int error;
	struct file *file;
	struct eventpoll *ep;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
		     current, epfd, events, maxevents, timeout));

	/* The maximum number of event must be greater than zero */
	if (maxevents <= 0)
		return -EINVAL;

	/* Verify that the area passed by the user is writeable */
	if ((error = verify_area(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))))
		goto eexit_1;

	/* Get the "struct file *" for the eventpoll file */
	error = -EBADF;
	file = fget(epfd);//获取对应struct eventpoll的file结构
	if (!file)
		goto eexit_1;

	/*
	 * We have to check that the file structure underneath the fd
	 * the user passed to us _is_ an eventpoll file.
	 */
	error = -EINVAL;
	if (!IS_FILE_EPOLL(file))
		goto eexit_2;

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	ep = file->private_data;//拿到struct eventpoll结构

	/* Time to fish for events ... */
	error = ep_poll(ep, events, maxevents, timeout);//主体函数

eexit_2:
	fput(file);
eexit_1:
	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
		     current, epfd, events, maxevents, timeout, error));

	return error;
}

sys_epoll_wait=>ep_poll



static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	int res, eavail;
	unsigned long flags;
	long jtimeout;
	wait_queue_t wait;

	/*
	 * Calculate the timeout by checking for the "infinite" value ( -1 )
	 * and the overflow condition. The passed timeout is in milliseconds,
	 * that why (t * HZ) / 1000.
	 */
	jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
		MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000;//超时时间检查和换算

retry:
	write_lock_irqsave(&ep->lock, flags);

	res = 0;
	if (list_empty(&ep->rdllist)) {//如果就绪队列为空
		/*
		 * We don't have any available event to return to the caller.
		 * We need to sleep here, and we will be wake up by
		 * ep_poll_callback() when events will become available.
		 */
		init_waitqueue_entry(&wait, current);
		add_wait_queue(&ep->wq, &wait);//设置等待项

		for (;;) {
			/*
			 * We don't want to sleep if the ep_poll_callback() sends us
			 * a wakeup in between. That's why we set the task state
			 * to TASK_INTERRUPTIBLE before doing the checks.
			 */
			set_current_state(TASK_INTERRUPTIBLE);//设置为可唤醒中断状态
			if (!list_empty(&ep->rdllist) || !jtimeout)//检查就绪队列是否有节点,或者超时时间已到,终止整个循环
				break;
			if (signal_pending(current)) {//检查是否有待处理信号
				res = -EINTR;
				break;
			}

			write_unlock_irqrestore(&ep->lock, flags);
			jtimeout = schedule_timeout(jtimeout);//调度出去,返回时,返回值为剩余时间
			write_lock_irqsave(&ep->lock, flags);
		}
		remove_wait_queue(&ep->wq, &wait);

		set_current_state(TASK_RUNNING);//设置为就绪状态
	}

	/* Is it worth to try to dig for events ? */
	eavail = !list_empty(&ep->rdllist);

	write_unlock_irqrestore(&ep->lock, flags);

	/*
	 * Try to transfer events to user space. In case we get 0 events and
	 * there's still timeout left over, we go trying again in search of
	 * more luck.
	 */
	if (!res && eavail &&
	    !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)//如果就绪队列不为空,就发送到用户态进程
		goto retry;

	return res;
}

sys_epoll_wait=>ep_poll=>ep_events_transfer

/*
 * Perform the transfer of events to user space.
 */
static int ep_events_transfer(struct eventpoll *ep,
			      struct epoll_event __user *events, int maxevents)
{
	int eventcnt = 0;
	struct list_head txlist;

	INIT_LIST_HEAD(&txlist);

	/*
	 * We need to lock this because we could be hit by
	 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
	 */
	down_read(&ep->sem);

	/* Collect/extract ready items */
	if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {//内部将rdllist转移到txlist
		/* Build result set in userspace */
		eventcnt = ep_send_events(ep, &txlist, events);//通过txlist将数据拷贝到用户态缓冲区中

		/* Reinject ready items into the ready list */
		ep_reinject_items(ep, &txlist);
	}

	up_read(&ep->sem);

	return eventcnt;
}

sys_epoll_wait=>ep_poll=>ep_events_transfer=>ep_collect_ready_items

/*
 * Since we have to release the lock during the __copy_to_user() operation and
 * during the f_op->poll() call, we try to collect the maximum number of items
 * by reducing the irqlock/irqunlock switching rate.
 */
static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
{
	int nepi;
	unsigned long flags;
	struct list_head *lsthead = &ep->rdllist, *lnk;
	struct epitem *epi;

	write_lock_irqsave(&ep->lock, flags);
    //循环将rdllist的节点脱链,并链入txlist, 最多maxevents
	for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
		epi = list_entry(lnk, struct epitem, rdllink);

		lnk = lnk->next;

		/* If this file is already in the ready list we exit soon */
		if (!EP_IS_LINKED(&epi->txlink)) {
			/*
			 * This is initialized in this way so that the default
			 * behaviour of the reinjecting code will be to push back
			 * the item inside the ready list.
			 */
			epi->revents = epi->event.events;

			/* Link the ready item into the transfer list */
			list_add(&epi->txlink, txlist);//链入txlist
			nepi++;

			/*
			 * Unlink the item from the ready list.
			 */
			EP_LIST_DEL(&epi->rdllink);//脱链
		}
	}

	write_unlock_irqrestore(&ep->lock, flags);

	return nepi;
}

sys_epoll_wait=>ep_poll=>ep_events_transfer=>ep_send_events

/*
 * This function is called without holding the "ep->lock" since the call to
 * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
 * because of the way poll() is traditionally implemented in Linux.
 */
static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
			  struct epoll_event __user *events)
{
	int eventcnt = 0;
	unsigned int revents;
	struct list_head *lnk;
	struct epitem *epi;

	/*
	 * We can loop without lock because this is a task private list.
	 * The test done during the collection loop will guarantee us that
	 * another task will not try to collect this file. Also, items
	 * cannot vanish during the loop because we are holding "sem".
	 */
	list_for_each(lnk, txlist) {
		epi = list_entry(lnk, struct epitem, txlink);

		/*
		 * Get the ready file event set. We can safely use the file
		 * because we are holding the "sem" in read and this will
		 * guarantee that both the file and the item will not vanish.
		 */
		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);//返回socket的事件

		/*
		 * Set the return event set for the current file descriptor.
		 * Note that only the task task was successfully able to link
		 * the item to its "txlist" will write this field.
		 */
		epi->revents = revents & epi->event.events;

		if (epi->revents) {//拷贝到用户态缓冲区中
			if (__put_user(epi->revents,
				       &events[eventcnt].events) ||
			    __put_user(epi->event.data,
				       &events[eventcnt].data))
				return -EFAULT;
			if (epi->event.events & EPOLLONESHOT)
				epi->event.events &= EP_PRIVATE_BITS;
			eventcnt++;
		}
	}
	return eventcnt;
}

到这里,整个过程基本走完了。有一个点还没讲到,就是socket有数据过来时,怎么触发执行的ep_poll_callback。这个要从创建socket开始看,sys_socket经过一系列操作会执行sock_init_data函数中,里面一行代码非常重要就是:

	sk->sk_data_ready	=	sock_def_readable;

我们来看下sock_def_readable做了什么:

static void sock_def_readable(struct sock *sk, int len)
{
	read_lock(&sk->sk_callback_lock);
	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
		wake_up_interruptible(sk->sk_sleep);//这里会执行epoll的回调函数ep_poll_callback
	sk_wake_async(sk,1,POLL_IN);
	read_unlock(&sk->sk_callback_lock);
}

到这里,所有的就讲完了。

おすすめ

転載: blog.csdn.net/guoguangwu/article/details/121322115
おすすめ