网络编程---IO复用之epoll模型

前文已经介绍了IO复用的select模型，并列举了在并发数量比较大的时候select的缺点，本文将介绍另外一种IO模型，epoll模型，该模型能够高效解决数百万计的并发量，因此在大规模的网络编程中应用很广，在大规模网络编程中Linux中选择epoll,windows下选择IOCP。

epoll的相关系统调用只有三个，使用上比较简单，但是要想能够很好的使用epoll需要如何了解epoll各种模式比如水平触发（LT），边缘触发（ET）。

1、int epoll_create(int size)

在Linux 2.6.8后参数size是忽略的，调用该接口将创建一个epoll的句柄，在使用完该句柄后必须当调用close()关闭该文件句柄。

2、int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)

epoll的事件注册接口，调用该函数将向epoll注册一个事件，事件的类型由event结构体制定。

参数epfd是epoll_create的返回值，第二个参数OP是一个epoll的枚举，第三个参数是一个socke文件描述符：

EPOLL_CTL_ADD //将epoll注册一个事件

EPOLL_CTL_DEL //删除一个事件

EPOLL_CTL_MOD //修改已经注册的事件。

第四个参数是一个epoll_event结构体类型指针，定义如下：

扫描二维码关注公众号，回复： 905225 查看本文章

typedef union epoll_data
{
  void *ptr;
  int fd;
  uint32_t u32;
  uint64_t u64;
} epoll_data_t;

struct epoll_event
{
  uint32_t events;	/* Epoll events */
  epoll_data_t data;	/* User data variable */
} __EPOLL_PACKED;

3 、int epoll_wait (int __epfd, struct epoll_event *__events, int __maxevents, int __timeout);

等待已注册的事件发生，参数__events是已经分配好的epoll_event数组，内核在注册事件发生后会将相应的event拷贝到events数组中，__maxevent是最大的event数量，__timeout是epoll_wait函数阻塞事件，若timeout==-1则函数一直阻塞直到有注册的事件发生，若timeout == 0则epoll_wait扫描完注册的事件集后立即返回，若是其他整数，则epoll_wait在等待该指定的事件后还没有事件发生，则函数超时返回。

epoll相关的源码可参考系统的<sys/epoll.h>文件

/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef	_SYS_EPOLL_H
#define	_SYS_EPOLL_H	1

#include <stdint.h>
#include <sys/types.h>

/* Get __sigset_t.  */
#include <bits/sigset.h>

#ifndef __sigset_t_defined
# define __sigset_t_defined
typedef __sigset_t sigset_t;
#endif

/* Get the platform-dependent flags.  */
#include <bits/epoll.h>

#ifndef __EPOLL_PACKED
# define __EPOLL_PACKED
#endif


enum EPOLL_EVENTS
  {
    EPOLLIN = 0x001,
#define EPOLLIN EPOLLIN
    EPOLLPRI = 0x002,
#define EPOLLPRI EPOLLPRI
    EPOLLOUT = 0x004,
#define EPOLLOUT EPOLLOUT
    EPOLLRDNORM = 0x040,
#define EPOLLRDNORM EPOLLRDNORM
    EPOLLRDBAND = 0x080,
#define EPOLLRDBAND EPOLLRDBAND
    EPOLLWRNORM = 0x100,
#define EPOLLWRNORM EPOLLWRNORM
    EPOLLWRBAND = 0x200,
#define EPOLLWRBAND EPOLLWRBAND
    EPOLLMSG = 0x400,
#define EPOLLMSG EPOLLMSG
    EPOLLERR = 0x008,
#define EPOLLERR EPOLLERR
    EPOLLHUP = 0x010,
#define EPOLLHUP EPOLLHUP
    EPOLLRDHUP = 0x2000,
#define EPOLLRDHUP EPOLLRDHUP
    EPOLLWAKEUP = 1u << 29,
#define EPOLLWAKEUP EPOLLWAKEUP
    EPOLLONESHOT = 1u << 30,
#define EPOLLONESHOT EPOLLONESHOT
    EPOLLET = 1u << 31
#define EPOLLET EPOLLET
  };


/* Valid opcodes ( "op" parameter ) to issue to epoll_ctl().  */
#define EPOLL_CTL_ADD 1	/* Add a file descriptor to the interface.  */
#define EPOLL_CTL_DEL 2	/* Remove a file descriptor from the interface.  */
#define EPOLL_CTL_MOD 3	/* Change file descriptor epoll_event structure.  */


typedef union epoll_data
{
  void *ptr;
  int fd;
  uint32_t u32;
  uint64_t u64;
} epoll_data_t;

struct epoll_event
{
  uint32_t events;	/* Epoll events */
  epoll_data_t data;	/* User data variable */
} __EPOLL_PACKED;


__BEGIN_DECLS

/* Creates an epoll instance.  Returns an fd for the new instance.
   The "size" parameter is a hint specifying the number of file
   descriptors to be associated with the new instance.  The fd
   returned by epoll_create() should be closed with close().  */
extern int epoll_create (int __size) __THROW;

/* Same as epoll_create but with an FLAGS parameter.  The unused SIZE
   parameter has been dropped.  */
extern int epoll_create1 (int __flags) __THROW;


/* Manipulate an epoll instance "epfd". Returns 0 in case of success,
   -1 in case of error ( the "errno" variable will contain the
   specific error code ) The "op" parameter is one of the EPOLL_CTL_*
   constants defined above. The "fd" parameter is the target of the
   operation. The "event" parameter describes which events the caller
   is interested in and any associated user data.  */
extern int epoll_ctl (int __epfd, int __op, int __fd,
		      struct epoll_event *__event) __THROW;


/* Wait for events on an epoll instance "epfd". Returns the number of
   triggered events returned in "events" buffer. Or -1 in case of
   error with the "errno" variable set to the specific error code. The
   "events" parameter is a buffer that will contain triggered
   events. The "maxevents" is the maximum number of events to be
   returned ( usually size of "events" ). The "timeout" parameter
   specifies the maximum wait time in milliseconds (-1 == infinite).

   This function is a cancellation point and therefore not marked with
   __THROW.  */
extern int epoll_wait (int __epfd, struct epoll_event *__events,
		       int __maxevents, int __timeout);


/* Same as epoll_wait, but the thread's signal mask is temporarily
   and atomically replaced with the one provided as parameter.

   This function is a cancellation point and therefore not marked with
   __THROW.  */
extern int epoll_pwait (int __epfd, struct epoll_event *__events,
			int __maxevents, int __timeout,
			const __sigset_t *__ss);

__END_DECLS

#endif /* sys/epoll.h */

Epoll的工作原理

与select类似，epoll也是扫描用户注册的事件，然后等待事件的发生，但是与select不同的是epoll_wait函数返回的不是文件描述符，而是文件描述符的数量，用户需要去events数组中查看当前文件描述符发生了什么事件，此外select需要用户去全部扫描注册的事件，而epoll仅将已经发生了的事件放在events数组中，在这里epoll使用了内存映射技术（mmap）,节省了文件描述符从内核复制到用户空间的开销，更主要的是epoll采用了一种类似Rector的模式，用户通过epoll_ctl向内核注册事件，一旦事件发生epoll通过类似于回调的形式通知用户，而select需要主动去扫描。

关于epoll的ET和LT

ET（Edge trigger，边缘触发），当epoll处于该中事件触发通知模式下，当有数据到达时，内核只通知一次，此后内核不再通知，若用户在事件发生后未把数据一次性读完，则内核会抛弃该部分未读数据。

LT(level trigger，水平触发)，与ET相对，当epoll工作在该种模式下，当有数据到达时，内核会一直通知，直到用户读取完缓冲区的数据。

内核默认工作在何种模式随系统而定，用户可通过epoll_cntl注册事件时将事件设置为ET或者LT，

	event.data.fd = listenfd;
	event.events = EPOLLIN  | EPOLLET;
	epoll_ctl(efd, EPOLL_CTL_ADD, listenfd, &event)

epoll实现的ECHO服务器实例：

#include <sys/socket.h>
#include <netinet/in.h>
#include <sys/time.h>
#include <sys/epoll.h>
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include <arpa/inet.h>
#include <string.h>
#include <unistd.h>

#define MAX_EVENT_NUM 1024
int make_socket_nonblock(int fd)
{
	int flags;
	if((flags = fcntl(fd,F_GETFL,0))< 0 )
	{
		perror("get flags failed");
		return -1;
	}
	flags |= O_NONBLOCK;
	if(fcntl(fd, F_SETFL,flags) < 0)
	{
		perror("set flags failed");
		return -1;
	}
	return 0;
}


int make_socket_reuseable(int fd)
{
	int reuse = 1;
	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)) < 0)
		return -1;
	return 0;
}

int doAccept(int fd)
{
	struct sockaddr_in cliAddr;
	char ipaddr[32] = {0}; 
	bzero(&cliAddr, sizeof(cliAddr));
	socklen_t len = sizeof(cliAddr);
	int connfd  = accept(fd, (struct sockaddr*)&cliAddr, &len);
	if(connfd < 0)
		return -1;
	inet_ntop(AF_INET,&(cliAddr.sin_addr),ipaddr,32);
	printf("accept client connect %s:%d.\n", ipaddr,ntohs(cliAddr.sin_port));
	return connfd;
}


void doRead(int fd)
{
	char recvBuf[1024] = {0};
	while(recv(fd,recvBuf, 1024, 0) > 0)
	{
		printf("msg recv is %s\n",recvBuf);
		send(fd, recvBuf, sizeof(recvBuf), 0);
		char * temp = recvBuf;
		while(*temp != '\r' && *temp != '\n')
			temp++;
		*temp == '\0';
		if(strcmp(recvBuf, "quit") == 0)
			exit(0);
	}
}

int main()
{
	int efd;
	int connfd;
	struct epoll_event event;
	struct epoll_event * events = NULL;
	struct sockaddr_in SerAddr;
	bzero(&SerAddr, sizeof(SerAddr));
	SerAddr.sin_addr.s_addr = htonl(INADDR_ANY);
	SerAddr.sin_port = htons(12345);
	SerAddr.sin_family = AF_INET;

	int listenfd = socket(AF_INET, SOCK_STREAM,0);
	if(listenfd < 0)
	{
		perror("listenfd < 0");
		return -1;
	}
	if(make_socket_nonblock(listenfd)< 0)
		return -1;
	if(make_socket_reuseable(listenfd) < 0)
		return -1;
	if(bind(listenfd, (struct sockaddr*)&SerAddr, sizeof(SerAddr))< 0)
	{
		perror("bind error");
		return -1;
	}
	efd = epoll_create(32000);
	if(efd < 0)
	{
		perror("epoll create");
		return -1;
	}
	event.data.fd = listenfd;
	event.events = EPOLLIN  | EPOLLET;
	if(epoll_ctl(efd, EPOLL_CTL_ADD, listenfd, &event) < 0)
	{
		perror("epoll-cntl");
		return -1;
	}
	events = (epoll_event*)calloc(MAX_EVENT_NUM, sizeof(event));
	while(1)
	{
		int n = epoll_wait(efd, events, MAX_EVENT_NUM, -1);
		for(int i=0; i < n; i++)
		{
			if(events[i].data.fd == listenfd)
			{
				if((connfd = doAccept(listenfd)) < 0)
				{
					perror("accept failed.\n");
					continue;
				}
				
				if(make_socket_nonblock(connfd) < 0)
				{
					perror("make non block failed");
					return -1;
				}
				
				event.data.fd = connfd;
				event.events = EPOLLIN | EPOLLET | EPOLLOUT ;
				if(epoll_ctl(efd, EPOLL_CTL_ADD, connfd, &event) < 0)
				{
					perror("epoll add");
					return -1;
				}
			}
			else if(events[i].events & EPOLLIN)
			{
				doRead(listenfd);
			}
			else if(events[i].events & EPOLLOUT)
			{

			}
			else
			{
				perror("epoll error");
				close(events[i].data.fd);
			}
		}
	}
	return 0;
}

epoll的优点:

epoll能够支持进程打开支持的最大数目的描述符，因此能够支持数目较大的并发量，与select，多线程/多进程相比，epoll的这个优势无疑是明显，可以通过cat /proc/sys/fs/file-max 查看当前进程支持的最大文件描述符数量，通常1G内存是10W左右，这个数字和内存相关。

IO效率不随文件描述符的增长而变差，select的一个缺点是当有一个很大的描述符集时，不管当前有多少文件描述符处于活动状态，select都会全部扫描，导致IO效率下降，而epoll不存在这个问题，epoll关注活动的的描述符，当某一个描述符处于活动状态后，它类似于回调一样告诉内核该描述符准备就绪，因此对于那些没有处于活动状态的描述符，epoll不关注，因此不影响IO效率。

使用内存映射(mmap)加速内核与用户空间数据传递，epoll不像select将所有描述符都从内核空间拷贝到用户空间，而是采用了内存映射计数，减少了内核数据拷贝，当描述符越大，这种效率的提升就越明显。

以上epoll的特点就是epoll能够支持数百万计并发的理由，因此当前Linux服务器的实现大多都采用epoll的机制。

网络编程---IO复用之epoll模型

猜你喜欢