从零开始之驱动发开、linux驱动（十六、poll机制）

上一节分析了系统调用函数的封装，这一节我们学习一下poll系统调用函数。就从最开始的sys_poll开始。

在开始之前我们先了解一下poll函数的作用。

使用下面命令查看poll的使用方法和作用可知

man 2  poll

官方的解释是:poll函数是用来，等待一组文件描述符中的一个准备好执行 I / O

int poll(struct pollfd *fds, nfds_t nfds, int timeout);

函数原型如上所示

其中第一个结构体指针代表这个文件描述符，第二个参数代表文件的数量（上面说了是一组文件描述符），第三个参数，代表等待超时自动唤醒时间。

官网的解释说的太拗口，这用白话解释一下，poll函数可以等待多个文件描述符（该进程open打开的多个文件），如果要继续执行某个操作，必须要先得到IO，否则就睡眠，但这里的睡眠可以设置睡眠时间限制，即如果在设定时间到来之前还没有发生该事件，则时间到后会自动唤醒。其中因为可以设置多个文件描述符的等待事件，这里只要等到一个文件描述符的事件，则就会立刻返回。

在这里我先列出代码，后面分析poll机制的实现。

#include <linux/fs.h>       /* 包含file_operation结构体 */
#include <linux/init.h>     /* 包含module_init module_exit */
#include <linux/module.h>   /* 包含LICENSE的宏 */
#include <linux/io.h>
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/gfp.h>
#include <linux/interrupt.h>
#include <linux/device.h>
#include <linux/gpio.h>
#include <linux/kernel.h>
#include <linux/highmem.h> /* For wait_event_interruptible */
#include <linux/poll.h>
#include <asm/gpio.h>
#include <asm/uaccess.h>



static unsigned int major;
static struct class *button_class;
static struct device *button_dev;

static unsigned char key_val;
/* 定义一个等待队列的头 */
static DECLARE_WAIT_QUEUE_HEAD(button_wait_q);
/* 定义一个事件标志 */
static volatile int ev_press = 0;

struct pin_desc {
    unsigned int    pin;
    unsigned int    key_val;
};


/* 按下时 值分别是  0x01 , 0x02 */
/* 松开时 值分别是  0x00 , 0x00 */
static struct pin_desc pins_desc[] = {
    {S5PV210_GPH0(2), 0x01},
    {S5PV210_GPH0(3), 0x02},
};



static irqreturn_t irq_handler(int irq, void *dev_id)
{
    struct pin_desc *p = dev_id;
    int pin_val;

    pin_val =  gpio_get_value(p->pin);

    /* 得到键值,判断时按下还是松开 */
    if(pin_val)
    {
        /* 松开 */
        key_val &= ~p->key_val;
    }
    else
    {
        /* 按下 */
        key_val |= p->key_val;
    }
    /* 标记有事件发生 */
    ev_press = 1;
    /* 唤醒等待队列 */
    wake_up_interruptible(&button_wait_q);

    return IRQ_HANDLED;
}


/* open函数 */
static int button_drv_open(struct inode *inode, struct file *file)
{
    int ret = 0;

    /* 我们在ope中申请中断,可以在安装了驱动,但应用未使用的情况下不申请中断,这种情况其它外设也能用这个中断 */
    ret = request_irq(IRQ_EINT(2), irq_handler, IRQF_TRIGGER_RISING|IRQF_TRIGGER_FALLING, "irq-eint2",&pins_desc[0]);
    if(ret)
    {
        printk(KERN_ERR"request_irq IRQ_EINT(2) fail");
        return -1；
    }
    ret = request_irq(IRQ_EINT(3), irq_handler, IRQF_TRIGGER_RISING|IRQF_TRIGGER_FALLING, "irq-eint3",&pins_desc[1]);
    if(ret)
    {
        printk(KERN_ERR"request_irq IRQ_EINT(3) fail");
        free_irq(IRQ_EINT(2), &pins_desc[0]);
        return -1;
    }


    return 0;
}


static ssize_t button_drv_read(struct file *file, char __user *array, size_t size, loff_t *ppos)
{
    int len;

    if(size < 1)
    {
        return -EINVAL;
    }

    /* 如果ev_press为0,将该进程进行睡眠,否则继续向下执行 */
    //wait_event_interruptible(button_wait_q, ev_press);    /* 这个在我们上一节的等待队列里有用,这一节因为使用了poll机制,应用程序读则肯定发生了该事件,即condition（ev_press）非零,执行也会直接退出的 */

    /* 把键值传给应用层,赋值只是为了消除告警 */
    len = copy_to_user(array , &key_val, 1);

    /* 清除中断事件标志  */
    ev_press = 0;

    return 1;
}


static unsigned int button_drv_poll(struct file *file, struct poll_table_struct * wait)
{
    int mask = 0;

    /* 将进程挂在button_waitq队列上，不是在这里休眠 */
    poll_wait(file, &button_wait_q, wait);
    if(ev_press)
    {
        mask = POLLIN | POLLRDNORM;        /* 有按键事件发生,返回可读 */
    }

    return mask;
}



static int button_drv_close(struct inode *inode, struct file *file)
{
    free_irq(IRQ_EINT(2), &pins_desc[0]);
    free_irq(IRQ_EINT(3), &pins_desc[1]);

    return 0;
}

static const struct file_operations button_drv_file_operation = {
    .owner      = THIS_MODULE,
    .open       = button_drv_open,
    .read       = button_drv_read,
    .poll       = button_drv_poll,
    .release    = button_drv_close,
};


static int __init button_drv_init(void)
{
    /* 获取一个自动的主设备号 */
    major =  register_chrdev(0,"button_drv",&button_drv_file_operation);
    if(major < 0)
    {
        printk(KERN_ERR"register_chrdev button_drv fail \n");
        goto err_register_chrdev;
    }

    /* 创建一个类 */
    button_class = class_create(THIS_MODULE, "button_class");
    if(!button_class)
    {
        printk(KERN_ERR"class_create button_class fail\n");
        goto err_class_create;
    }

    /* 创建从属这个类的设备 */
    button_dev = device_create(button_class,NULL,MKDEV(major, 0), NULL, "button");
    if(!button_dev)
    {
        printk(KERN_ERR"device_create button_dev fail \n");
        goto err_device_create;
    }

    return 0;

/* 倒影式错误处理机制 */
err_device_create:
    class_destroy(button_class);
err_class_create:
    unregister_chrdev(major,"button_drv");
err_register_chrdev:

    return -EIO;
}


static void __exit button_drv_exit(void)
{
    //gpio_free_array(buttons_gpio, 2); 
    /* 注销类里面的设备 */
    device_unregister(button_dev);
    /* 注销类 */
    class_destroy(button_class);
    /* 注销字符设备 */
    unregister_chrdev(major,"button_drv");
}

module_init(button_drv_init);
module_exit(button_drv_exit);
MODULE_LICENSE("GPL");

可以发现，polll驱动程序和前面小节的等待队列中的很相似，唯一就是增加了一个poll函数，而且就连poll_wait函数的第二个参数都是用的等待队列的队列头。

接下来看一下应用程序。

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <poll.h>


int main(int argc,char *argv[])
{
    char buf[2];
    int ret;
    struct pollfd fdsa[1];

    
    int fd = open("/dev/button", O_RDWR);
    

    if(fd < 0)
    {   
        printf("open /dev/%s fail\n",argv[1]);
        return -1; 
    }   

    fdsa[0].fd      = fd; 
    fdsa[0].events  = POLLIN;    /* 等待一个输入事件 */


    while(1)
    {   
        /* 如果5000ms以内没有等到该事件,则5000ms后,自动唤醒（返回0）
         * 如果5000ms以内等到该事件返回大于1的值,具体的返回值可以看 fdsa[0].revents
         */
        ret  = poll( &fdsa[0], 1, 5000);
        if(!ret)
        {
            /* 返回0,超时 */
            printf("time out\n");
        }
        else
        {
            /* 有可能会出错,我们这里没判断 */
            read(fd, buf, 1);
            printf("buf = %d\n", buf[0]);
        }
    }


    close(fd);
    return 0;
}

这里先说明一下程序的执行流程。

应用程序先使用poll来来查询是否有等待的事件发生，如果有发生，则poll函数立刻返回返回正数（出错时，返回-1），为时表示等待超时。

接下来我们参照代码分析一下poll函数的调用流程和相关参数。

struct pollfd {
	int fd;                /* 该进程中打开文件的文件描述符 */
	short events;          /* 表示要请求的事件类型 */
	short revents;        /* 表示返回的事件类型 */
};


struct timespec {
	__kernel_time_t	tv_sec;			/* seconds,以s为单位的事件 */
	long		tv_nsec;		/* nanoseconds ,以ns为单位的时间*/
};


/* 
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);




/*
 * Do not touch the structure directly, use the access functions
 * poll_does_not_wait() and poll_requested_events() instead.
 */
typedef struct poll_table_struct {
	poll_queue_proc _qproc;
	unsigned long _key;
} poll_table;


struct poll_list {
	struct poll_list *next;
	int len;                     /* 多少个fd */
	struct pollfd entries[0];    /* 这边先定义成指针形式,在程序中用栈分配的内存绑定 */
};




/*
 * Structures and helpers for select/poll syscall
 */
struct poll_wqueues {
	poll_table pt;
	struct poll_table_page *table;        /*  */
	struct task_struct *polling_task;
	int triggered;
	int error;
	int inline_index;
	struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

其中请求事件和返回的事件类似可以是下面的几种

POLLIN	有数据可读
POLLRDNORM	普通数据可读
POLLRDBAND	优先级带数据可读
POLLPRI	高优先级数据可读
POLLOUT	普通数据可写
POLLWRNORM	普通数据可写
POLLWRBAND	优先级带数据可写
POLLERR	发生错误
POLLHUP	发生挂起
POLLNVAL	描述字不是一个打开的文件

注意：后三个只能作为描述字的返回结果存放在revents中，而不能作为测试条件用于events中。

sys_poll函数

/* 定义sys_poll函数（前面已经分析） */
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
		int, timeout_msecs)
{
	struct timespec end_time, *to = NULL;
	int ret;

	if (timeout_msecs >= 0) {        /* 超时时间进行换,最终换算成以struct timespec结构方式计算的秒和纳秒形式 */
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
	}

    /* 执行真正的系统调用 */
	ret = do_sys_poll(ufds, nfds, to);

    /* 如果在poll期间有信号到来,会返回-ERESTART_RESTARTBLOCK,这会导致重新调用 */
	if (ret == -EINTR) {
		struct restart_block *restart_block;

		restart_block = &current_thread_info()->restart_block;
		restart_block->fn = do_restart_poll;
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

do_sys_poll

#define FRONTEND_STACK_ALLOC	256
#define SELECT_STACK_ALLOC	FRONTEND_STACK_ALLOC
#define POLL_STACK_ALLOC	FRONTEND_STACK_ALLOC

# define RLIMIT_NOFILE		7	/* max number of open files */


/* (256 - 16) / 8 = 30 */
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
			sizeof(struct pollfd))

/* 计算一页也可放多少个struct pollfd */
#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))




int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
		struct timespec *end_time)
{
	struct poll_wqueues table;        /* 局部变量分配poll_wqueues  */
 	int err = -EFAULT, fdcount, len, size;
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
    /* /* 局部变量分配poll_list */ */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];        /* 256字节(64个long)空间,用来存放用户空间的传过来的数据 */
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;        /* 要等待多少个事件 */

	if (nfds > rlimit(RLIMIT_NOFILE))    /* 检查打开文件的数量要是大于大年进程打开的文件数量则返回错误 */
		return -EINVAL;

    /* 得到要等待事件文件数量（因为要存在stack_pps数组,不能超过30(看上面计算过程)个,所以如果nfds超过30,则这按30个处理） */
	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;    /* 考虑到应用层要等待的事件过多,64个long放不下,这里 */
		if (!len)
			break;

        /* 把用户空间的pollfd拷贝到可存放walk的entries空间中 */
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))    
			goto out_fds;

		todo -= walk->len;        /* 超过30个,在这里减后继续,在下面继续拷贝申请空间  */
		if (!todo)    
			break;    /* 拷贝完,退出循环 */
        

        /* 检查一页(4K)和剩下没拷贝的那个小,返回较小者,如果一页也不够拷贝,下轮循环再申请一页
        POLLFD_PER_PAGE表示一页的内存能够存储多少个struct pollfd，可以计算一下，一页是4K，而  
          struct pollfd的内存占用8个字节，就是一页的内存可以将近存储512个pollfd描述符。如果
        在分配一页的内存之后，还不够nfds来用，没关系，循环不会退出的，会再分配一个页，并且所有分
        配的块都被struct poll_list链接起来，上面可以看到，这个结构有一个next域，就是专门做这个
        的。在这之后，就会形成一个以stack_pps存储空间为头，然后一页一页分配的内存为接点的链表，
        这个链表上就存储了poll调用时传入的所有的fd描述符。 */
		len = min(todo, POLLFD_PER_PAGE);    
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
        /* 动态申请len长度,和64个long长度局部变量的一块做成单向链表 */
		walk = walk->next = kmalloc(size, GFP_KERNEL);    
		if (!walk) {
			err = -ENOMEM;
			goto out_fds;
		}
	}
    
    /* 初始化等待队列项 */
	poll_initwait(&table);
    /* 执行驱动中的poll函数 */
	fdcount = do_poll(nfds, head, &table, end_time);
    /* 删除等待队列,释放动态申请的空间 */
	poll_freewait(&table);

    /* 把内核空间每个文件描述符的返回值,拷贝到用户空间数组中 */
	for (walk = head; walk; walk = walk->next) {
		struct pollfd *fds = walk->entries;
		int j;

		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
				goto out_fds;
  	}

	err = fdcount;
out_fds:    /* 释放动态申请存放用户传空间过来的struct pollfd,在内核空间组成的每个链表项 */
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
	}

	return err;
}

poll_initwait

/* 初始化等待队列 */
void poll_initwait(struct poll_wqueues *pwq)
{
	init_poll_funcptr(&pwq->pt, __pollwait);         /* 初始化poll_table */
	pwq->polling_task = current;
	pwq->triggered = 0;
	pwq->error = 0;
	pwq->table = NULL;
	pwq->inline_index = 0;
}


static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
	pt->_qproc = qproc;        /* 绑定__pollwait函数 */
	pt->_key   = ~0UL; /* all events enabled */
}

/* 删除等待队列中的所有项,并释放动态申请的空间 */
void poll_freewait(struct poll_wqueues *pwq)
{
	struct poll_table_page * p = pwq->table;
	int i;
	for (i = 0; i < pwq->inline_index; i++)
		free_poll_entry(pwq->inline_entries + i);        /* 从等待队列中删除所有等待项(这里是删除在局部变量里最初分配的,函数退出会自动释放) */
	while (p) {        /* 删除等待列表中的所有动态分配方式分配的项,并释放当时申请的页 */
		struct poll_table_entry * entry;
		struct poll_table_page *old;

		entry = p->entry;
		do {
			entry--;
			free_poll_entry(entry);    /* 从等待队列中删除所有等待项(动态分配方式分配了一个页的等待项） */
		} while (entry > p->entries);
		old = p;
		p = p->next;
		free_page((unsigned long) old);    /* 释放动态分配方式分配的一个页 */
	}
}

do_poll


static int do_poll(unsigned int nfds,  struct poll_list *list,
		   struct poll_wqueues *wait, struct timespec *end_time)
{
	poll_table* pt = &wait->pt;
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
	unsigned long slack = 0;
	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
	unsigned long busy_end = 0;

	/* Optimise the no-wait case,处理不等待的情况(应用层传为0) */
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
		pt->_qproc = NULL;    
		timed_out = 1;            /* 标记已经超时 */
	}

	if (end_time && !timed_out)    /*  */
		slack = select_estimate_accuracy(end_time);

	for (;;) {
		struct poll_list *walk;
		bool can_busy_loop = false;


        /* 执行每个list链表项中的,每个pollfd 项 */
		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;          /* 每个链表中的entries上都会挂接可能多个struct pollfd */
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {       /* 执行当前链表项poll_list的所有pollfd */ 
				/*
				 * Fish for events. If we found one, record it
				 * and kill poll_table->_qproc, so we don't
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
                /* 处理当前进程的一个fd的poll操作,返回非0值证明有事件发生(也可能是出错事件) */
				if (do_pollfd(pfd, pt, &can_busy_loop,
					      busy_flag)) {
					count++;            /* 收到事件,则count就不为0了 */
					pt->_qproc = NULL;
					/* found something, stop busy polling */
					busy_flag = 0;
					can_busy_loop = false;
				}
			}
		}
		/*
		 * All waiters have already been registered, so don't provide
		 * a poll_table->_qproc to them on the next loop iteration.
		 */
		pt->_qproc = NULL;
		if (!count) {
			count = wait->error;
			if (signal_pending(current))    /* 是否有需要处理的信号 */
				count = -EINTR;            /* 如果有则标记为由信号要处理,会推出 */
		}
		if (count || timed_out)            /* 有收到等待事件,信号或等到超时都会退出 */
			break;

		/* only if found POLL_BUSY_LOOP sockets && not out of time */
		if (can_busy_loop && !need_resched()) {    /* 当前进程如果是socket等待是也是不进行睡眠 */
			if (!busy_end) {
				busy_end = busy_loop_end_time();
				continue;
			}
			if (!busy_loop_timeout(busy_end))    /* 如果有us级别的等待,不睡眠而是忙等(我们的调度是ms级别的) */
				continue;
		}
		busy_flag = 0;

		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec_to_ktime(*end_time);        /* 把超时时间,换算成ktime_t类型 */
			to = &expire;
		}
        
        /* 设置当前进程可被信号中断,并睡眠传入的时间 */
		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
			timed_out = 1;
	}
	return count;
}




int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
			  ktime_t *expires, unsigned long slack)
{
	int rc = -EINTR;

	set_current_state(state);        /* 设置该进程睡眠期间可被信号打断 */
	if (!pwq->triggered)
		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);    /* 休眠expires时间 */
	__set_current_state(TASK_RUNNING);    /* 睡醒后到这里,标记该进程当前是运行状态 */

	/*
	 * Prepare for the next iteration.
	 *
	 * The following set_mb() serves two purposes.  First, it's
	 * the counterpart rmb of the wmb in pollwake() such that data
	 * written before wake up is always visible after wake up.
	 * Second, the full barrier guarantees that triggered clearing
	 * doesn't pass event check of the next iteration.  Note that
	 * this problem doesn't exist for the first iteration as
	 * add_wait_queue() has full barrier semantics.
	 */
	set_mb(pwq->triggered, 0);

	return rc;
}


/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.    busy_flag的意思是不休眠,循环等待
 */
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
				     bool *can_busy_poll,
				     unsigned int busy_flag)
{
	unsigned int mask;
	int fd;

	mask = 0;
	fd = pollfd->fd;        /* 得到文件描述符 */
	if (fd >= 0) {
		struct fd f = fdget(fd);        /* 通过文件描述符得到文件信息 */
		mask = POLLNVAL;        /* 状态先标记为无效的请求 */
		if (f.file) {            /* 确保通过文件描述符找到的该文件存在 */                
			mask = DEFAULT_POLLMASK;
			if (f.file->f_op->poll) {        /* 驱动程序的poll函数存在 */
				pwait->_key = pollfd->events|POLLERR|POLLHUP;    /* 错误和热插拔事件是必须要的 */
				pwait->_key |= busy_flag;    
				mask = f.file->f_op->poll(f.file, pwait);    /* 执行驱动程序的poll函数 */
				if (mask & busy_flag)
					*can_busy_poll = true;
			}
			/* Mask out unneeded events. */
			mask &= pollfd->events | POLLERR | POLLHUP;    /* 屏蔽掉不需要的事件 */
			fdput(f);                
		}
	}
	pollfd->revents = mask;        /* 设置返回事件状态 */

	return mask;
}

我们看一下我们的驱动程序是做了什么

static unsigned int button_drv_poll(struct file *file, struct poll_table_struct * wait)
{
    int mask = 0;

    /* 将进程挂在button_waitq队列上，不是在这里休眠 */
    poll_wait(file, &button_wait_q, wait);
    if(ev_press)
    {
        /* 标记有可读的信息 */
        mask = POLLIN | POLLRDNORM;
    }

    return mask;
}

我们这里很简单，即调用poll_wait函数，如果有可读按键则返回可读标记，否则返回0.

看一下poll_wait函数

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && p->_qproc && wait_address)        /* 检查等待队列头和file中的_qproc存在 */
		p->_qproc(filp, wait_address, p);      /* 执行该函数 */
}

其中p->qproc是我们在初始化tabel的时候绑定的一个函数指针

这里我们再回顾前面的初始化结果

poll_initwait(&table);

其中poll_table中的_key就是在do_pollfd（）函数中根据发生事件的情况等设置的。

current是当前运行任务的任务结构体（是全局变量）

这里我们主要看一下poll_wait做了什么（对比我上面的excel）

/* Add a new entry __pollwait 是把当前进程挂接到等待队列上,并不会睡眠该进程  */
/* 在调用poll_get_entry之后，会返回一个新的poll_table_entry，这也是每次调用__pollwait都会产生
的。接下来调用init_waitqueue_entry函数将这个新建的struct poll_table_entry和当前的进程绑定起
来，再将struct poll_table_entry加入到的等待队列。这样就将当前进程和等待队列联系，说白了，就是把
应用进程的一个等待项加入到,该设备的的等待队列上。
因为一旦有I/O事件到来，等待队列将被唤醒，就会叫醒等待队列上的进程。 */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);    /* 通过pt找到poll_wqueues */
	struct poll_table_entry *entry = poll_get_entry(pwq);
	if (!entry)
		return;
	entry->filp = get_file(filp);
	entry->wait_address = wait_address;    /* 驱动程序中的等待队列头 */
	entry->key = p->_key;                  /* 设置要等待的事件 */
	init_waitqueue_func_entry(&entry->wait, pollwake);    /* 绑定函数唤醒函数 */
	entry->wait.private = pwq;            /* 让该等待项指向当前等待队列 */
	add_wait_queue(wait_address, &entry->wait);    /* 把等待项加入到等待队列中 */
}


/* 查看前面excel,默认是分配了N_INLINE_POLL_ENTRIES(一页4k能分配的数量)个struct poll_table_entry结构体,
如果够用,则把p->inline_index++,并返回该新的未使用的地址。
如果不够用，则inline_index已经等于N_INLINE_POLL_ENTRIES,所以之后的if都会不成立
要使用动态分配方式,有两点,
1、属于第一次用动态方式则table = NULL,需要分配分配
2、动态分配的也用完了,即POLL_TABLE_FULL(table),需要再次动态分配(如果上次动态分配的还没用完,则直接到
return table->entry++;  返回上次动态分配还存在的空间的地址  ) 
 */
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
	struct poll_table_page *table = p->table;

	if (p->inline_index < N_INLINE_POLL_ENTRIES)          /* 默认分配的还有则 */
		return p->inline_entries + p->inline_index++;    /* 返回地址,对使用数量+1 */

	if (!table || POLL_TABLE_FULL(table)) {    /* table为NULL,或table也满了的话，动态申请 */
		struct poll_table_page *new_table;

		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);    /* 得到一个4k的空闲页 */
		if (!new_table) {
			p->error = -ENOMEM;
			return NULL;
		}
        
        /* 新申请的和之前的通过链表连接起来,新申请的插入到原来的前面  */
		new_table->entry = new_table->entries;    
		new_table->next = table;
		p->table = new_table;
		table = new_table;
	}

	return table->entry++;    
}

其中唤醒函数和我们的等待队列方式唤醒的函数，里面调用的其实相同的函数。（主要是poll的原理就是在等待队列的基础上加上了，自动唤醒时间而已，他们的等待，唤醒都是利用的等待队列）

static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	struct poll_table_entry *entry;

	entry = container_of(wait, struct poll_table_entry, wait);
	if (key && !((unsigned long)key & entry->key))
		return 0;
	return __pollwake(wait, mode, sync, key);
}

static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	struct poll_wqueues *pwq = wait->private;
	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

	/*
	 * Although this function is called under waitqueue lock, LOCK
	 * doesn't imply write barrier and the users expect write
	 * barrier semantics on wakeup functions.  The following
	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
	 * and is paired with set_mb() in poll_schedule_timeout.
	 */
	smp_wmb();
	pwq->triggered = 1;

	/*
	 * Perform the default wake up operation using a dummy
	 * waitqueue.
	 *
	 * TODO: This is hacky but there currently is no interface to
	 * pass in @sync.  @sync is scheduled to be removed and once
	 * that happens, wake_up_process() can be used directly.
	 */
	return default_wake_function(&dummy_wait, mode, sync, key);    /* 唤醒函数最终还是调用的这个函数来唤醒 */
}

下图是对上面初始化的一个总结

我们先对poll对数据结构的使用先做一个总结

1.poll中对用户空间传过过来的struct pollfd，先使用局部变量栈保留的256字节用，如果还不够用，在动态申请，并用链表方式连接。因为动态申请效率很差，而栈有不能使用太大（会溢出），所以使用了上面这种折中的，对要查询的比较少的文件用局部变量在栈中分配，对要查询的文件很多的采用动态申请的方式。（这里每次分配的都会链到链表的最后节点）

申请

释放

2.对等待队列项的分配，也是采用类似的方式。定义的时候首先只是一次定义固定的数量个等待项（一页即4k能分配的数量）。每次驱动程序调用poll_wait函数，会对使用数量统计，如果小于固定的数量个等待项，则继续在固定数量个等待项中分配使用。如果不够再采用动态方式，分配一页使用（还不够则再继续动态）。（这里和前面唯一的区别是这次的动态申请的是采用链表的前插方式链入到链表头的第一个节点了，因为下一次分配使用链表第一项效率更高）

申请

poll_initwait(&table);
    init_poll_funcptr(&pwq->pt, __pollwait);    /* 这里只是绑定 */


真正的调用在驱动的poll函数中调用poll_wait中
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && p->_qproc && wait_address)
		p->_qproc(filp, wait_address, p);
}

释放

总结系统调用poll的流程

用户空间:
    poll();
内核空间:
    sys_poll();
        poll_select_set_timeout();    /* 超时时间换算 */
            do_sys_poll(ufds, nfds, to);
                copy_from_user();    /* 把用户空间的多struct pollfd拷贝到内核空间(静态内存不够,要动态分配) */
                poll_initwait(&table);     /* 初始化等待队列 */ 
                    init_poll_funcptr(&pwq->pt, __pollwait);    / 绑定_pollwait函数_* */
                do_poll(nfds, head, &table, end_time);
                    for(;;)
                    {
                        for (walk = list; walk != NULL; walk = walk->next) {    /*256字节存储不下,其它动态的都是采用链表形式在内核空间保存的 */
                            for (; pfd != pfd_end; pfd++) {    /* 执行每个fd,在驱动中的poll函数 */
                                do_pollfd(pfd, pt, &can_busy_loop, busy_flag);
   /* 执行驱动程序的poll函数 */    
                                    struct fd f = fdget(fd);    /* 通过文件夹描述符，得文件信息 */
                                    f.file->f_op->poll(f.file, pwait);    /* 通过文件信息,找到驱动文件操作接口,进而执行驱动注册的poll函数 */
                            }
                        }
                        
                        /* 退出条件,超时、有信号到来 */
                        if (count || timed_out)
			                break;

                        if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
			                timed_out = 1;
                    }
	            poll_freewait(&table);    /* 从等待队列删除等待项(如果有动态申请页,也一并释放掉) */
                __put_user();        /* 把内核空间对多个文件执行完的返回值,拷贝到用户空间数组struct pollfd(如果动态分配了内存,在这里释放) */
        do_restart_poll();            /* 如果是被信号中断,则重新开始poll */






poll_wait
    __pollwait
        struct poll_table_entry *entry = poll_get_entry(pwq);    /* 得到一个新的entry */
        初始化entry
        init_waitqueue_func_entry(&entry->wait, pollwake);        /* 添加唤醒函数 */
        add_wait_queue(wait_address, &entry->wait);                /* 加入到内核进程的等待队列 */

上面的参考韦东山老师的驱动代码和应用程序的整体执行流程是应用程序循环方式

我这里只执行一次，看一下


int main(int argc,char *argv[])
{
    char buf[2];
    int ret;
    struct pollfd fdsa[1];
    
    int fd = open("/dev/button", O_RDWR);
    if(fd < 0)
    {   
        printf("open /dev/%s fail\n",argv[1]);
        return -1; 
    }   

    fdsa[0].fd      = fd; 
    fdsa[0].events  = POLLIN;

    ret = poll( &fdsa[0], 1, 5000);
    printf("ret = %d\n", ret);
    if(!ret)
    {   
        printf("time out\n");
    }   
    else
    {   
        read(fd, buf, 1); 
        printf("buf = %d\n", buf[0]);
    }  
    close(fd);
    return 0;
}

先分析一下流程

应用程序

poll -> 调用到内核的 poll函数 -> 内核中先调用了 poll_wait 申请一个等待队列项并加入了等待队列 -> 判断是否有按键事件发生 ==》

有按键事件发生

ev_prass == 1 ，则返回非零值 POLLIN，在sys_poll的do_poll中判断返回值非0，则count++，之后因为if (count || timed_out) break;，中的count != 0 。则退出，并从等待队列中删除该项，返回count到应用程序

没按键事件发生

ev_press == 0 ，返回 0，sys_poll中进行睡眠，在睡眠期间可能出现两种情况，

1.直到超时也没按键事件发生

这种情况比较简单，就是在5000ms超时后，调度器唤醒该进程，之后把它置位运行状态，把timed_out = 1，之后运行，if (count || timed_out) break;给应用层返回值count还是0，（返回前也同样从等待队列中删除该项）。

2.睡眠期间，有按键事件发生（信号之类也算）

在睡眠期间，发生按键，先标记有按键事件发生ev_press = 1，因为我们需要立刻返回，所以在中断函数中增加了唤醒函数，程序会立刻唤醒，这次的唤醒点和超时的唤醒点一样，都是在do_poll中的poll_schedule_timeout里面唤醒（也是在它里面睡眠的），但因为不是通过调度事件到唤醒的，所以返回1，返回前从等待队列中删除该等待项。只后通过read读，就可以读到按键了。

细心的读者应该发现了，poll其实和可被打断的等待事件睡眠超时限制这个宏的效果是一样的（他们的唤醒函数都很相似）。

我在上一节的中断和等待队列分析的代码上做了，效果一直。

poll要实现

应用层函数   时间是ms
int poll(struct pollfd *fds, nfds_t nfds, int timeout);

驱动层实现
unsigned int poll(struct file *, struct poll_table_struct *);

poll对应用层更灵活，时间可变。

当poll的文件太多，应用层和内核层，要使用很多动态分配内存等，效率不高。

/* 时间是调度jiffies */
wait_event_interruptible_timeout(wq, condition, timeout);

等待事件睡眠超时限制可以添加在驱动中合适的地方。

上面可能没说等待队列和进程的关系，这里统一所以下。

等待队列头是在一个设备的驱动程序中定义，具设体来说是属于某个设备驱动的。该设备可被多个应用进程打开，使用。

而我们的驱动如果是阻塞访问的，则必须要某个I/O事件到达，应用进程才能正确访问。而在I/O事件没到来之前，如果应用进程访问该设备。则驱动程序中检测到该I/O事件确实还没准备好，则该设备驱动会先把该进程加入到设备所在的等待队列中，之后主动让该进程睡眠。在I/O准备好后，该I/O事件主动唤醒该设备驱动等待队列上的进程。

第一次对n个文件进行poll（）的时候，若任何一个文件满足要求， poll（）就直接返回；第2次再进行poll（）的时候，没有文件满足读写要求， poll（）的进程阻塞且睡眠。由于调用poll（）的时候，每个驱动的poll（）接口都会被调用到，实际上执行select（）的进程被挂到了每个驱动的等待队列上，可以被任何一个驱动唤醒。（我们上面只打开了一个设备文件是所以只能被这一个驱动唤醒）

从零开始之驱动发开、linux驱动（十六、poll机制）

猜你喜欢