socket源码分析之accept()

基于 kernel 3.10

之前有分析过TCP accept()的实现，但是太过于沉浸于代码本身，没有结合应用去分析accept()函数。

我们要解决如下几个问题：
1：accept()函数的实现，包括从全队列中取出sock。
2：accept()函数如何如何被唤醒
3：accept()函数如何解决惊群
4：多个进程accept()，优先唤醒哪个进程

accept()函数的实现

accept()函数实现逻辑相对比较简单

如果没有完成建立的TCP会话，阻塞情况下，则阻塞，非阻塞情况下，则返回-EAGAIN。

所以总结来说需要考虑这么几种情况：
1、当前全队列中有socket，则accept()直接返回对应的fd。
2、如果当前全队列中没有socket，则如果当前socket是阻塞的，直接睡眠。
3、如果当前全队列中没有socket，如果非阻塞，就直接返回-EAGAIN。
4、如果是阻塞的listenfd，需要将当前进程挂在listenfd对应socket的等待队列里面，当前进程让出cpu，并且等待唤醒。

accept实现的调用链

sys_accept->sys_accept4->inet_accept->inet_csk_accept
其中 inet_csk_accept是核心处理逻辑，其处理了上述1、3两种情况。


/*
 * This will accept the next outstanding connection.
 */
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
	struct sock *newsk;
	struct request_sock *req;
	int error;

	lock_sock(sk);

	/* We need to make sure that this socket is listening,
	 * and that it has something pending.
	 */
    
    //只有TCP_LISTEN状态的socket才能调用accept
	error = -EINVAL;
	if (sk->sk_state != TCP_LISTEN)
		goto out_err;

	/* Find already established connection */
    
    //如果当前全队列中有已经三次握手建立起来后的连接，就不会进这个if，直接走到后面取全队列中的socket
	if (reqsk_queue_empty(queue)) {
		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);

		/* If this is a non blocking socket don't sleep */
        //非阻塞的socket，直接返回了
		error = -EAGAIN;
		if (!timeo)
			goto out_err;

        //阻塞的socket，调用 inet_csk_wait_for_connect ，下文会说
		error = inet_csk_wait_for_connect(sk, timeo);
        
		if (error)
			goto out_err;
	}
    
    //走到这里，说明全队列中有socket，直接取出来
	req = reqsk_queue_remove(queue);
	newsk = req->sk;

	sk_acceptq_removed(sk);
	if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
		spin_lock_bh(&queue->fastopenq->lock);
		if (tcp_rsk(req)->listener) {
			/* We are still waiting for the final ACK from 3WHS
			 * so can't free req now. Instead, we set req->sk to
			 * NULL to signify that the child socket is taken
			 * so reqsk_fastopen_remove() will free the req
			 * when 3WHS finishes (or is aborted).
			 */
			req->sk = NULL;
			req = NULL;
		}
		spin_unlock_bh(&queue->fastopenq->lock);
	}
out:
	release_sock(sk);
	if (req)
		__reqsk_free(req);
	return newsk;
out_err:
	newsk = NULL;
	req = NULL;
	*err = error;
	goto out;
}

inet_csk_wait_for_connect函数处理了2、4两种情况。


/*
 * Wait for an incoming connection, avoid race conditions. This must be called
 * with the socket locked.
 */
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	DEFINE_WAIT(wait);
	int err;

	/*
	 * True wake-one mechanism for incoming connections: only
	 * one process gets woken up, not the 'whole herd'.
	 * Since we do not 'race & poll' for established sockets
	 * anymore, the common case will execute the loop only once.
	 *
	 * Subtle issue: "add_wait_queue_exclusive()" will be added
	 * after any current non-exclusive waiters, and we know that
	 * it will always _stay_ after any new non-exclusive waiters
	 * because all non-exclusive waiters are added at the
	 * beginning of the wait-queue. As such, it's ok to "drop"
	 * our exclusiveness temporarily when we get woken up without
	 * having to remove and re-insert us on the wait queue.
	 */
	for (;;) {
        //prepare_to_wait_exclusive很重要，把 wait 挂到当前sk的等待队列里面。
		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
					  TASK_INTERRUPTIBLE);
		release_sock(sk);
        //icsk_accept_queue是全队列
		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
			timeo = schedule_timeout(timeo);//阻塞情况下，只有主动唤醒当前进程，才会继续执行。
		lock_sock(sk);
		err = 0;
        
        //如果阻塞且非超时的情况从schedule_timeout返回，那么必然是全队列有值了。
		if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
			break;//这个break是所有程序必经之路
		err = -EINVAL;
		if (sk->sk_state != TCP_LISTEN)
			break;
		err = sock_intr_errno(timeo);
        
        //有信号或者睡眠时间满了，则退出循环，否则接着睡。
		if (signal_pending(current))
			break;
		err = -EAGAIN;
		if (!timeo)
			break;
	}
	finish_wait(sk_sleep(sk), &wait);
	return err;
}

首先，为什么循环？这是历史原因，考虑有这么一种情况，就是睡眠时间没有睡满，那么 schedule_timeout返回的值大于0，那么什么情况下，睡眠没有睡满呢？一种情况就是进程收到信号，另一种就是listenfd对应的socket的全队列有数据了，不考虑信号的情况，假设全队列有数据了，历史上，Linux的accept是惊群的，全队列有值后，所有进程都唤醒，那么必然存在某些进程读取到了全队列socket，而某些没有读取到，这些没有读取到的进程，肯定是睡眠没睡满，所以需要接着睡。
但是本文分析的Linux内核版本是3.10，全队列有数据时，只会唤醒一个进程，故而，次for循环只会跑一次。

prepare_to_wait_exclusive函数很重要，把当前上下文加到listenfd对应的socket等待队列里面，如果是多进程，那么listenfd对应的socket等待队列里面会有多个进程的上下文。

多进程 accept 如何处理惊群

多进程accept，不考虑resuseport，那么多进程accept只会出现在父子进程同时accept的情况，那么上文也说过，prepare_to_wait_exclusive函数会被当前进程上下文加入到listenfd等待队列里面，所以父子进程的上下文都会加入到socket的等待队列里面。核心问题就是这么唤醒，我们可以相当，所谓的惊群，就是把等待队里里面的所有进程都唤醒。
我们此时来看看如何唤醒。

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
	struct sock *rsk;

    ......
	if (sk->sk_state == TCP_LISTEN) {
		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
		if (!nsk)
			goto discard;

		if (nsk != sk) {
			sock_rps_save_rxhash(nsk, skb);
            //当三次握手客户端的ack到来时，会走tcp_child_process这里
			if (tcp_child_process(sk, nsk, skb)) {
				rsk = nsk;
				goto reset;
			}
			return 0;
		}
	}
    ......
}

tcp_child_process:

int tcp_child_process(struct sock *parent, struct sock *child,
		      struct sk_buff *skb)
{
	int ret = 0;
	int state = child->sk_state;

	if (!sock_owned_by_user(child)) {
		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
					    skb->len);
		/* Wakeup parent, send SIGIO */
		if (state == TCP_SYN_RECV && child->sk_state != state)
			parent->sk_data_ready(parent, 0);//唤醒 在accept的进程，调用 sock_def_readable
	} else {
		/* Alas, it is possible again, because we do lookup
		 * in main socket hash table and lock on listening
		 * socket does not protect us more.
		 */
		__sk_add_backlog(child, skb);
	}

	bh_unlock_sock(child);
	sock_put(child);
	return ret;
}

parent->sk_data_ready:

static void sock_def_readable(struct sock *sk, int len)
{
	struct socket_wq *wq;

	rcu_read_lock();
	wq = rcu_dereference(sk->sk_wq);
    //显然，我们在accept的时候调用了`prepare_to_wait_exclusive`加入了队列，故唤醒靠 wake_up_interruptible_sync_poll
	if (wq_has_sleeper(wq))
		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
						POLLRDNORM | POLLRDBAND);
	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
	rcu_read_unlock();
}

#define wake_up_interruptible_sync_poll(x, m)				\
	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))

注意，__wake_up_sync_key的第三个参数是1

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
			int nr_exclusive, void *key)
{
	unsigned long flags;
	int wake_flags = WF_SYNC;

	if (unlikely(!q))
		return;

	if (unlikely(!nr_exclusive))
		wake_flags = 0;

	spin_lock_irqsave(&q->lock, flags);
    //mode是TASK_INTERRUPTIBLE nr_exclusive是1，
	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
	spin_unlock_irqrestore(&q->lock, flags);
}

__wake_up_common:

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
			int nr_exclusive, int wake_flags, void *key)
{
	wait_queue_t *curr, *next;

	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
		unsigned flags = curr->flags;

        //prepare_to_wait_exclusive时候，flags是WQ_FLAG_EXCLUSIVE，入参nr_exclusive是1，所以只执行一次就break了。
		if (curr->func(curr, mode, wake_flags, key) &&
				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
			break;
	}
}

所以多个进程accept的时候，内核只会唤醒1个等待的进程，且唤醒的逻辑是FIFO。

Mrpre 博客专家

发布了122 篇原创文章 · 获赞 117 · 访问量 50万+

他的留言板关注