Linux kernel protocol stack TCP server receives SYN request segment Ⅰ

Table of Contents

1 Overview of tcp server receiving SYN

2 tcp server receives the SYN process

2.1 The data packet input interface tcp_v4_rcv of the tcp layer

2.2 tcp_v4_do_rcv()

2.3 tcp_rcv_state_process()

2.4 tcp processing SYN request interface tcp_v4_conn_request() (core)

2.4.1 SYN request queue (semi-connection) is full judgment inet_csk_reqsk_queue_is_full

2.4.2 Accept connection queue (full connection) is full judgment sk_acceptq_is_full

2.5 Allocation and initialization of the connection request block 

2.5.1 Connection request block allocation reqsk_alloc / inet_reqsk_alloc

2.5.2 Initialization of the connection request block

2.6 Add the connection request block to the SYN request queue inet_csk_reqsk_queue_hash_add


1 Overview of tcp server receiving SYN

  1. Find the local socket from tcp_hashinfo according to the 5-tuple information
  2. Determine that the local socket request queue is full, including: semi-connection, full-connection
  3. Send SYN+ACK message to the client
  4. Add the newly created request socket to the SYN queue and start the SYN+ACK timeout retransmission timer (initial value is 3s)

Note: After receiving the SYN packet, the linux kernel protocol stack will not transfer the state to the SYN_RECV state. This state will create a new sock after receiving the ACK message from the client and change the state of the sock Set to  TCP_SYN_RECV , then call tcp_rcv_state_process to migrate the state to TCP_ESTABLISHED

2 tcp server receives the SYN process

tcp_v4_rcv
--tcp_v4_do_rcv
    --tcp_rcv_state_process
	    --tcp_v4_conn_request
		    --inet_csk_reqsk_queue_is_full
		    --sk_acceptq_is_full
		    --inet_reqsk_alloc
		    --tcp_v4_send_synack
		    --inet_csk_reqsk_queue_hash_add

2.1 The data packet input interface tcp_v4_rcv of the tcp layer

  1. Verify the legitimacy of tcp packets
  2. Find the local socket from tcp_hashinfo based on the 5-tuple information
  3. Call tcp_v4_do_rcv() to process the packet
int tcp_v4_rcv(struct sk_buff *skb)
{
	struct tcphdr *th;
	struct sock *sk;
	int ret;

	//获取TCP首部指针
	th = tcp_hdr(skb);
        //获取IP首部指针
        iph = ip_hdr(skb);
	//从TCP的哈希表中寻找应该由哪个套接字来处理这个数据段(根据输入数据段的tcp/ip头部信息)
	//对于处理SYN请求段的场景,这里找到的就是监听套接字
	sk = __inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->saddr,
			th->source, iph->daddr, th->dest, inet_iif(skb));
	if (!sk)
		goto no_tcp_socket;

process:
	//这里涉及TCP接收时为了性能考虑使用的三个队列,暂不关注,直接看tcp_v4_do_rcv()
	if (!sock_owned_by_user(sk)) {
		if (!tcp_prequeue(sk, skb))
			//调用tcp_v4_do_rcv()对数据包进行处理
			ret = tcp_v4_do_rcv(sk, skb);
	} else
		sk_add_backlog(sk, skb);
	
	bh_unlock_sock(sk);
	sock_put(sk);
	return ret;
}

2.2 tcp_v4_do_rcv()

  1. Call tcp_v4_hnd_req to find the requested socket, if not found, return sk
  2. Call tcp_rcv_state_process to process SYN request message
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
	struct sock *rsk;

	if (sk->sk_state == TCP_LISTEN) {
		//返回NULL:出错
		//nsk == sk:没有找到新的TCB,所以收到的是第一次握手的SYN(这篇笔记就是这种情况)
		//NSK != SK: 找到了新的TCB,所以收到的是第三次握手的ACK
		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
		if (!nsk)
			goto discard;
		//ACK包由tcp_child_process处理
		if (nsk != sk) {
			if (tcp_child_process(sk, nsk, skb)) {
				rsk = nsk;
				goto reset;
			}
			return 0;
		}
	}

	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
		//如果返回非0,表示收到了不期望的数据包,此时会向对端发送RST报文
		rsk = sk;
		goto reset;
	}
	return 0;
}

2.3 tcp_rcv_state_process()

  1. Call tcp_v4_conn_request to process SYN connection request
/*
sk: 接收该报文的TCP套接字
skb:输入数据报文
th:指向该报文的TCP头部指针
len:数据报文长度
*/
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
			  struct tcphdr *th, unsigned len)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	int queued = 0;

	switch (sk->sk_state) {
	case TCP_LISTEN:
		//此函数只处理SYN报文段,如果ACK置为,说明收到的是非预期的报文,
		//返回1会导致向对端回复RST报文
		if (th->ack)
			return 1;
		//收到RST报文,只是忽略该报文
		if (th->rst)
			goto discard;
		
		if (th->syn) {
			//收到了SYN报文,交由TCP的tcp_v4_conn_request()处理,
			//该指针在传输控制块初始化过程中被指定,见tcp_v4_init_sock
			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
				return 1;

			/* Now we have several options: In theory there is
			 * nothing else in the frame. KA9Q has an option to
			 * send data with the syn, BSD accepts data with the
			 * syn up to the [to be] advertised window and
			 * Solaris 2.1 gives you a protocol error. For now
			 * we just ignore it, that fits the spec precisely
			 * and avoids incompatibilities. It would be nice in
			 * future to drop through and process the data.
			 *
			 * Now that TTCP is starting to be used we ought to
			 * queue this data.
			 * But, this leaves one open to an easy denial of
			 * service attack, and SYN cookies can't defend
			 * against this problem. So, we drop the data
			 * in the interest of security over speed unless
			 * it's still in use.
			 */
			//上面是关于第一个SYN包是否可以携带数据的讨论,当期版本的实现是不允许其携带报文的
			kfree_skb(skb);
			return 0;
		}
		goto discard;
	}
}

2.4 tcp processing SYN request interface tcp_v4_conn_request() (core)

The main thing this function does is to create a connection request socket object, namely struct tcp_request_sock, and then add it to the SYN request queue (semi-connection queue listen_sock.syn_table) of the listening socket. Summarize the core operation of this function:

  1. Check whether the SYN request queue and the accept connection queue are still allowed to receive the SYN request. If it cannot be received, then discard the SYN request segment (syn_cookie is not considered here), but will not return RST to the client, so if the client retry later And when the server-side queue is free, you can continue to process the request;
  2. Allocate connection request block struct tcp_request_sock object;
  3. Analyze and process the TCP options in the SYN request segment (not analyzed yet);
  4. Initialize the newly allocated connection request block according to the received option;
  5. Generate the seq to be carried in the SYN+ACK message, that is, the initial sequence number of the server;
  6. Send a SYN+ACK message to the client segment (see "Sending SYN+ACK Message on TCP Server");
  7. Add the connection request block to the SYN request queue of the listening socket and start the SYN+ACK timeout timer.
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
	struct inet_request_sock *ireq;
	struct tcp_options_received tmp_opt;
	struct request_sock *req;
	//记录SYN请求段中的源和目的地址
	__be32 saddr = ip_hdr(skb)->saddr;
	__be32 daddr = ip_hdr(skb)->daddr;
	__u32 isn = TCP_SKB_CB(skb)->when;
	struct dst_entry *dst = NULL;
    
	//SYN COOKIE技术相关内容,忽略
#ifdef CONFIG_SYN_COOKIES
	int want_cookie = 0;
#else
#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
#endif
	//对于发送给广播和组播地址的SYN报文丢弃,TCP不支持广播,这里应该是出于可靠性的考虑
	if (((struct rtable *)skb->dst)->rt_flags &
	    (RTCF_BROADCAST | RTCF_MULTICAST))
		goto drop;

	//如果SYN请求队列已满,那么丢弃(不考虑SYN Cookie)请求,这种情况客户端会重传SYN请求
	/* TW buckets are converted to open requests without
	 * limitations, they conserve resources and peer is
	 * evidently real one.
	 */
	//这里为什么要判断isn,不理解...
	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
		if (sysctl_tcp_syncookies) {
			want_cookie = 1;
		} else
#endif
		goto drop;
	}

	//如果accept接收队列已满,并且SYN请求队列中至少有一个请求还没有重传过SYN+ACK包,则丢弃该新的SYN请求.
	//个人理解这样设计的考虑是:因为SYN请求队列中有这种“年轻的SYN请求“,而且当前accept队列已满,那么这种
	//年轻的SYN请求很可能很快就会完成三次握手,进而需要添加到accept队列中,所以此时如果接受该新的SYN请求,
	//那么很可能会导致由于无法加入到accept队列而导致已经完成三次握手的TCP连接失败
	/* Accept backlog is full. If we have already queued enough
	 * of warm entries in syn queue, drop request. It is better than
	 * clogging syn queue with openreqs with exponentially increasing
	 * timeout.
	 */
	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
		goto drop;

	//分配struct tcp_reqeust_sock对象,并将tcp_request_sock_ops赋值给其rsk_ops,
	//后续连接建立过程中会调用该结构指定的函数,
	req = reqsk_alloc(&tcp_request_sock_ops);
	if (!req)
		goto drop;

#ifdef CONFIG_TCP_MD5SIG
	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
#endif

	//解析SYN包携带的TCP选项,这里先不关注TCP选项相关内容
	tcp_clear_options(&tmp_opt);
	tmp_opt.mss_clamp = 536;
	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
	tcp_parse_options(skb, &tmp_opt, 0);

	//SYN Cookie相关,忽略
	if (want_cookie) {
		tcp_clear_options(&tmp_opt);
		tmp_opt.saw_tstamp = 0;
	}

	//时间戳选项处理
	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
		/* Some OSes (unknown ones, but I see them on web server, which
		 * contains information interesting only for windows'
		 * users) do not send their stamp in SYN. It is easy case.
		 * We simply do not advertise TS support.
		 */
		tmp_opt.saw_tstamp = 0;
		tmp_opt.tstamp_ok  = 0;
	}
	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;

	//根据SYN请求段中的字段和选项来初始化连接请求块
	tcp_openreq_init(req, &tmp_opt, skb);

	if (security_inet_conn_request(sk, skb, req))
		goto drop_and_free;

	//记录该套接字的源地址和目的地址,这里的saddr和daddr分别是skb中的源IP和目的IP字段,所以相反赋值
	ireq = inet_rsk(req);
	ireq->loc_addr = daddr;
	ireq->rmt_addr = saddr;
	//将SYN请求段中的IP选项部分保存到连接请求块中
	ireq->opt = tcp_v4_save_options(sk, skb);
	if (!want_cookie)
		TCP_ECN_create_request(req, tcp_hdr(skb));

	//根据不同情况生成服务器端的初始发送序号
	if (want_cookie) {
#ifdef CONFIG_SYN_COOKIES
		syn_flood_warning(skb);
#endif
		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
	} else if (!isn) {
		struct inet_peer *peer = NULL;

		/* VJ's idea. We save last timestamp seen
		 * from the destination in peer table, when entering
		 * state TIME-WAIT, and check against it before
		 * accepting new connection request.
		 *
		 * If "isn" is not zero, this request hit alive
		 * timewait bucket, so that all the necessary checks
		 * are made in the function processing timewait state.
		 */
		if (tmp_opt.saw_tstamp &&
		    tcp_death_row.sysctl_tw_recycle &&
		    (dst = inet_csk_route_req(sk, req)) != NULL &&
		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
		    peer->v4daddr == saddr) {
			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
			    (s32)(peer->tcp_ts - req->ts_recent) >
							TCP_PAWS_WINDOW) {
				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
				dst_release(dst);
				goto drop_and_free;
			}
		}
		/* Kill the following clause, if you dislike this way. */
		else if (!sysctl_tcp_syncookies &&
			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
			  (sysctl_max_syn_backlog >> 2)) &&
			 (!peer || !peer->tcp_ts_stamp) &&
			 (!dst || !dst_metric(dst, RTAX_RTT))) {
			/* Without syncookies last quarter of
			 * backlog is filled with destinations,
			 * proven to be alive.
			 * It means that we continue to communicate
			 * to destinations, already remembered
			 * to the moment of synflood.
			 */
			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
				       "request from %u.%u.%u.%u/%u\n",
				       NIPQUAD(saddr),
				       ntohs(tcp_hdr(skb)->source));
			dst_release(dst);
			goto drop_and_free;
		}
		isn = tcp_v4_init_sequence(skb);
	}
	//将确定的初始序列号记录到TCP控制块中
	tcp_rsk(req)->snt_isn = isn;
	
	//发送SYN+ACK报文
	if (tcp_v4_send_synack(sk, req, dst))
		goto drop_and_free;

	if (want_cookie) {
		reqsk_free(req);
	} else {
		//将连接请求块加入到SYN请求队列中,并启动SYN+ACK超时重传定时器(初始值为3s)
		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
	}
	return 0;

drop_and_free:
	reqsk_free(req);
drop:
	return 0;
}

2.4 Connection request queue status judgment

Here to see whether the accept connection queue and the SYN request queue are full.

2.4.1 SYN request queue (semi-connection) is full judgment inet_csk_reqsk_queue_is_full

static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
{
	return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
}

static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
{
	//如果当前已经收到SYN请求的套接字数目(qlen)大于nr_tables_entries,
	//则认为SYN请求队列已满,这里巧妙的运用了移位运算而不是比较运算
	return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
}

2.4.2 Accept connection queue (full connection) is full judgment sk_acceptq_is_full

static inline int sk_acceptq_is_full(struct sock *sk)
{
	//直接比较当前已完成三次握手的套接字数目和允许的最大值,这可以看出listen()
	//调用中backlog参数(它会被赋值给sk_max_ack_backlog)的作用
	return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
}

2.5 Allocation and initialization of the connection request block 

2.5.1 Connection request block allocation reqsk_alloc / inet_reqsk_alloc

static inline struct request_sock *(const struct request_sock_ops *ops)
{
	//分配一个连接请求块,这里实际上是分配的struct tcp_request_sock结构
	struct request_sock *req = kmem_cache_alloc(ops->slab, GFP_ATOMIC);
	//将操作函数赋值给连接请求块的ops成员
	if (req != NULL)
		req->rsk_ops = ops;

	return req;
}

The ops passed in when calling reqsk_alloc() is tcp_request_sock_ops, which is defined as follows:

struct request_sock_ops tcp_request_sock_ops __read_mostly = {
	.family		=	PF_INET,
	//创建的对象为struct tcp_request_sock
	.obj_size	=	sizeof(struct tcp_request_sock),
	.rtx_syn_ack	=	tcp_v4_send_synack,
	.send_ack	=	tcp_v4_reqsk_send_ack,
	.destructor	=	tcp_v4_reqsk_destructor,
	.send_reset	=	tcp_v4_send_reset,
};

Here ops->slab is created when the AF_INET protocol family is initialized, the code is as follows:

struct proto tcp_prot = {
	...
	.rsk_prot		= &tcp_request_sock_ops,
    ...
};

static int __init inet_init(void)
{
	...
	rc = proto_register(&tcp_prot, 1);
	if (rc)
		goto out;
    ...
}

int proto_register(struct proto *prot, int alloc_slab)
{
	...
    prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
                         prot->rsk_prot->obj_size, 0,
                         SLAB_HWCACHE_ALIGN, NULL);
    ...
}

2.5.2 Initialization of the connection request block

The initialization of the connection request block depends on the TCP option in the SYN request segment, so it is executed after the TCP option analysis is completed. The code is as follows:

static inline void tcp_openreq_init(struct request_sock *req,
				    struct tcp_options_received *rx_opt,
				    struct sk_buff *skb)
{
	struct inet_request_sock *ireq = inet_rsk(req);

	req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
	req->mss = rx_opt->mss_clamp;
	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
	ireq->tstamp_ok = rx_opt->tstamp_ok;
	ireq->sack_ok = rx_opt->sack_ok;
	ireq->snd_wscale = rx_opt->snd_wscale;
	ireq->wscale_ok = rx_opt->wscale_ok;
	ireq->acked = 0;
	ireq->ecn_ok = 0;
	ireq->rmt_port = tcp_hdr(skb)->source;
}

2.6 Add the connection request block to the SYN request queue inet_csk_reqsk_queue_hash_add

void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
				   unsigned long timeout)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	//获取SYN请求队列
	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
	//根据连接请求块的对端IP地址、端口号、初始哈希值计算一个哈希值
	const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
				     lopt->hash_rnd, lopt->nr_table_entries);
	//将连接请求块插入SYN请求队列中,并且将超时时间设置到该连接请求块中
	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
	//更新SYN请求队列中的计数信息:qlen、qlen_yong,并启动SYN+ACK重传定时器
	inet_csk_reqsk_queue_added(sk, timeout);
}

static inline void reqsk_queue_hash_req(struct request_sock_queue *queue,
					u32 hash, struct request_sock *req,
					unsigned long timeout)
{
	struct listen_sock *lopt = queue->listen_opt;

	//设置超时参数
	req->expires = jiffies + timeout;
	//初始化SYN+ACK报文重传次数为0
	req->retrans = 0;
	req->sk = NULL;
	//将新的连接请求块插入到SYN请求队列的首部
	req->dl_next = lopt->syn_table[hash];
	write_lock(&queue->syn_wait_lock);
	lopt->syn_table[hash] = req;
	write_unlock(&queue->syn_wait_lock);
}

static inline void inet_csk_reqsk_queue_added(struct sock *sk,
					      const unsigned long timeout)
{
	//更新listen_ops的计数信息。如果函数返回0,表示之前SYN请求队列为空,
	//这种情况需要复位SYN+ACK重传定时器
	if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0)
		inet_csk_reset_keepalive_timer(sk, timeout);
}

static inline int reqsk_queue_added(struct request_sock_queue *queue)
{
	struct listen_sock *lopt = queue->listen_opt;
	const int prev_qlen = lopt->qlen;

	//更新qlne和qlen_young
	lopt->qlen_young++;
	lopt->qlen++;
	//返回SYN请求队列之前的长度
	return prev_qlen;
}

Guess you like

Origin blog.csdn.net/wangquan1992/article/details/108903813