linux内核协议栈 TCP服务器端接收SYN请求段 Ⅱ 之发送SYN+ACK报文

Table of Contents

1 SYN+ACK报文发送场景概述

2 SYN+ACK报文发送 tcp_v4_send_synack()

2.1 SYN+ACK报文够造 tcp_make_synack()

2.1.1 SYN+ACK报文内存分配 sock_wmalloc

3 SYN+ACK报文超时处理 tcp_synack_timer

3.1 inet_csk_reqsk_queue_prune()


1 SYN+ACK报文发送场景概述

当 tcp 服务器端收到SYN包后,将会调用 tcp_v4_send_synack() 向客户端发送SYN+ACK报文,同时启动 SYN+ACK 超时重传机制最终也会调用 tcp_v4_send_synack() 接口推送 SYN+ACK 报文,堆栈信息如下:

tcp_v4_conn_request //收到SYN立刻回复SYN+ACK
	--tcp_v4_send_synack
	
tcp_synack_timer //超时重传
--inet_csk_reqsk_queue_prune	
	--tcp_v4_rtx_synack //req->rsk_ops->rtx_syn_ack(parent, req, NULL)
		--tcp_v4_send_synack

2 SYN+ACK报文发送 tcp_v4_send_synack()

  1. 获取发送路由信息 //首次收到SYN 已经在 tcp_v4_conn_request 获取过了
  2. 构造 SYN+ACK 报文
  3. 计算 checksum 校验和
  4. 通过 ip 层将 tcp 报文发送出去
/*
 *	Send a SYN-ACK after having received a SYN.
 *	This still operates on a request_sock only, not on a big
 *	socket.
 */
static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
			      struct dst_entry *dst)
{
	const struct inet_request_sock *ireq = inet_rsk(req);
	int err = -1;
	struct sk_buff * skb;

	//获取路由
	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
		goto out;

	//根据监听套接字、连接请求块和路由构造SYN+ACK数据包
	skb = tcp_make_synack(sk, dst, req);

	if (skb) {
		struct tcphdr *th = tcp_hdr(skb);

		//计算TCP校验和
		th->check = tcp_v4_check(skb->len,
					 ireq->loc_addr,
					 ireq->rmt_addr,
					 csum_partial((char *)th, skb->len,
						      skb->csum));
		//构造IP报文并发送,属于IP层动作,暂不深究
		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
					    ireq->rmt_addr,
					    ireq->opt);
		err = net_xmit_eval(err);
	}

out:
	dst_release(dst);
	return err;
}

从上面的代码可以看出,TCP构造出SYN+ACK报文后,会直接发送给IP层,并且不会将该数据包加入TCP的发送队列。

2.1 SYN+ACK报文够造 tcp_make_synack()

struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
				struct request_sock *req)
{
	struct inet_request_sock *ireq = inet_rsk(req);
	struct tcp_sock *tp = tcp_sk(sk);
	struct tcphdr *th;
	int tcp_header_size;
	struct sk_buff *skb;
#ifdef CONFIG_TCP_MD5SIG
	struct tcp_md5sig_key *md5;
	__u8 *md5_hash_location;
#endif

	//分配数据包内存,这里第三个参数为1表示强制分配,即无论本次分配
	//是否会超出写内存使用量上限都会进行分配
	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
	if (skb == NULL)
		return NULL;

	//为TCP首部预留空间
	skb_reserve(skb, MAX_TCP_HEADER);

	skb->dst = dst_clone(dst);
	//根据需要包含的TCP选项计算实际的TCP首部长度,可以看到MSS选项是一定会包含进去的
	tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
			   (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
			   (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
			   /* SACK_PERM is in the place of NOP NOP of TS */
			   ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));

#ifdef CONFIG_TCP_MD5SIG
	/* Are we doing MD5 on this segment? If so - make room for it */
	md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
	if (md5)
		tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
#endif
	skb_push(skb, tcp_header_size);
	skb_reset_transport_header(skb);

	//设置标准TCP首部各个字段
	th = tcp_hdr(skb);
	memset(th, 0, sizeof(struct tcphdr));
	//SYN、ACK标志位置位
	th->syn = 1;
	th->ack = 1;
	TCP_ECN_make_synack(req, th);
	//源端口和目的端口
	th->source = inet_sk(sk)->sport;
	th->dest = ireq->rmt_port;
	/* Setting of flags are superfluous here for callers (and ECE is
	 * not even correctly set)
	 */
	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
			     TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
	//服务器端初始序列号设置(初始序列号的选择见SYN请求报文的处理过程)
	th->seq = htonl(TCP_SKB_CB(skb)->seq);
	//设置ACK序号为客户端SYN报文的初始序列号+1
	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
	//设置服务器端的接收窗口大小(与本端接收缓冲区大小以及窗口扩大因子选项有关)
	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
		__u8 rcv_wscale;
		/* Set this up on the first call only */
		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
		/* tcp_full_space because it is guaranteed to be the first packet */
		tcp_select_initial_window(tcp_full_space(sk),
			dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
			&req->rcv_wnd,
			&req->window_clamp,
			ireq->wscale_ok,
			&rcv_wscale);
		ireq->rcv_wscale = rcv_wscale;
	}
	//将接收窗口设置到TCP首部中
	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
	th->window = htons(min(req->rcv_wnd, 65535U));

	//设置SKB中的发送时间戳
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	//设置TCP首部的选项部分
	tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
			      ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
			      TCP_SKB_CB(skb)->when,
			      req->ts_recent,
			      (
#ifdef CONFIG_TCP_MD5SIG
			       md5 ? &md5_hash_location :
#endif
			       NULL)
			      );

	th->doff = (tcp_header_size >> 2);
	TCP_INC_STATS(TCP_MIB_OUTSEGS);

#ifdef CONFIG_TCP_MD5SIG
	/* Okay, we have all we need - do the md5 hash if needed */
	if (md5) {
		tp->af_specific->calc_md5_hash(md5_hash_location,
					       md5,
					       NULL, dst, req,
					       tcp_hdr(skb), sk->sk_protocol,
					       skb->len);
	}
#endif

	return skb;
}

2.1.1 SYN+ACK报文内存分配 sock_wmalloc

/*
 * Allocate a skb from the socket's send buffer.
 */
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
			     gfp_t priority)
{
	//1. force不为0表示强制进行分配
	//2. 如果已分配内存大小没有超过发送缓存上限,也可以进行分配
	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
		struct sk_buff * skb = alloc_skb(size, priority);
		if (skb) {
			//设置skb的属主为监听套接字
			skb_set_owner_w(skb, sk);
			return skb;
		}
	}
	return NULL;
}

3 SYN+ACK报文超时处理 tcp_synack_timer

SYN+ACK报文的超时处理函数为tcp_synack_timer(),下面看其实现:

#define TCP_SYNQ_INTERVAL	(HZ/5)	/* Period of SYNACK timer */
#define TCP_TIMEOUT_INIT ((unsigned)(3*HZ))	/* RFC 1122 initial RTO value	*/
#define TCP_RTO_MAX	((unsigned)(120*HZ))

static void tcp_synack_timer(struct sock *sk)
{
	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
				   TCP_TIMEOUT_INIT, TCP_RTO_MAX);
}

3.1 inet_csk_reqsk_queue_prune()

  1. 获取半连接队列
  2. 根据最大重传次数,判断超时,依次传输半连接请求
  3. 重传次数完毕删除半连接请求
  4. 重设定时器,+200ms之后继续调用
void inet_csk_reqsk_queue_prune(struct sock *parent,
				const unsigned long interval,
				const unsigned long timeout,
				const unsigned long max_rto)
{
	//icsk为监听套接字的连接请求队列
	struct inet_connection_sock *icsk = inet_csk(parent);
	//queue为accept连接队列
	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
	//lopt为SYN请求队列
	struct listen_sock *lopt = queue->listen_opt;
	//配置的SYN+ACK报文的最大重传次数(/proc/sys/net/ipv4/tcp_synack_retries)
	int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
	//实际允许的重传次数,下面会看到,thresh的值会调整
	int thresh = max_retries;
	unsigned long now = jiffies;
	struct request_sock **reqp, *req;
	int i, budget;

	//如果SYN请求队列为空,什么都不需要做,直接返回
	if (lopt == NULL || lopt->qlen == 0)
		return;

	//为了让监听套接字能够提供更好的服务,当SYN请求队列空间紧张时,队列中那些重传过
	//SYN+ACK的连接应该能够及时的释放,因为这些重传过的报文最有可能永远都不会再收到
	//ACK了(链路出了问题,导致SYN+ACK丢失、ACK丢失或者RTO过大)。为了能够即使释
	//放这些不再“年轻”的连接,SYN+ACK报文的重传次数thresh就不再是固定值,而是根据
	//当前SYN请求队列的状态动态调整的。

	//下面这个分支就是确定thresh的值,个人认为不用理解具体的调整细节
	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
		int young = (lopt->qlen_young<<1);

		while (thresh > 2) {
			if (lopt->qlen < young)
				break;
			thresh--;
			young <<= 1;
		}
	}
	//TCP_DEFER_ACCEPT选项相关
	if (queue->rskq_defer_accept)
		max_retries = queue->rskq_defer_accept;

	//budget代表的是遍历多少个SYN请求队列哈希桶。不知为何要这么设计,为什么不
	//直接将整个SYN请求队列遍历完,难道是性能考量,避免一次遍历花费过多时间?
	budget = 2 * (lopt->nr_table_entries / (timeout / interval));
	//如下,clock_hand是上次遍历结束时的索引
	i = lopt->clock_hand;

	do {
		//指向哈希表冲突链的第一个元素
		reqp=&lopt->syn_table[i];
		//遍历冲突链
		while ((req = *reqp) != NULL) {
			//如果该连接请求超时
			if (time_after_eq(now, req->expires)) {
				//cond1: 该连接的重传次数还没有超过门限thresh;
				//cond2: 该连接请求块已经被ACK过了,但是在创建通信套接字时失败了(资源受限等),
				//		  这种连接也是很快就可以成功的,所以应该重发SYN+ACK,所以用系统设定的
				//		  最大重传次数限制,acked赋值为1见《TCP之服务端接收ACK报文》
				//cond1或者cond2满足一个,则调用回调rtx_syn_ack()重传SYN+ACK
				if ((req->retrans < thresh ||
				     (inet_rsk(req)->acked && req->retrans < max_retries))
				    && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
					unsigned long timeo;

					//如果是首次重传,那么递减qlen_yong,即该连接不再年轻
					if (req->retrans++ == 0)
						lopt->qlen_young--;
					//设定下次超时时间,3*2^已重传次数,即3s、6s、12s...,最大120s
					timeo = min((timeout << req->retrans), max_rto);
					req->expires = now + timeo;
					reqp = &req->dl_next;
					continue;
				}

				//该连接已经超过了最大重传次数,将其从SYN请求队列中清除
				inet_csk_reqsk_queue_unlink(parent, req, reqp);
				reqsk_queue_removed(queue, req);
				reqsk_free(req);
				continue;
			}
			reqp = &req->dl_next;
		}
		//i指向哈希表的下一个冲突链,求余是为了实现自动回绕
		i = (i + 1) & (lopt->nr_table_entries - 1);

	} while (--budget > 0);

	//clock_hand变量记录遍历的位置,方便下次遍历
	lopt->clock_hand = i;

	//最后,如果SYN请求队列仍然不为空,重新设定定时器interval后(200ms)后超时
	if (lopt->qlen)
		inet_csk_reset_keepalive_timer(parent, interval);
}

《linux内核 TCP服务器端接收SYN请求段Ⅰ》可以知道,回调函数rtx_syn_ack()就是tcp_v4_send_synack(),即SYN+ACK的重传和初传使用的是同一个函数。

猜你喜欢

转载自blog.csdn.net/wangquan1992/article/details/108908194