The new data sent in the TCP layer of the linux kernel protocol stack

table of Contents

1 tcp_push()

2 __tcp_push_pending_frames()

2.1 The sk_write_queue queue has not sent data to push tcp_write_xmit()

2.1.1 Congestion window detection tcp_cwnd_test()

2.1.2 Send window test tcp_snd_wnd_test()

2.1.3 Send data window limit tcp_mss_split_point()

2.1.4 skb fragment processing tso_fragment()

2.1.5 sk_buff constructs tcp header push tcp_transmit_skb()

2.1.6 Update statistics tcp_event_new_data_sent()

3 Send a packet of data tcp_push_one()

4 ACK message reply tcp_data_snd_check()


The core processing function tcp_sendmsg() of the core of the TCP sending related system calls. It can be seen that the core work of this function is to organize the data to be sent into one skb, and put these skb into the sending queue sk_write_queue in order . And this function will also try to call tcp_push() (and the other two interfaces) to send a new data.

In addition, after receiving the confirmation, TCP will call  tcp_data_snd_check() to  check whether the data can be sent, and there will also be a chance to send new data.

Note: The interfaces tcp_push, tcp_one_push, __tcp_push_pending_frames here all refer to the new data sending process (note that it is new data, not retransmitted data). That is, start sending from the sk_send_head pointer of the sk_write_queue queue. The call stack for tcp_push is as follows:

tcp_push
	--__tcp_push_pending_frames
		--tcp_write_xmit //推送sk_write_queue队列中尚未发送的所有数据
			--tcp_cwnd_test //拥塞窗口检测
			--tcp_snd_wnd_test //发送窗口检测
			--tcp_mss_split_point //发送数据窗口限制
			--tso_fragment //skb超出门限进行分段
			--tcp_transmit_skb //skb推送,构造tcp报文调用ip层接口推送
			--tcp_event_new_data_sent //更新统计

1 tcp_push()

As can be seen from the implementation below, tcp_push() will call __tcp_push_pending_frames() after judging whether it needs to set the PUSH flag.

static inline void tcp_push(struct sock *sk, int flags, int mss_now,
			    int nonagle)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (tcp_send_head(sk)) {
		//判断是否需要设置PUSH标记
		struct sk_buff *skb = tcp_write_queue_tail(sk);
		if (!(flags & MSG_MORE) || forced_push(tp))
			tcp_mark_push(tp, skb);
		//MSG_OOB相关,忽略
		tcp_mark_urg(tp, flags, skb);
		//调用__tcp_push_pending_frames()尝试发送
		__tcp_push_pending_frames(sk, mss_now,
					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
	}
}

2 __tcp_push_pending_frames()

This function calls tcp_write_xmit() to finish sending.

/* Push out any pending frames which were held back due to
 * TCP_CORK or attempt at coalescing tiny packets.
 * The socket must be locked by the caller.
 */
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
			       int nonagle)
{
	struct sk_buff *skb = tcp_send_head(sk);
	//如果有新数据可供发送,调用tcp_write_xmit()发送
	if (skb) {
		if (tcp_write_xmit(sk, cur_mss, nonagle))
			//和PMTU相关
			tcp_check_probe_timer(sk);
	}
}

2.1 The sk_write_queue queue has not sent data to push tcp_write_xmit()

This function is the core function of TCP sending new data, and core operations such as sending window judgment and congestion control judgment are all completed in this function.

/* This routine writes packets to the network.  It advances the
 * send_head.  This happens as incoming acks open up the remote
 * window for us.
 *
 * Returns 1, if no segments are in flight and we have queued segments, but
 * cannot send anything now because of SWS or another problem.
 */
static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	unsigned int tso_segs, sent_pkts;
	int cwnd_quota;
	int result;

	/* If we are closed, the bytes will have to remain here.
	 * In time closedown will finish, we empty the write queue and all
	 * will be happy.
	 */
	//检查TCB的状态
	if (unlikely(sk->sk_state == TCP_CLOSE))
		return 0;

	//sent_pkts将记录本次调用发送的数据段数
	sent_pkts = 0;

	//PMTU探测相关,如果发送了探测报文,则sent_pkts加1
	if ((result = tcp_mtu_probe(sk)) == 0) {
		return 0;
	} else if (result > 0) {
		sent_pkts = 1;
	}

	//循环发送尚未发送过的数据包
	while ((skb = tcp_send_head(sk))) {
		unsigned int limit;
		//设置skb中的GSO分段信息。返回值tso_segs表示该skb中的数据需要分成几个段发送
		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
		BUG_ON(!tso_segs);
		//获取拥塞窗口允许发送的数据段数。如果为0,表示拥塞窗口不允许发送数据,结束发送过程
		cwnd_quota = tcp_cwnd_test(tp, skb);
		if (!cwnd_quota)
			break;
		//检测发送窗口是否至少允许发送skb中的一个的段。如果不允许,结束发送过程
		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
			break;

		if (tso_segs == 1) {
			//tso_segs为1,说明skb只有一个段,而且长度可能小于MSS,即是一个小数据包,
			//所以需要检测nagle算法是否允许发送该skb
			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
						     (tcp_skb_is_last(sk, skb) ?
						      nonagle : TCP_NAGLE_PUSH))))
				break;
		} else {
			//tso_segs>1,需要TSO分段,判断是否需要推迟发送,这种推迟主要是为了提高GSO性能
			if (tcp_tso_should_defer(sk, skb))
				break;
		}

		//通过上面的拥塞窗口和发送窗口的检测后,我们知道,目前至少是可以发送一个
		//TCP段的。当然也有可能还可以发送更多,所以下面需要根据条件调整limit

		//如果skb有多个段,需要检查到底可以发送多少数据
		limit = mss_now;
		if (tso_segs > 1)
			//tcp_mss_split_point()返回的是发送窗口和拥塞窗口允许发送的最大字节数,
			//可能会超过skb本身的数据量,见下文
			limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota);

		//skb的数据量超过了限定值,需要分段。这种情况只可能发生在TSO情形,因为非TSO场景,skb
		//的长度是不可能超过MSS的。此外,这种分段完全是因为拥塞控制和流量控制算法限制了发包大小,
		//所以才需要分割,和TSO本身没有任何关系
		if (skb->len > limit &&
		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
			break;
		//更新数据包的发送时间戳
		TCP_SKB_CB(skb)->when = tcp_time_stamp;
		//发送数据,如果返回非0,表示本次发送失败(如qdisc队列已满等),那么结束本次发送过程
		//第三个参数为1,表示让tcp_transmit_skb()发送时克隆一份skb首部
		if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
			break;
		//发送了新数据,更新发送队列以及相关统计
		tcp_event_new_data_sent(sk, skb);
		//Nagle算法相关,如果当前发送的数据量小于MSS,认为是小包,所以更新snd_sml的值
		tcp_minshall_update(tp, mss_now, skb);
		//累加发包计数
		sent_pkts++;
	}//end of while((skb = tcp_send_head(sk)))

	//如果本次发送了数据包,则调整拥塞控制相关变量
	if (likely(sent_pkts)) {
		tcp_cwnd_validate(sk);
		return 0;
	}
	//这两种特殊情况中的任意一种也认为是成功返回(返回0表示函数执行成功):
	//1. 当前已有未确认的数据包在发送;
	//2. sk->sk_send_head为NULL,即当前已没有新数据需要发送
	return !tp->packets_out && tcp_send_head(sk);
}

2.1.1 Congestion window detection tcp_cwnd_test()

This function detects whether the congestion window allows data segments to be sent, and if allowed, returns the number of segments available for sending within the congestion window limit (note: not the number of bytes).

/* Can at least one segment of SKB be sent right now, according to the
 * congestion window rules?  If so, return how many segments are allowed.
 */
static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
{
	u32 in_flight, cwnd;

	/* Don't be strict about the congestion window for the final FIN.  */
	//如果是FIN段,并且只有一个段(FIN有可能会携带很多数据),那么总是可以发送,不会被拥塞窗口限制
	if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && tcp_skb_pcount(skb) == 1)
		return 1;
	//估算当前还在网络中传输的TCP段的数目
	in_flight = tcp_packets_in_flight(tp);
	//snd_cwnd就是当前拥塞窗口的大小,以TCP段为单位
	cwnd = tp->snd_cwnd;
	//比较拥塞窗口大小和飞行报文数目,余量就是拥塞控制还允许发送的段数
	if (in_flight < cwnd)
		return (cwnd - in_flight);
	//拥塞窗口已经好耗尽,返回0表示不允许发送数据
	return 0;
}

//该函数估算的是那些已经发送出去(初传+重传)并且已经离开
//网络的段的数目,这些段主要是SACK确认的+已经判定为丢失的段
static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{
	//sacked_out:启用SACK时,表示已经被SACK选项确认的段的数量;
	//			不启用SACK时,记录了收到的重复ACK的次数,因为重复ACK不会自动发送,一定是对端收到了数据包;
	//lost_out:记录发送后在传输过程中丢失的段的数目,因为TCP没有一种机制可以准确的知道
	//		  发出去的段是否真的丢了,所以这只是一种算法上的估计值
	//无论如何,这两种段属于已经发送,但是可以确定它们在网络中已经不存在了
	return tp->sacked_out + tp->lost_out;
}

/* This determines how many packets are "in the network" to the best
 * of our knowledge.  In many cases it is conservative, but where
 * detailed information is available from the receiver (via SACK
 * blocks etc.) we can make more aggressive calculations.
 *
 * Use this for decisions involving congestion control, use just
 * tp->packets_out to determine if the send queue is empty or not.
 *
 * Read this equation as:
 *
 *	"Packets sent once on transmission queue" MINUS
 *	"Packets left network, but not honestly ACKed yet" PLUS
 *	"Packets fast retransmitted"
 */
static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
{
	//packets_out记录的是已经从发送队列发出,但是尚未被确认的段的数目(不包括重传)
	//retrans_out表示的是因为重传才发送出去,但是还没有被确认的段的数目
	//tcp_left_out():发出去了但是已经离开了网络的数据包数目
	return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
}

It can be seen that the detection of the congestion window is actually very simple, that is, to see whether the number of packets (that is, flying packets) still being transmitted in the current network exceeds the limit of the congestion window. The core of congestion control lies in how to set the value of congestion window tp->snd_cwnd reasonably in various situations.

2.1.2 Send window test tcp_snd_wnd_test()

This function judges whether the current sending window allows at least one segment to be sent, if it is allowed, it returns 1, otherwise it returns 0. If the size of skb exceeds one MSS, as long as one MSS is allowed to be sent, 1 is returned; if the size of skb is less than one MSS, then 1 is returned as long as the amount of data required to be sent is allowed.

/* Does at least the first segment of SKB fit into the send window? */
static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
{
	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
	//如果skb中数据超过了一个段大小,则调整end_seq为一个段大小的序号
	if (skb->len > cur_mss)
		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
	//检查一个段的末尾序号是否超过了发送窗口的右边界
	return !after(end_seq, tcp_wnd_end(tp));
}

//返回发送窗口的右边界
static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
{
	//snd_una:已经发送但是还没有被确认的最小序号
	//snd_wnd:当前发送窗口大小,即接收方剩余的接收缓冲区
	return tp->snd_una + tp->snd_wnd;
}

2.1.3 Send data window limit tcp_mss_split_point()

This function integrates the data length in the skb, the amount of data allowed to be sent in the sending window, and the amount of data allowed to be sent in the congestion window, and calculates the amount of data allowed to be sent by the current skb this time, in bytes.

/* Returns the portion of skb which can be sent right away without
 * introducing MSS oddities to segment boundaries. In rare cases where
 * mss_now != mss_cache, we will request caller to create a small skb
 * per input skb which could be mostly avoided here (if desired).
 *
 * We explicitly want to create a request for splitting write queue tail
 * to a small skb for Nagle purposes while avoiding unnecessary modulos,
 * thus all the complexity (cwnd_len is always MSS multiple which we
 * return whenever allowed by the other factors). Basically we need the
 * modulo only when the receiver window alone is the limiting factor or
 * when we would be allowed to send the split-due-to-Nagle skb fully.
 */
@skb:待判断的skb
@mss_now:当前MSS
@cwnd:拥塞窗口允许发送的段数,cwnd*mss_now即拥塞窗口允许发送的字节数;
static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
					unsigned int mss_now, unsigned int cwnd)
{
	struct tcp_sock *tp = tcp_sk(sk);
	u32 needed, window, cwnd_len;
	//window为发送窗口允许当前skb发送的最大字节数(可能会超过skb->len)
	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
	//cwnd_len为拥塞窗口允许发送的字节数
	cwnd_len = mss_now * cwnd;

	//这段逻辑要实现的效果见下面的注释
	if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
		return cwnd_len;
	//needed为经过发送窗口矫正后的实际要发送的数据量
	needed = min(skb->len, window);

	if (skb == tcp_write_queue_tail(sk) && cwnd_len <= needed)
		return cwnd_len;
	//最终返回值是MSS的整数倍,当然单位依然是字节
	return needed - needed % mss_now;
}

The above implementation is not well understood. In fact, the logic of the function is as follows:

  • The last skb, congestion window is limited-----returns the amount of data allowed to be sent in the congestion window;
  • The last skb, the congestion window is not limited ----- return min (the amount of data allowed in the sending window, the actual amount of data to be sent skb->len);
  • Not the last skb, the congestion window is limited-----return the amount of data allowed to be sent in the congestion window. In this case, the allowed value returned may be greater than the amount of data to be sent in the skb. Because it may be such a relationship skb->len <cwnd_len <= window.
  • It is not the last skb, and the congestion window is not limited ----- return min (the amount of data allowed in the sending window, the actual amount of data to be sent skb->len).

2.1.4 skb fragment processing tso_fragment()

In tcp_write_xmit(), if the amount of data in skb is too large and exceeds the limits of the sending window and congestion window, and only a part of the skb is allowed to be sent, then the skb needs to be split into two segments, the first half is len, this time It can be sent, the second half is stored in the newly allocated skb, and the second half is inserted after the first half in the sending queue sk_write_queue, so that the order of data transmission can be guaranteed.

Note: Since this segmentation only modifies the pointer relationship in frags[] of struct share_info and does not involve memory copy, the speed is very fast.

/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
 * which is put after SKB on the list.  It is very much like
 * tcp_fragment() except that it may make several kinds of assumptions
 * in order to speed up the splitting operation.  In particular, we
 * know that all the data is in scatter-gather pages, and that the
 * packet has never been sent out before (and thus is not cloned).
 */
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
			unsigned int mss_now)
{
	struct sk_buff *buff;
	int nlen = skb->len - len;
	u16 flags;

	/* All of a TSO frame must be composed of paged data.  */
	if (skb->len != skb->data_len)
		return tcp_fragment(sk, skb, len, mss_now);

	buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
	if (unlikely(buff == NULL))
		return -ENOMEM;

	sk->sk_wmem_queued += buff->truesize;
	sk_mem_charge(sk, buff->truesize);
	buff->truesize += nlen;
	skb->truesize -= nlen;

	/* Correct the sequence numbers. */
	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;

	/* PSH and FIN should only be set in the second packet. */
	flags = TCP_SKB_CB(skb)->flags;
	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
	TCP_SKB_CB(buff)->flags = flags;

	/* This packet was never sent out yet, so no SACK bits. */
	TCP_SKB_CB(buff)->sacked = 0;

	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
	skb_split(skb, buff, len);

	/* Fix up tso_factor for both original and new SKB.  */
	tcp_set_skb_tso_segs(sk, skb, mss_now);
	tcp_set_skb_tso_segs(sk, buff, mss_now);

	/* Link BUFF into the send queue. */
	skb_header_release(buff);
	tcp_insert_write_queue_after(skb, buff, sk);

	return 0;
}

/**
 * skb_split - Split fragmented skb to two parts at length len.
 * @skb: the buffer to split
 * @skb1: the buffer to receive the second part
 * @len: new length for skb
 */
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
	int pos = skb_headlen(skb);

	if (len < pos)	/* Split line is inside header. */
		skb_split_inside_header(skb, skb1, len, pos);
	else		/* Second chunk has no header, nothing to copy. */
		skb_split_no_header(skb, skb1, len, pos);
}

static inline void skb_split_no_header(struct sk_buff *skb,
				       struct sk_buff* skb1,
				       const u32 len, int pos)
{
	int i, k = 0;
	const int nfrags = skb_shinfo(skb)->nr_frags;

	skb_shinfo(skb)->nr_frags = 0;
	skb1->len		  = skb1->data_len = skb->len - len;
	skb->len		  = len;
	skb->data_len		  = len - pos;

	for (i = 0; i < nfrags; i++) {
		int size = skb_shinfo(skb)->frags[i].size;

		if (pos + size > len) {
			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];

			if (pos < len) {
				/* Split frag.
				 * We have two variants in this case:
				 * 1. Move all the frag to the second
				 *    part, if it is possible. F.e.
				 *    this approach is mandatory for TUX,
				 *    where splitting is expensive.
				 * 2. Split is accurately. We make this.
				 */
				get_page(skb_shinfo(skb)->frags[i].page);
				skb_shinfo(skb1)->frags[0].page_offset += len - pos;
				skb_shinfo(skb1)->frags[0].size -= len - pos;
				skb_shinfo(skb)->frags[i].size	= len - pos;
				skb_shinfo(skb)->nr_frags++;
			}
			k++;
		} else
			skb_shinfo(skb)->nr_frags++;
		pos += size;
	}
	skb_shinfo(skb1)->nr_frags = k;
}

2.1.5 sk_buff constructs tcp header push tcp_transmit_skb()

This function constructs the TCP header for the incoming skb, and then calls the output interface of the IP layer to complete the data transmission.

/* This routine actually transmits TCP packets queued in by
 * tcp_do_sendmsg().  This is used by both the initial
 * transmission and possible later retransmissions.
 * All SKB's seen here are completely headerless.  It is our
 * job to build the TCP header, and pass the packet down to
 * IP so it can do the same plus pass the packet off to the
 * device.
 *
 * We are working here with either a clone of the original
 * SKB, or a fresh unique copy made by the retransmit engine.
 */
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_sock *inet;
	struct tcp_sock *tp;
	struct tcp_skb_cb *tcb;
	int tcp_header_size;
#ifdef CONFIG_TCP_MD5SIG
	struct tcp_md5sig_key *md5;
	__u8 *md5_hash_location;
#endif
	struct tcphdr *th;
	int sysctl_flags;
	int err;

	BUG_ON(!skb || !tcp_skb_pcount(skb));

	/* If congestion control is doing timestamping, we must
	 * take such a timestamp before we potentially clone/copy.
	 */
	//拥塞控制算法相关
	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
		__net_timestamp(skb);
	//如果调用者指明需要克隆skb然后再发送,那么执行skb的克隆操作
	if (likely(clone_it)) {
		if (unlikely(skb_cloned(skb)))
			skb = pskb_copy(skb, gfp_mask);
		else
			skb = skb_clone(skb, gfp_mask);
		if (unlikely(!skb))
			return -ENOBUFS;
	}

	inet = inet_sk(sk);
	tp = tcp_sk(sk);
	tcb = TCP_SKB_CB(skb);
	tcp_header_size = tp->tcp_header_len;

#define SYSCTL_FLAG_TSTAMPS	0x1
#define SYSCTL_FLAG_WSCALE	0x2
#define SYSCTL_FLAG_SACK	0x4

	//根据报文类型,确定TCP首部长度,因为有些选项只能在SYN段中携带,所以这里需要区分计算
	sysctl_flags = 0;
	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
		tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
		if (sysctl_tcp_timestamps) {
			tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
			sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
		}
		if (sysctl_tcp_window_scaling) {
			tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
			sysctl_flags |= SYSCTL_FLAG_WSCALE;
		}
		if (sysctl_tcp_sack) {
			sysctl_flags |= SYSCTL_FLAG_SACK;
			if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
				tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
		}
	} else if (unlikely(tp->rx_opt.eff_sacks)) {
		/* A SACK is 2 pad bytes, a 2 byte header, plus
		 * 2 32-bit sequence numbers for each SACK block.
		 */
		tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
				    (tp->rx_opt.eff_sacks *
				     TCPOLEN_SACK_PERBLOCK));
	}

	//拥塞控制相关。如果之前没有正在传输的报文,那么这是第一次传输,通知拥塞控制
	//算法CA_EVENT_TX_START事件,表示新启动了发送
	if (tcp_packets_in_flight(tp) == 0)
		tcp_ca_event(sk, CA_EVENT_TX_START);

#ifdef CONFIG_TCP_MD5SIG
	/*
	 * Are we doing MD5 on this segment? If so - make
	 * room for it.
	 */
	md5 = tp->af_specific->md5_lookup(sk, sk);
	if (md5)
		tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
#endif
	//填充TCP首部各个字段
	skb_push(skb, tcp_header_size);
	skb_reset_transport_header(skb);
	skb_set_owner_w(skb, sk);

	/* Build TCP header and checksum it. */
	th = tcp_hdr(skb);
	th->source		= inet->sport;
	th->dest		= inet->dport;
	th->seq			= htonl(tcb->seq);
	th->ack_seq		= htonl(tp->rcv_nxt);
	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
					tcb->flags);

	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
		/* RFC1323: The window in SYN & SYN/ACK segments
		 * is never scaled.
		 */
		th->window	= htons(min(tp->rcv_wnd, 65535U));
	} else {
		th->window	= htons(tcp_select_window(sk));
	}
	th->check		= 0;
	th->urg_ptr		= 0;

	if (unlikely(tp->urg_mode &&
		     between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
		th->urg_ptr		= htons(tp->snd_up - tcb->seq);
		th->urg			= 1;
	}
	//构造TCP首部的选项部分
	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
		tcp_syn_build_options((__be32 *)(th + 1),
				      tcp_advertise_mss(sk),
				      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
				      (sysctl_flags & SYSCTL_FLAG_SACK),
				      (sysctl_flags & SYSCTL_FLAG_WSCALE),
				      tp->rx_opt.rcv_wscale,
				      tcb->when,
				      tp->rx_opt.ts_recent,

#ifdef CONFIG_TCP_MD5SIG
				      md5 ? &md5_hash_location :
#endif
				      NULL);
	} else {
		tcp_build_and_update_options((__be32 *)(th + 1),
					     tp, tcb->when,
#ifdef CONFIG_TCP_MD5SIG
					     md5 ? &md5_hash_location :
#endif
					     NULL);
		TCP_ECN_send(sk, skb, tcp_header_size);
	}

#ifdef CONFIG_TCP_MD5SIG
	/* Calculate the MD5 hash, as we have all we need now */
	if (md5) {
		tp->af_specific->calc_md5_hash(md5_hash_location,
					       md5,
					       sk, NULL, NULL,
					       tcp_hdr(skb),
					       sk->sk_protocol,
					       skb->len);
	}
#endif
	//校验和相关处理,在TCPv4中为tcp_v4_send_check()
	icsk->icsk_af_ops->send_check(sk, skb->len, skb);
	//发送的段中携带了ACK,延时确认机制需要做些更新操作
	if (likely(tcb->flags & TCPCB_FLAG_ACK))
		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
	//如果发送的段中携带了有效数据,需要进行一些拥塞控制相关的操作
	if (skb->len != tcp_header_size)
		tcp_event_data_sent(tp, skb, sk);

	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
		TCP_INC_STATS(TCP_MIB_OUTSEGS);
	//调用发送接口queue_xmit发送报文,进入到ip层,如果失败返回错误码。
	//在TCP中该接口实现函数为ip_queue_xmit()
	err = icsk->icsk_af_ops->queue_xmit(skb, 0);
	if (likely(err <= 0))
		return err;
	//显示拥塞相关
	tcp_enter_cwr(sk, 1);
	//根据错误码返回发送结果
	return net_xmit_eval(err);

#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
}

Note: Some of the content in tcp_transmit_skb() above involves other mechanisms of TCP, which will be analyzed later in time.

2.1.6 Update statistics tcp_event_new_data_sent()

When new data is sent out in the sending queue, call this function to update the statistics of the data segment.

static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
	unsigned int prior_packets = tp->packets_out;
	//将发送队列指针sk_send_head前移
	tcp_advance_send_head(sk, skb);
	//更新下一个待发送的段的TCP序号
	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;

	/* Don't override Nagle indefinately with F-RTO */
	//F-RTO算法
	if (tp->frto_counter == 2)
		tp->frto_counter = 3;
	//累加已经发送,但是尚未被确认的TCP段个数统计
	tp->packets_out += tcp_skb_pcount(skb);
	//如果之前没有发送过数据,则启动超时重传定时器
	if (!prior_packets)
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}

3 Send a packet of data tcp_push_one()

After reading the processing of tcp_write_xmit() above, let’s take a look at the implementation of tcp_push_one(). You will find that the two are basically the same. The difference is just like the function name. This function only tries to send one packet of data, while tcp_push() tries to traverse the entire sending queue. Until you can no longer send.

/* Send _single_ skb sitting at the send head. This function requires
 * true push pending frames to setup probe timer etc.
 */
void tcp_push_one(struct sock *sk, unsigned int mss_now)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb = tcp_send_head(sk);
	unsigned int tso_segs, cwnd_quota;

	BUG_ON(!skb || skb->len < mss_now);

	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);

	if (likely(cwnd_quota)) {
		unsigned int limit;

		BUG_ON(!tso_segs);

		limit = mss_now;
		if (tso_segs > 1 && !tcp_urg_mode(tp))
			limit = tcp_mss_split_point(sk, skb, mss_now,
						    cwnd_quota);

		if (skb->len > limit &&
		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
			return;

		/* Send it out now. */
		TCP_SKB_CB(skb)->when = tcp_time_stamp;

		if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
			tcp_event_new_data_sent(sk, skb);
			tcp_cwnd_validate(sk);
			return;
		}
	}
}

4 ACK message reply tcp_data_snd_check()

In the receiving process, after receiving the ACK, after updating the sending window and congestion window, tcp_data_snd_check() will also be called to check whether new data can be sent.

static inline void tcp_data_snd_check(struct sock *sk)
{
	//是上面__tcp_push_pending_frames()的包装函数
	tcp_push_pending_frames(sk);
	//内存管理相关
	tcp_check_space(sk);
}

static inline void tcp_push_pending_frames(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	__tcp_push_pending_frames(sk, tcp_current_mss(sk, 1), tp->nonagle);
}

Guess you like

Origin blog.csdn.net/wangquan1992/article/details/109018154