table of Contents

2.1 Data memory allocation size select_size()

2.2 Data push interface tcp_push / tcp_push_one /__tcp_push_pending_frames

There are many system call interfaces related to TCP protocol sending, such as send(), sendto(), sendmsg() and sendmmsg(). But when it comes to the TCP protocol layer, they are all handled by the kernel's tcp_sendmsg().

1 Send queue

Before looking at the tcp_sendmsg() code, it is necessary to look at the organization and usage of the send queue.
Insert picture description here
Note: It is important to note that sk_send_head tracks the data that has not been sent, excluding retransmitted data .

2 tcp_sendmsg()

The work to be done by this function is to organize the data to be sent by the application into skb, and then send it out as much as possible. The core operations are as follows:

Determine the current MSS (the existence of PMTU, the value may be dynamically changed), and the maximum amount of data size_goal (in bytes) that TCP can fill in a skb. These two parameters are not equal when supporting TSO, then size_goal will be an integer multiple of MSS;
The data copy process is divided into inner and outer loops. The outer loop is responsible for traversing the array (data specified by the application may not be in a continuous space, such as writev()). The inner loop is responsible for copying the data in an array element;
Next, we need to find an skb, which can be divided into two cases: 1) If there is still room for the last skb in the current sending queue to continue to fill with data, then fill the skb with data first; 2) if there is no existing skb available, then Just assign a new one. The basis for judging whether an skb can still hold data is to see whether the currently saved data has exceeded size_goal;
After finding the skb, the next step is to decide which area of the skb to copy data to. The linear buffer is preferred. If there is no space in the linear buffer, it will be placed in frag_list[] (provided that the device supports SG IO, if the device does not support it, Can only reallocate skb, and then copy the data to its linear buffer);
During the copy process, if necessary, different interfaces will be called to send data.

@msg：要发送的数据；
@size：本次要发送的数据量
int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
		size_t size)
{
	struct sock *sk = sock->sk;
	struct iovec *iov;
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	int iovlen, flags;
	int mss_now, size_goal;
	int err, copied;
	long timeo;

	lock_sock(sk);
	TCP_CHECK_TIMER(sk);

	//计算超时时间，如果设置了MSG_DONTWAIT标记，则超时时间为0
	flags = msg->msg_flags;
	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);

	//只有ESTABLISHED和CLOSE_WAIT两个状态可以发送数据，其它状态需要等待连接完成；
	//CLOSE_WAIT是收到对端FIN但是本端还没有发送FIN时所处状态，所以也可以发送数据
	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
			goto out_err;

	/* This should be in poll */
	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

	//每次发送都操作都会重新获取MSS值，保存到mss_now中
	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
	//获取一个skb可以容纳的数据量。如果不支持TSO，那么该值就是MSS，否则是MSS的整数倍
	size_goal = tp->xmit_size_goal;

	//应用要发送的数据被保存在msg中，以数组方式组织，msg_iovlen为数组大小，msg_iov为数组第一个元素
	iovlen = msg->msg_iovlen;
	iov = msg->msg_iov;
	//copied将记录本次能够写入TCP的字节数，如果成功，最终会返回给应用，初始化为0
	copied = 0;

	//检查之前TCP连接是否发生过异常
	err = -EPIPE;
	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
		goto do_error;

	//外层循环用来遍历msg_iov数组
	while (--iovlen >= 0) {
		//msg_iov数组中每个元素包含的数据量都可以不同，每个元素自己有多少数据量记录在自己的iov_len字段中
		int seglen = iov->iov_len;
		//from指向要拷贝的数据起点
		unsigned char __user *from = iov->iov_base;

		//iov指向下一个数组元素
		iov++;
		//内层循环用于拷贝一个数组元素
		while (seglen > 0) {
			//copy保存本轮循环要拷贝的数据量，下面会根据不同的情况计算该值
			int copy;
			//获取发送队列中最后一个数据块，因为该数据块当前已保存数据可能还没有超过
			//size_goal，所以可以继续往该数据块中填充数据
			skb = tcp_write_queue_tail(sk);

			//cond1：tcp_send_head()返回NULL表示待发送的新数为空(可能有待确认数据)
			//cond2：copy <= 0说明发送队列最后一个skb数据量也达到了size_goal，不能
			//  继续填充数据了。当两次发送之间MSS发生变化会出现小于0的情况
			
			//这两种情况中的任意一种发生都只能选择分配新的skb
			if (!tcp_send_head(sk) ||
			    (copy = size_goal - skb->len) <= 0) {
new_segment:
				/* Allocate new segment. If the interface is SG,
				 * allocate skb fitting to single page.
				 */
				//即将分配内存，首先检查内存使用是否会超限，如果会要先等待有内存可用
				if (!sk_stream_memory_free(sk))
					goto wait_for_sndbuf;
				//分配skb,select_size()的返回值决定了skb的线性区域大小，见下文
				skb = sk_stream_alloc_skb(sk, select_size(sk), sk->sk_allocation);
				//分配失败，需要等待有剩余内存可用后才能继续发送
				if (!skb)
					goto wait_for_memory;

				/*
				 * Check whether we can use HW checksum.
				 */
				//根据硬件能力确定TCP是否需要执行校验工作
				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
					skb->ip_summed = CHECKSUM_PARTIAL;

				//将新分配的skb加入到TCB的发送队列中，并且更新相关内存记账信息
				skb_entail(sk, skb);
				//设置本轮要拷贝的数据量为size_goal，因为该skb是新分配的，所以
				//一定可以容纳这么多，但是具体能不能拷贝这么多，还需要看有没有这么
				//多的数据要发送，见下方
				copy = size_goal;
			}
			//如果skb可以容纳的数据量超过了当前数组元素中已有数据量，那么本轮只拷贝数组元素中已有的数据量
			if (copy > seglen)
				copy = seglen;

			/* Where to copy to? */
			if (skb_tailroom(skb) > 0) {
				//如果skb的线性部分还有空间，先填充这部分

				//如果线性空间部分小于当前要拷贝的数据量，则调整本轮要拷贝的数据量
				/* We have some space in skb head. Superb! */
				if (copy > skb_tailroom(skb))
					copy = skb_tailroom(skb);
				//拷贝数据，如果出错则结束发送过程
				if ((err = skb_add_data(skb, from, copy)) != 0)
					goto do_fault;
			} else {
				//merge用于指示是否可以将新拷贝的数据和当前skb的最后一个片段合并。如果
				//它们在页面内刚好是连续的，那么就可以合并为一个片段
				int merge = 0;
				//i为当前skb中已经存在的分片个数
				int i = skb_shinfo(skb)->nr_frags;
				//page指向上一次分配的页面，off指向该页面中的偏移量
				struct page *page = TCP_PAGE(sk);
				int off = TCP_OFF(sk);
				//该函数用于判断该skb最后一个片段是否就是当前页面的最后一部分，如果是，那么新拷贝的
				//数据和该片段就可以合并，所以设置merge为1，这样可以节省一个frag_list[]位置
				if (skb_can_coalesce(skb, i, page, off) && off != PAGE_SIZE) {
					/* We can extend the last page fragment. */
					merge = 1;
				} else if (i == MAX_SKB_FRAGS || (!i && !(sk->sk_route_caps & NETIF_F_SG))) {
					//如果skb中已经容纳的分片已经达到了限定值(条件1)，或者网卡不支持SG IO
					//那么就不能往skb中添加分片，设置PUSH标志位，然后跳转到new_segment处，
					//然后重新分配一个skb，继续拷贝数据
					/* Need to add new fragment and cannot
					 * do this because interface is non-SG,
					 * or because all the page slots are
					 * busy. */
					tcp_mark_push(tp, skb);
					goto new_segment;
				} else if (page) {
					//如果上一次分配的页面已经使用完了，设定sk_sndpage为NULL
					if (off == PAGE_SIZE) {
						put_page(page);
						TCP_PAGE(sk) = page = NULL;
						off = 0;
					}
				} else
					off = 0;
				//如果要拷贝的数据量超过了当前页面剩余空间，调整本轮要拷贝的数据量
				if (copy > PAGE_SIZE - off)
					copy = PAGE_SIZE - off;
				//检查拷贝copy字节数据后是否会导致发送内存超标，如果超标需要等待内存可用
				if (!sk_wmem_schedule(sk, copy))
					goto wait_for_memory;
				//如果没有可用页面，则分配一个新的，分配失败则会等待内存可用
				if (!page) {
					/* Allocate new cache page. */
					if (!(page = sk_stream_alloc_page(sk)))
						goto wait_for_memory;
				}
				//拷贝copy字节数据到页面中
				err = skb_copy_to_page(sk, from, skb, page, off, copy);
				//拷贝失败处理
				if (err) {
					//虽然本次拷贝失败了，但是如果页面是新分配的，也不会收回了，
					//而是将其继续指派给当前TCB，这样下次发送就可以直接使用了
					if (!TCP_PAGE(sk)) {
						TCP_PAGE(sk) = page;
						TCP_OFF(sk) = 0;
					}
					goto do_error;
				}

				//更新skb中相关指针、计数信息
				if (merge) {
					//因为可以和最后一个分片合并，所以只需要更新该分片的大小即可
					skb_shinfo(skb)->frags[i - 1].size += copy;
				} else {
					//占用一个新的frag_list[]元素
					skb_fill_page_desc(skb, i, page, off, copy);
					if (TCP_PAGE(sk)) {
						//如果是旧页面，但是因为新分配了片段，所以累加对页面的引用计数
						//从这里可以看出，skb中的每个片段都会持有一个对页面的引用计数
						get_page(page);
					} else if (off + copy < PAGE_SIZE) {
						//页面是新分配的，并且本次拷贝没有将页面用完，所以持有页面的
						//引用计数，然后将页面指定给sk_sndmsg_page字段，下次可以继续使用
						get_page(page);
						TCP_PAGE(sk) = page;
					}
				}
				//设置sk_sndmsg_off的偏移量
				TCP_OFF(sk) = off + copy;
			}//end of 'else'

			//如果本轮是第一次拷贝，清除PUSH标记
			if (!copied)
				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
			//write_seq记录的是发送队列中下一个要分配的序号，所以这里需要更新它
			tp->write_seq += copy;
			//更新该数据包的最后一个字节的序号
			TCP_SKB_CB(skb)->end_seq += copy;
			skb_shinfo(skb)->gso_segs = 0;

			//用户空间缓存区指针前移
			from += copy;
			//累加已经拷贝字节数
			copied += copy;
			//如果所有要发送的数据都拷贝完了，结束发送过程
			if ((seglen -= copy) == 0 && iovlen == 0)
				goto out;
			//如果该skb没有填满，继续下一轮拷贝
			if (skb->len < size_goal || (flags & MSG_OOB))
				continue;
			//如果需要设置PUSH标志位，那么设置PUSH，然后发送数据包，可将PUSH可以让TCP尽快的发送数据
			if (forced_push(tp)) {
				tcp_mark_push(tp, skb);
				//尽可能的将发送队列中的skb发送出去，禁用nalge
				__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
			} else if (skb == tcp_send_head(sk))
				//当前只有这一个skb，也发送出去。因为只有一个，所以肯定也不存在拥塞，可以发送
				tcp_push_one(sk, mss_now);

			//继续拷贝数据
			continue;

wait_for_sndbuf:
			//设置套接字结构中发送缓存不足的标志
			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
			//如果已经有数据拷贝到了发送缓存中，那么调用tcp_push()立即发送，这样可能可以
			//让发送缓存快速的有剩余空间可用
			if (copied)
				tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
			//等待有空余内存可以使用，如果timeo不为0，那么这一步会休眠
			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				goto do_error;
			//睡眠后MSS可能发生了变化，所以重新计算
			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
			size_goal = tp->xmit_size_goal;
		}//end of 'while (seglen > 0)'，内层循环
	}//end of 'while (--iovlen >= 0)'，外层循环

out:
	//如果拷贝了数据到发送缓存区，尝试进行一次发送
	if (copied)
		tcp_push(sk, flags, mss_now, tp->nonagle);
	TCP_CHECK_TIMER(sk);
	release_sock(sk);
	//返回本次写入的数据量
	return copied;

do_fault:
	//发生了错误，并且当前skb尚未包含任何数据，那么需要释放该skb
	if (!skb->len) {
		tcp_unlink_write_queue(skb, sk);
		/* It is the one place in all of TCP, except connection
		 * reset, where we can be unlinking the send_head.
		 */
		tcp_check_send_head(sk, skb);
		sk_wmem_free_skb(sk, skb);
	}

do_error:
	if (copied)
		goto out;
out_err:
	err = sk_stream_error(sk, flags, err);
	TCP_CHECK_TIMER(sk);
	release_sock(sk);
	return err;
}

2.1 Data memory allocation size select_size()

The return value of this function determines the size of the linear region of the allocated skb. The following focuses on understanding the TSO scenario, because the current software can implement GSO, so basically we take this branch.

#define MAX_TCP_HEADER	(128 + MAX_HEADER)

static inline int select_size(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	//不考虑任何特殊情况，返回值就是MSS值
	int tmp = tp->mss_cache;
	//如果底层支持SG IO
	if (sk->sk_route_caps & NETIF_F_SG) {
		//如果支持TSO，那么返回值就是0，这表示TSO场景，分配数据时，skb的线性区域大小将为0
		if (sk_can_gso(sk))
			tmp = 0;
		else {
			//这一部分的原理没看明白
			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);

			if (tmp >= pgbreak &&
			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
				tmp = pgbreak;
		}
	}

	return tmp;
}

2.2 Data push interface tcp_push / tcp_push_one /__tcp_push_pending_frames

"Linux Kernel Protocol Stack TCP Layer Data Transmission New Data"

linux kernel protocol stack TCP layer data sending system call tcp_sendmsg

1 Send queue

2 tcp_sendmsg()

2.1 Data memory allocation size select_size()

2.2 Data push interface tcp_push / tcp_push_one /__tcp_push_pending_frames

Guess you like