Los datos TCP de la pila de protocolos del kernel de Linux reciben un procesamiento de ruta lento

Tabla de contenido

1 Procesamiento de ruta lenta tcp_rcv_established: ruta_lenta

1.1 Verificación de paquetes tcp_validate_incoming

1.2 El mensaje se registra en la cola tcp_data_queue ()


1 Procesamiento de ruta lenta tcp_rcv_established: ruta_lenta

int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
			struct tcphdr *th, unsigned len)
{
....
slow_path:
	长度检查和校验
	if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
		goto csum_error;

	/*
	 *	Standard slow path.
	 */
	//报文校验
	res = tcp_validate_incoming(sk, skb, th, 1);
	if (res <= 0)
		return -res;

step5:
	//收到ACK,处理(一般都会有,携带ACK是没有开销的)
	if (th->ack)
		tcp_ack(sk, skb, FLAG_SLOWPATH);

	//rtt采样
	tcp_rcv_rtt_measure_ts(sk, skb);

	/* Process urgent data. */
	//处理紧急数据,忽略
	tcp_urg(sk, skb, th);

	/* step 7: process the segment text */
	//报文检查
	tcp_data_queue(sk, skb);

	//尝试发送数据
	tcp_data_snd_check(sk);

	//尝试发送ACK
	tcp_ack_snd_check(sk);
	return 0;

csum_error:
	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);

discard:
	__kfree_skb(skb);
	return 0;
}

1.1 Verificación de paquetes tcp_validate_incoming

/* Does PAWS and seqno based validation of an incoming segment, flags will
 * play significant role here.
 */
static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
			      struct tcphdr *th, int syn_inerr)
{
	struct tcp_sock *tp = tcp_sk(sk);

	/* RFC1323: H1. Apply PAWS check first. */
	//PAWS相关处理,忽略
	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
	    tcp_paws_discard(sk, skb)) {
		if (!th->rst) {
			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
			tcp_send_dupack(sk, skb);
			goto discard;
		}
		/* Reset is accepted even if it did not pass PAWS. */
	}

	/* Step 1: check sequence number */
	 //如果输入数据包的序号不再接收窗口内,则需要丢弃
	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
		/* RFC793, page 37: "In all states except SYN-SENT, all reset
		 * (RST) segments are validated by checking their SEQ-fields."
		 * And page 69: "If an incoming segment is not acceptable,
		 * an acknowledgment should be sent in reply (unless the RST
		 * bit is set, if so drop the segment and return)".
		 */
		if (!th->rst)
			tcp_send_dupack(sk, skb);
		goto discard;
	}

	/* Step 2: check RST bit */
	//收到RST报文,做复位操作(根据TCP状态设置错误标记位)后丢弃该报文
	if (th->rst) {
		tcp_reset(sk);
		goto discard;
	}

	/* ts_recent update must be made after we are sure that the packet
	 * is in window.
	 */
	 //更新时间戳
	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);

	/* step 3: check security and precedence [ignored] */

	/* step 4: Check for a SYN in window. */
	//已建立连接的TCP收到SYN包时,说明对端遇到了错误,这时复位当前TCP
	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
		if (syn_inerr)
			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
		tcp_reset(sk);
		return -1;
	}

	return 1;

discard:
	__kfree_skb(skb);
	return 0;
}

1.2 El mensaje se registra en la cola tcp_data_queue ()

  1. Sin datos, suelte skb, vuelva;
  2. El segmento de datos que se espera recibir, a. Realizar juicio de ventana 0; b. Procesar contexto, copiar datos al espacio de usuario; c. Si b no se satisface o b no copia completamente el segmento de datos de este skb, entonces se agregará a la cola de recepción; d. Actualizar Un número de secuencia que se espera recibir; e. Si hay una etiqueta de aleta, procese fin; f. Si la cola fuera de servicio no está vacía, procese fuera de servicio; g. Verifique y establezca la ruta rápida; h. Reactive el proceso de espacio de usuario para leer datos;
  3. El segmento de datos retransmitidos entra en modo de confirmación rápida y libera el skb;
  4. El segmento de datos fuera de la ventana entra en el modo de confirmación rápida y libera el skb;
  5. Los segmentos de datos se superponen Después de juzgar la ventana 0, se realiza el proceso de agregar la cola de recepción en (2) y> = d;
  6. Para los segmentos de datos desordenados, llame a tcp_data_queue_ofo para recibir el procesamiento de los segmentos de datos desordenados;
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
	struct tcphdr *th = tcp_hdr(skb);
	struct tcp_sock *tp = tcp_sk(sk);
	int eaten = -1;
 
	//该函数要处理就是带数据的输入段,所以如果数据数据段没有数据部分,直接丢弃
	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
		goto drop;
	//调整data指针指向数据部分
	__skb_pull(skb, th->doff * 4);
	//ECN相关处理
	TCP_ECN_accept_cwr(tp, skb);
 
	//延迟ACK相关处理
	if (tp->rx_opt.dsack) {
		tp->rx_opt.dsack = 0;
		tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
	}

	/*  Queue data for delivery to the user.
	 *  Packets in sequence go to the receive queue.
	 *  Out of sequence packets to the out_of_order_queue.
	 */
	//虽然这里是慢速路径的处理,但是输入段还是有可能是预期的段,所以还是要将数据放入
	//接收队列或者直接拷贝到用户空间,这里的处理和快速路径的处理非常类似
	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
		//接收窗口为0,表示本端没有空间接收数据了,所以立马给对端发送零窗口通告
		if (tcp_receive_window(tp) == 0)
			goto out_of_window;
 
		//如果用户空间程序正在等待数据,并且数据正好是要读取的,直接拷贝给用户空间
		if (tp->ucopy.task == current &&
		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
		    sock_owned_by_user(sk) && !tp->urg_data) {
			int chunk = min_t(unsigned int, skb->len,
					  tp->ucopy.len);

			__set_current_state(TASK_RUNNING);

			local_bh_enable();
			//执行数据拷贝
			if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
				tp->ucopy.len -= chunk;
				tp->copied_seq += chunk;
				eaten = (chunk == skb->len && !th->fin);
				tcp_rcv_space_adjust(sk);
			}
			local_bh_disable();
		}
		//如果没有拷贝成功(内存受限,或者没有进程在等待等原因)
		if (eaten <= 0) {
queue_and_out:
			//内存不足,丢弃数据包
			if (eaten < 0 &&
			    tcp_try_rmem_schedule(sk, skb->truesize))
				goto drop;
			//将输入数据包放入接收队列中
			skb_set_owner_r(skb, sk);
			__skb_queue_tail(&sk->sk_receive_queue, skb);
		}
		//更新rcv_nxt
		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
		//收到了新数据,做新数据到的事件处理
		if (skb->len)
			tcp_event_data_recv(sk, skb);
		//输入数据包中携带了FIN标记,做断开连接处理
		if (th->fin)
			tcp_fin(skb, sk, th);
		//如果乱序队列不为空,那么因为来了新数据,所以乱序队列中可能有些数据变为连续的,
		//所以需要将这些数据移到接收队列中
		if (!skb_queue_empty(&tp->out_of_order_queue)) {
			//处理乱序队列
			tcp_ofo_queue(sk);

			/* RFC2581. 4.2. SHOULD send immediate ACK, when
			 * gap in queue is filled.
			 */
			if (skb_queue_empty(&tp->out_of_order_queue))
				inet_csk(sk)->icsk_ack.pingpong = 0;
		}
		//SACK相关处理
		if (tp->rx_opt.num_sacks)
			tcp_sack_remove(tp);
		//重新设置首部预测标记
		tcp_fast_path_check(sk);
		//如果数据已经拷贝给了用户空间程序,那么释放skb,否则通知用户空间程序数据可读
		if (eaten > 0)
			__kfree_skb(skb);
		else if (!sock_flag(sk, SOCK_DEAD))
			sk->sk_data_ready(sk, 0);
		return;
	}
 
	//下面负责处理收到的非预期段
 
	//输入数据段的end_seq都小于rcv_nxt,所以数据段一定是重复段
	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
		/* A retransmit, 2nd most common case.  Force an immediate ack. */
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);

out_of_window:
		tcp_enter_quickack_mode(sk);
		inet_csk_schedule_ack(sk);
drop:
		__kfree_skb(skb);
		return;
	}
 
	//输入数据段超过了接收窗口的右边界
	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
		goto out_of_window;
 
	//到这里,输入段在接收窗口内,但是一定是乱序报文
 
	//进入快速ACK模式
	tcp_enter_quickack_mode(sk);
 
	//这个条件成立,说明输入段的一部分数据在接收窗口内
	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
		/* Partial packet, seq < rcv_next < end_seq */
		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
			   TCP_SKB_CB(skb)->end_seq);

		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);

		/* If window is closed, drop tail of packet. But after
		 * remembering D-SACK for its head made in previous line.
		 */
		if (!tcp_receive_window(tp))
			goto out_of_window;
		goto queue_and_out;
	}
	//ECN相关
	TCP_ECN_check_ce(tp, skb);
	//内存检查
	if (tcp_try_rmem_schedule(sk, skb->truesize))
		goto drop;
 
	//因为发生了乱序,所以需要关闭首部预测标记
	tp->pred_flags = 0;
	inet_csk_schedule_ack(sk);

	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);

	skb_set_owner_r(skb, sk);
	//如果当前乱序队列为空
	if (!skb_peek(&tp->out_of_order_queue)) {
		//SACK相关
		/* Initial out of order segment, build 1 SACK. */
		if (tcp_is_sack(tp)) {
			tp->rx_opt.num_sacks = 1;
			tp->rx_opt.dsack     = 0;
			tp->rx_opt.eff_sacks = 1;
			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
			tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
		}
		//将skb加入乱序队列
		__skb_queue_head(&tp->out_of_order_queue, skb);
	} else {
		//下面代码执行的就是将skb放入乱序队列,虽然是乱序队列,但是放入的时候还是
		//保持序号的顺序排列,这样方便后续从乱序队列搬移到接收队列的处理。由于夹杂
		//着选择ACK和性能方面的考虑,所以看起来实现比较复杂
		struct sk_buff *skb1 = tp->out_of_order_queue.prev;
		u32 seq = TCP_SKB_CB(skb)->seq;
		u32 end_seq = TCP_SKB_CB(skb)->end_seq;

		if (seq == TCP_SKB_CB(skb1)->end_seq) {
			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);

			if (!tp->rx_opt.num_sacks ||
			    tp->selective_acks[0].end_seq != seq)
				goto add_sack;

			/* Common case: data arrive in order after hole. */
			tp->selective_acks[0].end_seq = end_seq;
			return;
		}

		/* Find place to insert this segment. */
		do {
			if (!after(TCP_SKB_CB(skb1)->seq, seq))
				break;
		} while ((skb1 = skb1->prev) !=
			 (struct sk_buff *)&tp->out_of_order_queue);

		/* Do skb overlap to previous one? */
		if (skb1 != (struct sk_buff *)&tp->out_of_order_queue &&
		    before(seq, TCP_SKB_CB(skb1)->end_seq)) {
			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				/* All the bits are present. Drop. */
				__kfree_skb(skb);
				tcp_dsack_set(sk, seq, end_seq);
				goto add_sack;
			}
			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
				/* Partial overlap. */
				tcp_dsack_set(sk, seq,
					      TCP_SKB_CB(skb1)->end_seq);
			} else {
				skb1 = skb1->prev;
			}
		}
		__skb_queue_after(&tp->out_of_order_queue, skb1, skb);

		/* And clean segments covered by new one as whole. */
		while ((skb1 = skb->next) !=
		       (struct sk_buff *)&tp->out_of_order_queue &&
		       after(end_seq, TCP_SKB_CB(skb1)->seq)) {
			if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
						 end_seq);
				break;
			}
			__skb_unlink(skb1, &tp->out_of_order_queue);
			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
					 TCP_SKB_CB(skb1)->end_seq);
			__kfree_skb(skb1);
		}

add_sack:
		if (tcp_is_sack(tp))
			tcp_sack_new_ofo_skb(sk, seq, end_seq);
	}
}

Supongo que te gusta

Origin blog.csdn.net/wangquan1992/article/details/109061754
Recomendado
Clasificación