TCP协议实现(主动连接的三次握手)

第一次握手(发送SYN包)

tcp_v4_connect() -> tcp_connect() -> tcp_transmit_skb() -> ip_queue_xmit()

tcp_v4_connect()

state从TCP_CLOSE变成TCP_SYN_SENT

int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
   struct inet_sock *inet = inet_sk(sk);
   struct tcp_sock *tp = tcp_sk(sk);
   struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
   struct rtable *rt;
   __be32 daddr, nexthop;
   int tmp;
   int err;

   if (addr_len < sizeof(struct sockaddr_in)) // 检查地址长度
      return -EINVAL;

   if (usin->sin_family != AF_INET) // 检查协议族
      return -EAFNOSUPPORT;

   nexthop = daddr = usin->sin_addr.s_addr; // 设置目的地址
   if (inet->opt && inet->opt->srr) { // 若使用源路由选项
      if (!daddr)
         return -EINVAL;
      nexthop = inet->opt->faddr; // 设置nexthop
   }

   // 查找路由缓存
   tmp = ip_route_connect(&rt, nexthop, inet->saddr,
                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
                IPPROTO_TCP,
                inet->sport, usin->sin_port, sk, 1);
   if (tmp < 0) {
      if (tmp == -ENETUNREACH)
         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
      return tmp;
   }

   if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { // 若路由项的目的地址是组/广播地址
      ip_rt_put(rt);
      return -ENETUNREACH;
   }

   if (!inet->opt || !inet->opt->srr) // 若不使用源路由选项
      daddr = rt->rt_dst; // 设置daddr

   if (!inet->saddr)
      inet->saddr = rt->rt_src; // 设置inet->saddr
   inet->rcv_saddr = inet->saddr; // 设置inet->rcv_saddr

   if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
      /* Reset inherited state */
      tp->rx_opt.ts_recent      = 0;
      tp->rx_opt.ts_recent_stamp = 0;
      tp->write_seq        = 0;
   }

   if (tcp_death_row.sysctl_tw_recycle &&
       !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
      struct inet_peer *peer = rt_get_peer(rt);
      /*
       * VJ's idea. We save last timestamp seen from
       * the destination in peer table, when entering state
       * TIME-WAIT * and initialize rx_opt.ts_recent from it,
       * when trying new connection.
       */
      if (peer != NULL &&
          peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
         tp->rx_opt.ts_recent = peer->tcp_ts;
      }
   }

   inet->dport = usin->sin_port; // 设置inet->dport
   inet->daddr = daddr; // 设置inet->daddr

   inet_csk(sk)->icsk_ext_hdr_len = 0;
   if (inet->opt)
      inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;

   tp->rx_opt.mss_clamp = 536;

   /* Socket identity is still unknown (sport may be zero).
    * However we set state to SYN-SENT and not releasing socket
    * lock select source port, enter ourselves into the hash tables and
    * complete initialization after this.
    */
   tcp_set_state(sk, TCP_SYN_SENT); // 设置state为TCP_SYN_SENT
   // inet_hash_connect() -> __inet_hash_connect() -> __inet_check_established() + __inet_hash_nolisten()
   err = inet_hash_connect(&tcp_death_row, sk);
   if (err)
      goto failure;

   err = ip_route_newports(&rt, IPPROTO_TCP,
            inet->sport, inet->dport, sk);
   if (err)
      goto failure;

   /* OK, now commit destination to socket.  */
   sk->sk_gso_type = SKB_GSO_TCPV4;
   sk_setup_caps(sk, &rt->u.dst);

   if (!tp->write_seq)
      // 计算isn
      tp->write_seq = secure_tcp_sequence_number(inet->saddr,
                        inet->daddr,
                        inet->sport,
                        usin->sin_port);

   inet->id = tp->write_seq ^ jiffies;

   err = tcp_connect(sk); // 构造并发送SYN包
   rt = NULL;
   if (err)
      goto failure;

   return 0;

failure:
   /*
    * This unhashes the socket and releases the local port,
    * if necessary.
    */
   tcp_set_state(sk, TCP_CLOSE);
   ip_rt_put(rt);
   sk->sk_route_caps = 0;
   inet->dport = 0;
   return err;
}

tcp_connect()

int tcp_connect(struct sock *sk)
{
   struct tcp_sock *tp = tcp_sk(sk);
   struct sk_buff *buff;

   tcp_connect_init(sk);

   buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
   if (unlikely(buff == NULL))
      return -ENOBUFS;

   /* Reserve space for headers. */
   skb_reserve(buff, MAX_TCP_HEADER);

   tp->snd_nxt = tp->write_seq;
   tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN);
   TCP_ECN_send_syn(sk, buff);

   /* Send it off. */
   TCP_SKB_CB(buff)->when = tcp_time_stamp;
   tp->retrans_stamp = TCP_SKB_CB(buff)->when;
   skb_header_release(buff);
   __tcp_add_write_queue_tail(sk, buff);
   sk->sk_wmem_queued += buff->truesize;
   sk_mem_charge(sk, buff->truesize);
   tp->packets_out += tcp_skb_pcount(buff);
   tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); // 调用tcp_transmit_skb()

   /* We change tp->snd_nxt after the tcp_transmit_skb() call
    * in order to make this packet get counted in tcpOutSegs.
    */
   tp->snd_nxt = tp->write_seq;
   tp->pushed_seq = tp->write_seq;
   TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);

   /* Timer for repeating the SYN until an answer. */
   inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
              inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
   return 0;
}

tcp_transmit_skb()

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
             gfp_t gfp_mask)
{
   const struct inet_connection_sock *icsk = inet_csk(sk);
   struct inet_sock *inet;
   struct tcp_sock *tp;
   struct tcp_skb_cb *tcb;
   struct tcp_out_options opts;
   unsigned tcp_options_size, tcp_header_size;
   struct tcp_md5sig_key *md5;
   __u8 *md5_hash_location;
   struct tcphdr *th;
   int err;

   BUG_ON(!skb || !tcp_skb_pcount(skb));

   /* If congestion control is doing timestamping, we must
    * take such a timestamp before we potentially clone/copy.
    */
   if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
      __net_timestamp(skb);

   if (likely(clone_it)) {
      if (unlikely(skb_cloned(skb)))
         skb = pskb_copy(skb, gfp_mask);
      else
         skb = skb_clone(skb, gfp_mask);
      if (unlikely(!skb))
         return -ENOBUFS;
   }

   inet = inet_sk(sk);
   tp = tcp_sk(sk);
   tcb = TCP_SKB_CB(skb);
   memset(&opts, 0, sizeof(opts));

   if (unlikely(tcb->flags & TCPCB_FLAG_SYN))
      tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
   else
      tcp_options_size = tcp_established_options(sk, skb, &opts,
                        &md5);
   tcp_header_size = tcp_options_size + sizeof(struct tcphdr);

   if (tcp_packets_in_flight(tp) == 0)
      tcp_ca_event(sk, CA_EVENT_TX_START);

   skb_push(skb, tcp_header_size);
   skb_reset_transport_header(skb);
   skb_set_owner_w(skb, sk);

   /* Build TCP header and checksum it. */
   th = tcp_hdr(skb);
   th->source    = inet->sport;
   th->dest      = inet->dport;
   th->seq          = htonl(tcb->seq);
   th->ack_seq       = htonl(tp->rcv_nxt);
   *(((__be16 *)th) + 6)  = htons(((tcp_header_size >> 2) << 12) |
               tcb->flags);

   if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
      /* RFC1323: The window in SYN & SYN/ACK segments
       * is never scaled.
       */
      th->window = htons(min(tp->rcv_wnd, 65535U));
   } else {
      th->window = htons(tcp_select_window(sk));
   }
   th->check     = 0;
   th->urg_ptr       = 0;

   /* The urg_mode check is necessary during a below snd_una win probe */
   if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
      if (before(tp->snd_up, tcb->seq + 0x10000)) {
         th->urg_ptr = htons(tp->snd_up - tcb->seq);
         th->urg = 1;
      } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
         th->urg_ptr = 0xFFFF;
         th->urg = 1;
      }
   }

   tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
   if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
      TCP_ECN_send(sk, skb, tcp_header_size);

#ifdef CONFIG_TCP_MD5SIG
   /* Calculate the MD5 hash, as we have all we need now */
   if (md5) {
      sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
      tp->af_specific->calc_md5_hash(md5_hash_location,
                      md5, sk, NULL, skb);
   }
#endif

   icsk->icsk_af_ops->send_check(sk, skb->len, skb);

   if (likely(tcb->flags & TCPCB_FLAG_ACK))
      tcp_event_ack_sent(sk, tcp_skb_pcount(skb));

   if (skb->len != tcp_header_size)
      tcp_event_data_sent(tp, skb, sk);

   if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
      TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);

   err = icsk->icsk_af_ops->queue_xmit(skb, 0); // 调用ip_queue_xmit()
   if (likely(err <= 0))
      return err;

   tcp_enter_cwr(sk, 1);

   return net_xmit_eval(err);
}

第二次握手(收到SYN+ACK包)

tcp_v4_rcv() -> tcp_v4_do_rcv() -> tcp_rcv_state_process() -> tcp_rcv_synsent_state_process() -> tcp_send_ack()

tcp_rcv_synsent_state_process()

state从TCP_SYN_SENT变成TCP_ESTABLISHED

static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                struct tcphdr *th, unsigned len)
{
   struct tcp_sock *tp = tcp_sk(sk);
   struct inet_connection_sock *icsk = inet_csk(sk);
   int saved_clamp = tp->rx_opt.mss_clamp;

   tcp_parse_options(skb, &tp->rx_opt, 0);

   if (th->ack) { // 若ack置位
      /* rfc793:
       * "If the state is SYN-SENT then
       *    first check the ACK bit
       *      If the ACK bit is set
       *   If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
       *        a reset (unless the RST bit is set, if so drop
       *        the segment and return)"
       *
       *  We do not send data with SYN, so that RFC-correct
       *  test reduces to:
       */
      if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
         goto reset_and_undo;

      if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
          !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
              tcp_time_stamp)) {
         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
         goto reset_and_undo;
      }

      /* Now ACK is acceptable.
       *
       * "If the RST bit is set
       *    If the ACK was acceptable then signal the user "error:
       *    connection reset", drop the segment, enter CLOSED state,
       *    delete TCB, and return."
       */

      if (th->rst) { // 若rst置位
         tcp_reset(sk);
         goto discard;
      }

      /* rfc793:
       *   "fifth, if neither of the SYN or RST bits is set then
       *    drop the segment and return."
       *
       *    See note below!
       *                                        --ANK(990513)
       */
      if (!th->syn) // 若syn未置位
         goto discard_and_undo;

      /* rfc793:
       *   "If the SYN bit is on ...
       *    are acceptable then ...
       *    (our SYN has been ACKed), change the connection
       *    state to ESTABLISHED..."
       */

      TCP_ECN_rcv_synack(tp, th);

      tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
      tcp_ack(sk, skb, FLAG_SLOWPATH);

      /* Ok.. it's good. Set up sequence numbers and
       * move to established.
       */
      tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
      tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

      /* RFC1323: The window in SYN & SYN/ACK segments is
       * never scaled.
       */
      tp->snd_wnd = ntohs(th->window);
      tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

      if (!tp->rx_opt.wscale_ok) {
         tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
         tp->window_clamp = min(tp->window_clamp, 65535U);
      }

      if (tp->rx_opt.saw_tstamp) {
         tp->rx_opt.tstamp_ok      = 1;
         tp->tcp_header_len =
            sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
         tp->advmss     -= TCPOLEN_TSTAMP_ALIGNED;
         tcp_store_ts_recent(tp);
      } else {
         tp->tcp_header_len = sizeof(struct tcphdr);
      }

      if (tcp_is_sack(tp) && sysctl_tcp_fack)
         tcp_enable_fack(tp);

      tcp_mtup_init(sk);
      tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
      tcp_initialize_rcv_mss(sk);

      /* Remember, tcp_poll() does not lock socket!
       * Change state from SYN-SENT only after copied_seq
       * is initialized. */
      tp->copied_seq = tp->rcv_nxt;
      smp_mb();
      tcp_set_state(sk, TCP_ESTABLISHED); // 设置state为TCP_ESTABLISHED

      security_inet_conn_established(sk, skb);

      /* Make sure socket is routed, for correct metrics.  */
      icsk->icsk_af_ops->rebuild_header(sk);

      tcp_init_metrics(sk);

      tcp_init_congestion_control(sk);

      /* Prevent spurious tcp_cwnd_restart() on first data
       * packet.
       */
      tp->lsndtime = tcp_time_stamp;

      tcp_init_buffer_space(sk);

      if (sock_flag(sk, SOCK_KEEPOPEN))
         inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));

      if (!tp->rx_opt.snd_wscale)
         __tcp_fast_path_on(tp, tp->snd_wnd); // 设置首部预测标志
      else
         tp->pred_flags = 0;

      if (!sock_flag(sk, SOCK_DEAD)) {
         sk->sk_state_change(sk);
         sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
      }

      if (sk->sk_write_pending ||
          icsk->icsk_accept_queue.rskq_defer_accept ||
          icsk->icsk_ack.pingpong) {
         /* Save one ACK. Data will be ready after
          * several ticks, if write_pending is set.
          *
          * It may be deleted, but with this feature tcpdumps
          * look so _wonderfully_ clever, that I was not able
          * to stand against the temptation 8)     --ANK
          */
         inet_csk_schedule_ack(sk);
         icsk->icsk_ack.lrcvtime = tcp_time_stamp;
         icsk->icsk_ack.ato  = TCP_ATO_MIN;
         tcp_incr_quickack(sk);
         tcp_enter_quickack_mode(sk);
         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
                    TCP_DELACK_MAX, TCP_RTO_MAX);

discard:
         __kfree_skb(skb);
         return 0;
      } else {
         tcp_send_ack(sk); // 调用tcp_send_ack()发送ACK包
      }
      return -1;
   }

   /* No ACK in the segment */
   // 若ack未置位

   if (th->rst) { // 若rst置位
      /* rfc793:
       * "If the RST bit is set
       *
       *      Otherwise (no ACK) drop the segment and return."
       */

      goto discard_and_undo;
   }

   /* PAWS check. */
   if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
       tcp_paws_reject(&tp->rx_opt, 0))
      goto discard_and_undo;

   if (th->syn) { // 若syn置位(同时打开)
      /* We see SYN without ACK. It is attempt of
       * simultaneous connect with crossed SYNs.
       * Particularly, it can be connect to self.
       */
      tcp_set_state(sk, TCP_SYN_RECV); // 设置state为TCP_SYN_RECV

      if (tp->rx_opt.saw_tstamp) {
         tp->rx_opt.tstamp_ok = 1;
         tcp_store_ts_recent(tp);
         tp->tcp_header_len =
            sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
      } else {
         tp->tcp_header_len = sizeof(struct tcphdr);
      }

      tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
      tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

      /* RFC1323: The window in SYN & SYN/ACK segments is
       * never scaled.
       */
      tp->snd_wnd    = ntohs(th->window);
      tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
      tp->max_window = tp->snd_wnd;

      TCP_ECN_rcv_syn(tp, th);

      tcp_mtup_init(sk);
      tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
      tcp_initialize_rcv_mss(sk);

      tcp_send_synack(sk); // 调用tcp_send_synack()发送SYN+ACK包
#if 0
      /* Note, we could accept data and URG from this segment.
       * There are no obstacles to make this.
       *
       * However, if we ignore data in ACKless segments sometimes,
       * we have no reasons to accept it sometimes.
       * Also, seems the code doing it in step6 of tcp_rcv_state_process
       * is not flawless. So, discard packet for sanity.
       * Uncomment this return to process the data.
       */
      return -1;
#else
      goto discard;
#endif
   }
   /* "fifth, if neither of the SYN or RST bits is set then
    * drop the segment and return."
    */

discard_and_undo:
   tcp_clear_options(&tp->rx_opt);
   tp->rx_opt.mss_clamp = saved_clamp;
   goto discard;

reset_and_undo:
   tcp_clear_options(&tp->rx_opt);
   tp->rx_opt.mss_clamp = saved_clamp;
   return 1;
}

第三次握手(发送ACK包)

tcp_send_ack() -> tcp_transmit_skb() -> ip_queue_xmit()

tcp_send_ack()

void tcp_send_ack(struct sock *sk)
{
   struct sk_buff *buff;

   /* If we have been reset, we may not send again. */
   if (sk->sk_state == TCP_CLOSE)
      return;

   /* We are not putting this on the write queue, so
    * tcp_transmit_skb() will set the ownership to this
    * sock.
    */
   buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
   if (buff == NULL) {
      inet_csk_schedule_ack(sk);
      inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
      inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
                 TCP_DELACK_MAX, TCP_RTO_MAX);
      return;
   }

   /* Reserve space for headers and prepare control bits. */
   skb_reserve(buff, MAX_TCP_HEADER);
   tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK);

   /* Send it off, this clears delayed acks for us. */
   TCP_SKB_CB(buff)->when = tcp_time_stamp;
   tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); // 调用tcp_transmit_skb()
}

猜你喜欢

转载自blog.csdn.net/hz5034/article/details/80645052