TCP协议实现(被动连接的三次握手)

协议栈标准TCP协议的TCP状态转换略有不同:

第三次握手时,在tcp_v4_syn_recv_sock() -> tcp_create_openreq_child() -> inet_csk_clone()中设置state为TCP_SYN_RECV,在tcp_child_process() -> tcp_rcv_state_process()中设置state为TCP_ESTABLISHED

第一次握手(收到SYN包)

tcp_v4_rcv() -> tcp_v4_do_rcv() -> tcp_rcv_state_process() -> tcp_v4_conn_request()

tcp_rcv_state_process()

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
           struct tcphdr *th, unsigned len)
{
   struct tcp_sock *tp = tcp_sk(sk);
   struct inet_connection_sock *icsk = inet_csk(sk);
   int queued = 0;
   int res;

   tp->rx_opt.saw_tstamp = 0;

   switch (sk->sk_state) {
   case TCP_CLOSE:
      goto discard;

   case TCP_LISTEN:
      if (th->ack) // 若ack置位,tcp_v4_send_reset() + 丢包
         return 1;

      if (th->rst) // 若rst置位,丢包
         goto discard;

      if (th->syn) { // 若syn置位
         if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) // tcp_v4_conn_request()
            return 1;

         /* Now we have several options: In theory there is
          * nothing else in the frame. KA9Q has an option to
          * send data with the syn, BSD accepts data with the
          * syn up to the [to be] advertised window and
          * Solaris 2.1 gives you a protocol error. For now
          * we just ignore it, that fits the spec precisely
          * and avoids incompatibilities. It would be nice in
          * future to drop through and process the data.
          *
          * Now that TTCP is starting to be used we ought to
          * queue this data.
          * But, this leaves one open to an easy denial of
          * service attack, and SYN cookies can't defend
          * against this problem. So, we drop the data
          * in the interest of security over speed unless
          * it's still in use.
          */
         kfree_skb(skb);
         return 0;
      }
      goto discard;

   case TCP_SYN_SENT:
      queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
      if (queued >= 0)
         return queued;

      /* Do step6 onward by hand. */
      tcp_urg(sk, skb, th);
      __kfree_skb(skb);
      tcp_data_snd_check(sk);
      return 0;
   }

   res = tcp_validate_incoming(sk, skb, th, 0);
   if (res <= 0)
      return -res;

   /* step 5: check the ACK field */
   if (th->ack) {
      int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;

      switch (sk->sk_state) {
      case TCP_SYN_RECV:
         if (acceptable) {
            tp->copied_seq = tp->rcv_nxt;
            smp_mb();
            tcp_set_state(sk, TCP_ESTABLISHED); // 设置state为TCP_ESTABLISHED
            sk->sk_state_change(sk);

            /* Note, that this wakeup is only for marginal
             * crossed SYN case. Passively open sockets
             * are not waked up, because sk->sk_sleep ==
             * NULL and sk->sk_socket == NULL.
             */
            if (sk->sk_socket)
               sk_wake_async(sk,
                        SOCK_WAKE_IO, POLL_OUT);

            tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
            tp->snd_wnd = ntohs(th->window) <<
                     tp->rx_opt.snd_wscale;
            tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

            /* tcp_ack considers this ACK as duplicate
             * and does not calculate rtt.
             * Force it here.
             */
            tcp_ack_update_rtt(sk, 0, 0);

            if (tp->rx_opt.tstamp_ok)
               tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

            /* Make sure socket is routed, for
             * correct metrics.
             */
            icsk->icsk_af_ops->rebuild_header(sk);

            tcp_init_metrics(sk);

            tcp_init_congestion_control(sk);

            /* Prevent spurious tcp_cwnd_restart() on
             * first data packet.
             */
            tp->lsndtime = tcp_time_stamp;

            tcp_mtup_init(sk);
            tcp_initialize_rcv_mss(sk);
            tcp_init_buffer_space(sk);
            tcp_fast_path_on(tp);
         } else {
            return 1;
         }
         break;

      case TCP_FIN_WAIT1:
         if (tp->snd_una == tp->write_seq) {
            tcp_set_state(sk, TCP_FIN_WAIT2);
            sk->sk_shutdown |= SEND_SHUTDOWN;
            dst_confirm(sk->sk_dst_cache);

            if (!sock_flag(sk, SOCK_DEAD))
               /* Wake up lingering close() */
               sk->sk_state_change(sk);
            else {
               int tmo;

               if (tp->linger2 < 0 ||
                   (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
                    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
                  tcp_done(sk);
                  NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                  return 1;
               }

               tmo = tcp_fin_time(sk);
               if (tmo > TCP_TIMEWAIT_LEN) {
                  inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
               } else if (th->fin || sock_owned_by_user(sk)) {
                  /* Bad case. We could lose such FIN otherwise.
                   * It is not a big problem, but it looks confusing
                   * and not so rare event. We still can lose it now,
                   * if it spins in bh_lock_sock(), but it is really
                   * marginal case.
                   */
                  inet_csk_reset_keepalive_timer(sk, tmo);
               } else {
                  tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
                  goto discard;
               }
            }
         }
         break;

      case TCP_CLOSING:
         if (tp->snd_una == tp->write_seq) {
            tcp_time_wait(sk, TCP_TIME_WAIT, 0);
            goto discard;
         }
         break;

      case TCP_LAST_ACK:
         if (tp->snd_una == tp->write_seq) {
            tcp_update_metrics(sk);
            tcp_done(sk);
            goto discard;
         }
         break;
      }
   } else
      goto discard;

   /* step 6: check the URG bit */
   tcp_urg(sk, skb, th);

   /* step 7: process the segment text */
   switch (sk->sk_state) {
   case TCP_CLOSE_WAIT:
   case TCP_CLOSING:
   case TCP_LAST_ACK:
      if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
         break;
   case TCP_FIN_WAIT1:
   case TCP_FIN_WAIT2:
      /* RFC 793 says to queue data in these states,
       * RFC 1122 says we MUST send a reset.
       * BSD 4.4 also does reset.
       */
      if (sk->sk_shutdown & RCV_SHUTDOWN) {
         if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
             after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
            NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
            tcp_reset(sk);
            return 1;
         }
      }
      /* Fall through */
   case TCP_ESTABLISHED:
      tcp_data_queue(sk, skb);
      queued = 1;
      break;
   }

   /* tcp_data could move socket to TIME-WAIT */
   if (sk->sk_state != TCP_CLOSE) {
      tcp_data_snd_check(sk);
      tcp_ack_snd_check(sk);
   }

   if (!queued) {
discard:
      __kfree_skb(skb);
   }
   return 0;
}

tcp_v4_conn_request()

int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
   struct inet_request_sock *ireq;
   struct tcp_options_received tmp_opt;
   struct request_sock *req;
   __be32 saddr = ip_hdr(skb)->saddr;
   __be32 daddr = ip_hdr(skb)->daddr;
   __u32 isn = TCP_SKB_CB(skb)->when;
   struct dst_entry *dst = NULL;
#ifdef CONFIG_SYN_COOKIES
   int want_cookie = 0;
#else
#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
#endif

   /* Never answer to SYNs send to broadcast or multicast */
   // 若是广/组播包,丢包
   if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
      goto drop; // 丢包

   /* TW buckets are converted to open requests without
    * limitations, they conserve resources and peer is
    * evidently real one.
    */
   // 若半连接队列满
   if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
      if (sysctl_tcp_syncookies) {
         want_cookie = 1;
      } else
#endif
      goto drop;
   }

   /* Accept backlog is full. If we have already queued enough
    * of warm entries in syn queue, drop request. It is better than
    * clogging syn queue with openreqs with exponentially increasing
    * timeout.
    */
   // 若全连接队列满,且未重传SYN+ACK包的连接请求块多于1个
   if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
      goto drop;

   req = inet_reqsk_alloc(&tcp_request_sock_ops); // 分配连接请求块
   if (!req)
      goto drop;

#ifdef CONFIG_TCP_MD5SIG
   tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
#endif

   tcp_clear_options(&tmp_opt);
   tmp_opt.mss_clamp = 536;
   tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;

   tcp_parse_options(skb, &tmp_opt, 0);

   if (want_cookie && !tmp_opt.saw_tstamp)
      tcp_clear_options(&tmp_opt);

   tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;

   // 初始化ireq->rmt_port和ireq->loc_port
   tcp_openreq_init(req, &tmp_opt, skb);

   ireq = inet_rsk(req);
   // 初始化ireq->loc_addr和ireq->rmt_addr
   ireq->loc_addr = daddr;
   ireq->rmt_addr = saddr;
   ireq->no_srccheck = inet_sk(sk)->transparent;
   ireq->opt = tcp_v4_save_options(sk, skb);

   if (security_inet_conn_request(sk, skb, req))
      goto drop_and_free;

   if (!want_cookie)
      TCP_ECN_create_request(req, tcp_hdr(skb));

   if (want_cookie) {
#ifdef CONFIG_SYN_COOKIES
      syn_flood_warning(skb);
      req->cookie_ts = tmp_opt.tstamp_ok;
#endif
      isn = cookie_v4_init_sequence(sk, skb, &req->mss); // 计算cookie作为isn
   } else if (!isn) { // want_cookie和isn都为0
      struct inet_peer *peer = NULL;

      /* VJ's idea. We save last timestamp seen
       * from the destination in peer table, when entering
       * state TIME-WAIT, and check against it before
       * accepting new connection request.
       *
       * If "isn" is not zero, this request hit alive
       * timewait bucket, so that all the necessary checks
       * are made in the function processing timewait state.
       */
      if (tmp_opt.saw_tstamp &&
          tcp_death_row.sysctl_tw_recycle &&
          (dst = inet_csk_route_req(sk, req)) != NULL &&
          (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
          peer->v4daddr == saddr) {
         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
             (s32)(peer->tcp_ts - req->ts_recent) >
                     TCP_PAWS_WINDOW) {
            NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
            goto drop_and_release;
         }
      }
      /* Kill the following clause, if you dislike this way. */
      else if (!sysctl_tcp_syncookies &&
          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
           (sysctl_max_syn_backlog >> 2)) &&
          (!peer || !peer->tcp_ts_stamp) &&
          (!dst || !dst_metric(dst, RTAX_RTT))) {
         /* Without syncookies last quarter of
          * backlog is filled with destinations,
          * proven to be alive.
          * It means that we continue to communicate
          * to destinations, already remembered
          * to the moment of synflood.
          */
         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
                   &saddr, ntohs(tcp_hdr(skb)->source));
         goto drop_and_release;
      }

      isn = tcp_v4_init_sequence(skb); // 根据四元组计算isn
   }
   tcp_rsk(req)->snt_isn = isn;

   if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) // 发送SYN+ACK包
      goto drop_and_free;

   // 将req加入半连接队列
   inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
   return 0;

drop_and_release:
   dst_release(dst);
drop_and_free:
   reqsk_free(req);
drop:
   return 0;
}

第二次握手(发送SYN+ACK包)

__tcp_v4_send_synack() -> tcp_make_synack() + ip_build_and_send_pkt()

__tcp_v4_send_synack()

static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
            struct dst_entry *dst)
{
   const struct inet_request_sock *ireq = inet_rsk(req);
   int err = -1;
   struct sk_buff * skb;

   /* First, grab a route. */
   if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
      return -1;

   skb = tcp_make_synack(sk, dst, req); // 创建SYN+ACK包

   if (skb) {
      struct tcphdr *th = tcp_hdr(skb);

      th->check = tcp_v4_check(skb->len,
                ireq->loc_addr,
                ireq->rmt_addr,
                csum_partial(th, skb->len,
                        skb->csum));

      err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
                   ireq->rmt_addr,
                   ireq->opt); // 创建IP包并发送
      err = net_xmit_eval(err);
   }

   dst_release(dst);
   return err;
}

tcp_make_synack()

struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
            struct request_sock *req)
{
   struct inet_request_sock *ireq = inet_rsk(req);
   struct tcp_sock *tp = tcp_sk(sk);
   struct tcphdr *th;
   int tcp_header_size;
   struct tcp_out_options opts;
   struct sk_buff *skb;
   struct tcp_md5sig_key *md5;
   __u8 *md5_hash_location;
   int mss;

   /* #define MAX_TCP_HEADER (128 + MAX_HEADER)
       #define MAX_HEADER LL_MAX_HEADER
       #define LL_MAX_HEADER 32 */
   skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
   if (skb == NULL)
      return NULL;

   /* Reserve space for headers. */
   skb_reserve(skb, MAX_TCP_HEADER);

   skb_dst_set(skb, dst_clone(dst));

   mss = dst_metric(dst, RTAX_ADVMSS);
   if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
      mss = tp->rx_opt.user_mss;

   if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
      __u8 rcv_wscale;
      /* Set this up on the first call only */
      req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
      /* tcp_full_space because it is guaranteed to be the first packet */
      tcp_select_initial_window(tcp_full_space(sk),
         mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
         &req->rcv_wnd,
         &req->window_clamp,
         ireq->wscale_ok,
         &rcv_wscale);
      ireq->rcv_wscale = rcv_wscale;
   }

   memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
   if (unlikely(req->cookie_ts))
      TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
   else
#endif
   TCP_SKB_CB(skb)->when = tcp_time_stamp;
   tcp_header_size = tcp_synack_options(sk, req, mss,
                    skb, &opts, &md5) +
           sizeof(struct tcphdr); // 计算TCP首部长度

   skb_push(skb, tcp_header_size);
   skb_reset_transport_header(skb);

   th = tcp_hdr(skb);
   memset(th, 0, sizeof(struct tcphdr)); // 清零TCP首部
   th->syn = 1; // syn
   th->ack = 1; // ack
   TCP_ECN_make_synack(req, th); // ece
   th->source = ireq->loc_port; // 源port
   th->dest = ireq->rmt_port; // 目的port
   /* Setting of flags are superfluous here for callers (and ECE is
    * not even correctly set)
    */
   tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
              TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); // 计算seq和ack_seq
   th->seq = htonl(TCP_SKB_CB(skb)->seq); // 序列号
   th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); // 确认号

   /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
   th->window = htons(min(req->rcv_wnd, 65535U)); // 窗口大小
   tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); // 选项
   th->doff = (tcp_header_size >> 2); // 首部长度
   TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);

#ifdef CONFIG_TCP_MD5SIG
   /* Okay, we have all we need - do the md5 hash if needed */
   if (md5) {
      tcp_rsk(req)->af_specific->calc_md5_hash(md5_hash_location,
                      md5, NULL, req, skb);
   }
#endif

   return skb; // 返回skb
}

ip_build_and_send_pkt()

int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
           __be32 saddr, __be32 daddr, struct ip_options *opt)
{
   struct inet_sock *inet = inet_sk(sk);
   struct rtable *rt = skb_rtable(skb);
   struct iphdr *iph;

   /* Build the IP header. */
   skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
   skb_reset_network_header(skb);
   iph = ip_hdr(skb);
   iph->version  = 4; // 版本
   iph->ihl      = 5; // 首部长度
   iph->tos      = inet->tos; // 服务类型
   if (ip_dont_fragment(sk, &rt->u.dst))
      iph->frag_off = htons(IP_DF);
   else
      iph->frag_off = 0; // 3位标志 + 13位片偏移
   iph->ttl      = ip_select_ttl(inet, &rt->u.dst); // 生存时间
   iph->daddr    = rt->rt_dst; // 目的ip
   iph->saddr    = rt->rt_src; // 源ip
   iph->protocol = sk->sk_protocol; // 协议
   ip_select_ident(iph, &rt->u.dst, sk); // 设置iph->id

   if (opt && opt->optlen) {
      iph->ihl += opt->optlen>>2; // 更新首部长度
      ip_options_build(skb, opt, daddr, rt, 0);
   }

   skb->priority = sk->sk_priority;
   skb->mark = sk->sk_mark;

   /* Send it out. */
   /* ip_local_out() -> __ip_local_out() -> ip_send_check()
      在__ip_local_out()中设置iph->tot_len,在ip_send_check()中设置iph->check */
   return ip_local_out(skb);
}

第三次握手(收到ACK包)

tcp_v4_rcv() -> tcp_v4_do_rcv() -> tcp_v4_hnd_req() + tcp_child_process()
tcp_v4_hnd_req() -> tcp_check_req() -> tcp_v4_syn_recv_sock()
tcp_child_process() -> tcp_rcv_state_process()

tcp_v4_hnd_req()

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
   struct tcphdr *th = tcp_hdr(skb);
   const struct iphdr *iph = ip_hdr(skb);
   struct sock *nsk;
   struct request_sock **prev;
   /* Find possible connection requests. */
   // 先查找监听套接字的syn_table(半连接队列)
   struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
                         iph->saddr, iph->daddr);
   if (req) // 若找到tcp_request_sock
      // 调用tcp_v4_syn_recv_sock()处理第三次握手的ack包,返回已连接sock
      return tcp_check_req(sk, skb, req, prev);

   // 再查找ehash,防止其它进程将req移到ehash
   nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
         th->source, iph->daddr, th->dest, inet_iif(skb));

   if (nsk) { // 若找到tcp_sock
      if (nsk->sk_state != TCP_TIME_WAIT) {
         bh_lock_sock(nsk);
         return nsk; // 返回已连接sock
      }
      inet_twsk_put(inet_twsk(nsk));
      return NULL;
   }

#ifdef CONFIG_SYN_COOKIES
   if (!th->rst && !th->syn && th->ack)
      sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
    // 若都没找到,返回监听sock
   return sk;
}

tcp_check_req()

struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
            struct request_sock *req,
            struct request_sock **prev)
{
   const struct tcphdr *th = tcp_hdr(skb);
   __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); // RSA
   int paws_reject = 0;
   struct tcp_options_received tmp_opt;
   struct sock *child;

   tmp_opt.saw_tstamp = 0;
   if (th->doff > (sizeof(struct tcphdr)>>2)) {
      tcp_parse_options(skb, &tmp_opt, 0);

      if (tmp_opt.saw_tstamp) {
         tmp_opt.ts_recent = req->ts_recent;
         /* We do not store true stamp, but it is not required,
          * it can be estimated (approximately)
          * from another data.
          */
         tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
         paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
      }
   }

   /* Check for pure retransmitted SYN. */
   if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
       flg == TCP_FLAG_SYN &&
       !paws_reject) { // 若是重传syn包
      /*
       * RFC793 draws (Incorrectly! It was fixed in RFC1122)
       * this case on figure 6 and figure 8, but formal
       * protocol description says NOTHING.
       * To be more exact, it says that we should send ACK,
       * because this segment (at least, if it has no data)
       * is out of window.
       *
       *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
       *  describe SYN-RECV state. All the description
       *  is wrong, we cannot believe to it and should
       *  rely only on common sense and implementation
       *  experience.
       *
       * Enforce "SYN-ACK" according to figure 8, figure 6
       * of RFC793, fixed by RFC1122.
       */
      req->rsk_ops->rtx_syn_ack(sk, req); // tcp_v4_send_synack()
      return NULL;
   }

   /* Further reproduces section "SEGMENT ARRIVES"
      for state SYN-RECEIVED of RFC793.
      It is broken, however, it does not work only
      when SYNs are crossed.

      You would think that SYN crossing is impossible here, since
      we should have a SYN_SENT socket (from connect()) on our end,
      but this is not true if the crossed SYNs were sent to both
      ends by a malicious third party.  We must defend against this,
      and to do that we first verify the ACK (as per RFC793, page
      36) and reset if it is invalid.  Is this a true full defense?
      To convince ourselves, let us consider a way in which the ACK
      test can still pass in this 'malicious crossed SYNs' case.
      Malicious sender sends identical SYNs (and thus identical sequence
      numbers) to both A and B:

      A: gets SYN, seq=7
      B: gets SYN, seq=7

      By our good fortune, both A and B select the same initial
      send sequence number of seven :-)

      A: sends SYN|ACK, seq=7, ack_seq=8
      B: sends SYN|ACK, seq=7, ack_seq=8

      So we are now A eating this SYN|ACK, ACK test passes.  So
      does sequence test, SYN is truncated, and thus we consider
      it a bare ACK.

      If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
      bare ACK.  Otherwise, we create an established connection.  Both
      ends (listening sockets) accept the new incoming connection and try
      to talk to each other. 8-)

      Note: This case is both harmless, and rare.  Possibility is about the
      same as us discovering intelligent life on another plant tomorrow.

      But generally, we should (RFC lies!) to accept ACK
      from SYNACK both here and in tcp_rcv_state_process().
      tcp_rcv_state_process() does not, hence, we do not too.

      Note that the case is absolutely generic:
      we cannot optimize anything here without
      violating protocol. All the checks must be made
      before attempt to create socket.
    */

   /* RFC793 page 36: "If the connection is in any non-synchronized state ...
    *                  and the incoming segment acknowledges something not yet
    *                  sent (the segment carries an unacceptable ACK) ...
    *                  a reset is sent."
    *
    * Invalid ACK: reset will be sent by listening socket
    */
   if ((flg & TCP_FLAG_ACK) &&
       (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) // 若ack置位但ack_seq不对
      return sk;

   /* Also, it would be not so bad idea to check rcv_tsecr, which
    * is essentially ACK extension and too early or too late values
    * should cause reset in unsynchronized states.
    */

   /* RFC793: "first check sequence number". */

   if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
                 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
      /* Out of window: send ACK and drop. */
      if (!(flg & TCP_FLAG_RST)) // 若rst未置位
         req->rsk_ops->send_ack(sk, skb, req); // tcp_v4_reqsk_send_ack()
      if (paws_reject)
         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
      return NULL;
   }

   /* In sequence, PAWS is OK. */

   // TCP_SKB_CB(skb)->seq <= tcp_rsk(req)->rcv_isn + 1
   if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
      req->ts_recent = tmp_opt.rcv_tsval;

   if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
      /* Truncate SYN, it is out of window starting
         at tcp_rsk(req)->rcv_isn + 1. */
      flg &= ~TCP_FLAG_SYN;
   }

   /* RFC793: "second check the RST bit" and
    *    "fourth, check the SYN bit"
    */
   if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { // 若rst或syn置位
      TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
      goto embryonic_reset;
   }

   /* ACK sequence verified above, just make sure ACK is
    * set.  If ACK not set, just silently drop the packet.
    */
   if (!(flg & TCP_FLAG_ACK)) // 若ack未置位
      return NULL;

   /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
   if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
       TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
      inet_rsk(req)->acked = 1;
      return NULL;
   }

   /* OK, ACK is valid, create big socket and
    * feed this segment to it. It will repeat all
    * the tests. THIS SEGMENT MUST MOVE SOCKET TO
    * ESTABLISHED STATE. If it will be dropped after
    * socket is created, wait for troubles.
    */
   /* tcp_v4_syn_recv_sock() -> tcp_create_openreq_child() -> inet_csk_clone()
      创建子传输控制块,在inet_csk_clone()中设置state为TCP_SYN_RECV */
   child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
   if (child == NULL)
      goto listen_overflow;

   inet_csk_reqsk_queue_unlink(sk, req, prev); // 将tcp_request_sock移出半连接队列
   inet_csk_reqsk_queue_removed(sk, req);

   inet_csk_reqsk_queue_add(sk, req, child); // 将tcp_request_sock移入全连接队列
   return child;

listen_overflow:
   if (!sysctl_tcp_abort_on_overflow) {
      inet_rsk(req)->acked = 1;
      return NULL;
   }

embryonic_reset:
   NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
   if (!(flg & TCP_FLAG_RST))
      req->rsk_ops->send_reset(sk, skb);

   inet_csk_reqsk_queue_drop(sk, req, prev);
   return NULL;
}

tcp_v4_syn_recv_sock()

struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
              struct request_sock *req,
              struct dst_entry *dst)
{
   struct inet_request_sock *ireq;
   struct inet_sock *newinet;
   struct tcp_sock *newtp;
   struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
   struct tcp_md5sig_key *key;
#endif

   if (sk_acceptq_is_full(sk)) // 若全连接队列满
      goto exit_overflow;

   if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) // 查找路由
      goto exit;

   newsk = tcp_create_openreq_child(sk, req, skb); // 创建子传输控制块
   if (!newsk)
      goto exit;

   newsk->sk_gso_type = SKB_GSO_TCPV4;
   sk_setup_caps(newsk, dst);

   newtp           = tcp_sk(newsk);
   newinet             = inet_sk(newsk);
   ireq            = inet_rsk(req);
   newinet->daddr       = ireq->rmt_addr; // 源ip
   newinet->rcv_saddr    = ireq->loc_addr; // 目的ip
   newinet->saddr       = ireq->loc_addr; // 源ip
   newinet->opt         = ireq->opt;
   ireq->opt        = NULL;
   newinet->mc_index     = inet_iif(skb);
   newinet->mc_ttl          = ip_hdr(skb)->ttl;
   inet_csk(newsk)->icsk_ext_hdr_len = 0;
   if (newinet->opt)
      inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
   newinet->id = newtp->write_seq ^ jiffies;

   tcp_mtup_init(newsk);
   tcp_sync_mss(newsk, dst_mtu(dst));
   newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
   if (tcp_sk(sk)->rx_opt.user_mss &&
       tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
      newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;

   tcp_initialize_rcv_mss(newsk);

#ifdef CONFIG_TCP_MD5SIG
   /* Copy over the MD5 key from the original socket */
   if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
      /*
       * We're using one, so create a matching key
       * on the newsk structure. If we fail to get
       * memory, then we end up not copying the key
       * across. Shucks.
       */
      char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
      if (newkey != NULL)
         tcp_v4_md5_do_add(newsk, newinet->daddr,
                 newkey, key->keylen);
      newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
   }
#endif

   __inet_hash_nolisten(newsk); // 将newsk加入ehash
   __inet_inherit_port(sk, newsk);

   return newsk; // return newsk

exit_overflow:
   NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit:
   NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
   dst_release(dst);
   return NULL;
}

tcp_child_process()

int tcp_child_process(struct sock *parent, struct sock *child,
            struct sk_buff *skb)
{
   int ret = 0;
   int state = child->sk_state;

   if (!sock_owned_by_user(child)) {
      ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
                   skb->len);
      /* Wakeup parent, send SIGIO */
      if (state == TCP_SYN_RECV && child->sk_state != state)
         parent->sk_data_ready(parent, 0);
   } else {
      /* Alas, it is possible again, because we do lookup
       * in main socket hash table and lock on listening
       * socket does not protect us more.
       */
      sk_add_backlog(child, skb);
   }

   bh_unlock_sock(child);
   sock_put(child);
   return ret;
}

猜你喜欢

转载自blog.csdn.net/hz5034/article/details/80514129