TCP协议实现(tcp_v4_rcv)

TCP状态

enum {
   TCP_ESTABLISHED = 1,
   TCP_SYN_SENT,
   TCP_SYN_RECV,
   TCP_FIN_WAIT1,
   TCP_FIN_WAIT2,
   TCP_TIME_WAIT,
   TCP_CLOSE,
   TCP_CLOSE_WAIT,
   TCP_LAST_ACK,
   TCP_LISTEN,
   TCP_CLOSING,   /* Now a valid state */

   TCP_MAX_STATES /* Leave at the end! */
};

inet_hashinfo

struct inet_hashinfo tcp_hashinfo;

struct inet_hashinfo {
   /* This is for sockets with full identity only.  Sockets here will
    * always be without wildcards and will have the following invariant:
    *
    *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
    *
    * TIME_WAIT sockets use a separate chain (twchain).
    */
   // TCP_ESTABLISHED、TCP_SYN_SENT、TCP_SYN_RECV、TCP_FIN_WAIT1、TCP_FIN_WAIT2、TCP_TIME_WAIT
   struct inet_ehash_bucket   *ehash;
   spinlock_t       *ehash_locks;
   unsigned int         ehash_size;
   unsigned int         ehash_locks_mask;

   /* Ok, let's try this, I give up, we do need a local binding
    * TCP hash as well as the others for fast bind/connect.
    */
   struct inet_bind_hashbucket    *bhash;

   unsigned int         bhash_size;
   /* 4 bytes hole on 64 bit */

   struct kmem_cache     *bind_bucket_cachep;

   /* All the above members are written once at bootup and
    * never written again _or_ are predominantly read-access.
    *
    * Now align to a new cache line as all the following members
    * might be often dirty.
    */
   /* All sockets in TCP_LISTEN state will be in here.  This is the only
    * table where wildcard'd TCP sockets can exist.  Hash function here
    * is just local port number.
    */
   struct inet_listen_hashbucket  listening_hash[INET_LHTABLE_SIZE]
               ____cacheline_aligned_in_smp;

   atomic_t         bsockets;
};

这里写图片描述

这里写图片描述

tcp_v4_rcv()

tcp_v4_rcv()是ip层到tcp层的入口
这里写图片描述

int tcp_v4_rcv(struct sk_buff *skb)
{
   const struct iphdr *iph;
   struct tcphdr *th;
   struct sock *sk;
   int ret;
   struct net *net = dev_net(skb->dev);

   // 若目的mac地址不是本机mac地址,丢包
   if (skb->pkt_type != PACKET_HOST)
      goto discard_it;

   /* Count it even if it's bad */
   TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);

   // 检查data到tail的长度是否 >= sizeof(struct tcphdr)
   if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
      goto discard_it;

   th = tcp_hdr(skb); // tcp_hdr() -> skb_transport_header() -> skb->transport_header

   if (th->doff < sizeof(struct tcphdr) / 4)
      goto bad_packet;
   if (!pskb_may_pull(skb, th->doff * 4))
      goto discard_it;

   /* An explanation is required here, I think.
    * Packet length and doff are validated by header prediction,
    * provided case of th->doff==0 is eliminated.
    * So, we defer the checks. */
   if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
      goto bad_packet;

   th = tcp_hdr(skb);
   iph = ip_hdr(skb); // ip_hdr() -> skb_network_header() -> skb->network_header
   TCP_SKB_CB(skb)->seq = ntohl(th->seq);
   TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
                skb->len - th->doff * 4);
   TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
   TCP_SKB_CB(skb)->when   = 0;
   TCP_SKB_CB(skb)->flags  = iph->tos;
   TCP_SKB_CB(skb)->sacked     = 0;

   // 查找传输控制块,先查找ehash,后查找listening_hash
   sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
   if (!sk)
      goto no_tcp_socket;

process:
   if (sk->sk_state == TCP_TIME_WAIT)
      goto do_time_wait;

   if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
      goto discard_and_relse;
   nf_reset(skb);

   if (sk_filter(sk, skb))
      goto discard_and_relse;

   skb->dev = NULL;

   bh_lock_sock_nested(sk); // 加锁
   ret = 0;
   if (!sock_owned_by_user(sk)) { // sk->sk_lock.owned,若没有用户进程访问传输控制块
#ifdef CONFIG_NET_DMA
      struct tcp_sock *tp = tcp_sk(sk);
      if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
      if (tp->ucopy.dma_chan)
         ret = tcp_v4_do_rcv(sk, skb);
      else
#endif
      {
         // 若未启用tcp_low_latency,且用户进程正在访问传输控制块,将skb加入prequeue队列(return 1)
         if (!tcp_prequeue(sk, skb))
            ret = tcp_v4_do_rcv(sk, skb); // 处理有效TCP段
      }
   } else
      sk_add_backlog(sk, skb);
   bh_unlock_sock(sk); // 解锁

   sock_put(sk);

   return ret;

no_tcp_socket: // 若没找到传输控制块
   if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
      goto discard_it;

   if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
bad_packet:
      TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
   } else {
      tcp_v4_send_reset(NULL, skb); // 发送rst
   }

discard_it:
   /* Discard frame. */
   kfree_skb(skb);
   return 0;

discard_and_relse:
   sock_put(sk);
   goto discard_it;

do_time_wait:
   if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
      inet_twsk_put(inet_twsk(sk));
      goto discard_it;
   }

   if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
      TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
      inet_twsk_put(inet_twsk(sk));
      goto discard_it;
   }
   switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
   case TCP_TW_SYN: {
      struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
                     &tcp_hashinfo,
                     iph->daddr, th->dest,
                     inet_iif(skb));
      if (sk2) {
         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
         inet_twsk_put(inet_twsk(sk));
         sk = sk2;
         goto process;
      }
      /* Fall through to ACK */
   }
   case TCP_TW_ACK:
      tcp_v4_timewait_ack(sk, skb);
      break;
   case TCP_TW_RST:
      goto no_tcp_socket;
   case TCP_TW_SUCCESS:;
   }
   goto discard_it;
} 

__inet_lookup_skb()

sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);

static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
                    struct sk_buff *skb,
                    const __be16 sport,
                    const __be16 dport)
{
   struct sock *sk;
   const struct iphdr *iph = ip_hdr(skb);

   if (unlikely(sk = skb_steal_sock(skb)))
      return sk;
   else
      return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
                 iph->saddr, sport,
                 iph->daddr, dport, inet_iif(skb));
}

static inline struct sock *__inet_lookup(struct net *net,
                struct inet_hashinfo *hashinfo,
                const __be32 saddr, const __be16 sport,
                const __be32 daddr, const __be16 dport,
                const int dif)
{
   u16 hnum = ntohs(dport); // 主机字节序dport
   struct sock *sk = __inet_lookup_established(net, hashinfo,
            saddr, sport, daddr, hnum, dif); // 先查找ehash

   return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif); // 后查找listening_hash
}

struct sock * __inet_lookup_established(struct net *net,
              struct inet_hashinfo *hashinfo,
              const __be32 saddr, const __be16 sport,
              const __be32 daddr, const u16 hnum,
              const int dif)
{
   // acookie = daddr << 32 | saddr
   INET_ADDR_COOKIE(acookie, saddr, daddr)
   // ports = hnum << 16 | sport
   const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
   struct sock *sk;
   const struct hlist_nulls_node *node;
   /* Optimize here for direct hit, only listening connections can
    * have wildcards anyways.
    */
   unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
   unsigned int slot = hash & (hashinfo->ehash_size - 1);
   struct inet_ehash_bucket *head = &hashinfo->ehash[slot];

   rcu_read_lock();
begin:
   sk_nulls_for_each_rcu(sk, node, &head->chain) {
      if (INET_MATCH(sk, net, hash, acookie,
               saddr, daddr, ports, dif)) {
         if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
            goto begintw;
         if (unlikely(!INET_MATCH(sk, net, hash, acookie,
            saddr, daddr, ports, dif))) {
            sock_put(sk);
            goto begin;
         }
         goto out;
      }
   }
   /*
    * if the nulls value we got at the end of this lookup is
    * not the expected one, we must restart lookup.
    * We probably met an item that was moved to another chain.
    */
   if (get_nulls_value(node) != slot)
      goto begin;

begintw:
   /* Must check for a TIME_WAIT'er before going to listener hash. */
   sk_nulls_for_each_rcu(sk, node, &head->twchain) {
      if (INET_TW_MATCH(sk, net, hash, acookie,
               saddr, daddr, ports, dif)) {
         if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
            sk = NULL;
            goto out;
         }
         if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
             saddr, daddr, ports, dif))) {
            sock_put(sk);
            goto begintw;
         }
         goto out;
      }
   }
   /*
    * if the nulls value we got at the end of this lookup is
    * not the expected one, we must restart lookup.
    * We probably met an item that was moved to another chain.
    */
   if (get_nulls_value(node) != slot)
      goto begintw;
   sk = NULL;
out:
   rcu_read_unlock();
   return sk;
}

#define INET_ADDR_COOKIE(__name, __saddr, __daddr) const __addrpair __name = (__force __addrpair) ( \
   (((__force __u64)(__be32)(__daddr)) << 32 ) | ((__force __u64)(__be32)(__saddr)) \
);

#define INET_COMBINED_PORTS(__sport, __dport) ( \
   (__force __portpair) ( \
        ((__u32)(__dport) << 16 ) | (__force __u32)(__be16)(__sport)
   )
)

#define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif) ( \
   ((__sk)->sk_hash == (__hash)) && \
   net_eq(sock_net(__sk), (__net)) && \
   ((*((__addrpair *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ // 在sock中,rcv_saddr紧跟daddr
   ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \ // 在sock中,num紧跟dport
   (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))) \
)

struct sock *__inet_lookup_listener(struct net *net,
                struct inet_hashinfo *hashinfo,
                const __be32 daddr, const unsigned short hnum,
                const int dif)
{
   struct sock *sk, *result;
   struct hlist_nulls_node *node;
   unsigned int hash = inet_lhashfn(net, hnum);
   struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
   int score, hiscore;

   rcu_read_lock();
begin:
   result = NULL;
   hiscore = -1;
   sk_nulls_for_each_rcu(sk, node, &ilb->head) {
      score = compute_score(sk, net, hnum, daddr, dif);
      if (score > hiscore) { // hiscore为score的最大值
         result = sk;
         hiscore = score;
      }
   }
   /*
    * if the nulls value we got at the end of this lookup is
    * not the expected one, we must restart lookup.
    * We probably met an item that was moved to another chain.
    */
   if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
      goto begin;
   if (result) {
      if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
         result = NULL;
      else if (unlikely(compute_score(result, net, hnum, daddr,
              dif) < hiscore)) {
         sock_put(result);
         goto begin;
      }
   }
   rcu_read_unlock();
   return result;
}

/* net不同 or dport不同 or sk不支持ipv4:return -1
   net相同 and dport相同 and sk支持ipv4:
      sk不是ipv4:
         dip为空、出设备为空:return 0
         dip为空、出设备不为空:
            出设备不同:return -1
            出设备相同:return 2
         dip不为空、出设备为空:
            dip不同:return -1
            dip相同:return 2
         dip不为空、出设备不为空:
            dip相同、出设备相同:return 4
            dip相同、出设备不同:return -1
            dip不同:return -1
      sk是ipv4:
         dip为空、出设备为空(最常见的情形):return 1
         dip为空、出设备不为空:
            出设备不同:return -1
            出设备相同:return 3
         dip不为空、出设备为空:
            dip不同:return -1
            dip相同:return 3
         dip不为空、出设备不为空:
            dip相同、出设备相同:return 5
            dip相同、出设备不同:return -1
            dip不同:return -1 */
static inline int compute_score(struct sock *sk, struct net *net,
            const unsigned short hnum, const __be32 daddr,
            const int dif)
{
   int score = -1;
   struct inet_sock *inet = inet_sk(sk);

   if (net_eq(sock_net(sk), net) && inet->num == hnum &&
         !ipv6_only_sock(sk)) {
      __be32 rcv_saddr = inet->rcv_saddr;
      score = sk->sk_family == PF_INET ? 1 : 0;
      if (rcv_saddr) {
         if (rcv_saddr != daddr)
            return -1;
         score += 2;
      }
      if (sk->sk_bound_dev_if) {
         if (sk->sk_bound_dev_if != dif)
            return -1;
         score += 2;
      }
   }
   return score;
}

tcp_v4_do_rcv()

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
   struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
   /*
    * We really want to reject the packet as early as possible
    * if:
    *  o We're expecting an MD5'd packet and this is no MD5 tcp option
    *  o There is an MD5 option and we're not expecting one
    */
   if (tcp_v4_inbound_md5_hash(sk, skb))
      goto discard;
#endif

   // 若是已连接套接字
   if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
      TCP_CHECK_TIMER(sk);
      if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
         rsk = sk;
         goto reset;
      }
      TCP_CHECK_TIMER(sk);
      return 0;
   }

   if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
      goto csum_err;

   // 若是监听套接字
   if (sk->sk_state == TCP_LISTEN) {
      // 先查找监听套接字的syn_table(半连接队列),后查找ehash
      struct sock *nsk = tcp_v4_hnd_req(sk, skb);
      if (!nsk)
         goto discard;

      // 若找到tcp_request_sock或tcp_sock
      if (nsk != sk) {
         if (tcp_child_process(sk, nsk, skb)) {
            rsk = nsk;
            goto reset;
         }
         return 0;
      }
      // 若都没找到(nsk == sk),继续往下走
   }

   TCP_CHECK_TIMER(sk);
   if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
      rsk = sk;
      goto reset;
   }
   TCP_CHECK_TIMER(sk);
   return 0;

reset:
   tcp_v4_send_reset(rsk, skb);
discard:
   kfree_skb(skb);
   /* Be careful here. If this function gets more complicated and
    * gcc suffers from register pressure on the x86, sk (in %ebx)
    * might be destroyed here. This current version compiles correctly,
    * but you have been warned.
    */
   return 0;

csum_err:
   TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
   goto discard;
}

猜你喜欢

转载自blog.csdn.net/hz5034/article/details/80488868