协议栈出入口函数

dev_queue_xmit()

协议栈出口函数是dev_queue_xmit(),协议栈通过dev_queue_xmit()将skb下送网卡驱动

ip_finish_output2() -> 邻居子系统 -> dev_queue_xmit() -> 流控子系统 -> dev_hard_start_xmit() -> ops->ndo_start_xmit()即ixgbe_xmit_frame() -> ixgbe_tx_map() + ixgbe_tx_queue()

int dev_queue_xmit(struct sk_buff *skb)
{
   struct net_device *dev = skb->dev;
   struct netdev_queue *txq;
   struct Qdisc *q;
   int rc = -ENOMEM;

   /* GSO will handle the following emulations directly. */
   // 若是GSO(Generic Segmentation Offload)包且网络设备支持GSO
   if (netif_needs_gso(dev, skb))
      goto gso;

   if (skb_has_frags(skb) &&
       !(dev->features & NETIF_F_FRAGLIST) &&
       __skb_linearize(skb))
      goto out_kfree_skb;

   /* Fragmented skb is linearized if device does not support SG,
    * or if at least one of fragments is in highmem and device
    * does not support DMA from it.
    */
   if (skb_shinfo(skb)->nr_frags &&
       (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
       __skb_linearize(skb))
      goto out_kfree_skb;

   /* If packet is not checksummed and device does not support
    * checksumming for this protocol, complete checksumming here.
    */
   if (skb->ip_summed == CHECKSUM_PARTIAL) {
      skb_set_transport_header(skb, skb->csum_start -
                     skb_headroom(skb));
      if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
         goto out_kfree_skb;
   }

gso:
   /* Disable soft irqs for various locks below. Also
    * stops preemption for RCU.
    */
   rcu_read_lock_bh();

   txq = dev_pick_tx(dev, skb); // 选择发送队列
   q = rcu_dereference(txq->qdisc); // 得到排队规则

#ifdef CONFIG_NET_CLS_ACT
   skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
#endif
   if (q->enqueue) { // 若定义了入队操作
      rc = __dev_xmit_skb(skb, q, dev, txq); // 先入队后出队发送
      goto out;
   }

   /* The device has no queue. Common case for software devices:
      loopback, all the sorts of tunnels...

      Really, it is unlikely that netif_tx_lock protection is necessary
      here.  (f.e. loopback and IP tunnels are clean ignoring statistics
      counters.)
      However, it is possible, that they rely on protection
      made by us here.

      Check this and shot the lock. It is not prone from deadlocks.
      Either shot noqueue qdisc, it is even simpler 8)
    */
   if (dev->flags & IFF_UP) { // 若网络设备已打开
      int cpu = smp_processor_id(); /* ok because BHs are off */

      if (txq->xmit_lock_owner != cpu) { // 若当前CPU未持有锁

         HARD_TX_LOCK(dev, txq, cpu); // 加锁

         if (!netif_tx_queue_stopped(txq)) {
            rc = NET_XMIT_SUCCESS;
            if (!dev_hard_start_xmit(skb, dev, txq)) { // 直接发送
               HARD_TX_UNLOCK(dev, txq); // 解锁
               goto out;
            }
         }
         HARD_TX_UNLOCK(dev, txq);
         if (net_ratelimit())
            printk(KERN_CRIT "Virtual device %s asks to "
                   "queue packet!\n", dev->name);
      } else { // 若当前CPU持有锁
         /* Recursion is detected! It is possible,
          * unfortunately */
         if (net_ratelimit())
            printk(KERN_CRIT "Dead loop on virtual device "
                   "%s, fix it urgently!\n", dev->name);
      }
   }

   rc = -ENETDOWN;
   rcu_read_unlock_bh();

out_kfree_skb:
   kfree_skb(skb);
   return rc;
out:
   rcu_read_unlock_bh();
   return rc;
}

dev_pick_tx()

static struct netdev_queue *dev_pick_tx(struct net_device *dev,
                    struct sk_buff *skb)
{
    const struct net_device_ops *ops = dev->netdev_ops;
    u16 queue_index = 0;

    if (ops->ndo_select_queue) // ixgbe为ixgbe_select_queue()
        queue_index = ops->ndo_select_queue(dev, skb);
    else if (dev->real_num_tx_queues > 1)
        queue_index = skb_tx_hash(dev, skb);

    skb_set_queue_mapping(skb, queue_index);
    return netdev_get_tx_queue(dev, queue_index);
}

static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb)
{
    struct ixgbe_adapter *adapter = netdev_priv(dev);
    int txq = smp_processor_id(); // 得到当前CPU id

    if (adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE) { // 若开启FDIR
        while (unlikely(txq >= dev->real_num_tx_queues))
            txq -= dev->real_num_tx_queues;
        return txq; // 返回当前CPU id
    }

    if (adapter->flags & IXGBE_FLAG_DCB_ENABLED)
        return (skb->vlan_tci & IXGBE_TX_FLAGS_VLAN_PRIO_MASK) >> 13;

    return skb_tx_hash(dev, skb);
}

static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
    skb->queue_mapping = queue_mapping; // 设置queue_mapping
}

static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
                     unsigned int index)
{
    return &dev->_tx[index]; // 得到发送队列
}

__dev_xmit_skb()

static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                 struct net_device *dev,
                 struct netdev_queue *txq)
{
    spinlock_t *root_lock = qdisc_lock(q);
    int rc;

    spin_lock(root_lock);
    // 若q->state为__QDISC_STATE_DEACTIVATED
    if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
        kfree_skb(skb);
        rc = NET_XMIT_DROP;
    // 若q->flags为TCQ_F_CAN_BYPASS、q为空、q->state非__QDISC_STATE_RUNNING    
    } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
           !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
        /*
         * This is a work-conserving queue; there are no old skbs
         * waiting to be sent out; and the qdisc is not running -
         * xmit the skb directly.
         */
        __qdisc_update_bstats(q, skb->len);
        if (sch_direct_xmit(skb, q, dev, txq, root_lock)) // 直接发送,若返回值非0
            __qdisc_run(q); // 出队发送
        else // 若返回值为0
            clear_bit(__QDISC_STATE_RUNNING, &q->state); // 清除q->state的__QDISC_STATE_RUNNING

        rc = NET_XMIT_SUCCESS;
    } else {
        rc = qdisc_enqueue_root(skb, q); // 入队
        qdisc_run(q); // 出队发送
    }
    spin_unlock(root_lock);

    return rc;
}

sch_direct_xmit()

int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
            struct net_device *dev, struct netdev_queue *txq,
            spinlock_t *root_lock)
{
    int ret = NETDEV_TX_BUSY;

    /* And release qdisc */
    spin_unlock(root_lock);

    HARD_TX_LOCK(dev, txq, smp_processor_id()); // 加锁
    // 若发送队列的状态非__QUEUE_STATE_XOFF和__QUEUE_STATE_FROZEN
    if (!netif_tx_queue_stopped(txq) &&
        !netif_tx_queue_frozen(txq))
        ret = dev_hard_start_xmit(skb, dev, txq); // 直接发送
    HARD_TX_UNLOCK(dev, txq); // 解锁

    spin_lock(root_lock);

    switch (ret) {
    case NETDEV_TX_OK:
        /* Driver sent out skb successfully */
        ret = qdisc_qlen(q);
        break;

    case NETDEV_TX_LOCKED:
        /* Driver try lock failed */
        ret = handle_dev_cpu_collision(skb, txq, q);
        break;

    default:
        /* Driver returned NETDEV_TX_BUSY - requeue skb */
        if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
            printk(KERN_WARNING "BUG %s code %d qlen %d\n",
                   dev->name, ret, q->q.qlen);

        ret = dev_requeue_skb(skb, q); // 重新入队
        break;
    }

    if (ret && (netif_tx_queue_stopped(txq) ||
            netif_tx_queue_frozen(txq)))
        ret = 0;

    return ret;
}

qdisc_enqueue_root()

static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
{
    qdisc_skb_cb(skb)->pkt_len = skb->len;
    return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
}

static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
#ifdef CONFIG_NET_SCHED
    if (sch->stab)
        qdisc_calculate_pkt_len(skb, sch->stab);
#endif
    return sch->enqueue(skb, sch); // pfifo_fast_ops为pfifo_fast_enqueue()
}

static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
{
    /* 若发送队列未满
       在ether_setup()中设置dev->tx_queue_len为1000 */
    if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
        int band = prio2band[skb->priority & TC_PRIO_MAX]; // prio2band数组
        struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
        struct sk_buff_head *list = band2list(priv, band); // band2list()

        priv->bitmap |= (1 << band);
        qdisc->q.qlen++;
        return __qdisc_enqueue_tail(skb, qdisc, list); // 入队
    }

    return qdisc_drop(skb, qdisc); // 若发送队列满,丢包
}

qdisc_run()

static inline void qdisc_run(struct Qdisc *q)
{
    if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
        __qdisc_run(q);
}

void __qdisc_run(struct Qdisc *q)
{
    unsigned long start_time = jiffies;

    // 循环出队发包,直到发送队列为空
    while (qdisc_restart(q)) {
        /*
         * Postpone processing if
         * 1. another process needs the CPU;
         * 2. we've been doing it for too long.
         */
        if (need_resched() || jiffies != start_time) { // 延迟发包
            // __netif_schedule() -> __netif_reschedule() -> raise_softirq_irqoff()
            __netif_schedule(q);
            break;
        }
    }

    clear_bit(__QDISC_STATE_RUNNING, &q->state); // 清除q->state的__QDISC_STATE_RUNNING
}

static inline int qdisc_restart(struct Qdisc *q)
{
    struct netdev_queue *txq;
    struct net_device *dev;
    spinlock_t *root_lock;
    struct sk_buff *skb;

    /* Dequeue packet */
    skb = dequeue_skb(q); // 出队,得到skb
    if (unlikely(!skb))
        return 0;

    root_lock = qdisc_lock(q);
    dev = qdisc_dev(q); // 得到设备
    txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); // 得到队列

    return sch_direct_xmit(skb, q, dev, txq, root_lock); // 直接发送
}

netif_receive_skb()

协议栈入口函数是netif_receive_skb(),网卡驱动通过netif_receive_skb()将skb上送协议栈

ixgbe_receive_skb() -> napi_gro_receive() -> napi_skb_finish() -> netif_receive_skb() -> ip_rcv()

int netif_receive_skb(struct sk_buff *skb)
{
   struct packet_type *ptype, *pt_prev;
   struct net_device *orig_dev;
   struct net_device *null_or_orig;
   int ret = NET_RX_DROP;
   __be16 type;

   if (!skb->tstamp.tv64)
      net_timestamp(skb);

   if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
      return NET_RX_SUCCESS;

   /* if we've gotten here through NAPI, check netpoll */
   if (netpoll_receive_skb(skb))
      return NET_RX_DROP;

   if (!skb->iif)
      skb->iif = skb->dev->ifindex;

   null_or_orig = NULL;
   orig_dev = skb->dev;
   if (orig_dev->master) {
      if (skb_bond_should_drop(skb))
         null_or_orig = orig_dev; /* deliver only exact match */
      else
         skb->dev = orig_dev->master;
   }

   __get_cpu_var(netdev_rx_stat).total++;

   skb_reset_network_header(skb); // skb->network_header = skb->data
   skb_reset_transport_header(skb); // skb->transport_header = skb->data
   skb->mac_len = skb->network_header - skb->mac_header;

   pt_prev = NULL;

   rcu_read_lock();

#ifdef CONFIG_NET_CLS_ACT
   if (skb->tc_verd & TC_NCLS) {
      skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
      goto ncls;
   }
#endif

   // 遍历ptype_all hlist
   list_for_each_entry_rcu(ptype, &ptype_all, list) {
      if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
          ptype->dev == orig_dev) {
         if (pt_prev)
            ret = deliver_skb(skb, pt_prev, orig_dev);
         pt_prev = ptype;
      }
   }

#ifdef CONFIG_NET_CLS_ACT
   skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
   if (!skb)
      goto out;
ncls:
#endif

   skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
   if (!skb)
      goto out;
   skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
   if (!skb)
      goto out;

   type = skb->protocol;
   // 遍历ptype_base hlist,通过type得到packet_type
   list_for_each_entry_rcu(ptype,
         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
      if (ptype->type == type &&
          (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
           ptype->dev == orig_dev)) {
         if (pt_prev)
            ret = deliver_skb(skb, pt_prev, orig_dev);
         pt_prev = ptype;
      }
   }

   if (pt_prev) {
      ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); // 调用packet_type的func
   } else {
      kfree_skb(skb);
      /* Jamal, now you will not able to escape explaining
       * me how you were going to use this. :-)
       */
      ret = NET_RX_DROP;
   }

out:
   rcu_read_unlock();
   return ret;
}

ip_packet_type

static struct packet_type ip_packet_type __read_mostly = {
   .type = cpu_to_be16(ETH_P_IP),
   .func = ip_rcv, // ip_packet_type的func
   .gso_send_check = inet_gso_send_check,
   .gso_segment = inet_gso_segment,
   .gro_receive = inet_gro_receive,
   .gro_complete = inet_gro_complete,
};

eth_type_trans()

__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
   struct ethhdr *eth;
   unsigned char *rawp;

   skb->dev = dev;
   skb_reset_mac_header(skb); // skb->mac_header = skb->data
   skb_pull(skb, ETH_HLEN);
   eth = eth_hdr(skb);

   if (unlikely(is_multicast_ether_addr(eth->h_dest))) { // mac地址的最低位为1表示多/广播,为0表示单播
      if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast))
         skb->pkt_type = PACKET_BROADCAST;
      else
         skb->pkt_type = PACKET_MULTICAST;
   }

   /*
    *      This ALLMULTI check should be redundant by 1.4
    *      so don't forget to remove it.
    *
    *      Seems, you forgot to remove it. All silly devices
    *      seems to set IFF_PROMISC.
    */

   else if (1 /*dev->flags&IFF_PROMISC */ ) {
      if (unlikely(compare_ether_addr_64bits(eth->h_dest, dev->dev_addr))) // 比较skb的目的mac地址和网卡的mac地址
         skb->pkt_type = PACKET_OTHERHOST; // 默认为PACKET_HOST
   }

   /*
    * Some variants of DSA tagging don't have an ethertype field
    * at all, so we check here whether one of those tagging
    * variants has been configured on the receiving interface,
    * and if so, set skb->protocol without looking at the packet.
    */
   if (netdev_uses_dsa_tags(dev))
      return htons(ETH_P_DSA);
   if (netdev_uses_trailer_tags(dev))
      return htons(ETH_P_TRAILER);

   if (ntohs(eth->h_proto) >= 1536) // type/size大于0x0600表示type
      return eth->h_proto; // 返回L3协议

   rawp = skb->data;

   /*
    *      This is a magic hack to spot IPX packets. Older Novell breaks
    *      the protocol design and runs IPX over 802.3 without an 802.2 LLC
    *      layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
    *      won't work for fault tolerant netware but does for the rest.
    */
   if (*(unsigned short *)rawp == 0xFFFF)
      return htons(ETH_P_802_3);

   /*
    *      Real 802.2 LLC
    */
   return htons(ETH_P_802_2);
}
EXPORT_SYMBOL(eth_type_trans);

猜你喜欢

转载自blog.csdn.net/hz5034/article/details/79952195