msn: [email protected]
来源:http://yfydz.cublog.cn
1. 前言 本文简要介绍数据包在进入桥网卡后在Linux网络协议栈的处理流程,并描述netfilter的hook点的挂接处理情况,具体各部分的详细处理待后续文章中说明。 以下内核代码版本为2.6.19.2. 2. 函数处理流程 bridge入口点handle_bridge() /* net/core/dev.c */ int netif_receive_skb(struct sk_buff *skb) { ...... if (handle_bridge(&skb, &pt_prev, &ret, orig_dev)) goto out; ...... } bridge基本挂接点处理函数:br_handle_frame_hook() static __inline__ int handle_bridge(struct sk_buff **pskb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { struct net_bridge_port *port; if ((*pskb)->pkt_type == PACKET_LOOPBACK || (port = rcu_dereference((*pskb)->dev->br_port)) == NULL) return 0; if (*pt_prev) { *ret = deliver_skb(*pskb, *pt_prev, orig_dev); *pt_prev = NULL; } return br_handle_frame_hook(port, pskb); } bridge_handle_frame_hook()的实际实现: /* net/bridge/br.c */ static int __init br_init(void) { ...... br_handle_frame_hook = br_handle_frame; ...... } br_handle_frame: PF_BEIDGE的prerouting点 /* net/bridge/br_input.c */ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) { struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) goto err; if (unlikely(is_link_local(dest))) { // 自身包进入PF_BEIDGE的INPUT点, 一般处理的包数不多 skb->pkt_type = PACKET_HOST; // 正常是返回1的, 然后就返回1, 表示桥模块全权处理该包了 return NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev, NULL, br_handle_local_finish) != 0; } if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) { // br_should_route_hook函数一般没定义 if (br_should_route_hook) { if (br_should_route_hook(pskb)) return 0; skb = *pskb; dest = eth_hdr(skb)->h_dest; } if (!compare_ether_addr(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; // PF_BRIDGE的prerouting处理结束后进入br_handle_frame_finish NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, br_handle_frame_finish); // 处理后始终返回1, 表示不再进行其他协议族处理,该数据包已经完全由bridge处理完毕 return 1; } err: kfree_skb(skb); // 处理后始终返回1, 表示不再进行其他协议族处理,该数据包已经完全由bridge处理完毕 return 1; } 通过br_handle_frame_finish进入bridge的转发: /* note: already called with rcu_read_lock (preempt_disabled) */ int br_handle_frame_finish(struct sk_buff *skb) { const unsigned char *dest = eth_hdr(skb)->h_dest; struct net_bridge_port *p = rcu_dereference(skb->dev->br_port); struct net_bridge *br; struct net_bridge_fdb_entry *dst; int passedup = 0; if (!p || p->state == BR_STATE_DISABLED) goto drop; /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; br_fdb_update(br, p, eth_hdr(skb)->h_source); if (p->state == BR_STATE_LEARNING) goto drop; if (br->dev->flags & IFF_PROMISC) { struct sk_buff *skb2; skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 != NULL) { passedup = 1; br_pass_frame_up(br, skb2); } } if (is_multicast_ether_addr(dest)) { // 多播转发,也是调用广播处理 br->statistics.multicast++; br_flood_forward(br, skb, !passedup); if (!passedup) br_pass_frame_up(br, skb); goto out; } // 根据目的MAC找目的出口 dst = __br_fdb_get(br, dest); if (dst != NULL && dst->is_local) { if (!passedup) br_pass_frame_up(br, skb); else kfree_skb(skb); goto out; } if (dst != NULL) { // 单播转发 br_forward(dst->dst, skb); goto out; } // 广播转发 br_flood_forward(br, skb, 0); out: return 0; drop: kfree_skb(skb); goto out; } 广播/多播转发: br_flood_forward/br_flood /* called under bridge lock */ void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, int clone) { br_flood(br, skb, clone, __br_forward); } /* called under bridge lock */ static void br_flood(struct net_bridge *br, struct sk_buff *skb, int clone, void (*__packet_hook)(const struct net_bridge_port *p, struct sk_buff *skb)) { struct net_bridge_port *p; struct net_bridge_port *prev; if (clone) { struct sk_buff *skb2; if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { br->statistics.tx_dropped++; return; } skb = skb2; } prev = NULL; list_for_each_entry_rcu(p, &br->port_list, list) { if (should_deliver(p, skb)) { if (prev != NULL) { struct sk_buff *skb2; if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { br->statistics.tx_dropped++; kfree_skb(skb); return; } // 这里实际是__br_forward __packet_hook(prev, skb2); } prev = p; } } if (prev != NULL) { // 这里实际是__br_forward __packet_hook(prev, skb); return; } kfree_skb(skb); } 单播转发: br_forward /* net/bridge/br_forward.c */ /* called with rcu_read_lock */ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb) { if (should_deliver(to, skb)) { // 也是调用__br_forward __br_forward(to, skb); return; } kfree_skb(skb); } FORWARD点: static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) { struct net_device *indev; indev = skb->dev; skb->dev = to->dev; skb->ip_summed = CHECKSUM_NONE; // 进入PF_BRIDGE的forward hook, 结束后进入br_forward_finish() NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev, br_forward_finish); } POSTROUTING点: // 从FORWARD点处理后直接进入POSTROUTING点处理 int br_forward_finish(struct sk_buff *skb) { // 进入PF_BRIDGE的postrouting hook, 结束后进入br_dev_queue_push_xmit() return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, br_dev_queue_push_xmit); } 数据包发出: int br_dev_queue_push_xmit(struct sk_buff *skb) { /* drop mtu oversized packets except gso */ if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) kfree_skb(skb); else { /* ip_refrag calls ip_fragment, doesn't copy the MAC header. */ if (nf_bridge_maybe_copy_header(skb)) kfree_skb(skb); else { skb_push(skb, ETH_HLEN); // 此处调用dev设备的hard_start_xmit()函数 dev_queue_xmit(skb); } } return 0; } 桥网卡设备的hard_start_xmit()函数定义为: /* net/bridge/br_device.c */ void br_dev_setup(struct net_device *dev) { ...... dev->hard_start_xmit = br_dev_xmit; ...... } /* net device transmit always called with no BH (preempt_disabled) */ int br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); const unsigned char *dest = skb->data; struct net_bridge_fdb_entry *dst; br->statistics.tx_packets++; br->statistics.tx_bytes += skb->len; skb->mac.raw = skb->data; skb_pull(skb, ETH_HLEN); if (dest[0] & 1) // 多播发送 br_flood_deliver(br, skb, 0); else if ((dst = __br_fdb_get(br, dest)) != NULL) // 单播发送 br_deliver(dst->dst, skb); else // 广播发送 br_flood_deliver(br, skb, 0); // 这些发送函数最终都会调用__br_deliver()函数 return 0; } /* net/bridge/br_forward.c */ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { skb->dev = to->dev; // 此处是PF_BRIDGE的OUTPUT点 NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, br_forward_finish); } 总结: PF_BRIDGE中的各个hook点和PF_INET不同, 可用下面的图表示: PREROUTING --+--FORWARD-----POSTROUTING------+----OUTPUT | | | | INPUT 3. BF_BRIDGE的hook点 在net/bridge/br_netfilter.c中定义了以下hook点,注意这些hook点主要是PF_BRIDGE协议族的。 /* net/bridge/br_netfilter.c */ /* For br_nf_local_out we need (prio = NF_BR_PRI_FIRST), to insure that innocent * PF_BRIDGE/NF_BR_LOCAL_OUT functions don't get bridged traffic as input. * For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * ip_refrag() can return NF_STOLEN. */ static struct nf_hook_ops br_nf_ops[] = { // PF_BRIDGE的挂接点 // PREROUTING点 { .hook = br_nf_pre_routing, .owner = THIS_MODULE, .pf = PF_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_BRNF, }, // INPUT点 { .hook = br_nf_local_in, .owner = THIS_MODULE, .pf = PF_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_BRNF, }, // FORWARD点 { .hook = br_nf_forward_ip, .owner = THIS_MODULE, .pf = PF_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_BRNF - 1, }, // FORWARD点 { .hook = br_nf_forward_arp, .owner = THIS_MODULE, .pf = PF_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_BRNF, }, // OUTPUT点 { .hook = br_nf_local_out, .owner = THIS_MODULE, .pf = PF_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_FIRST, }, // POSTROUTING点 { .hook = br_nf_post_routing, .owner = THIS_MODULE, .pf = PF_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_LAST, }, // 后面是PF_INET/PF_INET6的挂接点, 其实也没进行什么数据包操作, // 就是自身的输入输出包不通过桥处理,要短路掉 { .hook = ip_sabotage_in, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_PRE_ROUTING, .priority = NF_IP_PRI_FIRST, }, { .hook = ip_sabotage_in, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_IP6_PRE_ROUTING, .priority = NF_IP6_PRI_FIRST, }, { .hook = ip_sabotage_out, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_FORWARD, .priority = NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD, }, { .hook = ip_sabotage_out, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_IP6_FORWARD, .priority = NF_IP6_PRI_BRIDGE_SABOTAGE_FORWARD, }, { .hook = ip_sabotage_out, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_LOCAL_OUT, .priority = NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT, }, { .hook = ip_sabotage_out, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_IP6_LOCAL_OUT, .priority = NF_IP6_PRI_BRIDGE_SABOTAGE_LOCAL_OUT, }, { .hook = ip_sabotage_out, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, .priority = NF_IP_PRI_FIRST, }, { .hook = ip_sabotage_out, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_IP6_POST_ROUTING, .priority = NF_IP6_PRI_FIRST, }, }; // PF_BRIDGE的PRROUTING点处理函数 static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { ...... // 此处继续调用PF_INET族的PREROUTING点的hook处理 NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL, br_nf_pre_routing_finish); return NF_STOLEN; inhdr_error: // IP_INC_STATS_BH(IpInHdrErrors); out: return NF_DROP; } // PF_BRIDGE的FORWARD点处理 static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { ...... // 此处继续调用PF_INET/PF_INET6族的FORWARD点的hook处理 NF_HOOK(pf, NF_IP_FORWARD, skb, bridge_parent(in), parent, br_nf_forward_finish); return NF_STOLEN; } // PF_BRIDGE的OUTPUT点处理 static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { ...... /* IP forwarded traffic has a physindev, locally * generated traffic hasn't. */ if (realindev != NULL) { if (!(nf_bridge->mask & BRNF_DONT_TAKE_PARENT)) { struct net_device *parent = bridge_parent(realindev); if (parent) realindev = parent; } // 此处继续调用PF_INET/PF_INET6族的FORWARD点的hook处理, 不过优先权值要在// NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1以上 NF_HOOK_THRESH(pf, NF_IP_FORWARD, skb, realindev, realoutdev, br_nf_local_out_finish, NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1); } else { // 此处继续调用PF_INET/PF_INET6族的FORWARD点的hook处理, 不过优先权值要在 // NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1以上 NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev, realoutdev, br_nf_local_out_finish, NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1); } out: return NF_STOLEN; } // PF_BRIDGE的POSTROUTING点 static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { ...... // 此处继续调用PF_INET/PF_INET6族的POSTROUTING点的hook处理 NF_HOOK(pf, NF_IP_POST_ROUTING, skb, NULL, realoutdev, br_nf_dev_queue_xmit); return NF_STOLEN; #ifdef CONFIG_NETFILTER_DEBUG print_error: if (skb->dev != NULL) { printk("[%s]", skb->dev->name); if (realoutdev) printk("[%s]", realoutdev->name); } printk(" head:%p, raw:%p, data:%p\n", skb->head, skb->mac.raw, skb->data); dump_stack(); return NF_ACCEPT; #endif } 由此可见, PF_INET的各个hook点也被PF_BRIDGE的各个hook点调用,因此可以在桥网卡中进行过滤或NAT等操作。 4. 结论 BRIDGE的数据处理流程是是一个独立的处理过程, 如果处理正常的话就不再返回到其他协议处理。 在桥的处理层次也和IP协议一样,可以挂接多个PF_BRIDGE的挂接点,这些挂接点中又调用了PF_INET族的挂接点,从而实现了桥下的过滤、NAT等功能。