输入接口定义
/*
* IP protocol layer initialiser
*/
static struct packet_type ip_packet_type = {
//IP层的数据包类型为ETH_P_IP,当设备接口层收到该类型的数据包,就会递交给IP层处理
.type = __constant_htons(ETH_P_IP),
//设备接口层通过调用ip_rcv()将数据包传递给IP层
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
};
该结构在inet_init()初始化时被注册给设备接口层:
static int __init inet_init(void)
{
...
dev_add_pack(&ip_packet_type);
...
}
输入数据包的IP层入口
/*
* Main IP Receive routine.
*/
@skb: 数据包
@dev:数据包的当前输入网络设备(层二可能会使用一些聚合技术)
@pt:数据包的类型
@orig_dev: 接收数据包的原始网络设备
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct iphdr *iph;
u32 len;
if (dev->nd_net != &init_net)
goto drop;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
//在混杂模式下,发往其它主机的一些数据包有可能会到达这里,忽略它们
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
//因为后面可能会修改SKB描述符的内容,所以如果该SKB描述符是被共享的(其users成员不为1),
//那么克隆一个新的,然后返回,我们使用新的SKB继续后续操作
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto out;
}
//确保线性区域中至少有IP首部长度的数据
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
//iph指向IP首部
iph = ip_hdr(skb);
/*
* RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
//1&2:检查首部长度和IP协议版本号
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
//这里之所以又做一遍,是因为IP首部可能还有选项部分,iph->ihl*4是IP报文的真实首部
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
//SKB内部指针可能已经发生变化,所以需要重新指向
iph = ip_hdr(skb);
//校验IP首部是否出错
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto inhdr_error;
//校验IP数据包的总长度
len = ntohs(iph->tot_len);
if (skb->len < len) {
IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
//如注释所述,层二有可能会在IP数据包上打padding,所这里知道了IP数据包的总长度,
//对SKB的长度字段进行调整并重新计算校验和
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto drop;
}
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
//数据包进入PREROUTING链,如果通过该链,则将数据包传递给ip_rcv_finish()继续处理
return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
PREROUTING后的处理ip_rcv_finish()
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
//如果数据包还没有目的路由,则查找路由表,确定其路由
if (skb->dst == NULL) {
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
goto drop;
}
}
#ifdef CONFIG_NET_CLS_ROUTE
if (unlikely(skb->dst->tclassid)) {
struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
u32 idx = skb->dst->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes+=skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes+=skb->len;
}
#endif
//如果该数据包包含IP选项,则解析这些选项
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
//根据目的路由信息,如果需要,更新多播和广播统计
rt = (struct rtable*)skb->dst;
if (rt->rt_type == RTN_MULTICAST)
IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
else if (rt->rt_type == RTN_BROADCAST)
IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
//根据目的路由进行向上分发,或者是转发
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
/* Input packet from network to transport. */
static inline int dst_input(struct sk_buff *skb)
{
int err;
//这里使用循环的用意是什么?如果返回其它值会重复处理?
for (;;) {
//调用skb中的目的路由信息中的input()继续处理,SKB中的dst信息实际上就是前面的ip_route_input()查询
//路由表时设置好的,所以说,查询路由表就是要获取一个dst信息并将其设置到skb中
err = skb->dst->input(skb);
if (likely(err == 0))
return err;
/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
if (unlikely(err != NET_XMIT_BYPASS))
return err;
}
}
这里,数据包被分为两种情况,一种是输入本机的,另外一种是转发的,它们对应的input函数实际上分别为ip_local_delivery()和ip_forward(),具体使用哪一个是由前面的输入路由查询决定的。
数据包输入到本地ip_local_delivery()
/*
* Deliver IP Packets to the higher protocol layers.
*/
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
//首先检查该IP数据报是否是分片,如果是则要调用ip_defrag()尝试进行组装,组装成功则继续处理,
//否则需要先进行缓存等待其它分组的到达
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
//进入LOCAL_IN HOOK点,如果通过则调用ip_local_deliver_finish()继续处理
return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
这里我们假设所有数据包都可以通过LOCAL_IN,继续看后面的处理。
static int ip_local_deliver_finish(struct sk_buff *skb)
{
//从数据包中删除掉IP首部
__skb_pull(skb, ip_hdrlen(skb));
/* Point into the IP datagram, just past the header. */
//设置skb->transport_header指针指向SKB的data开始位置
skb_reset_transport_header(skb);
rcu_read_lock();
{
//取出IP首部的协议字段,需要根据该字段寻找对应的上层协议
int protocol = ip_hdr(skb)->protocol;
int hash, raw;
struct net_protocol *ipprot;
resubmit:
//RAW套接字相关,忽略
raw = raw_local_deliver(skb, protocol);
//计算好哈希值
hash = protocol & (MAX_INET_PROTOS - 1);
//从inet_protos数组中寻找上层协议提供的接收处理回调,在协议族初始化时,所有的上层协议都会将自己的
//接收处理接口注册到该数组中
if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
int ret;
//IPSec相关的检查,忽略
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
//调用传输层接口处理,对于TCP是tcp_v4_rcv()
ret = ipprot->handler(skb);
//如果上层的处理返回错误,这里会将错误码作为协议号,重新执行上述流程,这一般会匹配到ICMP模块进行处理
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
} else
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
//没有对应的上层协议时,需要丢弃该数据包
kfree_skb(skb);
}
}
out:
rcu_read_unlock();
return 0;
}
数据包的转发ip_forward()
int ip_forward(struct sk_buff *skb)
{
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options * opt = &(IPCB(skb)->opt);
//IPSec相关检查,忽略
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
goto drop;
//如果有路由告警信息,处理成功后直接返回,不再转发这种数据包
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
//确保该数据包确实是让自己转发的
if (skb->pkt_type != PACKET_HOST)
goto drop;
//转发会修改IP的首部字段,所以需要把检验和设置为CHECKSUM_NONE
skb_forward_csum(skb);
/*
* According to the RFC, we must first decrease the TTL field. If
* that reaches zero, we must reply an ICMP control message telling
* that the packet's lifetime expired.
*/
//如果TTL已经减为1,那么向发送段回复生命周期太短的ICMP报文
if (ip_hdr(skb)->ttl <= 1)
goto too_many_hops;
//IPSec相关,忽略
if (!xfrm4_route_forward(skb))
goto drop;
//严格源路由选项检查
rt = (struct rtable*)skb->dst;
if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto sr_failed;
//IP分片相关处理
if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) &&
(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(dst_mtu(&rt->u.dst)));
goto drop;
}
/* We are about to mangle packet. Copy it! */
if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
goto drop;
iph = ip_hdr(skb);
/* Decrease ttl after skb cow done */
//递减TTL
ip_decrease_ttl(iph);
/*
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
*/
//路由重定向选项处理
if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp)
ip_rt_send_redirect(skb);
//根据TOS字段转换出优先级
skb->priority = rt_tos2priority(iph->tos);
//进入FORWARD链,如果通过调用ip_forward_finish()完成转发过程处理
return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev,
ip_forward_finish);
sr_failed:
/*
* Strict routing permits no gatewaying
*/
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
goto drop;
too_many_hops:
/* Tell the sender its packet died... */
IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
ip_forward_finish()
static int ip_forward_finish(struct sk_buff *skb)
{
struct ip_options * opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
//处理转发选项
if (unlikely(opt->optlen))
ip_forward_options(skb);
//直接调用路由输出,指向的应该是ip_output()或者ip_mc_output()
return dst_output(skb);
}