IPv4输入数据包的处理过程

输入接口定义

/*
 *	IP protocol layer initialiser
 */
static struct packet_type ip_packet_type = {
	//IP层的数据包类型为ETH_P_IP,当设备接口层收到该类型的数据包,就会递交给IP层处理
	.type = __constant_htons(ETH_P_IP),
	//设备接口层通过调用ip_rcv()将数据包传递给IP层
	.func = ip_rcv,
	.gso_send_check = inet_gso_send_check,
	.gso_segment = inet_gso_segment,
};

该结构在inet_init()初始化时被注册给设备接口层:

static int __init inet_init(void)
{
	...
	dev_add_pack(&ip_packet_type);
	...
}

输入数据包的IP层入口

/*
 * 	Main IP Receive routine.
 */
@skb: 数据包
@dev:数据包的当前输入网络设备(层二可能会使用一些聚合技术)
@pt:数据包的类型
@orig_dev: 接收数据包的原始网络设备
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
	struct iphdr *iph;
	u32 len;

	if (dev->nd_net != &init_net)
		goto drop;

	/* When the interface is in promisc. mode, drop all the crap
	 * that it receives, do not try to analyse it.
	 */
	//在混杂模式下,发往其它主机的一些数据包有可能会到达这里,忽略它们
	if (skb->pkt_type == PACKET_OTHERHOST)
		goto drop;

	IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
	//因为后面可能会修改SKB描述符的内容,所以如果该SKB描述符是被共享的(其users成员不为1),
	//那么克隆一个新的,然后返回,我们使用新的SKB继续后续操作
	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
		IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
		goto out;
	}
	//确保线性区域中至少有IP首部长度的数据
	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
		goto inhdr_error;
	//iph指向IP首部
	iph = ip_hdr(skb);

	/*
	 *	RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
	 *
	 *	Is the datagram acceptable?
	 *
	 *	1.	Length at least the size of an ip header
	 *	2.	Version of 4
	 *	3.	Checksums correctly. [Speed optimisation for later, skip loopback checksums]
	 *	4.	Doesn't have a bogus length
	 */
	//1&2:检查首部长度和IP协议版本号
	if (iph->ihl < 5 || iph->version != 4)
		goto inhdr_error;
	//这里之所以又做一遍,是因为IP首部可能还有选项部分,iph->ihl*4是IP报文的真实首部
	if (!pskb_may_pull(skb, iph->ihl*4))
		goto inhdr_error;
	//SKB内部指针可能已经发生变化,所以需要重新指向
	iph = ip_hdr(skb);
	//校验IP首部是否出错
	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
		goto inhdr_error;
	
	//校验IP数据包的总长度
	len = ntohs(iph->tot_len);
	if (skb->len < len) {
		IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
		goto drop;
	} else if (len < (iph->ihl*4))
		goto inhdr_error;

	/* Our transport medium may have padded the buffer out. Now we know it
	 * is IP we can trim to the true length of the frame.
	 * Note this now means skb->len holds ntohs(iph->tot_len).
	 */
	//如注释所述,层二有可能会在IP数据包上打padding,所这里知道了IP数据包的总长度,
	//对SKB的长度字段进行调整并重新计算校验和
	if (pskb_trim_rcsum(skb, len)) {
		IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
		goto drop;
	}

	/* Remove any debris in the socket control block */
	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
	//数据包进入PREROUTING链,如果通过该链,则将数据包传递给ip_rcv_finish()继续处理
	return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
		       ip_rcv_finish);

inhdr_error:
	IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
	kfree_skb(skb);
out:
	return NET_RX_DROP;
}

PREROUTING后的处理ip_rcv_finish()

static int ip_rcv_finish(struct sk_buff *skb)
{
	const struct iphdr *iph = ip_hdr(skb);
	struct rtable *rt;

	/*
	 *	Initialise the virtual path cache for the packet. It describes
	 *	how the packet travels inside Linux networking.
	 */
	//如果数据包还没有目的路由,则查找路由表,确定其路由
	if (skb->dst == NULL) {
		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
					 skb->dev);
		if (unlikely(err)) {
			if (err == -EHOSTUNREACH)
				IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
			else if (err == -ENETUNREACH)
				IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
			goto drop;
		}
	}

#ifdef CONFIG_NET_CLS_ROUTE
	if (unlikely(skb->dst->tclassid)) {
		struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
		u32 idx = skb->dst->tclassid;
		st[idx&0xFF].o_packets++;
		st[idx&0xFF].o_bytes+=skb->len;
		st[(idx>>16)&0xFF].i_packets++;
		st[(idx>>16)&0xFF].i_bytes+=skb->len;
	}
#endif

	//如果该数据包包含IP选项,则解析这些选项
	if (iph->ihl > 5 && ip_rcv_options(skb))
		goto drop;

	//根据目的路由信息,如果需要,更新多播和广播统计
	rt = (struct rtable*)skb->dst;
	if (rt->rt_type == RTN_MULTICAST)
		IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
	else if (rt->rt_type == RTN_BROADCAST)
		IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);

	//根据目的路由进行向上分发,或者是转发
	return dst_input(skb);

drop:
	kfree_skb(skb);
	return NET_RX_DROP;
}
/* Input packet from network to transport.  */
static inline int dst_input(struct sk_buff *skb)
{
	int err;

	//这里使用循环的用意是什么?如果返回其它值会重复处理?
	for (;;) {
		//调用skb中的目的路由信息中的input()继续处理,SKB中的dst信息实际上就是前面的ip_route_input()查询
		//路由表时设置好的,所以说,查询路由表就是要获取一个dst信息并将其设置到skb中
		err = skb->dst->input(skb);

		if (likely(err == 0))
			return err;
		/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
		if (unlikely(err != NET_XMIT_BYPASS))
			return err;
	}
}

这里,数据包被分为两种情况,一种是输入本机的,另外一种是转发的,它们对应的input函数实际上分别为ip_local_delivery()和ip_forward(),具体使用哪一个是由前面的输入路由查询决定的。

数据包输入到本地ip_local_delivery()

/*
 * 	Deliver IP Packets to the higher protocol layers.
 */
int ip_local_deliver(struct sk_buff *skb)
{
	/*
	 *	Reassemble IP fragments.
	 */
	//首先检查该IP数据报是否是分片,如果是则要调用ip_defrag()尝试进行组装,组装成功则继续处理,
	//否则需要先进行缓存等待其它分组的到达
	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
		if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
			return 0;
	}
	//进入LOCAL_IN HOOK点,如果通过则调用ip_local_deliver_finish()继续处理
	return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
		       ip_local_deliver_finish);
}

这里我们假设所有数据包都可以通过LOCAL_IN,继续看后面的处理。

static int ip_local_deliver_finish(struct sk_buff *skb)
{
	//从数据包中删除掉IP首部
	__skb_pull(skb, ip_hdrlen(skb));

	/* Point into the IP datagram, just past the header. */
	//设置skb->transport_header指针指向SKB的data开始位置
	skb_reset_transport_header(skb);

	rcu_read_lock();
	{
		//取出IP首部的协议字段,需要根据该字段寻找对应的上层协议
		int protocol = ip_hdr(skb)->protocol;
		int hash, raw;
		struct net_protocol *ipprot;

	resubmit:
		//RAW套接字相关,忽略
		raw = raw_local_deliver(skb, protocol);
		//计算好哈希值
		hash = protocol & (MAX_INET_PROTOS - 1);
		//从inet_protos数组中寻找上层协议提供的接收处理回调,在协议族初始化时,所有的上层协议都会将自己的
		//接收处理接口注册到该数组中
		if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
			int ret;
			//IPSec相关的检查,忽略
			if (!ipprot->no_policy) {
				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
					kfree_skb(skb);
					goto out;
				}
				nf_reset(skb);
			}
			//调用传输层接口处理,对于TCP是tcp_v4_rcv()
			ret = ipprot->handler(skb);
			//如果上层的处理返回错误,这里会将错误码作为协议号,重新执行上述流程,这一般会匹配到ICMP模块进行处理
			if (ret < 0) {
				protocol = -ret;
				goto resubmit;
			}
			IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
		} else {
			if (!raw) {
				if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
					IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
					icmp_send(skb, ICMP_DEST_UNREACH,
						  ICMP_PROT_UNREACH, 0);
				}
			} else
				IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
			//没有对应的上层协议时,需要丢弃该数据包
			kfree_skb(skb);
		}
	}
 out:
	rcu_read_unlock();

	return 0;
}

数据包的转发ip_forward()

int ip_forward(struct sk_buff *skb)
{
	struct iphdr *iph;	/* Our header */
	struct rtable *rt;	/* Route we use */
	struct ip_options * opt	= &(IPCB(skb)->opt);
	//IPSec相关检查,忽略
	if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
		goto drop;
	//如果有路由告警信息,处理成功后直接返回,不再转发这种数据包
	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
		return NET_RX_SUCCESS;

	//确保该数据包确实是让自己转发的
	if (skb->pkt_type != PACKET_HOST)
		goto drop;
	//转发会修改IP的首部字段,所以需要把检验和设置为CHECKSUM_NONE
	skb_forward_csum(skb);

	/*
	 *	According to the RFC, we must first decrease the TTL field. If
	 *	that reaches zero, we must reply an ICMP control message telling
	 *	that the packet's lifetime expired.
	 */
	//如果TTL已经减为1,那么向发送段回复生命周期太短的ICMP报文
	if (ip_hdr(skb)->ttl <= 1)
		goto too_many_hops;
	//IPSec相关,忽略
	if (!xfrm4_route_forward(skb))
		goto drop;

	//严格源路由选项检查
	rt = (struct rtable*)skb->dst;
	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
		goto sr_failed;
	//IP分片相关处理
	if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) &&
		     (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(dst_mtu(&rt->u.dst)));
		goto drop;
	}

	/* We are about to mangle packet. Copy it! */
	if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
		goto drop;
	iph = ip_hdr(skb);

	/* Decrease ttl after skb cow done */
	//递减TTL
	ip_decrease_ttl(iph);

	/*
	 *	We now generate an ICMP HOST REDIRECT giving the route
	 *	we calculated.
	 */
	//路由重定向选项处理
	if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp)
		ip_rt_send_redirect(skb);

	//根据TOS字段转换出优先级
	skb->priority = rt_tos2priority(iph->tos);

	//进入FORWARD链,如果通过调用ip_forward_finish()完成转发过程处理
	return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev,
		       ip_forward_finish);

sr_failed:
	/*
	 *	Strict routing permits no gatewaying
	 */
	 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
	 goto drop;

too_many_hops:
	/* Tell the sender its packet died... */
	IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
	icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
	kfree_skb(skb);
	return NET_RX_DROP;
}

ip_forward_finish()

static int ip_forward_finish(struct sk_buff *skb)
{
	struct ip_options * opt	= &(IPCB(skb)->opt);

	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
	//处理转发选项
	if (unlikely(opt->optlen))
		ip_forward_options(skb);
	//直接调用路由输出,指向的应该是ip_output()或者ip_mc_output()
	return dst_output(skb);
}

猜你喜欢

转载自blog.csdn.net/fanxiaoyu321/article/details/83155629