linux inet_init

1. 前言


2. 流程图

2.1 Tcp/Ip 协议层


2.2 代码内部分层结构


3. inet_init源码分析

static int __init inet_init(void)
{
	struct inet_protosw *q;
	struct list_head *r;
	int rc = -EINVAL;

	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));

	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
	if (!sysctl_local_reserved_ports)
		goto out;

	//tcp协议注册(传输层)
	rc = proto_register(&tcp_prot, 1); //第二个参数为1,表示在高速缓存内部分配空间
	if (rc)
		goto out_free_reserved_ports;

	//udp协议注册(传输层)
	rc = proto_register(&udp_prot, 1);
	if (rc)
		goto out_unregister_tcp_proto;

	//raw原始协议注册(传输层)
	rc = proto_register(&raw_prot, 1);
	if (rc)
		goto out_unregister_udp_proto;

	//icmp协议注册(传输层)
	rc = proto_register(&ping_prot, 1);
	if (rc)
		goto out_unregister_raw_proto;

	/*
	 *	Tell SOCKET that we are alive...
	 */

	(void)sock_register(&inet_family_ops);

#ifdef CONFIG_SYSCTL
	ip_static_sysctl_init();
#endif

	tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;

	/*
	 *	Add all the base protocols.
	 */

	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
		pr_crit("%s: Cannot add ICMP protocol\n", __func__);
	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
		pr_crit("%s: Cannot add UDP protocol\n", __func__);
	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
		pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
		pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif

	/* Register the socket-side information for inet_create. */
	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
		INIT_LIST_HEAD(r);

	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		inet_register_protosw(q);

	/*
	 *	Set the ARP module up
	 */

	arp_init();

	/*
	 *	Set the IP module up
	 */

	ip_init();

	tcp_v4_init();

	/* Setup TCP slab cache for open requests. */
	tcp_init();

	/* Setup UDP memory threshold */
	udp_init();

	/* Add UDP-Lite (RFC 3828) */
	udplite4_register();

	ping_init();

	/*
	 *	Set the ICMP layer up
	 */

	if (icmp_init() < 0)
		panic("Failed to create the ICMP control socket.\n");

	/*
	 *	Initialise the multicast router
	 */
#if defined(CONFIG_IP_MROUTE)
	if (ip_mr_init())
		pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
	/*
	 *	Initialise per-cpu ipv4 mibs
	 */

	if (init_ipv4_mibs())
		pr_crit("%s: Cannot init ipv4 mibs\n", __func__);

	ipv4_proc_init();

	ipfrag_init();

	dev_add_pack(&ip_packet_type);

	rc = 0;
out:
	return rc;
out_unregister_raw_proto:
	proto_unregister(&raw_prot);
out_unregister_udp_proto:
	proto_unregister(&udp_prot);
out_unregister_tcp_proto:
	proto_unregister(&tcp_prot);
out_free_reserved_ports:
	kfree(sysctl_local_reserved_ports);
	goto out;
}

fs_initcall(inet_init);
先贴出源码,下面逐个分析,为了了解架构,先分析它所涉及到的结构体!

3.  结构体

3.1 struct proto(传输层到网络层协议)

该结构体的操作集是“传输层”到“网络层”的接口

struct proto {
	void			(*close)(struct sock *sk,
					long timeout);
	int			(*connect)(struct sock *sk,
					struct sockaddr *uaddr,
					int addr_len);
	int			(*disconnect)(struct sock *sk, int flags);

	struct sock *		(*accept)(struct sock *sk, int flags, int *err);

	int			(*ioctl)(struct sock *sk, int cmd,
					 unsigned long arg);
	int			(*init)(struct sock *sk);
	void			(*destroy)(struct sock *sk);
	void			(*shutdown)(struct sock *sk, int how);
	int			(*setsockopt)(struct sock *sk, int level,
					int optname, char __user *optval,
					unsigned int optlen);
	int			(*getsockopt)(struct sock *sk, int level,
					int optname, char __user *optval,
					int __user *option);
#ifdef CONFIG_COMPAT
	int			(*compat_setsockopt)(struct sock *sk,
					int level,
					int optname, char __user *optval,
					unsigned int optlen);
	int			(*compat_getsockopt)(struct sock *sk,
					int level,
					int optname, char __user *optval,
					int __user *option);
	int			(*compat_ioctl)(struct sock *sk,
					unsigned int cmd, unsigned long arg);
#endif
	int			(*sendmsg)(struct kiocb *iocb, struct sock *sk,
					   struct msghdr *msg, size_t len);
	int			(*recvmsg)(struct kiocb *iocb, struct sock *sk,
					   struct msghdr *msg,
					   size_t len, int noblock, int flags,
					   int *addr_len);
	int			(*sendpage)(struct sock *sk, struct page *page,
					int offset, size_t size, int flags);
	int			(*bind)(struct sock *sk,
					struct sockaddr *uaddr, int addr_len);

	int			(*backlog_rcv) (struct sock *sk,
						struct sk_buff *skb);

	void		(*release_cb)(struct sock *sk);
	void		(*mtu_reduced)(struct sock *sk);

	/* Keeping track of sk's, looking them up, and port selection methods. */
	void			(*hash)(struct sock *sk);
	void			(*unhash)(struct sock *sk);
	void			(*rehash)(struct sock *sk);
	int			(*get_port)(struct sock *sk, unsigned short snum);
	void			(*clear_sk)(struct sock *sk, int size);

	/* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
	unsigned int		inuse_idx; //表示在proto_inuse_idx数组中占用的一个bit位索引
#endif

	/* Memory pressure */
	void			(*enter_memory_pressure)(struct sock *sk);
	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
	/*
	 * Pressure flag: try to collapse.
	 * Technical note: it is used by multiple contexts non atomically.
	 * All the __sk_mem_schedule() is of this nature: accounting
	 * is strict, actions are advisory and have some latency.
	 */
	int			*memory_pressure;
	long			*sysctl_mem;
	int			*sysctl_wmem;
	int			*sysctl_rmem;
	int			max_header;
	bool			no_autobind;

	struct kmem_cache	*slab;
	unsigned int		obj_size;
	int			slab_flags;

	struct percpu_counter	*orphan_count;

	struct request_sock_ops	*rsk_prot;
	struct timewait_sock_ops *twsk_prot;

	union {
		struct inet_hashinfo	*hashinfo;
		struct udp_table	*udp_table;
		struct raw_hashinfo	*raw_hash;
	} h;

	struct module		*owner;

	char			name[32];

	struct list_head	node;
#ifdef SOCK_REFCNT_DEBUG
	atomic_t		socks;
#endif
#ifdef CONFIG_MEMCG_KMEM
	/*
	 * cgroup specific init/deinit functions. Called once for all
	 * protocols that implement it, from cgroups populate function.
	 * This function has to setup any files the protocol want to
	 * appear in the kmem cgroup filesystem.
	 */
	int			(*init_cgroup)(struct mem_cgroup *memcg,
					       struct cgroup_subsys *ss);
	void			(*destroy_cgroup)(struct mem_cgroup *memcg);
	struct cg_proto		*(*proto_cgroup)(struct mem_cgroup *memcg);
#endif
};

对应的源码:

这里以tcp_prot为例:

struct proto tcp_prot = {
	.name			= "TCP",
	.owner			= THIS_MODULE,
	.close			= tcp_close,
	.connect		= tcp_v4_connect,
	.disconnect		= tcp_disconnect,
	.accept			= inet_csk_accept,
	.ioctl			= tcp_ioctl,
	.init			= tcp_v4_init_sock,
	.destroy		= tcp_v4_destroy_sock,
	.shutdown		= tcp_shutdown,
	.setsockopt		= tcp_setsockopt,
	.getsockopt		= tcp_getsockopt,
	.recvmsg		= tcp_recvmsg,
	.sendmsg		= tcp_sendmsg,
	.sendpage		= tcp_sendpage,
	.backlog_rcv		= tcp_v4_do_rcv,
	.release_cb		= tcp_release_cb,
	.mtu_reduced		= tcp_v4_mtu_reduced,
	.hash			= inet_hash,
	.unhash			= inet_unhash,
	.get_port		= inet_csk_get_port,
	.enter_memory_pressure	= tcp_enter_memory_pressure,
	.sockets_allocated	= &tcp_sockets_allocated,
	.orphan_count		= &tcp_orphan_count,
	.memory_allocated	= &tcp_memory_allocated,
	.memory_pressure	= &tcp_memory_pressure,
	.sysctl_wmem		= sysctl_tcp_wmem,
	.sysctl_rmem		= sysctl_tcp_rmem,
	.max_header		= MAX_TCP_HEADER,
	.obj_size		= sizeof(struct tcp_sock),
	.slab_flags		= SLAB_DESTROY_BY_RCU,
	.twsk_prot		= &tcp_timewait_sock_ops,
	.rsk_prot		= &tcp_request_sock_ops,
	.h.hashinfo		= &tcp_hashinfo,
	.no_autobind		= true,
#ifdef CONFIG_COMPAT
	.compat_setsockopt	= compat_tcp_setsockopt,
	.compat_getsockopt	= compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
	.init_cgroup		= tcp_init_cgroup,
	.destroy_cgroup		= tcp_destroy_cgroup,
	.proto_cgroup		= tcp_proto_cgroup,
#endif
};
EXPORT_SYMBOL(tcp_prot);
	//tcp协议注册(传输层)
	rc = proto_register(&tcp_prot, 1); //第二个参数为1,表示在高速缓存内部分配空间
	if (rc)
		goto out_free_reserved_ports;

	//udp协议注册(传输层)
	rc = proto_register(&udp_prot, 1);
	if (rc)
		goto out_unregister_tcp_proto;

	//raw原始协议注册(传输层)
	rc = proto_register(&raw_prot, 1);
	if (rc)
		goto out_unregister_udp_proto;

	//icmp协议注册(传输层)
	rc = proto_register(&ping_prot, 1);
	if (rc)
		goto out_unregister_raw_proto;

上面接口分别注册了“传输层”-->“网络层”的不同协议(tcp_prot, udp_prot, raw_prot, ping_prot)的接口操作集,它们都是通过以下接口进行协议注册:

int proto_register(struct proto *prot, int alloc_slab)
{
	if (alloc_slab) {
		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
					SLAB_HWCACHE_ALIGN | prot->slab_flags,
					NULL);

		if (prot->slab == NULL) {
			pr_crit("%s: Can't create sock SLAB cache!\n",
				prot->name);
			goto out;
		}

		//请求sock操作集
		if (prot->rsk_prot != NULL) {
			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
			if (prot->rsk_prot->slab_name == NULL)
				goto out_free_sock_slab;

			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
								 prot->rsk_prot->obj_size, 0,
								 SLAB_HWCACHE_ALIGN, NULL);

			if (prot->rsk_prot->slab == NULL) {
				pr_crit("%s: Can't create request sock SLAB cache!\n",
					prot->name);
				goto out_free_request_sock_slab_name;
			}
		}

		//等待sock操作集
		if (prot->twsk_prot != NULL) {
			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);

			if (prot->twsk_prot->twsk_slab_name == NULL)
				goto out_free_request_sock_slab;

			prot->twsk_prot->twsk_slab =
				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
						  prot->twsk_prot->twsk_obj_size,
						  0,
						  SLAB_HWCACHE_ALIGN |
							prot->slab_flags,
						  NULL);
			if (prot->twsk_prot->twsk_slab == NULL)
				goto out_free_timewait_sock_slab_name;
		}
	}

	mutex_lock(&proto_list_mutex);
	list_add(&prot->node, &proto_list); //添加到全局链表proto_list中
	assign_proto_idx(prot); //分配一个协议idx
	mutex_unlock(&proto_list_mutex);
	return 0;

out_free_timewait_sock_slab_name:
	kfree(prot->twsk_prot->twsk_slab_name);
out_free_request_sock_slab:
	if (prot->rsk_prot && prot->rsk_prot->slab) {
		kmem_cache_destroy(prot->rsk_prot->slab);
		prot->rsk_prot->slab = NULL;
	}
out_free_request_sock_slab_name:
	if (prot->rsk_prot)
		kfree(prot->rsk_prot->slab_name);
out_free_sock_slab:
	kmem_cache_destroy(prot->slab);
	prot->slab = NULL;
out:
	return -ENOBUFS;
}
EXPORT_SYMBOL(proto_register);

最终这些“传输层”-->“网络层”协议(tcp_prot, udp_prot, raw_prot, ping_prot)都注册到了全局链表“proto_list”中

	mutex_lock(&proto_list_mutex);
	list_add(&prot->node, &proto_list); //添加到全局链表proto_list中
	assign_proto_idx(prot); //分配一个协议idx
	mutex_unlock(&proto_list_mutex);

3.2 struct net_proto_family (BSD表示层)

该结构体表示协议族AF_INET, AF_UNIX

struct net_proto_family {
	int		family; //协议族 AF_INET

	//应用层通过socket函数创建socket套接字时调用的接口
	int		(*create)(struct net *net, struct socket *sock, 
				  int protocol, int kern);
	struct module	*owner;
};
static const struct net_proto_family inet_family_ops = {
	.family = PF_INET, //协议族
	.create = inet_create, //应用层创建socket套接字时调用的接口,该函数见下面分析
	.owner	= THIS_MODULE,
};
	(void)sock_register(&inet_family_ops); //sock套接字协议族注册
int sock_register(const struct net_proto_family *ops)
{
	int err;

	if (ops->family >= NPROTO) {
		printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
		       NPROTO);
		return -ENOBUFS;
	}

	spin_lock(&net_family_lock);

	//net_families[]是一个数组,数组成员字段对应不同的协议族
	if (rcu_dereference_protected(net_families[ops->family], //
				      lockdep_is_held(&net_family_lock)))
		err = -EEXIST;
	else {
		rcu_assign_pointer(net_families[ops->family], ops);
		err = 0;
	}
	spin_unlock(&net_family_lock);

	printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
	return err;
}
EXPORT_SYMBOL(sock_register);

这个协议最终会将协议族添加到全局数组net_families[]中。

最终输出如下信息:

printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);

3.3 struct net_protocol(inet层和网络层之间的连接协议)

struct net_protocol {
	void			(*early_demux)(struct sk_buff *skb);
	int			(*handler)(struct sk_buff *skb);
	void			(*err_handler)(struct sk_buff *skb, u32 info);
	unsigned int		no_policy:1,
				netns_ok:1;
};

这个结构体比较简单,但他是inet层和网络层(IP层)之间的连接,所以也很重要。

1. 第一个和第二个参数是查找基于包多路径选路,对于需要打了转包的设备,还是需要关闭这个功能,可以查看这里了解它
2. handler表示对应协议包的收包处理函数,当收到一个包的时候,在IP层将会判断这个包的协议,然后根据协议类型调用该结构中的收包函数,进而将包传给传输层处理

3. netns_ok表示是否支持虚拟网络? namespace网络命名空间?

下面列以下该对象的一些实例:

static const struct net_protocol icmp_protocol = {
	.handler =	icmp_rcv,
	.err_handler =	icmp_err,
	.no_policy =	1,
	.netns_ok =	1,
};
static const struct net_protocol udp_protocol = {
	.handler =	udp_rcv,
	.err_handler =	udp_err,
	.no_policy =	1,
	.netns_ok =	1,
};
static const struct net_protocol tcp_protocol = {
	.early_demux	=	tcp_v4_early_demux,
	.handler	=	tcp_v4_rcv,
	.err_handler	=	tcp_v4_err,
	.no_policy	=	1,
	.netns_ok	=	1,
};
static const struct net_protocol igmp_protocol = {
	.handler =	igmp_rcv,
	.netns_ok =	1,
};
	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
		pr_crit("%s: Cannot add ICMP protocol\n", __func__);
	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
		pr_crit("%s: Cannot add UDP protocol\n", __func__);
	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
		pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
		pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
	if (!prot->netns_ok) {
		pr_err("Protocol %u is not namespace aware, cannot register.\n",
			protocol);
		return -EINVAL;
	}

	return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
			NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_protocol);

这些实例在inet_init函数中,通过以下代码将不同的协议接收函数添加到inet_protos[protocol] 全局链表中,从而完成IP层和传输层的链接。

3.4 struct inet_protosw

/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {
	struct list_head list;

        /* These two fields form the lookup key.  */
	/*type和protocol两个域组成了socket的查找key,他们分别对应着应用
	层socket函数的第二和第三个参数,通过这两个参数来确定使用的是哪种
	socket类型。查找的时候主要是查找type,protocol只是用来防止初始化
	好的协议被再次初始化,比如开机的时候已经初始化好了TCP协议,如果
	应用层又调用了该协议的初始化函数,将直接退出。*/
	unsigned short	 type;	   /* This is the 2nd argument to socket(2). */
	unsigned short	 protocol; /* This is the L4 protocol number.  */

	/*表示的是该协议相关的处理函数,比如tcp_v4_connect和tcp_v4_init_sock等
	相关的协议具体处理函数.
	对应传输层到网络层协议*/
	struct proto	 *prot;

	//不同套接字类型的套接字管理函数集,也就是socket函数的第二个参数指
	//定的type,不同的套接字类型有不同的管理函数集
	const struct proto_ops *ops;
  
	char             no_check;   /* checksum on rcv/xmit/none? */
	unsigned char	 flags;      /* See INET_PROTOSW_* below.  */
};

       前面一直疑惑inet_protosw中的sw表示什么意思,后来才知道,原来是switch的缩写,表示inet层的协议切换,inetsw串联着PF_INET地址族所支持的协议类型,比如tcp, udp等。
这个函数起着承上启下的作用,它上层对应的是BSD层,下层对应的是inet层,它在网络协议栈的最上两层扮演着重要的角色
这个结构体每个注释已经比较清楚了,但是这里还是再讲一下:
1. type和protocol两个域组成了socket的查找key,他们分别对应着应用层socket函数的第二和第三个参数,通过这两个参数来确定使用的是哪种socket类型。查找的时候主要是查找type,protocol只是用来防止初始化好的协议被再次初始化,比如开机的时候已经初始化好了TCP协议,如果应用层又调用了该协议的初始化函数,将直接退出。
2. prot表示的是该协议相关的处理函数,比如tcp_v4_connect和tcp_v4_init_sock等相关的协议具体处理函数
3. ops表示的是该socket类型的套接字的管理函数,比如处理inet_bind和inet_accept等对inet层的套接字调用
prot和ops都是操作函数的集合,非常重要,我们会在后面进一步分析

static struct inet_protosw inetsw_array[] =
{
	{
		.type =       SOCK_STREAM,
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.no_check =   0,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
	},

	{
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_UDP,
		.prot =       &udp_prot,
		.ops =        &inet_dgram_ops,
		.no_check =   UDP_CSUM_DEFAULT,
		.flags =      INET_PROTOSW_PERMANENT,
       },

       {
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_ICMP,
		.prot =       &ping_prot,
		.ops =        &inet_dgram_ops,
		.no_check =   UDP_CSUM_DEFAULT,
		.flags =      INET_PROTOSW_REUSE,
       },

       {
	       .type =       SOCK_RAW,
	       .protocol =   IPPROTO_IP,	/* wild card */
	       .prot =       &raw_prot,
	       .ops =        &inet_sockraw_ops,
	       .no_check =   UDP_CSUM_DEFAULT,
	       .flags =      INET_PROTOSW_REUSE,
       }
};

#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
	/* Register the socket-side information for inet_create. */
	//初始化inet switch 链表
	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
		INIT_LIST_HEAD(r);

	//遍历inet switch数组
	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		inet_register_protosw(q);
void inet_register_protosw(struct inet_protosw *p)
{
	struct list_head *lh;
	struct inet_protosw *answer;
	int protocol = p->protocol;
	struct list_head *last_perm;

	spin_lock_bh(&inetsw_lock);

	if (p->type >= SOCK_MAX)
		goto out_illegal;

	/* If we are trying to override a permanent protocol, bail. */
	answer = NULL;
	last_perm = &inetsw[p->type];
	list_for_each(lh, &inetsw[p->type]) {
		answer = list_entry(lh, struct inet_protosw, list);

		/* Check only the non-wild match. */
		if (INET_PROTOSW_PERMANENT & answer->flags) {
			if (protocol == answer->protocol)
				break;
			last_perm = lh;
		}

		answer = NULL;
	}
	if (answer)
		goto out_permanent;

	/* Add the new entry after the last permanent entry if any, so that
	 * the new entry does not override a permanent entry when matched with
	 * a wild-card protocol. But it is allowed to override any existing
	 * non-permanent entry.  This means that when we remove this entry, the
	 * system automatically returns to the old behavior.
	 */
	list_add_rcu(&p->list, last_perm);
out:
	spin_unlock_bh(&inetsw_lock);

	return;

out_permanent:
	pr_err("Attempt to override permanent protocol %d\n", protocol);
	goto out;

out_illegal:
	pr_err("Ignoring attempt to register invalid socket type %d\n",
	       p->type);
	goto out;
}
EXPORT_SYMBOL(inet_register_protosw);
最红会将该协议注册到 inetsw[p->type]全局链表中。








猜你喜欢

转载自blog.csdn.net/chenliang0224/article/details/80330756
今日推荐