1. 前言
2. 流程图
2.1 Tcp/Ip 协议层
2.2 代码内部分层结构
3. inet_init源码分析
static int __init inet_init(void) { struct inet_protosw *q; struct list_head *r; int rc = -EINVAL; BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); if (!sysctl_local_reserved_ports) goto out; //tcp协议注册(传输层) rc = proto_register(&tcp_prot, 1); //第二个参数为1,表示在高速缓存内部分配空间 if (rc) goto out_free_reserved_ports; //udp协议注册(传输层) rc = proto_register(&udp_prot, 1); if (rc) goto out_unregister_tcp_proto; //raw原始协议注册(传输层) rc = proto_register(&raw_prot, 1); if (rc) goto out_unregister_udp_proto; //icmp协议注册(传输层) rc = proto_register(&ping_prot, 1); if (rc) goto out_unregister_raw_proto; /* * Tell SOCKET that we are alive... */ (void)sock_register(&inet_family_ops); #ifdef CONFIG_SYSCTL ip_static_sysctl_init(); #endif tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem; /* * Add all the base protocols. */ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) pr_crit("%s: Cannot add ICMP protocol\n", __func__); if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) pr_crit("%s: Cannot add UDP protocol\n", __func__); if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) pr_crit("%s: Cannot add TCP protocol\n", __func__); #ifdef CONFIG_IP_MULTICAST if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) pr_crit("%s: Cannot add IGMP protocol\n", __func__); #endif /* Register the socket-side information for inet_create. */ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q); /* * Set the ARP module up */ arp_init(); /* * Set the IP module up */ ip_init(); tcp_v4_init(); /* Setup TCP slab cache for open requests. */ tcp_init(); /* Setup UDP memory threshold */ udp_init(); /* Add UDP-Lite (RFC 3828) */ udplite4_register(); ping_init(); /* * Set the ICMP layer up */ if (icmp_init() < 0) panic("Failed to create the ICMP control socket.\n"); /* * Initialise the multicast router */ #if defined(CONFIG_IP_MROUTE) if (ip_mr_init()) pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endif /* * Initialise per-cpu ipv4 mibs */ if (init_ipv4_mibs()) pr_crit("%s: Cannot init ipv4 mibs\n", __func__); ipv4_proc_init(); ipfrag_init(); dev_add_pack(&ip_packet_type); rc = 0; out: return rc; out_unregister_raw_proto: proto_unregister(&raw_prot); out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); out_free_reserved_ports: kfree(sysctl_local_reserved_ports); goto out; } fs_initcall(inet_init);先贴出源码,下面逐个分析,为了了解架构,先分析它所涉及到的结构体!
3. 结构体
3.1 struct proto(传输层到网络层协议)
该结构体的操作集是“传输层”到“网络层”的接口
struct proto { void (*close)(struct sock *sk, long timeout); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); struct sock * (*accept)(struct sock *sk, int flags, int *err); int (*ioctl)(struct sock *sk, int cmd, unsigned long arg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); #ifdef CONFIG_COMPAT int (*compat_setsockopt)(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); int (*compat_getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); int (*compat_ioctl)(struct sock *sk, unsigned int cmd, unsigned long arg); #endif int (*sendmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int (*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); void (*release_cb)(struct sock *sk); void (*mtu_reduced)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ void (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); void (*clear_sk)(struct sock *sk, int size); /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS unsigned int inuse_idx; //表示在proto_inuse_idx数组中占用的一个bit位索引 #endif /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ int *memory_pressure; long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; int max_header; bool no_autobind; struct kmem_cache *slab; unsigned int obj_size; int slab_flags; struct percpu_counter *orphan_count; struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; union { struct inet_hashinfo *hashinfo; struct udp_table *udp_table; struct raw_hashinfo *raw_hash; } h; struct module *owner; char name[32]; struct list_head node; #ifdef SOCK_REFCNT_DEBUG atomic_t socks; #endif #ifdef CONFIG_MEMCG_KMEM /* * cgroup specific init/deinit functions. Called once for all * protocols that implement it, from cgroups populate function. * This function has to setup any files the protocol want to * appear in the kmem cgroup filesystem. */ int (*init_cgroup)(struct mem_cgroup *memcg, struct cgroup_subsys *ss); void (*destroy_cgroup)(struct mem_cgroup *memcg); struct cg_proto *(*proto_cgroup)(struct mem_cgroup *memcg); #endif };
对应的源码:
这里以tcp_prot为例:
struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, .mtu_reduced = tcp_v4_mtu_reduced, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_DESTROY_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif #ifdef CONFIG_MEMCG_KMEM .init_cgroup = tcp_init_cgroup, .destroy_cgroup = tcp_destroy_cgroup, .proto_cgroup = tcp_proto_cgroup, #endif }; EXPORT_SYMBOL(tcp_prot);
//tcp协议注册(传输层) rc = proto_register(&tcp_prot, 1); //第二个参数为1,表示在高速缓存内部分配空间 if (rc) goto out_free_reserved_ports; //udp协议注册(传输层) rc = proto_register(&udp_prot, 1); if (rc) goto out_unregister_tcp_proto; //raw原始协议注册(传输层) rc = proto_register(&raw_prot, 1); if (rc) goto out_unregister_udp_proto; //icmp协议注册(传输层) rc = proto_register(&ping_prot, 1); if (rc) goto out_unregister_raw_proto;
上面接口分别注册了“传输层”-->“网络层”的不同协议(tcp_prot, udp_prot, raw_prot, ping_prot)的接口操作集,它们都是通过以下接口进行协议注册:
int proto_register(struct proto *prot, int alloc_slab) { if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, SLAB_HWCACHE_ALIGN | prot->slab_flags, NULL); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); goto out; } //请求sock操作集 if (prot->rsk_prot != NULL) { prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); if (prot->rsk_prot->slab_name == NULL) goto out_free_sock_slab; prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, prot->rsk_prot->obj_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (prot->rsk_prot->slab == NULL) { pr_crit("%s: Can't create request sock SLAB cache!\n", prot->name); goto out_free_request_sock_slab_name; } } //等待sock操作集 if (prot->twsk_prot != NULL) { prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); if (prot->twsk_prot->twsk_slab_name == NULL) goto out_free_request_sock_slab; prot->twsk_prot->twsk_slab = kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, 0, SLAB_HWCACHE_ALIGN | prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; } } mutex_lock(&proto_list_mutex); list_add(&prot->node, &proto_list); //添加到全局链表proto_list中 assign_proto_idx(prot); //分配一个协议idx mutex_unlock(&proto_list_mutex); return 0; out_free_timewait_sock_slab_name: kfree(prot->twsk_prot->twsk_slab_name); out_free_request_sock_slab: if (prot->rsk_prot && prot->rsk_prot->slab) { kmem_cache_destroy(prot->rsk_prot->slab); prot->rsk_prot->slab = NULL; } out_free_request_sock_slab_name: if (prot->rsk_prot) kfree(prot->rsk_prot->slab_name); out_free_sock_slab: kmem_cache_destroy(prot->slab); prot->slab = NULL; out: return -ENOBUFS; } EXPORT_SYMBOL(proto_register);
最终这些“传输层”-->“网络层”协议(tcp_prot, udp_prot, raw_prot, ping_prot)都注册到了全局链表“proto_list”中
mutex_lock(&proto_list_mutex); list_add(&prot->node, &proto_list); //添加到全局链表proto_list中 assign_proto_idx(prot); //分配一个协议idx mutex_unlock(&proto_list_mutex);
3.2 struct net_proto_family (BSD表示层)
该结构体表示协议族AF_INET, AF_UNIX
struct net_proto_family { int family; //协议族 AF_INET //应用层通过socket函数创建socket套接字时调用的接口 int (*create)(struct net *net, struct socket *sock, int protocol, int kern); struct module *owner; };
static const struct net_proto_family inet_family_ops = { .family = PF_INET, //协议族 .create = inet_create, //应用层创建socket套接字时调用的接口,该函数见下面分析 .owner = THIS_MODULE, };
(void)sock_register(&inet_family_ops); //sock套接字协议族注册
int sock_register(const struct net_proto_family *ops) { int err; if (ops->family >= NPROTO) { printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); return -ENOBUFS; } spin_lock(&net_family_lock); //net_families[]是一个数组,数组成员字段对应不同的协议族 if (rcu_dereference_protected(net_families[ops->family], // lockdep_is_held(&net_family_lock))) err = -EEXIST; else { rcu_assign_pointer(net_families[ops->family], ops); err = 0; } spin_unlock(&net_family_lock); printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family); return err; } EXPORT_SYMBOL(sock_register);
这个协议最终会将协议族添加到全局数组net_families[]中。
最终输出如下信息:
printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
3.3 struct net_protocol(inet层和网络层之间的连接协议)
struct net_protocol { void (*early_demux)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); unsigned int no_policy:1, netns_ok:1; };
这个结构体比较简单,但他是inet层和网络层(IP层)之间的连接,所以也很重要。
1. 第一个和第二个参数是查找基于包多路径选路,对于需要打了转包的设备,还是需要关闭这个功能,可以查看这里了解它2. handler表示对应协议包的收包处理函数,当收到一个包的时候,在IP层将会判断这个包的协议,然后根据协议类型调用该结构中的收包函数,进而将包传给传输层处理
3. netns_ok表示是否支持虚拟网络? namespace网络命名空间?
下面列以下该对象的一些实例:
static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .err_handler = icmp_err, .no_policy = 1, .netns_ok = 1, };
static const struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, .netns_ok = 1, };
static const struct net_protocol tcp_protocol = { .early_demux = tcp_v4_early_demux, .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, .netns_ok = 1, };
static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, .netns_ok = 1, };
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) pr_crit("%s: Cannot add ICMP protocol\n", __func__); if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) pr_crit("%s: Cannot add UDP protocol\n", __func__); if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) pr_crit("%s: Cannot add TCP protocol\n", __func__); #ifdef CONFIG_IP_MULTICAST if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) pr_crit("%s: Cannot add IGMP protocol\n", __func__); #endif
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { if (!prot->netns_ok) { pr_err("Protocol %u is not namespace aware, cannot register.\n", protocol); return -EINVAL; } return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet_add_protocol);
这些实例在inet_init函数中,通过以下代码将不同的协议接收函数添加到inet_protos[protocol] 全局链表中,从而完成IP层和传输层的链接。
3.4 struct inet_protosw
/* This is used to register socket interfaces for IP protocols. */ struct inet_protosw { struct list_head list; /* These two fields form the lookup key. */ /*type和protocol两个域组成了socket的查找key,他们分别对应着应用 层socket函数的第二和第三个参数,通过这两个参数来确定使用的是哪种 socket类型。查找的时候主要是查找type,protocol只是用来防止初始化 好的协议被再次初始化,比如开机的时候已经初始化好了TCP协议,如果 应用层又调用了该协议的初始化函数,将直接退出。*/ unsigned short type; /* This is the 2nd argument to socket(2). */ unsigned short protocol; /* This is the L4 protocol number. */ /*表示的是该协议相关的处理函数,比如tcp_v4_connect和tcp_v4_init_sock等 相关的协议具体处理函数. 对应传输层到网络层协议*/ struct proto *prot; //不同套接字类型的套接字管理函数集,也就是socket函数的第二个参数指 //定的type,不同的套接字类型有不同的管理函数集 const struct proto_ops *ops; char no_check; /* checksum on rcv/xmit/none? */ unsigned char flags; /* See INET_PROTOSW_* below. */ };
前面一直疑惑inet_protosw中的sw表示什么意思,后来才知道,原来是switch的缩写,表示inet层的协议切换,inetsw串联着PF_INET地址族所支持的协议类型,比如tcp, udp等。
这个函数起着承上启下的作用,它上层对应的是BSD层,下层对应的是inet层,它在网络协议栈的最上两层扮演着重要的角色。
这个结构体每个注释已经比较清楚了,但是这里还是再讲一下:
1. type和protocol两个域组成了socket的查找key,他们分别对应着应用层socket函数的第二和第三个参数,通过这两个参数来确定使用的是哪种socket类型。查找的时候主要是查找type,protocol只是用来防止初始化好的协议被再次初始化,比如开机的时候已经初始化好了TCP协议,如果应用层又调用了该协议的初始化函数,将直接退出。
2. prot表示的是该协议相关的处理函数,比如tcp_v4_connect和tcp_v4_init_sock等相关的协议具体处理函数
3. ops表示的是该socket类型的套接字的管理函数,比如处理inet_bind和inet_accept等对inet层的套接字调用
prot和ops都是操作函数的集合,非常重要,我们会在后面进一步分析
static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_ICMP, .prot = &ping_prot, .ops = &inet_dgram_ops, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }, { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } }; #define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
/* Register the socket-side information for inet_create. */ //初始化inet switch 链表 for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); //遍历inet switch数组 for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q);
void inet_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; int protocol = p->protocol; struct list_head *last_perm; spin_lock_bh(&inetsw_lock); if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ answer = NULL; last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */ if (INET_PROTOSW_PERMANENT & answer->flags) { if (protocol == answer->protocol) break; last_perm = lh; } answer = NULL; } if (answer) goto out_permanent; /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); out: spin_unlock_bh(&inetsw_lock); return; out_permanent: pr_err("Attempt to override permanent protocol %d\n", protocol); goto out; out_illegal: pr_err("Ignoring attempt to register invalid socket type %d\n", p->type); goto out; } EXPORT_SYMBOL(inet_register_protosw);最红会将该协议注册到 inetsw[p->type]全局链表中。