Linux Kernel 2.6.9源码分析 -- socket

Linux Kernel 2.6.9源码分析 – socket

首先来看socket API的原型:int socket(int protofamily, int type, int protocol)
参数说明:
int protofamily：即协议域，又称为协议族（family）。常用的协议族有，AF_INET(IPV4)、AF_INET6(IPV6)、AF_LOCAL（或称AF_UNIX，Unix域socket）
int type：指定socket类型。常用的socket类型有，SOCK_STREAM、SOCK_DGRAM、SOCK_RAW、SOCK_PACKET、SOCK_SEQPACKET等等
int protocol：故名思意，就是指定协议。常用的协议有，IPPROTO_TCP、IPPTOTO_UDP、IPPROTO_SCTP、IPPROTO_TIPC等，它们分别对应TCP传输协议、UDP传输协议、STCP传输协议、TIPC传输协议。当protocol为0时，会自动选择type类型对应的默认协议。
返回值：文件描述符

sys_socket

socket API对应的系统调用是long sys_socket(int family, int type, int protocol),从代码来看主要有两个核心函数：sock_create和sock_map_fd，下面来重点介绍这个两个函数的实现.

asmlinkage long sys_socket(int family, int type, int protocol)
{
	int retval;
	struct socket *sock;
	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		goto out;
	retval = sock_map_fd(sock);
	if (retval < 0)
		goto out_release;
out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;
out_release:
	sock_release(sock);
	return retval;
}

sock_create

sock_create函数直接调用的是int __sock_create(int family, int type, int protocol, struct socket **res, int kern)，下面来分析下这个函数的源码:

static int __sock_create(int family, int type, int protocol, struct socket **res, int kern)
{
    ............	
#if defined(CONFIG_KMOD)
	if (net_families[family]==NULL) {
		request_module("net-pf-%d",family);
	}
#endif
	net_family_read_lock();
	if (net_families[family] == NULL) {
		i = -EAFNOSUPPORT;
		goto out;
	}
/*
 *	Allocate the socket and allow the family to set things up. if
 *	the protocol is 0, the family is instructed to select an appropriate
 *	default.
 */

	if (!(sock = sock_alloc())) 
	{
		printk(KERN_WARNING "socket: no more sockets\n");
		i = -ENFILE;		/* Not exactly a match, but its the
					   closest posix thing */
		goto out;
	}
	sock->type  = type;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	i = -EAFNOSUPPORT;
	if (!try_module_get(net_families[family]->owner))
		goto out_release;

	if ((i = net_families[family]->create(sock, protocol)) < 0)
		goto out_module_put;
	..................
}

从上面的代码看，主要是两个步骤：
1.调用sock_alloc获取一个struct socket结构
2.调用具体协议族的create函数：net_families[family]->create(sock, protocol)
下面分别看下两个函数:

sock_alloc

sock_alloc主要是从sock_mnt文件系统中分配一个iNode，在此可能有一个疑问，socket和inode是如何转换的？即sock = SOCKET_I(inode)的实现是个什么原理？很多博客对此避而不谈，下面首先看inode是如何得来的，再来将这个问题会来深入分析下.

struct socket *sock_alloc(void) {
	struct inode * inode;
	struct socket * sock;
	inode = new_inode(sock_mnt->mnt_sb);
	if (!inode)
		return NULL;
	sock = SOCKET_I(inode);
	inode->i_mode = S_IFSOCK|S_IRWXUGO;
	inode->i_sock = 1;
	inode->i_uid = current->fsuid;
	inode->i_gid = current->fsgid;
	get_cpu_var(sockets_in_use)++;
	put_cpu_var(sockets_in_use);
	return sock;
}

上面函数中的new_inode函数最终会调用到fs/inode.c – > struct inode *alloc_inode(struct super_block *sb)

static struct inode *alloc_inode(struct super_block *sb)
{
	static struct address_space_operations empty_aops;
	static struct inode_operations empty_iops;
	static struct file_operations empty_fops;
	struct inode *inode;

	if (sb->s_op->alloc_inode)
		inode = sb->s_op->alloc_inode(sb);
	else
		inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);

	if (inode) {
		struct address_space * const mapping = &inode->i_data;
          ............
		inode->i_sb = sb;
		inode->i_op = &empty_iops;
		inode->i_fop = &empty_fops;
		inode->i_nlink = 1;
	       ...............
		}
		memset(&inode->u, 0, sizeof(inode->u));
		inode->i_mapping = mapping;
	}
	return inode;
}

对于socket文件系统sb->s_op->alloc_inode 不为空，所以为执行inode = sb->s_op->alloc_inode(sb);那么socket文件系统的alloc_inode在哪里呢？
在Linux系统启动的时候，会调用net/socket.c --> void __init sock_init(void),会向VFS注册和mount sock_mnt文件系统.

void __init sock_init(void)
{
	int i;
	for (i = 0; i < NPROTO; i++) 
		net_families[i] = NULL; //Initialize all address (protocol) families. 
	sk_init(); //Initialize sock SLAB cache.
#ifdef SLAB_SKB
	skb_init(); //Initialize skbuff SLAB cache 
#endif
	/*
	 *	Initialize the protocols module. 
	 */
	init_inodecache();
	register_filesystem(&sock_fs_type);
	sock_mnt = kern_mount(&sock_fs_type);
   .............
}

sock_fs_type结构里面的sockfs_get_sb 函数会注册sockfs_ops，其中sock_alloc_inode函数就是上文提到的 sb->s_op->alloc_inode(sb)

static struct file_system_type sock_fs_type = {
	.name =		"sockfs",
	.get_sb =	sockfs_get_sb,
	.kill_sb =	kill_anon_super,
};
static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data)
{
	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
}

net/socket.c --> sock_alloc_inode,从代码中可以看出在分配inode空间的时候，struct socket结构体是和struct inode结构体同时分配的，只不过返回的是inode结构的地址.

struct socket_alloc {
	struct socket socket;
	struct inode vfs_inode;
};
static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;
	ei = (struct socket_alloc *)kmem_cache_alloc(sock_inode_cachep, SLAB_KERNEL);
	if (!ei)
		return NULL;
	init_waitqueue_head(&ei->socket.wait);
	ei->socket.fasync_list = NULL;
	ei->socket.state = SS_UNCONNECTED;
	ei->socket.flags = 0;
	ei->socket.ops = NULL;
	ei->socket.sk = NULL;
	ei->socket.file = NULL;
	ei->socket.passcred = 0;
	return &ei->vfs_inode;
}

再来看下sock = SOCKET_I(inode)的实现:
通过inode的地址获取struct socket_alloc的地址，从而获得struct socket_alloc中的struct socket的地址

引用于该博客:https://blog.csdn.net/npy_lp/article/details/7010752
container_of，其实它的语法很简单，只是一些指针的灵活应用，它分两步：
第一步，首先定义一个临时的数据类型（通过typeof( ((type *)0)->member )获得）与ptr相同的指针变量__mptr，然后用它来保存ptr的值。
第二步，用(char )__mptr减去member在结构体中的偏移量，得到的值就是整个结构体变量的首地址（整个宏的返回值就是这个首地址）。其中的语法难点就是如何得出成员相对结构体的偏移量？
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE )0)->MEMBER)
其中代码难以理解的地方就是它灵活地运用了0地址。如果觉得&( (struct socket_alloc)0 )->vfs_inode这样的代码不好理解，那么我们可以假设在0地址分配了一个结构体变量struct vfs_inode a，然后定义结构体指针变量p并指向a（struct vfs_inodep = &a），如此我们就可以通过&p->vfs_inode获得成员vfs_inode的地址。由于a的首地址为0x0，所以成员vfs_inode的为socket_alloc的内部偏移量。

#define container_of(ptr, type, member) ({			\
        const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
        (type *)( (char *)__mptr - offsetof(type,member) );})
        
static inline struct socket *SOCKET_I(struct inode *inode)
{
	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

好了，至此，sock_alloc重点工作就是调用socket文件系统的alloc_inode函数分配一个struct socket和一个struct inode结构并做初始化.然后再返回struct socket结构的指针.

net_families[family]->create(sock, protocol)

不同的协议族，会有不同的create函数，AF_UNIX协议族对应函数为unix_create，
net/unix/af_unix.c

static struct net_proto_family unix_family_ops = {
	.family = PF_UNIX,
	.create = unix_create,
	.owner	= THIS_MODULE,
};

而AF_INET协议族对应的函数为:inet_create

static struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,
	.owner	= THIS_MODULE,
};

先来看下简单的unix_create函数，再来分析复杂的inet_create.
net/unix/af_unix.c

static int unix_create(struct socket *sock, int protocol)
{
	if (protocol && protocol != PF_UNIX)
		return -EPROTONOSUPPORT;
	sock->state = SS_UNCONNECTED;
	switch (sock->type) {
	case SOCK_STREAM:
		sock->ops = &unix_stream_ops;
		break;
		/*
		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
		 *	nothing uses it.
		 */
	case SOCK_RAW:
		sock->type=SOCK_DGRAM;
	case SOCK_DGRAM:
		sock->ops = &unix_dgram_ops;
		break;
	case SOCK_SEQPACKET:
		sock->ops = &unix_seqpacket_ops;
		break;
	default:
		return -ESOCKTNOSUPPORT;
	}
	return unix_create1(sock) ? 0 : -ENOMEM;
}

首先根据struct socket的type来赋值sock->ops,如果是TCP就是unix_stream_ops，UDP就是unix_dgram_ops,这个sock->ops会非常重要，socket编程后续API真正调用的接口都在这个结构中,后面再来详说这些操作.unix_create进一步调用unix_create1对sock对象做一些操作.继续看代码.

static struct proto_ops unix_stream_ops = {
	.family =	PF_UNIX,
	.owner =	THIS_MODULE,
	.release =	unix_release,
	.bind =		unix_bind,
	.connect =	unix_stream_connect,
	.socketpair =	unix_socketpair,
	.accept =	unix_accept,
	.getname =	unix_getname,
	.poll =		unix_poll,
	.ioctl =	unix_ioctl,
	.listen =	unix_listen,
	.shutdown =	unix_shutdown,
	.setsockopt =	sock_no_setsockopt,
	.getsockopt =	sock_no_getsockopt,
	.sendmsg =	unix_stream_sendmsg,
	.recvmsg =	unix_stream_recvmsg,
	.mmap =		sock_no_mmap,
	.sendpage =	sock_no_sendpage,
};

最终unix_create是分配一个struct sock，并返回该对象的地址.此时应该比较疑惑为什么已经分配了一个struct socket在此还要在此分配一个struct sock,本文末尾会来再详细解释.

static struct sock * unix_create1(struct socket *sock)
{
    ............
	sk = sk_alloc(PF_UNIX, GFP_KERNEL, sizeof(struct unix_sock),unix_sk_cachep);
    .................
	sock_init_data(sock,sk);
	sk_set_owner(sk, THIS_MODULE);
    .................
	sk->sk_write_space	= unix_write_space;
	sk->sk_max_ack_backlog	= sysctl_unix_max_dgram_qlen;
	sk->sk_destruct		= unix_sock_destructor;
	.................
	return sk;
}

总结下unix_create步骤:1.初始化struct socket 的成员ops,2.分配struct sock并初始化，最后将struct sock 挂上struct socket，sock->sk = sk;中。

再来看下inet_create干了些啥？
1.根据socket类型在inetsw链表数组中找到对应sock->ops。其中TCP-> inet_stream_ops,UDP：inet_dgram_ops
2.同样是分配struct sock，但根据不同的sock类型分配的具体结构会有所不同.
3.将sock和socket挂上:sock->sk = sk;

static int inet_create(struct socket *sock, int protocol)
{
    ..............
	/* Look for the requested type/protocol pair. */
	answer = NULL;
	rcu_read_lock();
	list_for_each_rcu(p, &inetsw[sock->type]) {
		answer = list_entry(p, struct inet_protosw, list);
        ................
	}
    .................................
	sock->ops = answer->ops;
	answer_prot = answer->prot;
	answer_no_check = answer->no_check;
	answer_flags = answer->flags;
    ..................
	sk = sk_alloc(PF_INET, GFP_KERNEL,
		      answer_prot->slab_obj_size,
		      answer_prot->slab);
	if (sk == NULL)
		goto out;
	err = 0;
	sk->sk_prot = answer_prot;
	sk->sk_no_check = answer_no_check;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = 1;
	inet = inet_sk(sk);
    ........初始化inet options.......

}

至此sock_create已经分析完成，临时总结下:
1.调用sock_alloc:分配struct inode和struct socket，并做初始化
2.net_families[family]->create(sock, protocol)：赋值struct socket.ops，再来分别struct sock并初始化，最终将其赋值给struct socket.sk
3.返回struct socket指针.
接下来看下拿到struct socket对象之后，sys_socket调用sock_map_fd函数干了点啥 ?

sock_map_fd

int sock_map_fd(struct socket *sock)
{
    ....................
	fd = get_unused_fd();
	if (fd >= 0) {
		struct file *file = get_empty_filp();
        ..................
		file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
		if (!file->f_dentry) {
			put_filp(file);
			put_unused_fd(fd);
			fd = -ENOMEM;
			goto out;
		}
		file->f_dentry->d_op = &sockfs_dentry_operations;
		d_add(file->f_dentry, SOCK_INODE(sock));
		file->f_vfsmnt = mntget(sock_mnt);
		file->f_mapping = file->f_dentry->d_inode->i_mapping;

		sock->file = file;
		file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops;
		file->f_mode = FMODE_READ | FMODE_WRITE;
		file->f_flags = O_RDWR;
		file->f_pos = 0;
		fd_install(fd, file);
	}
	return fd;
}

上述代码的主要步骤如下:
1.在当前进程描述符struct task_struct找到一个未使用的文件描述符.
2.从slab中分配一个struct file结构.
3.从sock_mnt文件系统中分配一个目录项 struct dentry，并将其赋值给struct file.f_dentry.
4.从socket对象地址找到inode结构地址，并将inode 赋值给entry->d_inode = inode
5.file->ops = socket_file_ops,sock->file = file;
6.将struct file插入到当前进程描述符struct task_struct.fd[fd] = file
7.返回文件描述符fd给到用户层.
此时应该明白了为什么说socket是一个文件了

还剩下最后一个疑问:为何有struct socket 和struct sock两个结构同时存在？
每个套接字都一个struct socket和struct sock,后者是对前者的一种补充，而且两者相互挂接起来.二者是同一个东西的不同侧面.struct socket是面向用户，而struct sock是面向内核.

年轻态程序猿

发布了20 篇原创文章 · 获赞 0 · 访问量 566

私信关注