Linux Kernel 2.6.9源码分析 – socket
首先来看socket API的原型:int socket(int protofamily, int type, int protocol)
参数说明:
int protofamily:即协议域,又称为协议族(family)。常用的协议族有,AF_INET(IPV4)、AF_INET6(IPV6)、AF_LOCAL(或称AF_UNIX,Unix域socket)
int type:指定socket类型。常用的socket类型有,SOCK_STREAM、SOCK_DGRAM、SOCK_RAW、SOCK_PACKET、SOCK_SEQPACKET等等
int protocol:故名思意,就是指定协议。常用的协议有,IPPROTO_TCP、IPPTOTO_UDP、IPPROTO_SCTP、IPPROTO_TIPC等,它们分别对应TCP传输协议、UDP传输协议、STCP传输协议、TIPC传输协议。当protocol为0时,会自动选择type类型对应的默认协议。
返回值:文件描述符
sys_socket
socket API对应的系统调用是long sys_socket(int family, int type, int protocol),从代码来看主要有两个核心函数:sock_create和sock_map_fd,下面来重点介绍这个两个函数的实现.
asmlinkage long sys_socket(int family, int type, int protocol)
{
int retval;
struct socket *sock;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
retval = sock_map_fd(sock);
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
sock_create
sock_create函数直接调用的是int __sock_create(int family, int type, int protocol, struct socket **res, int kern),下面来分析下这个函数的源码:
static int __sock_create(int family, int type, int protocol, struct socket **res, int kern)
{
............
#if defined(CONFIG_KMOD)
if (net_families[family]==NULL) {
request_module("net-pf-%d",family);
}
#endif
net_family_read_lock();
if (net_families[family] == NULL) {
i = -EAFNOSUPPORT;
goto out;
}
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
if (!(sock = sock_alloc()))
{
printk(KERN_WARNING "socket: no more sockets\n");
i = -ENFILE; /* Not exactly a match, but its the
closest posix thing */
goto out;
}
sock->type = type;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
i = -EAFNOSUPPORT;
if (!try_module_get(net_families[family]->owner))
goto out_release;
if ((i = net_families[family]->create(sock, protocol)) < 0)
goto out_module_put;
..................
}
从上面的代码看,主要是两个步骤:
1.调用sock_alloc获取一个struct socket结构
2.调用具体协议族的create函数:net_families[family]->create(sock, protocol)
下面分别看下两个函数:
sock_alloc
sock_alloc主要是从sock_mnt文件系统中分配一个iNode,在此可能有一个疑问,socket和inode是如何转换的?即sock = SOCKET_I(inode)的实现是个什么原理 ?很多博客对此避而不谈,下面首先看inode是如何得来的,再来将这个问题会来深入分析下.
struct socket *sock_alloc(void) {
struct inode * inode;
struct socket * sock;
inode = new_inode(sock_mnt->mnt_sb);
if (!inode)
return NULL;
sock = SOCKET_I(inode);
inode->i_mode = S_IFSOCK|S_IRWXUGO;
inode->i_sock = 1;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
get_cpu_var(sockets_in_use)++;
put_cpu_var(sockets_in_use);
return sock;
}
上面函数中的new_inode函数最终会调用到fs/inode.c – > struct inode *alloc_inode(struct super_block *sb)
static struct inode *alloc_inode(struct super_block *sb)
{
static struct address_space_operations empty_aops;
static struct inode_operations empty_iops;
static struct file_operations empty_fops;
struct inode *inode;
if (sb->s_op->alloc_inode)
inode = sb->s_op->alloc_inode(sb);
else
inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
if (inode) {
struct address_space * const mapping = &inode->i_data;
............
inode->i_sb = sb;
inode->i_op = &empty_iops;
inode->i_fop = &empty_fops;
inode->i_nlink = 1;
...............
}
memset(&inode->u, 0, sizeof(inode->u));
inode->i_mapping = mapping;
}
return inode;
}
对于socket文件系统sb->s_op->alloc_inode 不为空,所以为执行inode = sb->s_op->alloc_inode(sb);那么socket文件系统的alloc_inode在哪里呢 ?
在Linux系统启动的时候,会调用net/socket.c --> void __init sock_init(void),会向VFS注册和mount sock_mnt文件系统.
void __init sock_init(void)
{
int i;
for (i = 0; i < NPROTO; i++)
net_families[i] = NULL; //Initialize all address (protocol) families.
sk_init(); //Initialize sock SLAB cache.
#ifdef SLAB_SKB
skb_init(); //Initialize skbuff SLAB cache
#endif
/*
* Initialize the protocols module.
*/
init_inodecache();
register_filesystem(&sock_fs_type);
sock_mnt = kern_mount(&sock_fs_type);
.............
}
sock_fs_type结构里面的sockfs_get_sb 函数会注册sockfs_ops,其中sock_alloc_inode函数就是上文提到的 sb->s_op->alloc_inode(sb)
static struct file_system_type sock_fs_type = {
.name = "sockfs",
.get_sb = sockfs_get_sb,
.kill_sb = kill_anon_super,
};
static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
}
net/socket.c --> sock_alloc_inode,从代码中可以看出在分配inode空间的时候,struct socket结构体是和struct inode结构体同时分配的,只不过返回的是inode结构的地址.
struct socket_alloc {
struct socket socket;
struct inode vfs_inode;
};
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
ei = (struct socket_alloc *)kmem_cache_alloc(sock_inode_cachep, SLAB_KERNEL);
if (!ei)
return NULL;
init_waitqueue_head(&ei->socket.wait);
ei->socket.fasync_list = NULL;
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
ei->socket.ops = NULL;
ei->socket.sk = NULL;
ei->socket.file = NULL;
ei->socket.passcred = 0;
return &ei->vfs_inode;
}
再来看下sock = SOCKET_I(inode)的实现:
通过inode的地址获取struct socket_alloc的地址,从而获得struct socket_alloc中的struct socket的地址
引用于该博客:https://blog.csdn.net/npy_lp/article/details/7010752
container_of,其实它的语法很简单,只是一些指针的灵活应用,它分两步:
第一步,首先定义一个临时的数据类型(通过typeof( ((type *)0)->member )获得)与ptr相同的指针变量__mptr,然后用它来保存ptr的值。
第二步,用(char )__mptr减去member在结构体中的偏移量,得到的值就是整个结构体变量的首地址(整个宏的返回值就是这个首地址)。其中的语法难点就是如何得出成员相对结构体的偏移量?
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE )0)->MEMBER)
其中代码难以理解的地方就是它灵活地运用了0地址。如果觉得&( (struct socket_alloc)0 )->vfs_inode这样的代码不好理解,那么我们可以假设在0地址分配了一个结构体变量struct vfs_inode a,然后定义结构体指针变量p并指向a(struct vfs_inodep = &a),如此我们就可以通过&p->vfs_inode获得成员vfs_inode的地址。由于a的首地址为0x0,所以成员vfs_inode的为socket_alloc的内部偏移量。
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
static inline struct socket *SOCKET_I(struct inode *inode)
{
return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
好了,至此,sock_alloc重点工作就是调用socket文件系统的alloc_inode函数分配一个struct socket和一个struct inode结构并做初始化.然后再返回struct socket结构的指针.
net_families[family]->create(sock, protocol)
不同的协议族,会有不同的create函数,AF_UNIX协议族对应函数为unix_create,
net/unix/af_unix.c
static struct net_proto_family unix_family_ops = {
.family = PF_UNIX,
.create = unix_create,
.owner = THIS_MODULE,
};
而AF_INET协议族对应的函数为:inet_create
static struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
先来看下简单的unix_create函数,再来分析复杂的inet_create.
net/unix/af_unix.c
static int unix_create(struct socket *sock, int protocol)
{
if (protocol && protocol != PF_UNIX)
return -EPROTONOSUPPORT;
sock->state = SS_UNCONNECTED;
switch (sock->type) {
case SOCK_STREAM:
sock->ops = &unix_stream_ops;
break;
/*
* Believe it or not BSD has AF_UNIX, SOCK_RAW though
* nothing uses it.
*/
case SOCK_RAW:
sock->type=SOCK_DGRAM;
case SOCK_DGRAM:
sock->ops = &unix_dgram_ops;
break;
case SOCK_SEQPACKET:
sock->ops = &unix_seqpacket_ops;
break;
default:
return -ESOCKTNOSUPPORT;
}
return unix_create1(sock) ? 0 : -ENOMEM;
}
首先根据struct socket的type来赋值sock->ops,如果是TCP就是unix_stream_ops,UDP就是unix_dgram_ops,这个sock->ops会非常重要,socket编程后续API真正调用的接口都在这个结构中,后面再来详说这些操作.unix_create进一步调用unix_create1对sock对象做一些操作.继续看代码.
static struct proto_ops unix_stream_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
.bind = unix_bind,
.connect = unix_stream_connect,
.socketpair = unix_socketpair,
.accept = unix_accept,
.getname = unix_getname,
.poll = unix_poll,
.ioctl = unix_ioctl,
.listen = unix_listen,
.shutdown = unix_shutdown,
.setsockopt = sock_no_setsockopt,
.getsockopt = sock_no_getsockopt,
.sendmsg = unix_stream_sendmsg,
.recvmsg = unix_stream_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
};
最终unix_create是分配一个struct sock,并返回该对象的地址.此时应该比较疑惑为什么已经分配了一个struct socket在此还要在此分配一个struct sock,本文末尾会来再详细解释.
static struct sock * unix_create1(struct socket *sock)
{
............
sk = sk_alloc(PF_UNIX, GFP_KERNEL, sizeof(struct unix_sock),unix_sk_cachep);
.................
sock_init_data(sock,sk);
sk_set_owner(sk, THIS_MODULE);
.................
sk->sk_write_space = unix_write_space;
sk->sk_max_ack_backlog = sysctl_unix_max_dgram_qlen;
sk->sk_destruct = unix_sock_destructor;
.................
return sk;
}
总结下unix_create步骤:1.初始化struct socket 的成员ops,2.分配struct sock并初始化,最后将struct sock 挂上struct socket,sock->sk = sk;中。
再来看下inet_create干了些啥 ?
1.根据socket类型在inetsw链表数组中找到对应sock->ops。其中TCP-> inet_stream_ops,UDP:inet_dgram_ops
2.同样是分配struct sock,但根据不同的sock类型分配的具体结构会有所不同.
3.将sock和socket挂上:sock->sk = sk;
static int inet_create(struct socket *sock, int protocol)
{
..............
/* Look for the requested type/protocol pair. */
answer = NULL;
rcu_read_lock();
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
................
}
.................................
sock->ops = answer->ops;
answer_prot = answer->prot;
answer_no_check = answer->no_check;
answer_flags = answer->flags;
..................
sk = sk_alloc(PF_INET, GFP_KERNEL,
answer_prot->slab_obj_size,
answer_prot->slab);
if (sk == NULL)
goto out;
err = 0;
sk->sk_prot = answer_prot;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = 1;
inet = inet_sk(sk);
........初始化inet options.......
}
至此sock_create已经分析完成,临时总结下:
1.调用sock_alloc:分配struct inode和struct socket,并做初始化
2.net_families[family]->create(sock, protocol):赋值struct socket.ops,再来分别struct sock并初始化,最终将其赋值给struct socket.sk
3.返回struct socket指针.
接下来看下拿到struct socket对象之后,sys_socket调用sock_map_fd函数干了点啥 ?
sock_map_fd
int sock_map_fd(struct socket *sock)
{
....................
fd = get_unused_fd();
if (fd >= 0) {
struct file *file = get_empty_filp();
..................
file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
if (!file->f_dentry) {
put_filp(file);
put_unused_fd(fd);
fd = -ENOMEM;
goto out;
}
file->f_dentry->d_op = &sockfs_dentry_operations;
d_add(file->f_dentry, SOCK_INODE(sock));
file->f_vfsmnt = mntget(sock_mnt);
file->f_mapping = file->f_dentry->d_inode->i_mapping;
sock->file = file;
file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops;
file->f_mode = FMODE_READ | FMODE_WRITE;
file->f_flags = O_RDWR;
file->f_pos = 0;
fd_install(fd, file);
}
return fd;
}
上述代码的主要步骤如下:
1.在当前进程描述符struct task_struct找到一个未使用的文件描述符.
2.从slab中分配一个struct file结构.
3.从sock_mnt文件系统中分配一个目录项 struct dentry,并将其赋值给struct file.f_dentry.
4.从socket对象地址找到inode结构地址,并将inode 赋值给entry->d_inode = inode
5.file->ops = socket_file_ops,sock->file = file;
6.将struct file插入到当前进程描述符struct task_struct.fd[fd] = file
7.返回文件描述符fd给到用户层.
此时应该明白了为什么说socket是一个文件了
还剩下最后一个疑问:为何有struct socket 和struct sock两个结构同时存在 ?
每个套接字都一个struct socket和struct sock,后者是对前者的一种补充,而且两者相互挂接起来.二者是同一个东西的不同侧面.struct socket是面向用户,而struct sock是面向内核.