文件系统的分类:
- 磁盘文件系统
- 内存虚拟文件系
- 统网络文件系统
通用的模型VFS对上(用户)解决了各种不同文件系统的统计接口问题,同时对下(具体的文件系统)兼容各种新的文件系统,让linux变得更加强大灵活
VFS的架构图
实现思路:
- 定义一个最小的通用模型
- 定义一个强大的模型,实现都可以选择性的实现(ext2->VFS)
下面是通过inode查找一个文件的过程(实际的实现过程中会有缓存来加速文件查找)
链接
- 软链接:有一个目录项,指向独立INODE,INODE不存文件数据,存指向真实文件的路径,可以在不同分区
- 硬链接:有一个目录项,没有独立INODE,指向原始INODE,并且原始INODE的引用计数加1 ,不能在不同分区
编程接口
- 文件描述符(系统提供)
- 流操作(C库)
万物皆文件
- 字符和块设备文件
- 管道文件
- 用于所有网络协议的套接字(网络设备例外)
- 终端
VFS的主要数据结构
INODE操作:用来操作文件元数据和文件管理,创建文件,删除文件,重命名文件,创建链接
文件操作:用来操作数据的,如读写文件,操作文件位置,创建内存映射
struct inode {
umode_t i_mode; //访问权限
unsigned short i_opflags;
kuid_t i_uid; //文件的创建用户UID
kgid_t i_gid; //文件的创建用户GID
unsigned int i_flags;
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
const struct inode_operations *i_op; //指向文件内容相关操作集合(C++对象方法指针)
struct super_block *i_sb; //文件所在的分区的超级块
struct address_space *i_mapping; //文件内存映射
#ifdef CONFIG_SECURITY
void *i_security;
#endif
/* Stat data, not accessed from path walking */
unsigned long i_ino; //INDOD的编号
/*
* Filesystems may only read i_nlink directly. They shall use the
* following functions for modification:
*
* (set|clear|inc|drop)_nlink
* inode_(inc|dec)_link_count
*/
union {
const unsigned int i_nlink; //文件的硬链接数
unsigned int __i_nlink;
};
dev_t i_rdev; //指向存储的真实设备(用dev_t代表)
loff_t i_size; //文件大小(以字节为单位)
struct timespec i_atime; //访问时间
struct timespec i_mtime; //修改时间
struct timespec i_ctime; //创建时间
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
unsigned int i_blkbits;
blkcnt_t i_blocks; //文件大小(以块为单位,块的对应的字节数由文件系统格式化时指定)
#ifdef __NEED_I_SIZE_ORDERED
seqcount_t i_size_seqcount;
#endif
/* Misc */
unsigned long i_state;
struct mutex i_mutex;
unsigned long dirtied_when; /* jiffies of first dirtying */
unsigned long dirtied_time_when;
struct hlist_node i_hash; //全局inode_hashtable的链接元素
struct list_head i_wb_list; /* backing dev IO list */
struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
union {
struct hlist_head i_dentry; //包含的目录项(如果这个inode代表的是一个目录)
struct rcu_head i_rcu;
};
u64 i_version;
atomic_t i_count; //同时引用的进程数
atomic_t i_dio_count;
atomic_t i_writecount;
#ifdef CONFIG_IMA
atomic_t i_readcount; /* struct files open RO */
#endif
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ //指向文件管理相关操作集合(C++对象方法指针)
struct file_lock_context *i_flctx;
struct address_space i_data;
struct list_head i_devices; //用于特定设备同时对应多个inode文件时(chroot时)
union {
struct pipe_inode_info *i_pipe; //用于指向管道数据结构
struct block_device *i_bdev; //用于指向块设备专用数据结构
struct cdev *i_cdev; //用于指向字符设备专用数据结构
};
__u32 i_generation;
#ifdef CONFIG_FSNOTIFY
__u32 i_fsnotify_mask; /* all events this inode cares about */
struct hlist_head i_fsnotify_marks;
#endif
void *i_private; /* fs or device private pointer */ //用于指向自定义数据结构(用的非常多)
};
struct dentry {
/* RCU lookup touched fields */
unsigned int d_flags; /* protected by d_lock */ //flags是目录还是文件
seqcount_t d_seq; /* per dentry seqlock */
struct hlist_bl_node d_hash; /* lookup hash list */ //用于全局dentry_hashtable冲突
struct dentry *d_parent; /* parent directory */ // 父目录
struct qstr d_name; //长文件名
struct inode *d_inode; /* Where the name belongs to - NULL is // 文件的inode
* negative */
unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ //文件名(短名)
/* Ref lookup also touches following */
struct lockref d_lockref; /* per-dentry lock and refcount */
const struct dentry_operations *d_op; //目录项的操作方法集合
struct super_block *d_sb; /* The root of the dentry tree */ //指向所属的文件系统的超级块实例
unsigned long d_time; /* used by d_revalidate */
void *d_fsdata; /* fs-specific data */
struct list_head d_lru; /* LRU list */ //最近最少使用的列表
struct list_head d_child; /* child of parent list */ //兄弟目录
struct list_head d_subdirs; /* our children */ //子目录
/*
* d_alias and d_rcu can share memory
*/
union {
struct hlist_node d_alias; /* inode alias list */
struct rcu_head d_rcu;
} d_u;
};
struct dentry_operations {
int (*d_revalidate)(struct dentry *, unsigned int); //对网络文件系统非常有用,用于同步远端dentry是否还有效
int (*d_weak_revalidate)(struct dentry *, unsigned int);
int (*d_hash)(const struct dentry *, struct qstr *); //计算hash
int (*d_compare)(const struct dentry *, const struct dentry *, //比较对象的文件名
unsigned int, const char *, const struct qstr *);
int (*d_delete)(const struct dentry *); //引用为0时调用
void (*d_release)(struct dentry *); //引用计数减1
void (*d_prune)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(struct dentry *, bool);
} ____cacheline_aligned;
struct super_block {
struct list_head s_list; /* Keep this first */ //打开的所有文件(struct file)
dev_t s_dev; /* search index; _not_ kdev_t */ //对应的真实设备
unsigned char s_blocksize_bits; //最大块大小(对数)
unsigned long s_blocksize; //最大的块大小
loff_t s_maxbytes; /* Max file size */ //最大的文件长度
struct file_system_type *s_type; //指向文件系统类对象
const struct super_operations *s_op; //super_block对象的操作方法
const struct dquot_operations *dq_op;
const struct quotactl_ops *s_qcop;
const struct export_operations *s_export_op;
unsigned long s_flags;
unsigned long s_magic; //文件系统的魔术字
struct dentry *s_root; //指向根目录(如果为空,则是伪文件系统仅内存可见)
struct rw_semaphore s_umount;
int s_count;
atomic_t s_active;
#ifdef CONFIG_SECURITY
void *s_security;
#endif
const struct xattr_handler **s_xattr;
struct list_head s_inodes; /* all inodes */ //管理的所有inode
struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
struct block_device *s_bdev; //指向块设备结构体
struct backing_dev_info *s_bdi;
struct mtd_info *s_mtd;
struct hlist_node s_instances; //文件系统的fs_supers为链表头,把相同文件系统下的所有实例串联起来
unsigned int s_quota_types; /* Bitmask of supported quota types */
struct quota_info s_dquot; /* Diskquota specific options */
struct sb_writers s_writers;
char s_id[32]; /* Informational name */
u8 s_uuid[16]; /* UUID */
void *s_fs_info; /* Filesystem private info */ //指向文件系统私有数据
unsigned int s_max_links;
fmode_t s_mode;
/* Granularity of c/m/atime in ns.
Cannot be worse than a second */
u32 s_time_gran;
/*
* The next field is for VFS *only*. No filesystems have any business
* even looking at it. You had been warned.
*/
struct mutex s_vfs_rename_mutex; /* Kludge */
/*
* Filesystem subtype. If non-empty the filesystem type field
* in /proc/mounts will be "type.subtype"
*/
char *s_subtype;
/*
* Saved mount options for lazy filesystems using
* generic_show_options()
*/
char __rcu *s_options;
const struct dentry_operations *s_d_op; /* default d_op for dentries */
/*
* Saved pool identifier for cleancache (-1 means none)
*/
int cleancache_poolid;
struct shrinker s_shrink; /* per-sb shrinker handle */
/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;
/* Being remounted read-only */
int s_readonly_remount;
/* AIO completions deferred from interrupt context */
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;
/*
* Keep the lru lists last in the structure so they always sit on their
* own individual cachelines.
*/
struct list_lru s_dentry_lru ____cacheline_aligned_in_smp;
struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
struct rcu_head rcu;
/*
* Indicates how deep in a filesystem stack this SB is
*/
int s_stack_depth;
};
struct path {
struct vfsmount *mnt; //挂载点
struct dentry *dentry; //文件名和INODE的对应关系
};
struct file {
union {
struct llist_node fu_llist; // 用于super_block的s_list, 保存所有已经打开的文件
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path; //挂载点、文件名和INODE的对应关系
struct inode *f_inode; /* cached value */
const struct file_operations *f_op; //文件操作的方法集合
/*
* Protects f_ep_links, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
atomic_long_t f_count;
unsigned int f_flags; //open系统调用的flags
fmode_t f_mode; //文件访问权限mode
struct mutex f_pos_lock;
loff_t f_pos; //文件操作的位置
struct fown_struct f_owner; //通知文件操作进程SIGIO,用于实现异步IO
const struct cred *f_cred;
struct file_ra_state f_ra;
u64 f_version;
#ifdef CONFIG_SECURITY
void *f_security;
#endif
/* needed for tty driver, and maybe others */
void *private_data;
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct list_head f_ep_links;
struct list_head f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping; //指向inode实例的地址空间映射
} __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
对如下的操作方法的理解需要很熟悉文件相关的系统调用
struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); //读文件内容操作
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); //写文件内容操作
ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); //异步读操作
ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); //异步写操作
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); 用于支持向量数组读(多个buffer的scatter-gather操作)
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
unsigned int (*poll) (struct file *, struct poll_table_struct *); //用于支持可限时的文件读操作(对应poll和select系统调用)
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long); //针对设备文件才需要的ioctl操作
int (*mmap) (struct file *, struct vm_area_struct *); //用于实现文件内存映射操作(为mmap系统调用提供支持)
int (*mremap)(struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *); //打开文件,其实就是把file=>indoe关联上
int (*flush) (struct file *, fl_owner_t id); //关闭文件时调用,引用计数减1
int (*release) (struct inode *, struct file *); //释放空间
int (*fsync) (struct file *, loff_t, loff_t, int datasync); //对fsync,fdatasync的系统调用,同步数据到设备上
int (*aio_fsync) (struct kiocb *, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *); //支持文件锁
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); //支持splice调用(file<=>pipe)
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
};
struct inode_operations {
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
void * (*follow_link) (struct dentry *, struct nameidata *);
int (*permission) (struct inode *, int);
struct posix_acl * (*get_acl)(struct inode *, int);
int (*readlink) (struct dentry *, char __user *,int);
void (*put_link) (struct dentry *, struct nameidata *, void *);
int (*create) (struct inode *,struct dentry *, umode_t, bool);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
int (*mkdir) (struct inode *,struct dentry *,umode_t);
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
int (*rename2) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*setattr) (struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*removexattr) (struct dentry *, const char *);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
u64 len);
int (*update_time)(struct inode *, struct timespec *, int);
int (*atomic_open)(struct inode *, struct dentry *,
struct file *, unsigned open_flag,
umode_t create_mode, int *opened);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
int (*set_acl)(struct inode *, struct posix_acl *, int);
/* WARNING: probably going away soon, do not use! */
int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
} ____cacheline_aligned;
//task_struct中保存的文件系统信息
struct fs_struct {
int users;
spinlock_t lock;
seqcount_t seq;
int umask; //进程新创建文件的默认权限
int in_exec;
struct path root, pwd; //保存了root,pwd目录和mount点
};
struct mnt_namespace {
atomic_t count; //使用这个命名空间的进程数
struct ns_common ns;
struct mount * root; //根目录的mount点
struct list_head list; //同命名空间的所有mount点通过个链接元素连接起来
struct user_namespace *user_ns;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
u64 event;
};
struct file_system_type {
const char *name; //文件系统的名称
int fs_flags;
#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int, //mount操作
const char *, void *);
void (*kill_sb) (struct super_block *);
struct module *owner; //指向内核模块
struct file_system_type * next; //用于构建系统文件系统链表
struct hlist_head fs_supers; //把同一文件系统中的多个super_block实例串联起来
struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
struct lock_class_key s_vfs_rename_key;
struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];
struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key;
};
注册一个新文件系统
int register_filesystem(struct file_system_type *);
struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */ //当前根目录项
struct super_block *mnt_sb; /* pointer to superblock */ //指向超级块的指针
int mnt_flags;
};
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb); //分配新的inode
void (*destroy_inode)(struct inode *);
void (*dirty_inode) (struct inode *, int flags); //标记为脏节点
int (*write_inode) (struct inode *, struct writeback_control *wbc); //
int (*drop_inode) (struct inode *);
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait); 将文件系统数据与设备同步
int (*freeze_super) (struct super_block *);
int (*freeze_fs) (struct super_block *);
int (*thaw_super) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *); //支持statfs系统调用
int (*remount_fs) (struct super_block *, int *, char *); //重新装载文件系统
void (*umount_begin) (struct super_block *); //用于网络文件系统
int (*show_options)(struct seq_file *, struct dentry *); //用于proc文件系统
int (*show_devname)(struct seq_file *, struct dentry *);
int (*show_path)(struct seq_file *, struct dentry *);
int (*show_stats)(struct seq_file *, struct dentry *); //用于proc文件系统
#ifdef CONFIG_QUOTA
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
struct dquot **(*get_dquots)(struct inode *);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
long (*nr_cached_objects)(struct super_block *, struct shrink_control *);
long (*free_cached_objects)(struct super_block *, struct shrink_control *);
};
/fs/namespace.c
long do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page)
实现了文件系统mount操作
伪文件系统,是不可以被装载的,专门用于内核来管理一些特殊文件的inode