Linux mount 流程详解

本文代码基于Linux 5.10。

Linux mount 主要通过mount 命令或者mount api来实现，本文主要介绍mount 调用在内核中的实现。

数据结构

fs_context

fs_context 是mount 流程中的重要数据结构，其定义如下

include/linux/fs_context.h
struct fs_context {
    const struct fs_context_operations *ops;
    struct mutex        uapi_mutex; /* Userspace access mutex */
    struct file_system_type *fs_type;
    void            *fs_private;    /* The filesystem's context */
    void            *sget_key;
    struct dentry       *root;      /* The root and superblock */
    struct user_namespace   *user_ns;   /* The user namespace for this mount */
    struct net      *net_ns;    /* The network namespace for this mount */
    const struct cred   *cred;      /* The mounter's credentials */
    struct p_log        log;        /* Logging buffer */
    const char      *source;    /* The source name (eg. dev path) */
    void            *security;  /* Linux S&M options */
    void            *s_fs_info; /* Proposed s_fs_info */
    unsigned int        sb_flags;   /* Proposed superblock flags (SB_*) */
    unsigned int        sb_flags_mask;  /* Superblock flags that were changed */
    unsigned int        s_iflags;   /* OR'd with sb->s_iflags */
    unsigned int        lsm_flags;  /* Information flags from the fs to the LSM */
    enum fs_context_purpose purpose:8;
    enum fs_context_phase   phase:8;    /* The phase the context is in */
    bool            need_free:1;    /* Need to call ops->free() */
    bool            global:1;   /* Goes into &init_user_ns */
    bool            oldapi:1;   /* Coming from mount(2) */
};

Linux 对于这个结构体的注释是：

/*
* Filesystem context for holding the parameters used in the creation or
* reconfiguration of a superblock.
*
* Superblock creation fills in ->root whereas reconfiguration begins with this
* already set.
*
* See Documentation/filesystems/mount_api.rst
*/

我的理解这个结构是从 file_system_type 到 super_block 之间的桥梁，控制了mount 流程。

fs_type：对应的fs_type 结构体

ops：这个比较重要，指向了fs_context_operations, 一般会在文件系统的init_fs_context回调中对其进行赋值

struct fs_context_operations {
    void (*free)(struct fs_context *fc);
    int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
    int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
    int (*parse_monolithic)(struct fs_context *fc, void *data);
    int (*get_tree)(struct fs_context *fc);
    int (*reconfigure)(struct fs_context *fc);
};

处理流程

mount 的整体调用栈如下, 下面我们一个一个分析：

#0  exfat_fill_super (sb=0xffff888004865000, fc=0xffff888003053d80) at fs/exfat/super.c:599
#1  0xffffffff8120a2e9 in get_tree_bdev (fc=0xffff888003053d80, fill_super=0xffffffff813232e5 <exfat_fill_super>) at fs/super.c:1344
#2  0xffffffff813236eb in exfat_get_tree (fc=0xffff888003053d80) at fs/exfat/super.c:696
#3  0xffffffff8120915c in vfs_get_tree (fc=fc@entry=0xffff888003053d80) at fs/super.c:1549
#4  0xffffffff8122a997 in do_new_mount (data=0x0 <fixed_percpu_data>, name=0xffff8880032794a0 "/dev/loop0", mnt_flags=32, sb_flags=<optimized out>, fstype=0x20 <fixed_percpu_data+32> <error: Cannot access memory at address 0x20>, path=0xffffc90000183ec8) at fs/namespace.c:2875
#5  path_mount (dev_name=dev_name@entry=0xffff8880032794a0 "/dev/loop0", path=path@entry=0xffffc90000183ec8, type_page=type_page@entry=0xffff8880032d1c78 "exfat", flags=<optimized out>, flags@entry=32768, data_page=data_page@entry=0x0 <fixed_percpu_data>) at fs/namespace.c:3205
#6  0xffffffff8122ae10 in do_mount (dev_name=dev_name@entry=0xffff8880032794a0 "/dev/loop0", dir_name=dir_name@entry=0x7ffd4ff80f31 "/mnt", type_page=type_page@entry=0xffff8880032d1c78 "exfat", flags=flags@entry=32768, data_page=data_page@entry=0x0 <fixed_percpu_data>) at fs/namespace.c:3218
#7  0xffffffff8122b246 in __do_sys_mount (data=<optimized out>, flags=32768, type=<optimized out>, dir_name=0x7ffd4ff80f31 "/mnt", dev_name=<optimized out>) at fs/namespace.c:3426
#8  __se_sys_mount (data=<optimized out>, flags=32768, type=<optimized out>, dir_name=140725945110321, dev_name=<optimized out>) at fs/namespace.c:3403
#9  __x64_sys_mount (regs=<optimized out>) at fs/namespace.c:3403
#10 0xffffffff819bf903 in do_syscall_64 (nr=<optimized out>, regs=0xffffc90000183f58) at arch/x86/entry/common.c:46
#11 0xffffffff81a0007c in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:120
#12 0x0000000000000000 in ?? ()

入口函数

如下是linux mount系统调用的定义， mount都会走到这个地方来，主要调用do_mount 完成后续的工作。

fs/namespace.c
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
        char __user *, type, unsigned long, flags, void __user *, data)
{
    int ret;
    char *kernel_type;
    char *kernel_dev;
    void *options;

    kernel_type = copy_mount_string(type);
    ret = PTR_ERR(kernel_type);
    if (IS_ERR(kernel_type))
        goto out_type;

    kernel_dev = copy_mount_string(dev_name);
    ret = PTR_ERR(kernel_dev);
    if (IS_ERR(kernel_dev))
        goto out_dev;

    options = copy_mount_options(data);
    ret = PTR_ERR(options);
    if (IS_ERR(options))
        goto out_data;

    ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);

    kfree(options);
out_data:
    kfree(kernel_dev);
out_dev:
    kfree(kernel_type);
out_type:
    return ret;
}

do_mount 主要调用了path_mount , path_mount 中主要设置了sb_flags和mnt_flags，然后调用了do_new_mount

fs/namespace.c
int path_mount(const char *dev_name, struct path *path,
        const char *type_page, unsigned long flags, void *data_page)
{
    unsigned int mnt_flags = 0, sb_flags;
    int ret;

    /* Discard magic */
    if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
        flags &= ~MS_MGC_MSK;

    /* Basic sanity checks */
    if (data_page)
        ((char *)data_page)[PAGE_SIZE - 1] = 0;

    if (flags & MS_NOUSER)
        return -EINVAL;

    ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
    if (ret)
        return ret;
    if (!may_mount())
        return -EPERM;
    if ((flags & SB_MANDLOCK) && !may_mandlock())
        return -EPERM;

    /* Default to relatime unless overriden */
    if (!(flags & MS_NOATIME))
        mnt_flags |= MNT_RELATIME;

    /* Separate the per-mountpoint flags */
    if (flags & MS_NOSUID)
        mnt_flags |= MNT_NOSUID;
    if (flags & MS_NODEV)
        mnt_flags |= MNT_NODEV;
    if (flags & MS_NOEXEC)
        mnt_flags |= MNT_NOEXEC;
    if (flags & MS_NOATIME)
        mnt_flags |= MNT_NOATIME;
    if (flags & MS_NODIRATIME)
        mnt_flags |= MNT_NODIRATIME;
    if (flags & MS_STRICTATIME)
        mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
    if (flags & MS_RDONLY)
        mnt_flags |= MNT_READONLY;
    if (flags & MS_NOSYMFOLLOW)
        mnt_flags |= MNT_NOSYMFOLLOW;

    /* The default atime for remount is preservation */
    if ((flags & MS_REMOUNT) &&
        ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
               MS_STRICTATIME)) == 0)) {
        mnt_flags &= ~MNT_ATIME_MASK;
        mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
    }

    sb_flags = flags & (SB_RDONLY |
                SB_SYNCHRONOUS |
                SB_MANDLOCK |
                SB_DIRSYNC |
                SB_SILENT |
                SB_POSIXACL |
                SB_LAZYTIME |
                SB_I_VERSION);

    if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
        return do_reconfigure_mnt(path, mnt_flags);
    if (flags & MS_REMOUNT)
        return do_remount(path, flags, sb_flags, mnt_flags, data_page);
    if (flags & MS_BIND)
        return do_loopback(path, dev_name, flags & MS_REC);
    if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
        return do_change_type(path, flags);
    if (flags & MS_MOVE)
        return do_move_mount_old(path, dev_name);

    return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
                data_page);
}

分配fs_context

do_new_mount 是比较重要的函数，这里面分配了fs_context结构体。

fs/namespace.c
/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
            int mnt_flags, const char *name, void *data)
{
    struct file_system_type *type;
    struct fs_context *fc;
    const char *subtype = NULL;
    int err = 0;

    if (!fstype)
        return -EINVAL;

    type = get_fs_type(fstype);                             /* 1 */
    if (!type)
        return -ENODEV;

    if (type->fs_flags & FS_HAS_SUBTYPE) {
        subtype = strchr(fstype, '.');
        if (subtype) {
            subtype++;
            if (!*subtype) {
                put_filesystem(type);
                return -EINVAL;
            }
        }
    }

    fc = fs_context_for_mount(type, sb_flags);                  /* 2 */
    put_filesystem(type);
    if (IS_ERR(fc))
        return PTR_ERR(fc);

    if (subtype)
        err = vfs_parse_fs_string(fc, "subtype",
                      subtype, strlen(subtype));
    if (!err && name)
        err = vfs_parse_fs_string(fc, "source", name, strlen(name));
    if (!err)
        err = parse_monolithic_mount_data(fc, data);
    if (!err && !mount_capable(fc))
        err = -EPERM;
    if (!err)
        err = vfs_get_tree(fc);                            /* 3 */
    if (!err)
        err = do_new_mount_fc(fc, path, mnt_flags);

    put_fs_context(fc);
    return err;
}

根据fstype 找到对应的 file_system_type 结构体
初始化fc 结构体。主要调用了alloc_fs_context 这个函数，这里面，会调用文件系统自定义的init_fs_context回调；如果没有定义fc->fs_type->init_fs_context，则会调用legacy_init_fs_context初始化, 这里fc->ops = &legacy_fs_context_ops，其中legacy_get_tree会调用fc->fs_type->mount，
调用vfs_get_tree, 这里会调用fc->ops->get_tree。其中exfat 的实现为。这里主要调用get_tree_bdev，并传入了exfat_fill_super来作为回调填充super_block

申请super_block

get_tree_bdev 函数中会申请super_block结构体，主要流程如下：

/**
 * get_tree_bdev - Get a superblock based on a single block device
 * @fc: The filesystem context holding the parameters
 * @fill_super: Helper to initialise a new superblock
 */
int get_tree_bdev(struct fs_context *fc,
        int (*fill_super)(struct super_block *,
                  struct fs_context *))
{
    struct block_device *bdev;
    struct super_block *s;
    fmode_t mode = FMODE_READ | FMODE_EXCL;
    int error = 0;

    if (!(fc->sb_flags & SB_RDONLY))
        mode |= FMODE_WRITE;

    if (!fc->source)
        return invalf(fc, "No source specified");

    fc->sb_flags |= SB_NOSEC;
    fc->sget_key = bdev;
    s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);            /* 1 */
    mutex_unlock(&bdev->bd_fsfreeze_mutex);
    if (IS_ERR(s)) {
        blkdev_put(bdev, mode);
        return PTR_ERR(s);
    }

    if (s->s_root) {
        /* Don't summarily change the RO/RW state. */
        if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
            warnf(fc, "%pg: Can't mount, would change RO state", bdev);
            deactivate_locked_super(s);
            blkdev_put(bdev, mode);
            return -EBUSY;
        }

        /*
         * s_umount nests inside bd_mutex during
         * __invalidate_device().  blkdev_put() acquires
         * bd_mutex and can't be called under s_umount.  Drop
         * s_umount temporarily.  This is safe as we're
         * holding an active reference.
         */
        up_write(&s->s_umount);
        blkdev_put(bdev, mode);
        down_write(&s->s_umount);
    } else {
        s->s_mode = mode;
        snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
        sb_set_blocksize(s, block_size(bdev));
        error = fill_super(s, fc);                                 /* 2 */
        if (error) {
            deactivate_locked_super(s);
            return error;
        }

        s->s_flags |= SB_ACTIVE;
        bdev->bd_super = s;
    }

    BUG_ON(fc->root);
    fc->root = dget(s->s_root);
    return 0;
}

(1) alloc super_block 结构体

(2) 调用传入的fill_super函数，执行文件系统自定义的操作。这里一般做的是去解析文件系统的元数据，并填充到文件系统的私有结构体中。

调用回调函数填充super_block

exfat fill_super函数的实现如下：

fs/exfat/super.c
static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
{
    struct exfat_sb_info *sbi = sb->s_fs_info;
    struct exfat_mount_options *opts = &sbi->options;
    struct inode *root_inode;
    int err;

    if (opts->allow_utime == (unsigned short)-1)
        opts->allow_utime = ~opts->fs_dmask & 0022;

    if (opts->discard) {
        struct request_queue *q = bdev_get_queue(sb->s_bdev);

        if (!blk_queue_discard(q)) {
            exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard");
            opts->discard = 0;
        }
    }

    sb->s_flags |= SB_NODIRATIME;
    sb->s_magic = EXFAT_SUPER_MAGIC;
    sb->s_op = &exfat_sops;

    sb->s_time_gran = 10 * NSEC_PER_MSEC;
    sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS;
    sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS;

    err = __exfat_fill_super(sb);                                 /* 1 */ 
    if (err) {
        exfat_err(sb, "failed to recognize exfat type");
        goto check_nls_io;
    }

    /* set up enough so that it can read an inode */
    exfat_hash_init(sb);

    if (!strcmp(sbi->options.iocharset, "utf8"))
        opts->utf8 = 1;
    else {
        sbi->nls_io = load_nls(sbi->options.iocharset);
        if (!sbi->nls_io) {
            exfat_err(sb, "IO charset %s not found",
                  sbi->options.iocharset);
            err = -EINVAL;
            goto free_table;
        }
    }

    if (sbi->options.utf8)
        sb->s_d_op = &exfat_utf8_dentry_ops;
    else
        sb->s_d_op = &exfat_dentry_ops;

    root_inode = new_inode(sb);
    if (!root_inode) {
        exfat_err(sb, "failed to allocate root inode");
        err = -ENOMEM;
        goto free_table;
    }

    root_inode->i_ino = EXFAT_ROOT_INO;
    inode_set_iversion(root_inode, 1);
    err = exfat_read_root(root_inode);
    if (err) {
        exfat_err(sb, "failed to initialize root inode");
        goto put_inode;
    }

    exfat_hash_inode(root_inode, EXFAT_I(root_inode)->i_pos);
    insert_inode_hash(root_inode);

    sb->s_root = d_make_root(root_inode);
    if (!sb->s_root) {
        exfat_err(sb, "failed to get the root dentry");
        err = -ENOMEM;
        goto put_inode;
    }

    return 0;

put_inode:
    iput(root_inode);
    sb->s_root = NULL;

free_table:
    exfat_free_upcase_table(sbi);
    exfat_free_bitmap(sbi);
    brelse(sbi->boot_bh);

check_nls_io:
    unload_nls(sbi->nls_io);
    exfat_free_iocharset(sbi);
    sb->s_fs_info = NULL;
    kfree(sbi);
    return err;
}

主要分为两部分：

（1）读取exfat 的文件系统信息，解析后保存在 exfat_sb_info这个结构体中

（2）设置super_block 的一些重要field, 例如s_op， s_root， s_d_op

装载到全局文件系统树

mount 完成后，会调用do_new_mount_fc将新的挂载实例添加到系统中。

fs/namespace.c
/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
            int mnt_flags, const char *name, void *data)
{
    struct file_system_type *type;
    struct fs_context *fc;
    const char *subtype = NULL;
    int err = 0;

    if (!fstype)
        return -EINVAL;

    type = get_fs_type(fstype);                             /* 1 */
    if (!type)
        return -ENODEV;

    if (type->fs_flags & FS_HAS_SUBTYPE) {
        subtype = strchr(fstype, '.');
        if (subtype) {
            subtype++;
            if (!*subtype) {
                put_filesystem(type);
                return -EINVAL;
            }
        }
    }

    fc = fs_context_for_mount(type, sb_flags);                  /* 2 */
    put_filesystem(type);
    if (IS_ERR(fc))
        return PTR_ERR(fc);

    if (subtype)
        err = vfs_parse_fs_string(fc, "subtype",
                      subtype, strlen(subtype));
    if (!err && name)
        err = vfs_parse_fs_string(fc, "source", name, strlen(name));
    if (!err)
        err = parse_monolithic_mount_data(fc, data);
    if (!err && !mount_capable(fc))
        err = -EPERM;
    if (!err)
        err = vfs_get_tree(fc);                            /* 3 */
    if (!err)
        err = do_new_mount_fc(fc, path, mnt_flags);     /*  4 */

    put_fs_context(fc);
    return err;
}

这里主要调用了 do_new_mount_fc ，创建新的挂载实例关联到系统中，这里面数据结构涉及很多，且很混乱，其中的关系暂时没有梳理清楚。