ramdisk configuration, unzip, create rootfs, start simple analysis

1. How to turn on the ramdisk function?

If you want to use the ramdisk function, you need to do two steps: one is to modify the bootargs of the Kernel and add the rdinit option; the other is to embed rootfs.cpio when compiling uImage.

The following are two configurations of using ramdisk to boot and using eMMC as the boot medium. The ramdisk needs to specify the rdinit option, and the root device becomes /dev/ram0.

bootargs = "console=ttyS0,115200 rdinit=/sbin/init root=/dev/ram0 quiet";
bootargs = "console=ttyS0,115200 root=/dev/mmcblk1p2 rw rootfstype=ext4  rootflags=data=journal,barrier=1  rootwait";

Need to embed rootfs.cpio into the kernel image, which can be configured through buildroot:

 

config BR2_TARGET_ROOTFS_INITRAMFS
    bool "initial RAM filesystem linked into linux kernel"
    depends on BR2_LINUX_KERNEL
    select BR2_TARGET_ROOTFS_CPIO
    help
      Integrate the root filesystem generated by Buildroot as an
      initramfs inside the kernel image. This integration will
      take place automatically.

      A rootfs.cpio file will be generated in the images/ directory.
      This is the archive that will be included in the kernel image.
      The default rootfs compression set in the kernel configuration
      is used, regardless of how buildroot's cpio archive is configured.

      Note that enabling initramfs together with another filesystem
      formats doesn't make sense: you would end up having two
      identical root filesystems, one embedded inside the kernel
      image, and one separately.

 

You can also use the following compilation options when compiling the kernel:

make uImage -j16 CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="${BR_BINARIES_DIR}/rootfs.cpio" KCPPFLAGS=-DCONFIG_BLK_DEV_INITRD

Take a look at how rdinit and root are processed in the kernel. If rdinit and root are set in bootargs, the kernel parses and assigns ramdisk_execute_command and saved_root_name to ramdisk_execute_command and saved_root_name respectively during the startup phase.

These two important parameters will be used in the later analysis of the kernel startup process.

 

static int __init rdinit_setup(char *str)
{
    unsigned int i;

    ramdisk_execute_command = str;--------------------------------此例中ramdisk_execute_command对应/sbin/init。
    /* See "auto" comment in init_setup */
    for (i = 1; i < MAX_INIT_ARGS; i++)
        argv_init[i] = NULL;
    return 1;
}
__setup("rdinit=", rdinit_setup);

static int __init root_dev_setup(char *line)
{
    strlcpy(saved_root_name, line, sizeof(saved_root_name));------saved_root_name对应/dev/ram0。
    return 1;
}

__setup("root=", root_dev_setup);

 

 

2. Where is the ramdisk stored?

From the vmlinux.lds.h file, we can see that ramfs is used according to CONFIG_BLK_DEV_INITRD definition.

INIT_RAM_FS stores ramfs related content, including two sections.init.ramfs and .init.ramfs.info.

 

SECTIONS
{
    . = PAGE_OFFSET + PHYS_OFFSET_OFFSET;

    _stext = .;
    __init_begin = .;
...
    INIT_DATA_SECTION(PAGE_SIZE)
...
    . = ALIGN(PAGE_SIZE);
    __init_end = .;------------------------------从__init_begin到__init_end部分的空间会在free_initmem()中被释放。

    .text : AT(ADDR(.text) - LOAD_OFFSET) {
...
    } = 0
    _etext = .;
...
}

#define INIT_DATA_SECTION(initsetup_align)                \
    .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {        \
...
        INIT_RAM_FS                        \
    }

#ifdef CONFIG_BLK_DEV_INITRD
#define INIT_RAM_FS                            \
    . = ALIGN(4);                            \
    VMLINUX_SYMBOL(__initramfs_start) = .;                \
    KEEP(*(.init.ramfs))                        \
    . = ALIGN(8);                            \
    KEEP(*(.init.ramfs.info))
#else
#define INIT_RAM_FS
#endif

 

The two sections .init.ramfs and .init.ramfs.info are defined in initramfs_data.S.

 

.section .init.ramfs,"a"
__irf_start:
.incbin __stringify(INITRAMFS_IMAGE)------------------原封不动的将INITRAMFS_IMAGE对应的二进制文件编译到当前文件中。
__irf_end:
.section .init.ramfs.info,"a"
.globl VMLINUX_SYMBOL(__initramfs_size)
VMLINUX_SYMBOL(__initramfs_size):
#ifdef CONFIG_64BIT
    .quad __irf_end - __irf_start
#else
    .long __irf_end - __irf_start
#endif

 

Where does INITRAMFS_IMAGE come from? Need to check the Makefile in the /usr/ directory.

It can be seen from the Makefile that the rootfs.cpio file corresponding to CONFIG_INITRAMFS_SOURCE is used as input, and gen_init_cpio and gen_initramfs_list.sh are called to generate the initramfs_data.cpio.gz file.

Then INITRAMFS_IMAGE corresponds to the /usr/initramfs_data.cpio$(suffix_y) file.

Finally, INITRAMFS_IMAGE is compiled into the initramfs_data.o file through .incbin, which corresponds to the .init.ramfs section.

 

800308cc T __security_initcall_start
800308d0 T __initramfs_start
800308d0 t __irf_start---------------------------ramfs区域起始地址。
800308d0 T __security_initcall_end
814ed9c0 T __initramfs_size----------------------ramfs文件大小。
814ed9c0 t __irf_end-----------------------------ramfs区域结束地址。
814ee000 T __init_end

 

 

3. How to start ramdisk?

As part of the init data, ramfs is located at the end of __init_begin and __init_end, and is released in free_initmem().

The ramfs is stored between __initramfs_start and __initramfs_size in the form of a compressed package, and is decompressed by calling unpack_to_rootfs() in kernel_init()-->kernel_init_freeable()-->do_basic_setup()-->populate_rootfs().

 

kernel_init()
  -->kernel_init_freeable()-------------------------------在执行完do_basic_setup(),即完成各种initcall之后,判断ramdisk_execute_command命令。
  -->free_initmem()---------------------------------------释放__init_begin到__init_end之间的内存。
  -->do_basic_setup()
    -->populate_rootfs()---------------------------------解压__initramfs_start包含的ramdisk到rootfs中。
  -->run_init_process(ramdisk_execute_command)------------执行ramdisk_execute_command命令替代当前进程。

 

 

3.1 Analysis of initrd_start and initrd_end

Before start_kernel(), initrd and root related parameters are parsed from dts.

调用early_init_dt_scan()-->early_init_dt_scan_nodes-->early_init_dt_scan_nodes():

 

void __init early_init_dt_scan_nodes(void)
{
    /* Retrieve various information from the /chosen node */
    of_scan_flat_dt(early_init_dt_scan_chosen, boot_command_line);
...
}

int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
                     int depth, void *data)
{
...
    early_init_dt_check_for_initrd(node);
...
}

static void __init early_init_dt_check_for_initrd(unsigned long node)
{
    u64 start, end;
    int len;
    const __be32 *prop;

    pr_debug("Looking for initrd properties... ");

    prop = of_get_flat_dt_prop(node, "linux,initrd-start", &len);
    if (!prop)
        return;
    start = of_read_number(prop, len/4);

    prop = of_get_flat_dt_prop(node, "linux,initrd-end", &len);
    if (!prop)
        return;
    end = of_read_number(prop, len/4);

    __early_init_dt_declare_initrd(start, end);

    pr_debug("initrd_start=0x%llx  initrd_end=0x%llx\n",
         (unsigned long long)start, (unsigned long long)end);
}

 

Regarding initrd_start and initrd_end, from early_init_dt_check_for_initrd (), it can be known that if "linux,initrd-start" and "linux,initrd-end" are not set in dts, then the two parameters of initrd_start and initrd_end are the original values ​​of 0.

 

#ifdef CONFIG_BLK_DEV_INITRD
#ifndef __early_init_dt_declare_initrd
static void __early_init_dt_declare_initrd(unsigned long start,
                       unsigned long end)
{
    initrd_start = (unsigned long)__va(start);
    initrd_end = (unsigned long)__va(end);
    initrd_below_start_ok = 1;
}
#endif

 

 

3.2 rootfs and ramfs file system

Rootfs is not actually an actual file system, it may use ramfs or tmpfs according to the actual situation.

Here is an analysis of how rootfs corresponds to ramfs, and a brief introduction to ramfs.

3.2.1 rootfs file system

In start_kernel()-->vfs_caches_init()-->mnt_init(), register the rootfs type file system.

 

void __init mnt_init(void)
{
...
    fs_kobj = kobject_create_and_add("fs", NULL);
    if (!fs_kobj)
        printk(KERN_WARNING "%s: kobj create error\n", __func__);
    init_rootfs();
    init_mount_tree();
}

int __init init_rootfs(void)
{
    int err = register_filesystem(&rootfs_fs_type);

    if (err)
        return err;

    if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
        (!root_fs_names || strstr(root_fs_names, "tmpfs"))) {---------没有指定saved_root_name并且root_fs_names为tmpfs时候,初始化tmpfs文件系统。
        err = shmem_init();-------------------------------------------初始化tmpfs文件系统。
        is_tmpfs = true;----------------------------------------------后面rootfs_mount()会需要判断是使用tmpfs还是ramfs作为文件系统类型。
    } else {
        err = init_ramfs_fs();----------------------------------------初始化ramfs文件系统。
    }
...
}

static void __init init_mount_tree(void)
{
    struct vfsmount *mnt;
    struct mnt_namespace *ns;
    struct path root;
    struct file_system_type *type;

    type = get_fs_type("rootfs");-------------------------------------获取rootfs对应的file_system_type,这里对应的是ramfs操作函数。
    if (!type)
        panic("Can't find rootfs type");
    mnt = vfs_kern_mount(type, 0, "rootfs", NULL);--------------------这里会调用mount_fs(),进而调用rootfs_fs_type->mount(),即rootfs_mount()。
    put_filesystem(type);
    if (IS_ERR(mnt))
        panic("Can't create rootfs");

    ns = create_mnt_ns(mnt);
    if (IS_ERR(ns))
        panic("Can't allocate initial namespace");

    init_task.nsproxy->mnt_ns = ns;
    get_mnt_ns(ns);

    root.mnt = mnt;
    root.dentry = mnt->mnt_root;
    mnt->mnt_flags |= MNT_LOCKED;

    set_fs_pwd(current->fs, &root);
    set_fs_root(current->fs, &root);
}

 

Let's take a look at how the rootfs file system is mounted? Rootfs does not have its own fixed type, or use ramfs or tmpfs.

 

static bool is_tmpfs;
static struct dentry *rootfs_mount(struct file_system_type *fs_type,
    int flags, const char *dev_name, void *data)
{
    static unsigned long once;
    void *fill = ramfs_fill_super;

if (test_and_set_bit(0, &once))
        return ERR_PTR(-ENODEV);

    if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs)
        fill = shmem_fill_super;

    return mount_nodev(fs_type, flags, data, fill);--------------这里的fill究竟用的是ramfs还是tmpfs,在init_roofs()中已经决定。
}

static struct file_system_type rootfs_fs_type = {
    .name        = "rootfs",
    .mount        = rootfs_mount,
    .kill_sb    = kill_litter_super,
};

struct dentry *mount_nodev(struct file_system_type *fs_type,
    int flags, void *data,
    int (*fill_super)(struct super_block *, void *, int))
{
    int error;
    struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);

if (IS_ERR(s))
        return ERR_CAST(s);

    error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);------调用ramfs_fill_super()或者shmem_fill_super()。
    if (error) {
        deactivate_locked_super(s);
        return ERR_PTR(error);
    }
    s->s_flags |= MS_ACTIVE;
    return dget(s->s_root);
}

int ramfs_fill_super(struct super_block *sb, void *data, int silent)
{
    struct ramfs_fs_info *fsi;
    struct inode *inode;
    int err;

    save_mount_options(sb, data);

    fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
    sb->s_fs_info = fsi;
    if (!fsi)
        return -ENOMEM;

    err = ramfs_parse_options(data, &fsi->mount_opts);
    if (err)
        return err;

    sb->s_maxbytes        = MAX_LFS_FILESIZE;
    sb->s_blocksize        = PAGE_SIZE;
    sb->s_blocksize_bits    = PAGE_SHIFT;
    sb->s_magic        = RAMFS_MAGIC;
    sb->s_op        = &ramfs_ops;--------------------------rootfs最终使用的还是ramfs文件系统类型的操作函数,如果是tmpfs则使用shmem_ops。
    sb->s_time_gran        = 1;

    inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
    sb->s_root = d_make_root(inode);-----------------------创建根节点"/"。
if (!sb->s_root)
        return -ENOMEM;

    return 0;
}

struct dentry *d_make_root(struct inode *root_inode)
{
    struct dentry *res = NULL;
if (root_inode) {
        res = __d_alloc(root_inode->i_sb, NULL);-----------在name参数为NULL的时候,即创建根节点"/"。
        if (res)
            d_instantiate(res, root_inode);
        else
            iput(root_inode);
    }
return res;
}

 

In summary, when the kernel is started, init_rootfs() first determines whether to use tmpfs or ramfs according to the parameters, and then mounts in init_mount_tree().

3.2.2 ramfs file system

ramfs selects the appropriate inode or file operation type according to the requested mode type.

 

struct inode *ramfs_get_inode(struct super_block *sb,
                const struct inode *dir, umode_t mode, dev_t dev)
{
    struct inode * inode = new_inode(sb);

    printk("lubaoquan %s line=%d\n", __func__, __LINE__);
    if (inode) {
        inode->i_ino = get_next_ino();
        inode_init_owner(inode, dir, mode);
        inode->i_mapping->a_ops = &ramfs_aops;
        mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
        mapping_set_unevictable(inode->i_mapping);
        inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
        switch (mode & S_IFMT) {
        default:
            init_special_inode(inode, mode, dev);---------------------处理char、block、pipefifo等类型的文件。
            break;
        case S_IFREG:-------------------------------------------------处理普通文件。
            inode->i_op = &ramfs_file_inode_operations;
            inode->i_fop = &ramfs_file_operations;
            break;
        case S_IFDIR:-------------------------------------------------处理目录。
            inode->i_op = &ramfs_dir_inode_operations;
            inode->i_fop = &simple_dir_operations;

            /* directory inodes start off with i_nlink == 2 (for "." entry) */
            inc_nlink(inode);
            break;
        case S_IFLNK:-------------------------------------------------处理link文件。
            inode->i_op = &page_symlink_inode_operations;
            inode_nohighmem(inode);
            break;
        }
    }
    return inode;
}

void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
    inode->i_mode = mode;
    if (S_ISCHR(mode)) {
        inode->i_fop = &def_chr_fops;
        inode->i_rdev = rdev;
    } else if (S_ISBLK(mode)) {
        inode->i_fop = &def_blk_fops;
        inode->i_rdev = rdev;
    } else if (S_ISFIFO(mode))
        inode->i_fop = &pipefifo_fops;
    else if (S_ISSOCK(mode))
        ;    /* leave it no_open_fops */
    else
        printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
                  " inode %s:%lu\n", mode, inode->i_sb->s_id,
                  inode->i_ino);
}

const struct file_operations ramfs_file_operations = {
    .read_iter    = generic_file_read_iter,
    .write_iter    = generic_file_write_iter,
    .mmap        = generic_file_mmap,
    .fsync        = noop_fsync,
    .splice_read    = generic_file_splice_read,
    .splice_write    = iter_file_splice_write,
    .llseek        = generic_file_llseek,
    .get_unmapped_area    = ramfs_mmu_get_unmapped_area,
};

const struct inode_operations ramfs_file_inode_operations = {
    .setattr    = simple_setattr,
    .getattr    = simple_getattr,
};

static const struct inode_operations ramfs_dir_inode_operations = {
    .create        = ramfs_create,
    .lookup        = simple_lookup,
    .link        = simple_link,
    .unlink        = simple_unlink,
    .symlink    = ramfs_symlink,
    .mkdir        = ramfs_mkdir,
    .rmdir        = simple_rmdir,
    .mknod        = ramfs_mknod,
    .rename        = simple_rename,
};

const struct inode_operations page_symlink_inode_operations = {
    .readlink    = generic_readlink,
    .get_link    = page_get_link,
};

 

According to different types of inode->i_mode, different inode->i_fop and inode->i_op are adopted.

3.3 The calling sequence of rootfs_initcall() in the kernel

All initcalls call initcall in turn in start_kernel()-->reset_init()-->kernel_init()-->kernel_init_freeable()-->do_basic_setup().

Among them, rootfs_initcall() is after fs_initcall() and before device_initcall().

#define fs_initcall(fn)            __define_initcall(fn, 5)
#define fs_initcall_sync(fn)        __define_initcall(fn, 5s)
#define rootfs_initcall(fn)        __define_initcall(fn, rootfs)
#define device_initcall(fn)        __define_initcall(fn, 6)
#define device_initcall_sync(fn)    __define_initcall(fn, 6s)

 

3.4 Decompression of ramfs

rootfs_initcall() calls default_rootfs () when CONFIG_BLK_DEV_INITRD is not defined .

default_rootfs() mainly generates two directories /dev and /root, and a device file /dev/console.

 

static int __init default_rootfs(void)
{
    int err;

    err = sys_mkdir((const char __user __force *) "/dev", 0755);
    if (err < 0)
        goto out;

    err = sys_mknod((const char __user __force *) "/dev/console",
            S_IFCHR | S_IRUSR | S_IWUSR,
            new_encode_dev(MKDEV(5, 1)));
    if (err < 0)
        goto out;

    err = sys_mkdir((const char __user __force *) "/root", 0700);
    if (err < 0)
        goto out;

    return 0;

out:
    printk(KERN_WARNING "Failed to create a rootfs\n");
    return err;
}

 

In the case of defining CONFIG_BLK_DEV_INITRD, call populate_rootfs () to decompress the ramdisk into RAM. 

unpack_to_rootfs() obtains the type of decompress from the header according to the parameters __initramfs_start and __initramfs_size; then calls decompress_fn to decompress.

 

static int __init populate_rootfs(void)
{
    char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);

if (err)
        panic("%s", err); /* Failed to decompress INTERNAL initramfs */
    if (initrd_start) {---------------------------------------------判断是否特别指定了initrd_start。如果指定,就对initrd进行单独处理。
#ifdef CONFIG_BLK_DEV_RAM
        int fd;
        printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n");
        err = unpack_to_rootfs((char *)initrd_start,
            initrd_end - initrd_start);-----------------------------判断加载的是不是initramfs CPIO文件。
        if (!err) {
            free_initrd();------------------------------------------如果解压成功,释放image中initrd对应内存。
            goto done;
        } else {
            clean_rootfs();
            unpack_to_rootfs(__initramfs_start, __initramfs_size);--可能是initrd文件。
        }
        printk(KERN_INFO "rootfs image is not initramfs (%s)"
                "; looks like an initrd\n", err);
        fd = sys_open("/initrd.image",
                  O_WRONLY|O_CREAT, 0700);--------------------------创建文件/initrd.image。
        if (fd >= 0) {
            ssize_t written = xwrite(fd, (char *)initrd_start,
                        initrd_end - initrd_start);-----------------将intird_start到initrd_end内容保存到/initrd.image文件中。

            if (written != initrd_end - initrd_start)
                pr_err("/initrd.image: incomplete write (%zd != %ld)\n",
                       written, initrd_end - initrd_start);

            sys_close(fd);
            free_initrd();------------------------------------------关闭文件并释放image中initrd对应内存。
        }
    done:
#else
        printk(KERN_INFO "Unpacking initramfs...\n");
        err = unpack_to_rootfs((char *)initrd_start,
            initrd_end - initrd_start);
        if (err)
            printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
        free_initrd();
#endif
        load_default_modules();
    }
    return 0;
}

static char * __init unpack_to_rootfs(char *buf, unsigned long len)
{
    long written;
    decompress_fn decompress;
    const char *compress_name;
    static __initdata char msg_buf[64];

    header_buf = kmalloc(110, GFP_KERNEL);
    symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL);
    name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL);

    if (!header_buf || !symlink_buf || !name_buf)
        panic("can't allocate buffers");

    state = Start;
    this_header = 0;
    message = NULL;
    while (!message && len) {
...
        decompress = decompress_method(buf, len, &compress_name);------根据buf的第1、2个字节的magic来判断decompress类型。比如这里对应gzip,所以返回值decompress及对应gunzip()。
        pr_debug("Detected %s compressed data\n", compress_name);
        if (decompress) {
            int res = decompress(buf, len, NULL, flush_buffer, NULL,
                   &my_inptr, error);
            if (res)
                error("decompressor failed");
        } else if (compress_name) {
...
        } else
            error("junk in compressed archive");
        if (state != Reset)
            error("junk in compressed archive");
        this_header = saved_offset + my_inptr;
        buf += my_inptr;
        len -= my_inptr;
    }
    dir_utime();
    kfree(name_buf);
    kfree(symlink_buf);
    kfree(header_buf);
    return message;
}

 

 

3.4.1 decompressor

The decompressor supported in the kernel is represented by struct compress_format, and the core is the decompress_fn() function.

 

struct compress_format {
    unsigned char magic[2];
    const char *name;
    decompress_fn decompressor;
};


typedef int (*decompress_fn) (unsigned char *inbuf, long len,
                  long (*fill)(void*, unsigned long),
                  long (*flush)(void*, unsigned long),
                  unsigned char *outbuf,
                  long *posp,
                  void(*error)(char *x));

/* inbuf   - input buffer
 *len     - len of pre-read data in inbuf
 *fill    - function to fill inbuf when empty
 *flush   - function to write out outbuf
 *outbuf  - output buffer
 *posp    - if non-null, input position (number of bytes read) will be
 *      returned here

 

The decompress_method judges the decompressor used in the corresponding space according to the two bytes of the incoming inbuf header.

decompressed_formats[] saves the decompressor types supported by the system.

 

static const struct compress_format compressed_formats[] __initconst = {
    { {0x1f, 0x8b}, "gzip", gunzip },
    { {0x1f, 0x9e}, "gzip", gunzip },
    { {0x42, 0x5a}, "bzip2", bunzip2 },
    { {0x5d, 0x00}, "lzma", unlzma },
    { {0xfd, 0x37}, "xz", unxz },
    { {0x89, 0x4c}, "lzo", unlzo },
    { {0x02, 0x21}, "lz4", unlz4 },
    { {0, 0}, NULL, NULL }
};

decompress_fn __init decompress_method(const unsigned char *inbuf, long len,
                const char **name)
{
...
    pr_debug("Compressed data magic: %#.2x %#.2x\n", inbuf[0], inbuf[1]);

    for (cf = compressed_formats; cf->name; cf++) {
        if (!memcmp(inbuf, cf->magic, 2))------------------------遍历compressed_formats[]知道找到吻合的magic作为后续ramfs解压工具。
            break;
    }
    if (name)
        *name = cf->name;
    return cf->decompressor;
}

 

 The decompres_fn() corresponding to the gzip type is gunzip, which is not studied in depth here, but the input parameter flush() function is closely related to ramfs. 

 

STATIC int INIT gunzip(unsigned char *buf, long len,
               long (*fill)(void*, unsigned long),
               long (*flush)(void*, unsigned long),
               unsigned char *out_buf,
               long *pos,
               void (*error)(char *x))
{
    return __gunzip(buf, len, fill, flush, out_buf, 0, pos, error);
}

STATIC int INIT __gunzip(unsigned char *buf, long len,
               long (*fill)(void*, unsigned long),
               long (*flush)(void*, unsigned long),
               unsigned char *out_buf, long out_len,
               long *pos,
               void(*error)(char *x)) {
    u8 *zbuf;
    struct z_stream_s *strm;
    int rc;

    rc = -1;
    if (flush) {
        out_len = 0x8000; /* 32 K */
        out_buf = malloc(out_len);-----------------------以32K为单位进行处理。
    } else {
        if (!out_len)
            out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
    }
...
    while (rc == Z_OK) {
...
        rc = zlib_inflate(strm, 0);

        /* Write any data generated */
        if (flush && strm->next_out > out_buf) {
            long l = strm->next_out - out_buf;
            if (l != flush(out_buf, l)) {-----------------将解压后的数据刷出,这里即调用flush_buffer()进行处理。
                rc = -1;
                error("write error");
                break;
            }
            strm->next_out = out_buf;
            strm->avail_out = out_len;
        }

        /* after Z_FINISH, only Z_STREAM_END is "we unpacked it all" */
        if (rc == Z_STREAM_END) {
            rc = 0;
            break;
        } else if (rc != Z_OK) {
            error("uncompression error");
            rc = -1;
        }
    }

    zlib_inflateEnd(strm);
    if (pos)
        /* add + 8 to skip over trailer */
        *pos = strm->next_in - zbuf+8;

gunzip_5:
    free(strm->workspace);
gunzip_nomem4:
    free(strm);
gunzip_nomem3:
    if (!buf)
        free(zbuf);
gunzip_nomem2:
    if (flush)
        free(out_buf);
gunzip_nomem1:
    return rc; /* returns Z_OK (0) if successful */
}

 

 

3.4.2 flush_buffer

From the above analysis, we can see that rootfs uses the ramfs file system type.

The ramfs part is decompressed through gzip, and then the decompressed content is flushed out through flush_buffer.

Let's take a look at how flush_buffer() flushes the memory of the size of __initramfs_size from __initramfs_start into the rootfs file system.

flush_buffer() calls write_buffer for processing, here one core is to call different actions[state] for processing through different state machine states.

 

static long __init write_buffer(char *buf, unsigned long len)
{
    byte_count = len;
    victim = buf;

    while (!actions[state]())
        ;
    return len - byte_count;
}

static long __init flush_buffer(void *bufv, unsigned long len)
{
    char *buf = (char *) bufv;
    long written;
    long origLen = len;
    if (message)
        return -1;
    while ((written = write_buffer(buf, len)) < len && !message) {
...
    }
    return origLen;
}

 

Actions[] can be said to be the core of transforming the decompressed data and generating rootfs.

actions[] calls the corresponding system call, and generates the entire file system step by step according to the decompressed data.

 

static __initdata int (*actions[])(void) = {
    [Start]        = do_start,
    [Collect]    = do_collect,
    [GotHeader]    = do_header,
    [SkipIt]    = do_skip,
    [GotName]    = do_name,
    [CopyFile]    = do_copy,
    [GotSymlink]    = do_symlink,
    [Reset]        = do_reset,
};

static int __init do_start(void)
{
    read_into(header_buf, 110, GotHeader);----------------------读取开头110字节,用于解析cpio文件头。
    return 0;
}

static int __init do_collect(void)
{
    unsigned long n = remains;
if (byte_count < n)
        n = byte_count;
    memcpy(collect, victim, n);
    eat(n);
    collect += n;
    if ((remains -= n) != 0)
        return 1;
    state = next_state;
    return 0;
}

static int __init do_header(void)
{
    if (memcmp(collected, "070707", 6)==0) {---------------------cpio文件的magic,开头6个字节“070707”或者“070701”。
        error("incorrect cpio method used: use -H newc option");
        return 1;
    }
    if (memcmp(collected, "070701", 6)) {
        error("no cpio magic");
        return 1;
    }
    parse_header(collected);
    next_header = this_header + N_ALIGN(name_len) + body_len;
    next_header = (next_header + 3) & ~3;
    state = SkipIt;
    if (name_len <= 0 || name_len > PATH_MAX)
        return 0;
    if (S_ISLNK(mode)) {
        if (body_len > PATH_MAX)
            return 0;
        collect = collected = symlink_buf;
        remains = N_ALIGN(name_len) + body_len;
        next_state = GotSymlink;
        state = Collect;
        return 0;
    }
    if (S_ISREG(mode) || !body_len)
        read_into(name_buf, N_ALIGN(name_len), GotName);
    return 0;
}

static int __init do_skip(void)
{
if (this_header + byte_count < next_header) {
        eat(byte_count);
        return 1;
    } else {
        eat(next_header - this_header);
        state = next_state;
        return 0;
    }
}

static int __init do_reset(void)
{
while (byte_count && *victim == '\0')
        eat(1);
    if (byte_count && (this_header & 3))
        error("broken padding");
    return 1;
}

static int __init maybe_link(void)
{
if (nlink >= 2) {
        char *old = find_link(major, minor, ino, mode, collected);
        if (old)
            return (sys_link(old, collected) < 0) ? -1 : 1;
    }
    return 0;
}

static void __init clean_path(char *path, umode_t fmode)
{
    struct stat st;

if (!sys_newlstat(path, &st) && (st.st_mode ^ fmode) & S_IFMT) {
        if (S_ISDIR(st.st_mode))--------------------------------删除目录,如果确实是一个目录调用sys_rmdir();如果是一个link,只需要sys_unlink()。
            sys_rmdir(path);
        else
            sys_unlink(path);
    }
}

static __initdata int wfd;

static int __init do_name(void)
{
    state = SkipIt;
    next_state = Reset;
    if (strcmp(collected, "TRAILER!!!") == 0) {
        free_hash();
        return 0;
    }
    clean_path(collected, mode);
    if (S_ISREG(mode)) {---------------------------------------如果是一个普通文件,调用sys_open()创建文件,并且通过sys_fchown()和sys_fchmod()等进行属性修改。
        int ml = maybe_link();
        if (ml >= 0) {
            int openflags = O_WRONLY|O_CREAT;
            if (ml != 1)
                openflags |= O_TRUNC;
            wfd = sys_open(collected, openflags, mode);

            if (wfd >= 0) {
                sys_fchown(wfd, uid, gid);
                sys_fchmod(wfd, mode);
                if (body_len)
                    sys_ftruncate(wfd, body_len);
                vcollected = kstrdup(collected, GFP_KERNEL);
                state = CopyFile;-----------------------------然后进行do_copy()将gzip解压的数据写入wfd中。
            }
        }
    } else if (S_ISDIR(mode)) {-------------------------------如果是一个目录则调用sys_mkdir()创建目录。
        sys_mkdir(collected, mode);
        sys_chown(collected, uid, gid);
        sys_chmod(collected, mode);
        dir_add(collected, mtime);
    } else if (S_ISBLK(mode) || S_ISCHR(mode) ||
           S_ISFIFO(mode) || S_ISSOCK(mode)) {
        if (maybe_link() == 0) {
            sys_mknod(collected, mode, rdev);
            sys_chown(collected, uid, gid);
            sys_chmod(collected, mode);
            do_utime(collected, mtime);
        }
    }
    return 0;
}

static int __init do_copy(void)
{
    if (byte_count >= body_len) {-----------------------------将数据写入wfd中,如果遇到写完则关闭文件,并且更新do_utime()。
        if (xwrite(wfd, victim, body_len) != body_len)
            error("write error");
        sys_close(wfd);
        do_utime(vcollected, mtime);
        kfree(vcollected);
        eat(body_len);
        state = SkipIt;
        return 0;
    } else {
        if (xwrite(wfd, victim, byte_count) != byte_count)
            error("write error");
        body_len -= byte_count;
        eat(byte_count);
        return 1;
    }
}

static int __init do_symlink(void)
{
    collected[N_ALIGN(name_len) + body_len] = '\0';
    clean_path(collected, 0);
    sys_symlink(collected + N_ALIGN(name_len), collected);-------对于符号链接调用sys_symlink()创建符号。
    sys_lchown(collected, uid, gid);
    do_utime(collected, mtime);
    state = SkipIt;
    next_state = Reset;
    return 0;
}

From the above series of actions[] functions, we can see that the data after gzip decompression undergoes a complex mode to jump to different functions to process the buffer.

In the end, a complete rootfs is created by calling the same function in the kernel like open()/write()/close()/mkdir().

3.5 release init memory

After all initcalls are executed, free_initmem() is called to release the memory.

 

void free_initmem(void)
{
    unsigned long addr;

    addr = (unsigned long) &__init_begin;
    while (addr < (unsigned long) &__init_end) {
            ClearPageReserved(virt_to_page(addr));
            init_page_count(virt_to_page(addr));
            free_page(addr);---------------------每次释放一个页面。
            totalram_pages++;--------------------totalram_pages递增。
            addr += PAGE_SIZE;-------------------addr后移一个页面。
    }
    pr_info("Freeing unused kernel memory: %dk freed\n",
            ((unsigned int)&__init_end - (unsigned int)&__init_begin) >> 10);
}

 

Since the segment .init.ramfs that stores the ramdisk is between __init_begin and __init_end, all will be released together.

3.6 ramdisk execution

Kernel_init() is the first process in user space, and ramdisk related to ramfs file system type preparation; ramdisk decompression; start ramdisk_execute_command to replace the current process.

 

static int __ref kernel_init(void *unused)
{
    int ret;

    kernel_init_freeable();--------执行各种initcall,包括对ramfs注册和populate_rootfs()解压ramdisk;以及判断ramdisk_execute_command是否存在,否则prepare_namespace()
...
    if (ramdisk_execute_command) {
        ret = run_init_process(ramdisk_execute_command);
        if (!ret)
            return 0;
        pr_err("Failed to execute %s (error %d)\n",
               ramdisk_execute_command, ret);
    }...
    panic("No working init found.  Try passing init= option to kernel. "
          "See Linux Documentation/init.txt for guidance.");
}

 

Register the ramfs file system type in kernel_init_freeable(), and extract the code of the size of __initramfs_start from __initramfs_start in vmlinux to rootfs.

Then sys_access() checks if ramdisk_execute_command exists in rootfs, if not, prepare_namespace() is needed to prepare rootfs.

 

static noinline void __init kernel_init_freeable(void)
{
...
    if (!ramdisk_execute_command)
        ramdisk_execute_command = "/init";

    if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {
        ramdisk_execute_command = NULL;
        prepare_namespace();
    }
...
}

 

run_init_process() is started from rootfs according to init_filename, replacing the current process as the first process in user space. 

 

static int run_init_process(const char *init_filename)
{
    argv_init[0] = init_filename;
    return do_execve(getname_kernel(init_filename),--------------------------init_filename对应/sbin/init。
        (const char __user *const __user *)argv_init,------------------------argv_init[0]对应/sbin/init,其他为空。
        (const char __user *const __user *)envp_init);-----------------------envp_init[0]对应"HOME=/",envp_init[1]对应"TERM=linux"。
}

int do_execve(struct filename *filename,
    const char __user *const __user *__argv,
    const char __user *const __user *__envp)
{
    struct user_arg_ptr argv = { .ptr.native = __argv };
    struct user_arg_ptr envp = { .ptr.native = __envp };
    return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

static int do_execveat_common(int fd, struct filename *filename,
                  struct user_arg_ptr argv,
                  struct user_arg_ptr envp,
                  int flags)
{
    char *pathbuf = NULL;
    struct linux_binprm *bprm;
    struct file *file;
    struct files_struct *displaced;
    int retval;

    if (IS_ERR(filename))
        return PTR_ERR(filename);

    if ((current->flags & PF_NPROC_EXCEEDED) &&
        atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
        retval = -EAGAIN;
        goto out_ret;
    }

    current->flags &= ~PF_NPROC_EXCEEDED;

    retval = unshare_files(&displaced);
    if (retval)
        goto out_ret;

    retval = -ENOMEM;
    bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
    if (!bprm)
        goto out_files;

    retval = prepare_bprm_creds(bprm);
    if (retval)
        goto out_free;

    check_unsafe_exec(bprm);
    current->in_execve = 1;

    file = do_open_execat(fd, filename, flags);
    retval = PTR_ERR(file);
    if (IS_ERR(file))
        goto out_unmark;

    sched_exec();

    bprm->file = file;
    if (fd == AT_FDCWD || filename->name[0] == '/') {
        bprm->filename = filename->name;
    } else {
        if (filename->name[0] == '\0')
            pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
        else
            pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
                        fd, filename->name);
        if (!pathbuf) {
            retval = -ENOMEM;
            goto out_unmark;
        }

        if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
            bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
        bprm->filename = pathbuf;
    }
    bprm->interp = bprm->filename;

    retval = bprm_mm_init(bprm);
    if (retval)
        goto out_unmark;

    bprm->argc = count(argv, MAX_ARG_STRINGS);
    if ((retval = bprm->argc) < 0)
        goto out;

    bprm->envc = count(envp, MAX_ARG_STRINGS);
    if ((retval = bprm->envc) < 0)
        goto out;

    retval = prepare_binprm(bprm);
    if (retval < 0)
        goto out;

    retval = copy_strings_kernel(1, &bprm->filename, bprm);
    if (retval < 0)
        goto out;

    bprm->exec = bprm->p;
    retval = copy_strings(bprm->envc, envp, bprm);
    if (retval < 0)
        goto out;

    retval = copy_strings(bprm->argc, argv, bprm);
    if (retval < 0)
        goto out;

    would_dump(bprm, bprm->file);

    retval = exec_binprm(bprm);
    if (retval < 0)
        goto out;

    /* execve succeeded */
    current->fs->in_exec = 0;
    current->in_execve = 0;
    acct_update_integrals(current);
    task_numa_free(current);
    free_bprm(bprm);
    kfree(pathbuf);
    putname(filename);
    if (displaced)
        put_files_struct(displaced);
    return retval;

out:
    if (bprm->mm) {
        acct_arg_size(bprm, 0);
        mmput(bprm->mm);
    }

out_unmark:
    current->fs->in_exec = 0;
    current->in_execve = 0;

out_free:
    free_bprm(bprm);
    kfree(pathbuf);

out_files:
    if (displaced)
        reset_files_struct(displaced);
out_ret:
    putname(filename);
    return retval;
}

 

 

4. Summary

In summary, after the ramdisk function is turned on during buildroot or kernel compilation, the ramdisk will be embedded in vmlinux.

In the Linux boot phase, the ramdisk is read from the code through populate_rootfs(). Then call gzip decompressor to decompress it to RAM, and the decompressed data is parsed and converted into rootfs file system through actions[].

After the init initialization is completed, the ramfs related memory is released along with the init memory and returns to totalram_pages.

In the final stage of kernel_init(), the init process in the ramdisk is executed through run_init_process() as the first process in the user space.

Guess you like

Origin blog.csdn.net/daocaokafei/article/details/114845111