【linux内核】start_kernel和rest_init

以前看linux内核觉得一直没有真正看懂，用GPT试着分析两个函数。

linux3.16内核：

asmlinkage __visible void __init start_kernel(void)
{
    char * command_line, *after_dashes;
    extern const struct kernel_param __start___param[], __stop___param[];
    /*
     * Need to run as early as possible, to initialize the
     * lockdep hash:
     */
    lockdep_init();
    smp_setup_processor_id();
    debug_objects_early_init();
    /*
     * Set up the the initial canary ASAP:
     */
    boot_init_stack_canary();
    cgroup_init_early();
    local_irq_disable();
    early_boot_irqs_disabled = true;
/*
 * Interrupts are still disabled. Do necessary setups, then
 * enable them
 */
    boot_cpu_init();
    page_address_init();
    pr_notice("%s", linux_banner);
    setup_arch(&command_line);
    mm_init_cpumask(&init_mm);
    setup_command_line(command_line);
    setup_nr_cpu_ids();
    setup_per_cpu_areas();
    smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
    build_all_zonelists(NULL, NULL);
    page_alloc_init();
    pr_notice("Kernel command line: %s\n", boot_command_line);
    parse_early_param();
    after_dashes = parse_args("Booting kernel",
                  static_command_line, __start___param,
                  __stop___param - __start___param,
                  -1, -1, &unknown_bootoption);
    if (!IS_ERR_OR_NULL(after_dashes))
        parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
               set_init_arg);
    jump_label_init();
    /*
     * These use large bootmem allocations and must precede
     * kmem_cache_init()
     */
    setup_log_buf(0);
    pidhash_init();
    vfs_caches_init_early();
    sort_main_extable();
    trap_init();
    mm_init();
    /*
     * Set up the scheduler prior starting any interrupts (such as the
     * timer interrupt). Full topology setup happens at smp_init()
     * time - but meanwhile we still have a functioning scheduler.
     */
    sched_init();
    /*
     * Disable preemption - early bootup scheduling is extremely
     * fragile until we cpu_idle() for the first time.
     */
    preempt_disable();
    if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n"))
        local_irq_disable();
    idr_init_cache();
    rcu_init();
    tick_nohz_init();
    context_tracking_init();
    radix_tree_init();
    /* init some links before init_ISA_irqs() */
    early_irq_init();
    init_IRQ();
    tick_init();
    init_timers();
    hrtimers_init();
    softirq_init();
    timekeeping_init();
    time_init();
    sched_clock_postinit();
    perf_event_init();
    profile_init();
    call_function_init();
    WARN(!irqs_disabled(), "Interrupts were enabled early\n");
    early_boot_irqs_disabled = false;
    local_irq_enable();
    kmem_cache_init_late();
    /*
     * HACK ALERT! This is early. We're enabling the console before
     * we've done PCI setups etc, and console_init() must be aware of
     * this. But we do want output early, in case something goes wrong.
     */
    console_init();
    if (panic_later)
        panic("Too many boot %s vars at `%s'", panic_later,
              panic_param);
    lockdep_info();
    /*
     * Need to run this when irqs are enabled, because it wants
     * to self-test [hard/soft]-irqs on/off lock inversion bugs
     * too:
     */
    locking_selftest();
#ifdef CONFIG_BLK_DEV_INITRD
    if (initrd_start && !initrd_below_start_ok &&
        page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
        pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
            page_to_pfn(virt_to_page((void *)initrd_start)),
            min_low_pfn);
        initrd_start = 0;
    }
#endif
    page_cgroup_init();
    debug_objects_mem_init();
    kmemleak_init();
    setup_per_cpu_pageset();
    numa_policy_init();
    if (late_time_init)
        late_time_init();
    sched_clock_init();
    calibrate_delay();
    pidmap_init();
    anon_vma_init();
    acpi_early_init();
#ifdef CONFIG_X86
    if (efi_enabled(EFI_RUNTIME_SERVICES))
        efi_enter_virtual_mode();
#endif
#ifdef CONFIG_X86_ESPFIX64
    /* Should be run before the first non-init thread is created */
    init_espfix_bsp();
#endif
    thread_info_cache_init();
    cred_init();
    fork_init(totalram_pages);
    proc_caches_init();
    buffer_init();
    key_init();
    security_init();
    dbg_late_init();
    vfs_caches_init(totalram_pages);
    signals_init();
    /* rootfs populating might need page-writeback */
    page_writeback_init();
    proc_root_init();
    cgroup_init();
    cpuset_init();
    taskstats_init_early();
    delayacct_init();
    check_bugs();
    acpi_subsystem_init();
    sfi_init_late();
    if (efi_enabled(EFI_RUNTIME_SERVICES)) {
        efi_late_init();
        efi_free_boot_services();
    }
    ftrace_init();
    /* Do the rest non-__init'ed, we're now alive */
    rest_init();
}

这个函数是Linux内核启动的关键入口之一,进行了大量的内核初始化工作,主要功能如下:
1. 初始化锁依赖(lockdep)和处理器ID
2. 设置内核初始栈保护(boot_init_stack_canary)
3. 启动第一个引导CPU,设置页表地址,打印内核欢迎信息
4. 初始化内存管理子系统(mm_init)
5. 设置命令行参数,解析内核启动参数
6. 初始化跳转标签(jump label)
7. 初始化日志缓冲区、PID哈希表、VFS缓存等
8. 初始化中断向量表、软中断、调度器
9. 初始化RCU,时钟,定时器,高精度定时器等
10. 初始化slab分配器
11. 初始化控制台,自检锁等
12. 如果有initrd,检查其地址范围的正确性
13. 初始化调度时钟,延迟校准等
14. 初始化进程凭证,分叉,以及各种缓存与表
15. 执行最终初始化(rest_init)
通过这些关键初始化步骤,Linux内核从机器启动至可以开始执行进程,完成了从硬件到操作系统内核的启动工作。

这段代码是Linux内核启动过程中的一部分，函数名为start_kernel。这个函数的主要任务是在系统启动时初始化硬件设备、内存管理、进程管理等关键组件。

以下是该函数的主要步骤：

初始化锁依赖性和处理器ID。
设置并启用内核命令行和参数。
初始化并启用各种中断和调度器。
进行必要的系统设置，如日志缓冲区、页表缓存、进程控制等。
初始化并启用各种硬件设备，如时钟、中断控制器、虚拟内存管理等。
对系统进行全面的自检和测试，包括硬件、软件和性能等方面。
初始化ACPI子系统和EFI运行时服务（如果启用了EFI）。
最后，调用rest_init()函数完成剩余的初始化工作。
注意，这是一个非常复杂的过程，涉及到许多底层的硬件和系统操作。在大多数情况下，用户不需要关心这个过程的具体细节，只需要知道当系统启动完成后，所有的硬件设备和软件环境都应该被正确地初始化和配置。

linux5.16.2内核：

asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
{
    char *command_line;
    char *after_dashes;
    set_task_stack_end_magic(&init_task);
    smp_setup_processor_id();
    debug_objects_early_init();
    init_vmlinux_build_id();
    cgroup_init_early();
    local_irq_disable();
    early_boot_irqs_disabled = true;
    /*
     * Interrupts are still disabled. Do necessary setups, then
     * enable them.
     */
    boot_cpu_init();
    page_address_init();
    pr_notice("%s", linux_banner);
    early_security_init();
    setup_arch(&command_line);
    setup_boot_config();
    setup_command_line(command_line);
    setup_nr_cpu_ids();
    setup_per_cpu_areas();
    smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
    boot_cpu_hotplug_init();
    build_all_zonelists(NULL);
    page_alloc_init();
    pr_notice("Kernel command line: %s\n", saved_command_line);
    /* parameters may set static keys */
    jump_label_init();
    parse_early_param();
    after_dashes = parse_args("Booting kernel",
                  static_command_line, __start___param,
                  __stop___param - __start___param,
                  -1, -1, NULL, &unknown_bootoption);
    print_unknown_bootoptions();
    if (!IS_ERR_OR_NULL(after_dashes))
        parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
               NULL, set_init_arg);
    if (extra_init_args)
        parse_args("Setting extra init args", extra_init_args,
               NULL, 0, -1, -1, NULL, set_init_arg);
    /*
     * These use large bootmem allocations and must precede
     * kmem_cache_init()
     */
    setup_log_buf(0);
    vfs_caches_init_early();
    sort_main_extable();
    trap_init();
    mm_init();
    ftrace_init();
    /* trace_printk can be enabled here */
    early_trace_init();
    /*
     * Set up the scheduler prior starting any interrupts (such as the
     * timer interrupt). Full topology setup happens at smp_init()
     * time - but meanwhile we still have a functioning scheduler.
     */
    sched_init();
    if (WARN(!irqs_disabled(),
         "Interrupts were enabled *very* early, fixing it\n"))
        local_irq_disable();
    radix_tree_init();
    /*
     * Set up housekeeping before setting up workqueues to allow the unbound
     * workqueue to take non-housekeeping into account.
     */
    housekeeping_init();
    /*
     * Allow workqueue creation and work item queueing/cancelling
     * early.  Work item execution depends on kthreads and starts after
     * workqueue_init().
     */
    workqueue_init_early();
    rcu_init();
    /* Trace events are available after this */
    trace_init();
    if (initcall_debug)
        initcall_debug_enable();
    context_tracking_init();
    /* init some links before init_ISA_irqs() */
    early_irq_init();
    init_IRQ();
    tick_init();
    rcu_init_nohz();
    init_timers();
    srcu_init();
    hrtimers_init();
    softirq_init();
    timekeeping_init();
    kfence_init();
    /*
     * For best initial stack canary entropy, prepare it after:
     * - setup_arch() for any UEFI RNG entropy and boot cmdline access
     * - timekeeping_init() for ktime entropy used in rand_initialize()
     * - rand_initialize() to get any arch-specific entropy like RDRAND
     * - add_latent_entropy() to get any latent entropy
     * - adding command line entropy
     */
    rand_initialize();
    add_latent_entropy();
    add_device_randomness(command_line, strlen(command_line));
    boot_init_stack_canary();
    time_init();
    perf_event_init();
    profile_init();
    call_function_init();
    WARN(!irqs_disabled(), "Interrupts were enabled early\n");
    early_boot_irqs_disabled = false;
    local_irq_enable();
    kmem_cache_init_late();
    /*
     * HACK ALERT! This is early. We're enabling the console before
     * we've done PCI setups etc, and console_init() must be aware of
     * this. But we do want output early, in case something goes wrong.
     */
    console_init();
    if (panic_later)
        panic("Too many boot %s vars at `%s'", panic_later,
              panic_param);
    lockdep_init();
    /*
     * Need to run this when irqs are enabled, because it wants
     * to self-test [hard/soft]-irqs on/off lock inversion bugs
     * too:
     */
    locking_selftest();
    /*
     * This needs to be called before any devices perform DMA
     * operations that might use the SWIOTLB bounce buffers. It will
     * mark the bounce buffers as decrypted so that their usage will
     * not cause "plain-text" data to be decrypted when accessed.
     */
    mem_encrypt_init();
#ifdef CONFIG_BLK_DEV_INITRD
    if (initrd_start && !initrd_below_start_ok &&
        page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
        pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
            page_to_pfn(virt_to_page((void *)initrd_start)),
            min_low_pfn);
        initrd_start = 0;
    }
#endif
    setup_per_cpu_pageset();
    numa_policy_init();
    acpi_early_init();
    if (late_time_init)
        late_time_init();
    sched_clock_init();
    calibrate_delay();
    pid_idr_init();
    anon_vma_init();
#ifdef CONFIG_X86
    if (efi_enabled(EFI_RUNTIME_SERVICES))
        efi_enter_virtual_mode();
#endif
    thread_stack_cache_init();
    cred_init();
    fork_init();
    proc_caches_init();
    uts_ns_init();
    key_init();
    security_init();
    dbg_late_init();
    vfs_caches_init();
    pagecache_init();
    signals_init();
    seq_file_init();
    proc_root_init();
    nsfs_init();
    cpuset_init();
    cgroup_init();
    taskstats_init_early();
    delayacct_init();
    poking_init();
    check_bugs();
    acpi_subsystem_init();
    arch_post_acpi_subsys_init();
    kcsan_init();
    /* Do the rest non-__init'ed, we're now alive */
    arch_call_rest_init();
    prevent_tail_call_optimization();
}

这个函数是Linux内核启动过程中的关键函数之一,它进行了大量的内核初始化工作。主要功能包括:
1. 设置处理器ID,初始化早期的调试对象。
2. 禁用本地中断。
3. 初始化启动CPU,设置页表地址,打印内核横幅。
4. 初始化早期的安全性设置,解析命令行参数。
5. 初始化记忆体管理,建立zone列表。
6. 初始化跳转标签,解析内核启动参数。
7. 初始化日志缓冲区,虚拟文件系统缓存。
8. 初始化陷阱和内存管理子系统。
9. 初始化调度器,时钟,RCU等。
10. 初始化工作队列,软中断,高精度定时器等。
11. 初始化堆栈保护,熵源,打印启动选项。
12. 初始化slab分配器,控制台打印。
13. 检查initrd地址范围。
14. 继续进行剩余的初始化工作。
通过这些关键步骤,Linux内核从启动进入到可以运行进程的环境,完成了从硬件初始化到启动内核的重要工作。

这段代码是Linux内核启动过程中的一部分，函数名为start_kernel，是一个汇编语言的实现。这个函数主要完成以下任务：

初始化各种硬件设备和内存管理；
初始化进程管理相关的数据结构；
设置中断处理机制；
初始化调度器；
启用全局中断；
进入内核主循环。
整个过程涉及到很多子任务的完成，包括启动内核、初始化内存、设置中断处理、初始化调度器等。这些任务的完成需要按照一定的顺序进行，以确保系统能够正常启动。

noinline void __ref rest_init(void)
{
    struct task_struct *tsk;
    int pid;
    rcu_scheduler_starting();
    /*
     * We need to spawn init first so that it obtains pid 1, however
     * the init task will end up wanting to create kthreads, which, if
     * we schedule it before we create kthreadd, will OOPS.
     */
    pid = kernel_thread(kernel_init, NULL, CLONE_FS);
    /*
     * Pin init on the boot CPU. Task migration is not properly working
     * until sched_init_smp() has been run. It will set the allowed
     * CPUs for init to the non isolated CPUs.
     */
    rcu_read_lock();
    tsk = find_task_by_pid_ns(pid, &init_pid_ns);
    tsk->flags |= PF_NO_SETAFFINITY;
    set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
    rcu_read_unlock();
    numa_default_policy();
    pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
    rcu_read_lock();
    kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
    rcu_read_unlock();
    /*
     * Enable might_sleep() and smp_processor_id() checks.
     * They cannot be enabled earlier because with CONFIG_PREEMPTION=y
     * kernel_thread() would trigger might_sleep() splats. With
     * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
     * already, but it's stuck on the kthreadd_done completion.
     */
    system_state = SYSTEM_SCHEDULING;
    complete(&kthreadd_done);
    /*
     * The boot idle thread must execute schedule()
     * at least once to get things moving:
     */
    schedule_preempt_disabled();
    /* Call into cpu_idle with preempt disabled */
    cpu_startup_entry(CPUHP_ONLINE);
}

这段代码主要完成内核剩余初始化工作的最后阶段,主要功能如下:
1. 调用rcu_scheduler_starting(),通知RCU子系统调度器已经启动。
2. 通过kernel_thread创建init进程,获得PID 1,并将其锁定在启动CPU上。
3. 通过kernel_thread创建kthreadd内核线程。
4. 将系统状态设置为SYSTEM_SCHEDULING,表示开始正式调度。
5. 完成kthreadd线程的同步,等待其就绪。
6. 执行一次schedule(),启动调度器,开始调度线程。
7. 调用cpu_startup_entry(),完成CPU的线上(Online)初始化。
至此,Linux内核启动过程基本完成,初始化工作全部就绪,可以正式进入多任务正常运行状态。
init进程和kthreadd线程负责后续创建和管理其他内核线程与用户进程。

这段代码是一个名为rest_init的函数，它是一个非内联函数（noinline），没有返回值（void）。该函数用于初始化系统，并执行一系列任务调度和设置。

函数的主要步骤如下：

启动调度器（rcu_scheduler_starting()）。
创建内核线程（kernel_thread）以运行内核初始化函数（kernel_init）。
将内核初始化任务固定在引导CPU上（rcu_read_lock()、find_task_by_pid_ns()、set_cpus_allowed_ptr()、rcu_read_unlock()）。
设置默认的NUMA策略（numa_default_policy()）。
创建并启动kthreadd线程（kernel_thread）。
查找kthreadd任务（find_task_by_pid_ns()）。
启用might_sleep()和smp_processor_id()检查（system_state = SYSTEM_SCHEDULING）。
完成kthreadd_done任务（complete(&kthreadd_done)）。
调用schedule_preempt_disabled()函数。
使用预禁用的抢占功能进入CPU空闲状态（cpu_startup_entry(CPUHP_ONLINE)）。
这段代码的作用是在系统启动时进行一些初始化操作，包括创建内核线程、设置任务优先级、启用调度等。它确保了系统的正常运行和任务的正确调度。

这段代码是Linux内核的初始化函数rest_init()。该函数会在系统引导过程中被调用，用于初始化进程调度和CPU相关的设置。

首先，函数调用rcu_scheduler_starting()，用于启动RCU（Read-Copy-Update）机制。

接下来，通过kernel_thread()函数创建一个内核线程kernel_init()，并将其绑定到引导CPU上。kernel_init()是系统的第一个用户空间进程，其PID被设置为1。由于kernel_init()进程需要创建其他内核线程，所以需要先将其绑定到引导CPU上，以避免调度问题。

然后，通过find_task_by_pid_ns()找到kernel_init()进程的task_struct结构体，并设置其PF_NO_SETAFFINITY标志，表示不允许该进程改变其运行的CPU。然后，使用set_cpus_allowed_ptr()函数将kernel_init()进程绑定到当前CPU上。

接着，调用numa_default_policy()函数，用于设置NUMA（Non-Uniform Memory Access，非一致内存访问）的默认策略。

再次使用kernel_thread()函数创建一个内核线程kthreadd，并将其绑定到引导CPU上。

然后，通过find_task_by_pid_ns()找到kthreadd进程的task_struct结构体。

接下来，设置系统状态为SYSTEM_SCHEDULING，表示系统开始进行进程调度。

然后，调用complete(&kthreadd_done)函数，表示kthreadd进程已经完成初始化。

接着，使用schedule_preempt_disabled()函数调度一个禁止抢占的idle线程，将控制权交给idle线程，以便后续操作执行。

最后，调用cpu_startup_entry(CPUHP_ONLINE)函数，表示CPU已经启动完成，可以开始正常运行。

这是一个初始化函数，它用于在Linux内核启动过程中初始化和配置一些任务和线程。让我为你逐行解释一下这个函数：

noinline void __ref rest_init(void)：定义了一个名为rest_init的函数，该函数没有返回值（void），且被标记为noinline，这意味着编译器不应尝试将函数的调用替换为函数体。__ref是告诉编译器该函数可能会被引用，不要优化掉。
struct task_struct *tsk; int pid;：声明了两个变量，一个是指向task_struct结构体的指针tsk，另一个是整型变量pid。task_struct是Linux内核用于表示进程/线程的数据结构。
rcu_scheduler_starting();：调用RCU（Read-Copy Update）调度器的开始函数，RCU是一种并发控制机制，用于读取共享数据时避免阻塞。
pid = kernel_thread(kernel_init, NULL, CLONE_FS);：创建一个新的内核线程，它将运行kernel_init函数。kernel_thread函数创建一个新的进程并执行指定的函数。这里使用CLONE_FS标志，表示新线程与父线程共享文件系统信息。
rcu_read_lock(); 和 rcu_read_unlock();：开始和结束RCU读锁的操作。RCU读锁用于在并发环境中保护对数据的读取，不会阻塞读取操作。
tsk = find_task_by_pid_ns(pid, &init_pid_ns);：通过PID在指定的命名空间中查找对应的任务。
tsk->flags |= PF_NO_SETAFFINITY;：设置任务的PF_NO_SETAFFINITY标志，这表示不允许修改该任务的CPU亲和性。
set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));：设置任务可以运行在的CPU集合，这里设置为当前CPU。
numa_default_policy();：设置NUMA（Non-Uniform Memory Access）策略为默认策略。NUMA是一种内存管理策略，它根据处理器的位置来决定内存的访问成本。
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);：再次调用kernel_thread创建一个新的线程，这次运行的是kthreadd函数，并使用CLONE_FS | CLONE_FILES标志，表示新线程与父线程共享文件系统和文件描述符。
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);：通过PID查找新创建的kthreadd任务。
system_state = SYSTEM_SCHEDULING;：设置系统状态为调度状态。
complete(&kthreadd_done);：通知等待在kthreadd_done完成变量上的任务，该任务已经完成。
schedule_preempt_disabled();：调用调度函数，执行上下文切换。这是必要的，因为配置完成后，内核现在需要开始执行用户级别的进程和线程。
cpu_startup_entry(CPUHP_ONLINE);：启动CPU的入口点，该函数将执行CPU的初始化和设置，并最终导致CPU运行idle任务。
总的来说，这个函数的主要目的是在Linux内核启动过程中初始化和配置一些关键的任务和线程，如init任务和kthreadd任务，以及相关的设置和配置。

【linux内核】start_kernel和rest_init

猜你喜欢