Kernel: 2.6.10-rc2 Finished: 01/01/05 /* * Activate the first processor. */ asmlinkage void __init start_kernel(void) { char * command_line; extern struct kernel_param __start___param[], __stop___param[]; /* * Interrupts are still disabled. Do necessary setups, then * enable them */ lock_kernel(); /* 给kernel上锁 */ page_address_init(); /* 在配置highmem才作工作 */ printk(linux_banner); /* 打印kernel版本信息 */ setup_arch(&command_line); /* 设置体系结构相关信息,包括页面映射,acpi等 */ setup_per_cpu_areas(); /* 设置smp中每个cpu区域偏移量信息 */ /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. 设置引导cpu在工作状态 */ smp_prepare_boot_cpu(); /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. */ sched_init(); /* runqueue设置 */ build_all_zonelists(); /* 建立分配策略 */ page_alloc_init(); /* hotplug CPU设置 */ printk("Kernel command line: %s/n", saved_command_line); parse_early_param(); parse_args("Booting kernel", command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); /* 对传入内核参数作分析,并作相应设置 */ sort_main_extable(); /* 异常处理调用函数表排序 */ trap_init(); /* 重新设置中断向量表 */ rcu_init(); /* 初始化RCU(Read-Copy Update),主要是一个per_cpu_rcu_tasklet */ init_IRQ(); /* 中断服务队列初始化,但没有具体中断处理函数入口,在request_irq()向系统注册 */ pidhash_init(); /* pidhash表初始化,共5个,是不是每个表中保存不同类型pid? */ init_timers(); /* 初始化一个per_cpu_tvec_bases队列,并设置TIMER_SOFTIRQ */ softirq_init(); /* 初始化软中断和tasklet */ time_init(); /* 硬件时钟及其中断初始化 */ /* * HACK ALERT! This is early. We're enabling the console before * we've done PCI setups etc, and console_init() must be aware of * this. But we do want output early, in case something goes wrong. */ console_init(); if (panic_later) panic(panic_later, panic_param); profile_init(); /* profile设置 */ local_irq_enable(); /* 开中断 */ #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && initrd_start < min_low_pfn << PAGE_SHIFT) { printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " "disabling it./n",initrd_start,min_low_pfn << PAGE_SHIFT); initrd_start = 0; } #endif vfs_caches_init_early(); /* 初始化dentry和inode缓冲队列的hash表 */ mem_init(); /* 最后内存初始化,释放前边标志为保留的所有页面 */ kmem_cache_init(); /* slab初始化 */ numa_policy_init(); /* ?????????????????????? */ if (late_time_init) late_time_init(); calibrate_delay(); /* 计算BogoMIPS */ pidmap_init(); /* 初始化pid位图 */ pgtable_cache_init(); /* pgd,pmd slab初始化 */ prio_tree_init(); /* 初始化index_bits_to_maxindex,For (struct page)->mapping->i_map*/ anon_vma_init(); /* anon_vma slab初始化,用于对rmap支持 */ #ifdef CONFIG_X86 if (efi_enabled) efi_enter_virtual_mode(); #endif fork_init(num_physpages); /* 计算系统最大安全进程数,设置当前进程最大进程数 */ proc_caches_init(); /* 其他slab初始化 */ buffer_init(); /* buffer head初始化 */ unnamed_dev_init(); /* ?????what is idr????? */ security_init(); /* security 初始化 */ vfs_caches_init(num_physpages); /* **vfs需要的cache初始化** */ radix_tree_init(); /* radix_tree初始化,该功能主要加速look up dirty or writeback pages */ signals_init(); /* 创建sigqueue slab */ /* rootfs populating might need page-writeback */ page_writeback_init(); /* 计算当前系统vm-radio等,设置是否需要回写操作 */ #ifdef CONFIG_PROC_FS proc_root_init(); /* proc文件系统初始化,并根据配置建立相应的目录和文件 */ #endif check_bugs(); acpi_early_init(); /* before LAPIC and SMP init */ /* Do the rest non-__init'ed, we're now alive */ rest_init(); /* 建立init进程 */ } /* arch/i386/kernel/setup.c */ /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures * for initialization. Note, the efi init code path is determined by the * global efi_enabled. This allows the same kernel image to be used on existing * systems (with a traditional BIOS) as well as on EFI systems. * 检测是否是通过EFI引导kernel.如果是,将通过efi导入memmap, systab等,因此用此数据 * 结构进行初始化。 * Note: efi初始化路径是在全觉efi_enabled决定的(是否配置efi_enable?)。 */ void __init setup_arch(char **cmdline_p) { unsigned long max_low_pfn; memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); pre_setup_arch_hook(); /* 执行某些体系结构相关的hook程序, i386是空 */ early_cpu_init(); /* 设置获取的cpu信息 */ /* * FIXME: This isn't an official loader_type right * now but does currently work with elilo. * If we were configured as an EFI kernel, check to make * sure that we were loaded correctly from elilo and that * the system table is valid. If not, then initialize normally. */ #ifdef CONFIG_EFI if ((LOADER_TYPE == 0x50) && EFI_SYSTAB) efi_enabled = 1; #endif /* 从setup中取得BIOS自检后取得的信息,复制到内核内存空间中(原来保存在一个临时页面中) */ ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); drive_info = DRIVE_INFO; screen_info = SCREEN_INFO; edid_info = EDID_INFO; apm_info.bios = APM_BIOS_INFO; ist_info = IST_INFO; saved_videomode = VIDEO_MODE; if( SYS_DESC_TABLE.length != 0 ) { MCA_bus = SYS_DESC_TABLE.table[3] &0x2; machine_id = SYS_DESC_TABLE.table[0]; machine_submodel_id = SYS_DESC_TABLE.table[1]; BIOS_revision = SYS_DESC_TABLE.table[2]; } aux_device_present = AUX_DEVICE_INFO; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); #endif ARCH_SETUP /* x86系列没有任何的动作 */ if (efi_enabled) efi_init(); else { printk(KERN_INFO "BIOS-provided physical RAM map:/n"); print_memory_map(machine_specific_memory_setup()); /* 处理内存图,最后保存在e820中 */ } copy_edd(); /* 复制增强磁盘参数(来之setup自检信息),实验性质,CONFIG_EDD */ if (!MOUNT_ROOT_RDONLY) root_mountflags &= ~MS_RDONLY; init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; init_mm.brk = init_pg_tables_end + PAGE_OFFSET; code_resource.start = virt_to_phys(_text); code_resource.end = virt_to_phys(_etext)-1; data_resource.start = virt_to_phys(_etext); data_resource.end = virt_to_phys(_edata)-1; parse_cmdline_early(cmdline_p); /* 分析引导时用户提供的启动参数(例如mem=xxx,acpi=xx,and so on) */ max_low_pfn = setup_memory(); /* 为页面映射作基础工作(生成map) */ /* * NOTE: before this point _nobody_ is allowed to allocate 到现在依然不可以用bootmem内存分配器来 * any memory using the bootmem allocator. Although the 分配内存,在执行paging_init()以前必须 * alloctor is now initialised only the first 8Mb of the kernel 用alloc_bootmem_low_pages()来分配内存 * virtual address space has been mapped. All allocations before * paging_init() has completed must use the alloc_bootmem_low_pages() * variant (which allocates DMA'able memory) and care must be taken * not to exceed the 8Mb limit. */ #ifdef CONFIG_SMP smp_alloc_memory(); /* AP processor realmode stacks in low memory 为启动smp其他cpu分配内存 */ #endif paging_init(); /* 页面信息初始化 */ /* * NOTE: at this point the bootmem allocator is fully available. */ #ifdef CONFIG_EARLY_PRINTK { char *s = strstr(*cmdline_p, "earlyprintk="); if (s) { extern void setup_early_printk(char *); setup_early_printk(s); printk("early console enabled/n"); } } #endif dmi_scan_machine(); /* DMI=Desktop Management Interface */ #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(*cmdline_p); /* 检测APIC(高级可编程中断器) */ #endif if (efi_enabled) efi_map_memmap(); /* * Parse the ACPI tables for possible boot-time SMP configuration. */ acpi_boot_init(); #ifdef CONFIG_X86_LOCAL_APIC if (smp_found_config) get_smp_config(); #endif register_memory(max_low_pfn); /* 对系统I/O资源生成资源树 */ #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; #endif #endif } /* arch/i386/kernel/cpu/common.c */ void __init early_cpu_init(void) { /* 目前支持9中x386系列cpu,分别赋值给cpu_devs */ intel_cpu_init(); /* Intel CPU结构赋值 */ cyrix_init_cpu(); nsc_init_cpu(); amd_init_cpu(); centaur_init_cpu(); transmeta_init_cpu(); rise_init_cpu(); nexgen_init_cpu(); umc_init_cpu(); early_cpu_detect(); /* 检测cpu信息,并将检测得到信息给boot_cpu_data */ #ifdef CONFIG_DEBUG_PAGEALLOC /* pse is not compatible with on-the-fly unmapping, * disable it even if the cpus claim to support it. */ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); disable_pse = 1; #endif } /* arch/i386/kernel/setup.c */ static void __init print_memory_map(char *who) { int i; for (i = 0; i < e820.nr_map; i++) { printk(" %s: %016Lx - %016Lx ", who, e820.map[i].addr, e820.map[i].addr + e820.map[i].size); switch (e820.map[i].type) { case E820_RAM: printk("(usable)/n"); break; case E820_RESERVED: printk("(reserved)/n"); break; case E820_ACPI: printk("(ACPI data)/n"); break; case E820_NVS: printk("(ACPI NVS)/n"); break; default: printk("type %lu/n", e820.map[i].type); break; } } } /* arch/i386/kernel/setup.c */ static void __init parse_cmdline_early (char ** cmdline_p) { char c = ' ', *to = command_line, *from = saved_command_line; int len = 0; int userdef = 0; /* Save unparsed command line copy for /proc/cmdline */ saved_command_line[COMMAND_LINE_SIZE-1] = '/0'; for (;;) { /* * "mem=nopentium" disables the 4MB page tables. * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM * to , overriding the bios size. * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from * to +, overriding the bios size. * * HPA tells me bootloaders need to parse mem=, so no new * option should be mem= [also see Documentation/i386/boot.txt] */ if (c == ' ' && !memcmp(from, "mem=", 4)) { if (to != command_line) to--; if (!memcmp(from+4, "nopentium", 9)) { from += 9+4; clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); disable_pse = 1; } else { /* If the user specifies memory size, we * limit the BIOS-provided memory map to * that size. exactmap can be used to specify * the exact map. mem=number can be used to * trim the existing memory map. */ unsigned long long mem_size; mem_size = memparse(from+4, &from); limit_regions(mem_size); userdef=1; } } if (c == ' ' && !memcmp(from, "memmap=", 7)) { if (to != command_line) to--; if (!memcmp(from+7, "exactmap", 8)) { from += 8+7; e820.nr_map = 0; userdef = 1; } else { /* If the user specifies memory size, we * limit the BIOS-provided memory map to * that size. exactmap can be used to specify * the exact map. mem=number can be used to * trim the existing memory map. */ unsigned long long start_at, mem_size; mem_size = memparse(from+7, &from); if (*from == '@') { start_at = memparse(from+1, &from); add_memory_region(start_at, mem_size, E820_RAM); } else if (*from == '#') { start_at = memparse(from+1, &from); add_memory_region(start_at, mem_size, E820_ACPI); } else if (*from == '$') { start_at = memparse(from+1, &from); add_memory_region(start_at, mem_size, E820_RESERVED); } else { limit_regions(mem_size); userdef=1; } } } #ifdef CONFIG_X86_SMP /* * If the BIOS enumerates physical processors before logical, * maxcpus=N at enumeration-time can be used to disable HT. */ else if (!memcmp(from, "maxcpus=", 8)) { extern unsigned int maxcpus; maxcpus = simple_strtoul(from + 8, NULL, 0); } #endif #ifdef CONFIG_ACPI_BOOT /* "acpi=off" disables both ACPI table parsing and interpreter */ else if (!memcmp(from, "acpi=off", 8)) { disable_acpi(); } /* acpi=force to over-ride black-list */ else if (!memcmp(from, "acpi=force", 10)) { acpi_force = 1; acpi_ht = 1; acpi_disabled = 0; } /* acpi=strict disables out-of-spec workarounds */ else if (!memcmp(from, "acpi=strict", 11)) { acpi_strict = 1; } /* Limit ACPI just to boot-time to enable HT */ else if (!memcmp(from, "acpi=ht", 7)) { if (!acpi_force) disable_acpi(); acpi_ht = 1; } /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ else if (!memcmp(from, "pci=noacpi", 10)) { acpi_disable_pci(); } /* "acpi=noirq" disables ACPI interrupt routing */ else if (!memcmp(from, "acpi=noirq", 10)) { acpi_noirq_set(); } else if (!memcmp(from, "acpi_sci=edge", 13)) acpi_sci_flags.trigger = 1; else if (!memcmp(from, "acpi_sci=level", 14)) acpi_sci_flags.trigger = 3; else if (!memcmp(from, "acpi_sci=high", 13)) acpi_sci_flags.polarity = 1; else if (!memcmp(from, "acpi_sci=low", 12)) acpi_sci_flags.polarity = 3; #ifdef CONFIG_X86_IO_APIC else if (!memcmp(from, "acpi_skip_timer_override", 24)) acpi_skip_timer_override = 1; #endif #ifdef CONFIG_X86_LOCAL_APIC /* disable IO-APIC */ else if (!memcmp(from, "noapic", 6)) disable_ioapic_setup(); #endif /* CONFIG_X86_LOCAL_APIC */ #endif /* CONFIG_ACPI_BOOT */ /* * highmem=size forces highmem to be exactly 'size' bytes.使用用户定义的highmem大小 * This works even on boxes that have no highmem otherwise.即使配置内核没有选择此选项 * This also works to reduce highmem size on bigger boxes.如果选择此选项也可能减少hignmem大小 */ if (c == ' ' && !memcmp(from, "highmem=", 8)) highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; /* * vmalloc=size forces the vmalloc area to be exactly 'size' * bytes. This can be used to increase (or decrease) the * vmalloc area - the default is 128m. 用户指定vmalloc大小代替缺省128m */ if (c == ' ' && !memcmp(from, "vmalloc=", 8)) __VMALLOC_RESERVE = memparse(from+8, &from); c = *(from++); if (!c) break; if (COMMAND_LINE_SIZE <= ++len) break; *(to++) = c; } *to = '/0'; *cmdline_p = command_line; if (userdef) { printk(KERN_INFO "user-defined physical RAM map:/n"); print_memory_map("user"); } } static unsigned long __init setup_memory(void) { unsigned long bootmap_size, start_pfn, max_low_pfn; /* * partially used pages are not usable - thus * we are rounding upwards: */ start_pfn = PFN_UP(init_pg_tables_end); find_max_pfn(); max_low_pfn = find_max_low_pfn(); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) { highstart_pfn = max_low_pfn; } printk(KERN_NOTICE "%ldMB HIGHMEM available./n", pages_to_mb(highend_pfn - highstart_pfn)); #endif printk(KERN_NOTICE "%ldMB LOWMEM available./n", pages_to_mb(max_low_pfn)); /* * Initialize the boot-time allocator (with low memory only): */ bootmap_size = init_bootmem(start_pfn, max_low_pfn); /* 设置此区间页面为保留,好像结果在node_data[0]->bdata */ register_bootmem_low_pages(max_low_pfn); /* 设置所有可以使用内存页面位图 */ /* * Reserve the bootmem bitmap itself as well. We do this in two * steps (first step was init_bootmem()) because this catches * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); /* 保留内核在内存中的映像 */ /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ reserve_bootmem(0, PAGE_SIZE); /* 保留物理页面0, 主要是和启动有关的信息以及bios信息 */ /* reserve EBDA region, it's a 4K region */ reserve_ebda_region(); /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent PCI prefetch into it (errata #56). Usually the page is reserved anyways, unless you have no PS/2 mouse plugged in. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 == 6) reserve_bootmem(0xa0000 - 4096, 4096); #ifdef CONFIG_SMP /* * But first pinch a few for the stack/trampoline stuff * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ reserve_bootmem(PAGE_SIZE, PAGE_SIZE); /* 在SMP系统中需要使用 */ #endif #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. */ acpi_reserve_bootmem(); #endif #ifdef CONFIG_X86_FIND_SMP_CONFIG /* * Find and reserve possible boot-time SMP configuration: */ find_smp_config(); #endif #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { reserve_bootmem(INITRD_START, INITRD_SIZE); initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0; initrd_end = initrd_start+INITRD_SIZE; } else { printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)/ndisabling initrd/n", INITRD_START + INITRD_SIZE, max_low_pfn << PAGE_SHIFT); initrd_start = 0; } } #endif return max_low_pfn; } /* arch/i386/mm/init.c */ /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. * 已经有8MB内存在head.S中映射完成 * This routines also unmaps the page at virtual kernel address 0, so * that we can trap those pesky NULL-reference errors in the kernel. */ void __init paging_init(void) { #ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) printk("NX (Execute Disable) protection: active/n"); #endif pagetable_init(); /* 修改系统空间页面表信息,原来在系统setup时已经设置好,但都是空的 */ load_cr3(swapper_pg_dir); #ifdef CONFIG_X86_PAE /* * We will bail out later - printk doesn't work right now so * the user would just see a hanging kernel. */ if (cpu_has_pae) set_in_cr4(X86_CR4_PAE); #endif __flush_tlb_all(); /* 刷新mmu */ kmap_init(); /* highmem使用内存设定 */ zone_sizes_init(); /* 内存初始化 pgdat_list->zone */ } static void __init pagetable_init (void) { unsigned long vaddr; pgd_t *pgd_base = swapper_pg_dir; #ifdef CONFIG_X86_PAE /* 用三级页面映射表(Physical Address Extension) */ int i; /* Init entries of the first-level page table to the zero page */ for (i = 0; i < PTRS_PER_PGD; i++) /* PTRS_PER_PGD=4 */ set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); #endif /* Enable PSE if available(Page Size Extensions)4MB页面表 */ if (cpu_has_pse) { set_in_cr4(X86_CR4_PSE); } /* Enable PGE if available (PTE Global Bit)*/ if (cpu_has_pge) { set_in_cr4(X86_CR4_PGE); __PAGE_KERNEL |= _PAGE_GLOBAL; __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; } kernel_physical_mapping_init(pgd_base); /* 系统空间映射(0xC0000000..=>0-max_low_pfn) */ remap_numa_kva(); /* 重新初始化numa的内核虚拟地址空间???? */ /* * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): 固定使用的地址 */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; /* 在内存高地址区域 */ page_table_range_init(vaddr, 0, pgd_base); /* 页面表固定地址初始化,包括acpi地址等 */ permanent_kmaps_init(pgd_base); /* 固定地址初始化(pkmap),此地址干什么用?????是不是用作highmem分配使用 */ #ifdef CONFIG_X86_PAE /* * Add low memory identity-mappings - SMP needs it when * starting up on an AP from real-mode. In the non-PAE * case we already have these mappings through head.S. * All user-space mappings are explicitly cleared after * SMP startup. */ pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; #endif } /* * This maps the physical memory to kernel virtual address space, a total * of max_low_pfn pages, by creating page tables starting from address * PAGE_OFFSET.(映射物理内存到系统空间虚拟地址,共max_low_pfn页面,从0xc0000000地址开始) */ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) { unsigned long pfn; pgd_t *pgd; pmd_t *pmd; pte_t *pte; int pgd_idx, pmd_idx, pte_ofs; pgd_idx = pgd_index(PAGE_OFFSET); /* 映射开始地址是系统空间 */ pgd = pgd_base + pgd_idx; pfn = 0; for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { pmd = one_md_table_init(pgd); /* 初始化二级目录表 */ if (pfn >= max_low_pfn) continue; for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; /* Map with big pages if possible, otherwise create normal page tables. */ if (cpu_has_pse) { /* 4MB页面表初始化,如果用此,将没有第三级页面 */ unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; if (is_kernel_text(address) || is_kernel_text(address2)) set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); else set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); pfn += PTRS_PER_PTE; } else { pte = one_page_table_init(pmd); for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { if (is_kernel_text(address)) set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); else set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); } } } } } void __init sched_init(void) { runqueue_t *rq; int i, j, k; /* 初始化每个cpu运行队列 */ for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->active = rq->arrays; /* 活动队列 */ rq->expired = rq->arrays + 1; /* 过期队列 */ rq->best_expired_prio = MAX_PRIO; /* 优先级最低 */ #ifdef CONFIG_SMP rq->sd = &sched_domain_dummy; rq->cpu_load = 0; /* cpu负载 */ rq->active_balance = 0; /* ???? */ rq->push_cpu = 0; /* ???? */ rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); #endif atomic_set(&rq->nr_iowait, 0); for (j = 0; j < 2; j++) { array = rq->arrays + j; for (k = 0; k < MAX_PRIO; k++) { INIT_LIST_HEAD(array->queue + k); __clear_bit(k, array->bitmap); } // delimiter for bitsearch __set_bit(MAX_PRIO, array->bitmap); } } /* * The boot idle thread does lazy MMU switching as well: */ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); /* 设置idel进程,并将runqueue中curr指向该进程 */ } void __init trap_init(void) /* 中断向量重新设置(在初始化时设置指向ignore_int) */ { #ifdef CONFIG_EISA if (isa_readl(0x0FFFD9) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { EISA_bus = 1; } #endif #ifdef CONFIG_X86_LOCAL_APIC init_apic_mappings(); #endif set_trap_gate(0,÷_error); /* 陷阱门设置 */ set_intr_gate(1,&debug); /* 中断门设置 */ set_intr_gate(2,&nmi); set_system_intr_gate(3, &int3); /* int3-5 can be called from all */ set_system_gate(4,&overflow); set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); set_trap_gate(7,&device_not_available); set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS); set_trap_gate(9,&coprocessor_segment_overrun); set_trap_gate(10,&invalid_TSS); set_trap_gate(11,&segment_not_present); set_trap_gate(12,&stack_segment); set_trap_gate(13,&general_protection); set_intr_gate(14,&page_fault); set_trap_gate(15,&spurious_interrupt_bug); set_trap_gate(16,&coprocessor_error); set_trap_gate(17,&alignment_check); #ifdef CONFIG_X86_MCE set_trap_gate(18,&machine_check); #endif set_trap_gate(19,&simd_coprocessor_error); set_system_gate(SYSCALL_VECTOR,&system_call); /* 系统调用中断设置 */ /* * Should be a barrier for any external CPU state. */ cpu_init(); /* 重新装入gdt,ldt */ trap_init_hook(); /* do nothing on i386 */ } void __init init_IRQ(void) { int i; /* all the set up before the call gates are initialised */ pre_intr_init_hook(); /* 中断请求队列初始化 */ /* * Cover the whole vector space, no vector can escape 设置中断向量 * us. (some of these will be overridden and become * 'special' SMP interrupts) */ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { int vector = FIRST_EXTERNAL_VECTOR + i; if (i >= NR_IRQS) break; if (vector != SYSCALL_VECTOR) set_intr_gate(vector, interrupt[i]); } /* setup after call gates are initialised (usually add in * the architecture specific gates) 在系统调用初始化完毕后特殊设置,和结构相关 */ intr_init_hook(); /* * Set the clock to HZ Hz, we already have a valid * vector now: 设置时钟hz */ setup_pit_timer(); /* * External FPU? Set up irq13 if so, for * original braindamaged IBM FERR coupling. */ if (boot_cpu_data.hard_math && !cpu_has_fpu) setup_irq(FPU_IRQ, &fpu_irq); irq_ctx_init(smp_processor_id()); } void __init mem_init(void) { extern int ppro_with_ram_bug(void); /* 检测pentium是否是有bug的cpu */ int codesize, reservedpages, datasize, initsize; int tmp; int bad_ppro; #ifndef CONFIG_DISCONTIGMEM if (!mem_map) BUG(); #endif bad_ppro = ppro_with_ram_bug(); #ifdef CONFIG_HIGHMEM /* check that fixmap and pkmap do not overlap 确认fixmap和pkmap没有重叠 */ if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { printk(KERN_ERR "fixmap and kmap areas overlap - this will crash/n"); printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh/n", PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); BUG(); } #endif set_max_mapnr_init(); /* 设置highmem区域 */ #ifdef CONFIG_HIGHMEM high_memory = (void *) __va(highstart_pfn * PAGE_SIZE); #else high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); #endif /* this will put all low memory onto the freelists,根据页面位图释放内存中所有可供动态分配的页面 */ totalram_pages += __free_all_bootmem(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* * Only count reserved RAM pages */ if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; set_highmem_pages_init(bad_ppro); codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); /* 初始化kcore_mem,应该是实际内存? */ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); /* 虚拟内存初始化 */ printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)/n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); #ifdef CONFIG_X86_PAE if (!cpu_has_pae) panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); #endif if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); /* * Subtle. SMP is doing it's boot stuff late (because it has to * fork idle threads) - but it also needs low mappings for the * protected-mode entry to work. We zap these entries only after * the WP-bit has been tested. */ #ifndef CONFIG_SMP zap_low_mappings(); #endif } /* Initialisation. * Called after the gfp() functions have been enabled, and before smp_init(). */ void __init kmem_cache_init(void) { size_t left_over; struct cache_sizes *sizes; struct cache_names *names; /* * Fragmentation(分裂) resistance(阻力) on low memory - only use bigger * page orders on machines with more than 32MB of memory. */ if (num_physpages > (32 << 20) >> PAGE_SHIFT) /* 系统有多于32MB内存 */ slab_break_gfp_order = BREAK_GFP_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: * 1) initialize the cache_cache cache: it contains the kmem_cache_t * structures of all caches, except cache_cache itself: cache_cache * is statically allocated. * Initially an __init data area is used for the head array, it's * replaced with a kmalloc allocated array at the end of the bootstrap. * 2) Create the first kmalloc cache. * The kmem_cache_t for the new cache is allocated normally. An __init * data area is used for the head array. * 3) Create the remaining kmalloc caches, with minimally sized head arrays. * 4) Replace the __init data head arrays for cache_cache and the first * kmalloc cache with kmalloc allocated arrays. * 5) Resize the head arrays of the kmalloc caches to their final sizes. */ /* 1) create the cache_cache */ init_MUTEX(&cache_chain_sem); /* 初始化cache链表信号量 */ INIT_LIST_HEAD(&cache_chain); /* 初始化cache链表 */ list_add(&cache_cache.next, &cache_chain); /* 是不是把自己加入到队列头???? */ cache_cache.colour_off = cache_line_size(); /* 128 */ cache_cache.array[smp_processor_id()] = &initarray_cache.cache; cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, &left_over, &cache_cache.num); if (!cache_cache.num) BUG(); cache_cache.colour = left_over/cache_cache.colour_off; cache_cache.colour_next = 0; cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + sizeof(struct slab), cache_line_size()); /* 2+3) create the kmalloc caches */ sizes = malloc_sizes; names = cache_names; while (sizes->cs_size) { /* For performance, all the general caches are L1 aligned. * This should be particularly beneficial on SMP boxes, as it * eliminates "false sharing". * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ sizes->cs_cachep = kmem_cache_create(names->name, sizes->cs_size, ARCH_KMALLOC_MINALIGN, (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); /* Inc off-slab bufctl limit until the ceiling is hit. */ if (!(OFF_SLAB(sizes->cs_cachep))) { offslab_limit = sizes->cs_size-sizeof(struct slab); offslab_limit /= sizeof(kmem_bufctl_t); } sizes->cs_dmacachep = kmem_cache_create(names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), NULL, NULL); sizes++; names++; } /* 4) Replace the bootstrap head arrays */ { void * ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); local_irq_disable(); BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); cache_cache.array[smp_processor_id()] = ptr; local_irq_enable(); ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); local_irq_disable(); BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), sizeof(struct arraycache_init)); malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; local_irq_enable(); } /* 5) resize the head arrays to their final sizes */ { kmem_cache_t *cachep; down(&cache_chain_sem); list_for_each_entry(cachep, &cache_chain, next) enable_cpucache(cachep); /* 激活cpu缓存 */ up(&cache_chain_sem); } /* Done! */ g_cpucache_up = FULL; /* Register a cpu startup notifier callback * that initializes ac_data for all new cpus */ register_cpu_notifier(&cpucache_notifier); /* The reap timers are started later, with a module init call: * That part of the kernel is not yet operational. */ } void __init pidmap_init(void) { int i; pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); set_bit(0, pidmap_array->page); atomic_dec(&pidmap_array->nr_free); /* * Allocate PID 0, and hash it via all PID types: */ for (i = 0; i < PIDTYPE_MAX; i++) /* 将当前进程加入到hash表中.pid,pgid,tgid,sid */ attach_pid(current, i, 0); } /* * We need to finalize in a non-__init function or else race conditions * between the root thread and the init thread may cause start_kernel to * be reaped by free_initmem before the root thread has proceeded to * cpu_idle. * * gcc-3.4 accidentally inlines this function, so use noinline. */ static void noinline rest_init(void) __releases(kernel_lock) { kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND); /* 启动init内核进程 */ numa_default_policy(); unlock_kernel(); cpu_idle(); } static int init(void * unused) { lock_kernel(); /* * Tell the world that we're going to be the grim * reaper of innocent orphaned children. 所有进程的父进程 * * We don't want people to have to make incorrect * assumptions about where in the task array this * can be found. */ child_reaper = current; /* Sets up cpus_possible() */ smp_prepare_cpus(max_cpus); /*主cpu会依次启动各个从cpu。见smp_boot_cpus->do_boot_cpu()*/ do_pre_smp_initcalls(); /* 启动migration_thread,ksoftirqd等CPU进程 */ fixup_cpu_present_map(); smp_init(); /* 主要设置APIC */ sched_init_smp(); /* * Do this before initcalls, because some drivers want to access * firmware files. */ populate_rootfs(); /* 生成initrd文件 */ do_basic_setup(); /* * check if there is an early userspace init. If yes, let it do all * the work */ if (sys_access((const char __user *) "/init", 0) == 0) execute_command = "/init"; else prepare_namespace(); /* 装载initrd,安装模块,mount根文件系统 */ /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ free_initmem(); unlock_kernel(); system_state = SYSTEM_RUNNING; numa_default_policy(); if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) printk("Warning: unable to open an initial console./n"); (void) sys_dup(0); (void) sys_dup(0); /* * We try each of these until one succeeds. * * The Bourne shell can be used instead of init if we are * trying to recover a really broken machine. */ if (execute_command) run_init_process(execute_command); run_init_process("/sbin/init"); run_init_process("/etc/init"); run_init_process("/bin/init"); run_init_process("/bin/sh"); panic("No init found. Try passing init= option to kernel."); }
linux启动代码分析
猜你喜欢
转载自blog.csdn.net/enlaihe/article/details/7916027
今日推荐
周排行