linux启动代码分析

Kernel: 2.6.10-rc2
Finished: 01/01/05


/*
 *	Activate the first processor.
 */

asmlinkage void __init start_kernel(void)
{
	char * command_line;
	extern struct kernel_param __start___param[], __stop___param[];
/*
 * Interrupts are still disabled. Do necessary setups, then
 * enable them
 */
	lock_kernel();		/* 给kernel上锁 */
	page_address_init();	/* 在配置highmem才作工作 */
	printk(linux_banner);	/* 打印kernel版本信息 */
	setup_arch(&command_line); /* 设置体系结构相关信息,包括页面映射,acpi等 */
	setup_per_cpu_areas();	/* 设置smp中每个cpu区域偏移量信息 */

	/*
	 * Mark the boot cpu "online" so that it can call console drivers in
	 * printk() and can access its per-cpu storage. 设置引导cpu在工作状态
	 */
	smp_prepare_boot_cpu();

	/*
	 * Set up the scheduler prior starting any interrupts (such as the
	 * timer interrupt). Full topology setup happens at smp_init()
	 * time - but meanwhile we still have a functioning scheduler.
	 */
	sched_init();	/* runqueue设置 */
	build_all_zonelists();	/* 建立分配策略 */
	page_alloc_init();	/* hotplug CPU设置 */
	printk("Kernel command line: %s/n", saved_command_line);
	parse_early_param();
	parse_args("Booting kernel", command_line, __start___param,
		   __stop___param - __start___param,
		   &unknown_bootoption);	/* 对传入内核参数作分析,并作相应设置 */
	sort_main_extable();	/* 异常处理调用函数表排序 */
	trap_init();		/* 重新设置中断向量表 */
	rcu_init();		/* 初始化RCU(Read-Copy Update),主要是一个per_cpu_rcu_tasklet */
	init_IRQ();		/* 中断服务队列初始化,但没有具体中断处理函数入口,在request_irq()向系统注册 */
	pidhash_init();		/* pidhash表初始化,共5个,是不是每个表中保存不同类型pid? */
	init_timers();		/* 初始化一个per_cpu_tvec_bases队列,并设置TIMER_SOFTIRQ */
	softirq_init();		/* 初始化软中断和tasklet */
	time_init();		/* 硬件时钟及其中断初始化 */

	/*
	 * HACK ALERT! This is early. We're enabling the console before
	 * we've done PCI setups etc, and console_init() must be aware of
	 * this. But we do want output early, in case something goes wrong.
	 */
	console_init();
	if (panic_later)
		panic(panic_later, panic_param);
	profile_init();		/* profile设置 */
	local_irq_enable();	/* 开中断 */
#ifdef CONFIG_BLK_DEV_INITRD
	if (initrd_start && !initrd_below_start_ok &&
			initrd_start < min_low_pfn << PAGE_SHIFT) {
		printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
		    "disabling it./n",initrd_start,min_low_pfn << PAGE_SHIFT);
		initrd_start = 0;
	}
#endif
	vfs_caches_init_early();	/* 初始化dentry和inode缓冲队列的hash表 */
	mem_init();			/* 最后内存初始化,释放前边标志为保留的所有页面 */
	kmem_cache_init();		/* slab初始化 */
	numa_policy_init();		/* ?????????????????????? */
	if (late_time_init)
		late_time_init();
	calibrate_delay();		/* 计算BogoMIPS  */
	pidmap_init();			/* 初始化pid位图 */
	pgtable_cache_init();		/* pgd,pmd slab初始化 */
	prio_tree_init();		/* 初始化index_bits_to_maxindex,For (struct page)->mapping->i_map*/
	anon_vma_init();		/* anon_vma slab初始化,用于对rmap支持 */
#ifdef CONFIG_X86
	if (efi_enabled)
		efi_enter_virtual_mode();
#endif
	fork_init(num_physpages);	/* 计算系统最大安全进程数,设置当前进程最大进程数 */
	proc_caches_init();		/* 其他slab初始化 */
	buffer_init();			/* buffer head初始化 */
	unnamed_dev_init();		/* ?????what is idr????? */
	security_init();		/* security 初始化 */
	vfs_caches_init(num_physpages);	/* **vfs需要的cache初始化** */
	radix_tree_init();		/* radix_tree初始化,该功能主要加速look up dirty or writeback pages */
	signals_init();			/* 创建sigqueue slab */
	/* rootfs populating might need page-writeback */
	page_writeback_init();		/* 计算当前系统vm-radio等,设置是否需要回写操作 */
#ifdef CONFIG_PROC_FS
	proc_root_init();		/* proc文件系统初始化,并根据配置建立相应的目录和文件 */
#endif
	check_bugs();

	acpi_early_init(); /* before LAPIC and SMP init */

	/* Do the rest non-__init'ed, we're now alive */
	rest_init();			/* 建立init进程 */
}

/* arch/i386/kernel/setup.c */
/*
 * Determine if we were loaded by an EFI loader.  If so, then we have also been
 * passed the efi memmap, systab, etc., so we should use these data structures
 * for initialization.  Note, the efi init code path is determined by the
 * global efi_enabled. This allows the same kernel image to be used on existing
 * systems (with a traditional BIOS) as well as on EFI systems.
 * 检测是否是通过EFI引导kernel.如果是,将通过efi导入memmap, systab等,因此用此数据
 * 结构进行初始化。
 * Note: efi初始化路径是在全觉efi_enabled决定的(是否配置efi_enable?)。
 */
void __init setup_arch(char **cmdline_p)
{
	unsigned long max_low_pfn;

	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
	pre_setup_arch_hook();	/* 执行某些体系结构相关的hook程序, i386是空 */
	early_cpu_init();	/* 设置获取的cpu信息 */

	/*
	 * FIXME: This isn't an official loader_type right
	 * now but does currently work with elilo.
	 * If we were configured as an EFI kernel, check to make
	 * sure that we were loaded correctly from elilo and that
	 * the system table is valid.  If not, then initialize normally.
	 */
#ifdef CONFIG_EFI
	if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
		efi_enabled = 1;
#endif
	/* 从setup中取得BIOS自检后取得的信息,复制到内核内存空间中(原来保存在一个临时页面中) */
 	ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
 	drive_info = DRIVE_INFO;
 	screen_info = SCREEN_INFO;
	edid_info = EDID_INFO;
	apm_info.bios = APM_BIOS_INFO;
	ist_info = IST_INFO;
	saved_videomode = VIDEO_MODE;
	if( SYS_DESC_TABLE.length != 0 ) {
		MCA_bus = SYS_DESC_TABLE.table[3] &0x2;
		machine_id = SYS_DESC_TABLE.table[0];
		machine_submodel_id = SYS_DESC_TABLE.table[1];
		BIOS_revision = SYS_DESC_TABLE.table[2];
	}
	aux_device_present = AUX_DEVICE_INFO;

#ifdef CONFIG_BLK_DEV_RAM
	rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
	rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
	rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
#endif
	ARCH_SETUP	/* x86系列没有任何的动作 */
	if (efi_enabled)
		efi_init();
	else {
		printk(KERN_INFO "BIOS-provided physical RAM map:/n");
		print_memory_map(machine_specific_memory_setup()); /* 处理内存图,最后保存在e820中 */
	}

	copy_edd();	/* 复制增强磁盘参数(来之setup自检信息),实验性质,CONFIG_EDD */

	if (!MOUNT_ROOT_RDONLY)
		root_mountflags &= ~MS_RDONLY;
	init_mm.start_code = (unsigned long) _text;
	init_mm.end_code = (unsigned long) _etext;
	init_mm.end_data = (unsigned long) _edata;
	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;

	code_resource.start = virt_to_phys(_text);
	code_resource.end = virt_to_phys(_etext)-1;
	data_resource.start = virt_to_phys(_etext);
	data_resource.end = virt_to_phys(_edata)-1;

	parse_cmdline_early(cmdline_p);	/* 分析引导时用户提供的启动参数(例如mem=xxx,acpi=xx,and so on) */

	max_low_pfn = setup_memory();	/* 为页面映射作基础工作(生成map) */

	/*
	 * NOTE: before this point _nobody_ is allowed to allocate	到现在依然不可以用bootmem内存分配器来
	 * any memory using the bootmem allocator.  Although the	分配内存,在执行paging_init()以前必须
	 * alloctor is now initialised only the first 8Mb of the kernel	用alloc_bootmem_low_pages()来分配内存
	 * virtual address space has been mapped.  All allocations before
	 * paging_init() has completed must use the alloc_bootmem_low_pages()
	 * variant (which allocates DMA'able memory) and care must be taken
	 * not to exceed the 8Mb limit.
	 */

#ifdef CONFIG_SMP
	smp_alloc_memory(); /* AP processor realmode stacks in low memory 为启动smp其他cpu分配内存 */
#endif
	paging_init();	/* 页面信息初始化 */

	/*
	 * NOTE: at this point the bootmem allocator is fully available.
	 */

#ifdef CONFIG_EARLY_PRINTK
	{
		char *s = strstr(*cmdline_p, "earlyprintk=");
		if (s) {
			extern void setup_early_printk(char *);

			setup_early_printk(s);
			printk("early console enabled/n");
		}
	}
#endif


	dmi_scan_machine(); /* DMI=Desktop Management Interface */

#ifdef CONFIG_X86_GENERICARCH
	generic_apic_probe(*cmdline_p);	/* 检测APIC(高级可编程中断器) */
#endif	
	if (efi_enabled)
		efi_map_memmap();

	/*
	 * Parse the ACPI tables for possible boot-time SMP configuration.
	 */
	acpi_boot_init();

#ifdef CONFIG_X86_LOCAL_APIC
	if (smp_found_config)
		get_smp_config();
#endif

	register_memory(max_low_pfn);	/* 对系统I/O资源生成资源树 */

#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
	if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
		conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
	conswitchp = &dummy_con;
#endif
#endif
}

/* arch/i386/kernel/cpu/common.c */
void __init early_cpu_init(void)
{
	/* 目前支持9中x386系列cpu,分别赋值给cpu_devs */
	intel_cpu_init();	/* Intel CPU结构赋值 */
	cyrix_init_cpu();
	nsc_init_cpu();
	amd_init_cpu();
	centaur_init_cpu();
	transmeta_init_cpu();
	rise_init_cpu();
	nexgen_init_cpu();
	umc_init_cpu();
	early_cpu_detect();	/* 检测cpu信息,并将检测得到信息给boot_cpu_data */

#ifdef CONFIG_DEBUG_PAGEALLOC
	/* pse is not compatible with on-the-fly unmapping,
	 * disable it even if the cpus claim to support it.
	 */
	clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
	disable_pse = 1;
#endif
}

/* arch/i386/kernel/setup.c */
static void __init print_memory_map(char *who)
{
	int i;

	for (i = 0; i < e820.nr_map; i++) {
		printk(" %s: %016Lx - %016Lx ", who,
			e820.map[i].addr,
			e820.map[i].addr + e820.map[i].size);
		switch (e820.map[i].type) {
		case E820_RAM:	printk("(usable)/n");
				break;
		case E820_RESERVED:
				printk("(reserved)/n");
				break;
		case E820_ACPI:
				printk("(ACPI data)/n");
				break;
		case E820_NVS:
				printk("(ACPI NVS)/n");
				break;
		default:	printk("type %lu/n", e820.map[i].type);
				break;
		}
	}
}

/* arch/i386/kernel/setup.c */
static void __init parse_cmdline_early (char ** cmdline_p)
{
	char c = ' ', *to = command_line, *from = saved_command_line;
	int len = 0;
	int userdef = 0;

	/* Save unparsed command line copy for /proc/cmdline */
	saved_command_line[COMMAND_LINE_SIZE-1] = '/0';

	for (;;) {
		/*
		 * "mem=nopentium" disables the 4MB page tables.
		 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
		 * to , overriding the bios size.
		 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
		 *  to +, overriding the bios size.
		 *
		 * HPA tells me bootloaders need to parse mem=, so no new
		 * option should be mem=  [also see Documentation/i386/boot.txt]
		 */
		if (c == ' ' && !memcmp(from, "mem=", 4)) {
			if (to != command_line)
				to--;
			if (!memcmp(from+4, "nopentium", 9)) {
				from += 9+4;
				clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
				disable_pse = 1;
			} else {
				/* If the user specifies memory size, we
				 * limit the BIOS-provided memory map to
				 * that size. exactmap can be used to specify
				 * the exact map. mem=number can be used to
				 * trim the existing memory map.
				 */
				unsigned long long mem_size;
 
				mem_size = memparse(from+4, &from);
				limit_regions(mem_size);
				userdef=1;
			}
		}

		if (c == ' ' && !memcmp(from, "memmap=", 7)) {
			if (to != command_line)
				to--;
			if (!memcmp(from+7, "exactmap", 8)) {
				from += 8+7;
				e820.nr_map = 0;
				userdef = 1;
			} else {
				/* If the user specifies memory size, we
				 * limit the BIOS-provided memory map to
				 * that size. exactmap can be used to specify
				 * the exact map. mem=number can be used to
				 * trim the existing memory map.
				 */
				unsigned long long start_at, mem_size;
 
				mem_size = memparse(from+7, &from);
				if (*from == '@') {
					start_at = memparse(from+1, &from);
					add_memory_region(start_at, mem_size, E820_RAM);
				} else if (*from == '#') {
					start_at = memparse(from+1, &from);
					add_memory_region(start_at, mem_size, E820_ACPI);
				} else if (*from == '$') {
					start_at = memparse(from+1, &from);
					add_memory_region(start_at, mem_size, E820_RESERVED);
				} else {
					limit_regions(mem_size);
					userdef=1;
				}
			}
		}

#ifdef  CONFIG_X86_SMP
		/*
		 * If the BIOS enumerates physical processors before logical,
		 * maxcpus=N at enumeration-time can be used to disable HT.
		 */
		else if (!memcmp(from, "maxcpus=", 8)) {
			extern unsigned int maxcpus;

			maxcpus = simple_strtoul(from + 8, NULL, 0);
		}
#endif

#ifdef CONFIG_ACPI_BOOT
		/* "acpi=off" disables both ACPI table parsing and interpreter */
		else if (!memcmp(from, "acpi=off", 8)) {
			disable_acpi();
		}

		/* acpi=force to over-ride black-list */
		else if (!memcmp(from, "acpi=force", 10)) {
			acpi_force = 1;
			acpi_ht = 1;
			acpi_disabled = 0;
		}

		/* acpi=strict disables out-of-spec workarounds */
		else if (!memcmp(from, "acpi=strict", 11)) {
			acpi_strict = 1;
		}

		/* Limit ACPI just to boot-time to enable HT */
		else if (!memcmp(from, "acpi=ht", 7)) {
			if (!acpi_force)
				disable_acpi();
			acpi_ht = 1;
		}
		
		/* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
		else if (!memcmp(from, "pci=noacpi", 10)) {
			acpi_disable_pci();
		}
		/* "acpi=noirq" disables ACPI interrupt routing */
		else if (!memcmp(from, "acpi=noirq", 10)) {
			acpi_noirq_set();
		}

		else if (!memcmp(from, "acpi_sci=edge", 13))
			acpi_sci_flags.trigger =  1;

		else if (!memcmp(from, "acpi_sci=level", 14))
			acpi_sci_flags.trigger = 3;

		else if (!memcmp(from, "acpi_sci=high", 13))
			acpi_sci_flags.polarity = 1;

		else if (!memcmp(from, "acpi_sci=low", 12))
			acpi_sci_flags.polarity = 3;

#ifdef CONFIG_X86_IO_APIC
		else if (!memcmp(from, "acpi_skip_timer_override", 24))
			acpi_skip_timer_override = 1;
#endif

#ifdef CONFIG_X86_LOCAL_APIC
		/* disable IO-APIC */
		else if (!memcmp(from, "noapic", 6))
			disable_ioapic_setup();
#endif /* CONFIG_X86_LOCAL_APIC */
#endif /* CONFIG_ACPI_BOOT */

		/*
		 * highmem=size forces highmem to be exactly 'size' bytes.使用用户定义的highmem大小
		 * This works even on boxes that have no highmem otherwise.即使配置内核没有选择此选项
		 * This also works to reduce highmem size on bigger boxes.如果选择此选项也可能减少hignmem大小
		 */
		if (c == ' ' && !memcmp(from, "highmem=", 8))
			highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
	
		/*
		 * vmalloc=size forces the vmalloc area to be exactly 'size'
		 * bytes. This can be used to increase (or decrease) the
		 * vmalloc area - the default is 128m. 用户指定vmalloc大小代替缺省128m
		 */
		if (c == ' ' && !memcmp(from, "vmalloc=", 8))
			__VMALLOC_RESERVE = memparse(from+8, &from);

		c = *(from++);
		if (!c)
			break;
		if (COMMAND_LINE_SIZE <= ++len)
			break;
		*(to++) = c;
	}
	*to = '/0';
	*cmdline_p = command_line;
	if (userdef) {
		printk(KERN_INFO "user-defined physical RAM map:/n");
		print_memory_map("user");
	}
}
static unsigned long __init setup_memory(void)
{
	unsigned long bootmap_size, start_pfn, max_low_pfn;

	/*
	 * partially used pages are not usable - thus
	 * we are rounding upwards:
	 */
	start_pfn = PFN_UP(init_pg_tables_end);

	find_max_pfn();

	max_low_pfn = find_max_low_pfn();

#ifdef CONFIG_HIGHMEM
	highstart_pfn = highend_pfn = max_pfn;
	if (max_pfn > max_low_pfn) {
		highstart_pfn = max_low_pfn;
	}
	printk(KERN_NOTICE "%ldMB HIGHMEM available./n",
		pages_to_mb(highend_pfn - highstart_pfn));
#endif
	printk(KERN_NOTICE "%ldMB LOWMEM available./n",
			pages_to_mb(max_low_pfn));
	/*
	 * Initialize the boot-time allocator (with low memory only):
	 */
	bootmap_size = init_bootmem(start_pfn, max_low_pfn);	/* 设置此区间页面为保留,好像结果在node_data[0]->bdata */

	register_bootmem_low_pages(max_low_pfn);	/* 设置所有可以使用内存页面位图 */

	/*
	 * Reserve the bootmem bitmap itself as well. We do this in two
	 * steps (first step was init_bootmem()) because this catches
	 * the (very unlikely) case of us accidentally initializing the
	 * bootmem allocator with an invalid RAM area.
	 */
	reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
			 bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));	/* 保留内核在内存中的映像 */

	/*
	 * reserve physical page 0 - it's a special BIOS page on many boxes,
	 * enabling clean reboots, SMP operation, laptop functions.
	 */
	reserve_bootmem(0, PAGE_SIZE);	/* 保留物理页面0, 主要是和启动有关的信息以及bios信息 */

	/* reserve EBDA region, it's a 4K region */
	reserve_ebda_region();

    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
       unless you have no PS/2 mouse plugged in. */
	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
	    boot_cpu_data.x86 == 6)
	     reserve_bootmem(0xa0000 - 4096, 4096);

#ifdef CONFIG_SMP
	/*
	 * But first pinch a few for the stack/trampoline stuff
	 * FIXME: Don't need the extra page at 4K, but need to fix
	 * trampoline before removing it. (see the GDT stuff)
	 */
	reserve_bootmem(PAGE_SIZE, PAGE_SIZE);	/* 在SMP系统中需要使用 */
#endif
#ifdef CONFIG_ACPI_SLEEP
	/*
	 * Reserve low memory region for sleep support.
	 */
	acpi_reserve_bootmem();
#endif
#ifdef CONFIG_X86_FIND_SMP_CONFIG
	/*
	 * Find and reserve possible boot-time SMP configuration:
	 */
	find_smp_config();
#endif

#ifdef CONFIG_BLK_DEV_INITRD
	if (LOADER_TYPE && INITRD_START) {
		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
			reserve_bootmem(INITRD_START, INITRD_SIZE);
			initrd_start =
				INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
			initrd_end = initrd_start+INITRD_SIZE;
		}
		else {
			printk(KERN_ERR "initrd extends beyond end of memory "
			    "(0x%08lx > 0x%08lx)/ndisabling initrd/n",
			    INITRD_START + INITRD_SIZE,
			    max_low_pfn << PAGE_SHIFT);
			initrd_start = 0;
		}
	}
#endif
	return max_low_pfn;
}


/* arch/i386/mm/init.c */
/*
 * paging_init() sets up the page tables - note that the first 8MB are
 * already mapped by head.S.
 * 已经有8MB内存在head.S中映射完成
 * This routines also unmaps the page at virtual kernel address 0, so
 * that we can trap those pesky NULL-reference errors in the kernel.
 */
void __init paging_init(void)
{
#ifdef CONFIG_X86_PAE
	set_nx();
	if (nx_enabled)
		printk("NX (Execute Disable) protection: active/n");
#endif

	pagetable_init();	/* 修改系统空间页面表信息,原来在系统setup时已经设置好,但都是空的 */

	load_cr3(swapper_pg_dir);

#ifdef CONFIG_X86_PAE
	/*
	 * We will bail out later - printk doesn't work right now so
	 * the user would just see a hanging kernel.
	 */
	if (cpu_has_pae)
		set_in_cr4(X86_CR4_PAE);
#endif
	__flush_tlb_all();	/* 刷新mmu */

	kmap_init();	/* highmem使用内存设定 */
	zone_sizes_init(); /* 内存初始化 pgdat_list->zone */
}

static void __init pagetable_init (void)
{
	unsigned long vaddr;
	pgd_t *pgd_base = swapper_pg_dir;

#ifdef CONFIG_X86_PAE	/* 用三级页面映射表(Physical Address Extension) */
	int i;
	/* Init entries of the first-level page table to the zero page */
	for (i = 0; i < PTRS_PER_PGD; i++)	/* PTRS_PER_PGD=4 */
		set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
#endif

	/* Enable PSE if available(Page Size Extensions)4MB页面表 */
	if (cpu_has_pse) {
		set_in_cr4(X86_CR4_PSE);
	}

	/* Enable PGE if available (PTE Global Bit)*/
	if (cpu_has_pge) {
		set_in_cr4(X86_CR4_PGE);
		__PAGE_KERNEL |= _PAGE_GLOBAL;
		__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
	}

	kernel_physical_mapping_init(pgd_base);	/* 系统空间映射(0xC0000000..=>0-max_low_pfn) */
	remap_numa_kva(); /* 重新初始化numa的内核虚拟地址空间???? */

	/*
	 * Fixed mappings, only the page table structure has to be
	 * created - mappings will be set by set_fixmap(): 固定使用的地址
	 */
	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; /* 在内存高地址区域 */
	page_table_range_init(vaddr, 0, pgd_base); /* 页面表固定地址初始化,包括acpi地址等 */

	permanent_kmaps_init(pgd_base);  /* 固定地址初始化(pkmap),此地址干什么用?????是不是用作highmem分配使用 */

#ifdef CONFIG_X86_PAE
	/*
	 * Add low memory identity-mappings - SMP needs it when
	 * starting up on an AP from real-mode. In the non-PAE
	 * case we already have these mappings through head.S.
	 * All user-space mappings are explicitly cleared after
	 * SMP startup.
	 */
	pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
#endif
}

/*
 * This maps the physical memory to kernel virtual address space, a total 
 * of max_low_pfn pages, by creating page tables starting from address 
 * PAGE_OFFSET.(映射物理内存到系统空间虚拟地址,共max_low_pfn页面,从0xc0000000地址开始)
 */
static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
{
	unsigned long pfn;
	pgd_t *pgd;
	pmd_t *pmd;
	pte_t *pte;
	int pgd_idx, pmd_idx, pte_ofs;

	pgd_idx = pgd_index(PAGE_OFFSET);	/* 映射开始地址是系统空间 */
	pgd = pgd_base + pgd_idx;
	pfn = 0;

	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
		pmd = one_md_table_init(pgd);	/* 初始化二级目录表 */
		if (pfn >= max_low_pfn)
			continue;
		for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
			unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;

			/* Map with big pages if possible, otherwise create normal page tables. */
			if (cpu_has_pse) {	/* 4MB页面表初始化,如果用此,将没有第三级页面 */
				unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;

				if (is_kernel_text(address) || is_kernel_text(address2))
					set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
				else
					set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
				pfn += PTRS_PER_PTE;
			} else {
				pte = one_page_table_init(pmd);

				for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
						if (is_kernel_text(address))
							set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
						else
							set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
				}
			}
		}
	}
}


void __init sched_init(void)
{
	runqueue_t *rq;
	int i, j, k;
	/* 初始化每个cpu运行队列 */
	for (i = 0; i < NR_CPUS; i++) {
		prio_array_t *array;

		rq = cpu_rq(i);
		spin_lock_init(&rq->lock);
		rq->active = rq->arrays; /* 活动队列 */
		rq->expired = rq->arrays + 1; /* 过期队列 */
		rq->best_expired_prio = MAX_PRIO; /* 优先级最低 */

#ifdef CONFIG_SMP
		rq->sd = &sched_domain_dummy;
		rq->cpu_load = 0; 	/* cpu负载 */
		rq->active_balance = 0;	/* ???? */
		rq->push_cpu = 0;	/* ???? */
		rq->migration_thread = NULL;
		INIT_LIST_HEAD(&rq->migration_queue);
#endif
		atomic_set(&rq->nr_iowait, 0);

		for (j = 0; j < 2; j++) {
			array = rq->arrays + j;
			for (k = 0; k < MAX_PRIO; k++) {
				INIT_LIST_HEAD(array->queue + k);
				__clear_bit(k, array->bitmap);
			}
			// delimiter for bitsearch
			__set_bit(MAX_PRIO, array->bitmap);
		}
	}

	/*
	 * The boot idle thread does lazy MMU switching as well:
	 */
	atomic_inc(&init_mm.mm_count);
	enter_lazy_tlb(&init_mm, current);

	/*
	 * Make us the idle thread. Technically, schedule() should not be
	 * called from this thread, however somewhere below it might be,
	 * but because we are the idle thread, we just pick up running again
	 * when this runqueue becomes "idle".
	 */
	init_idle(current, smp_processor_id()); /* 设置idel进程,并将runqueue中curr指向该进程 */
}

void __init trap_init(void)	/* 中断向量重新设置(在初始化时设置指向ignore_int) */
{
#ifdef CONFIG_EISA
	if (isa_readl(0x0FFFD9) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
		EISA_bus = 1;
	}
#endif

#ifdef CONFIG_X86_LOCAL_APIC
	init_apic_mappings();
#endif

	set_trap_gate(0,÷_error);	/* 陷阱门设置 */
	set_intr_gate(1,&debug);	/* 中断门设置 */
	set_intr_gate(2,&nmi);
	set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
	set_system_gate(4,&overflow);
	set_system_gate(5,&bounds);
	set_trap_gate(6,&invalid_op);
	set_trap_gate(7,&device_not_available);
	set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
	set_trap_gate(9,&coprocessor_segment_overrun);
	set_trap_gate(10,&invalid_TSS);
	set_trap_gate(11,&segment_not_present);
	set_trap_gate(12,&stack_segment);
	set_trap_gate(13,&general_protection);
	set_intr_gate(14,&page_fault);
	set_trap_gate(15,&spurious_interrupt_bug);
	set_trap_gate(16,&coprocessor_error);
	set_trap_gate(17,&alignment_check);
#ifdef CONFIG_X86_MCE
	set_trap_gate(18,&machine_check);
#endif
	set_trap_gate(19,&simd_coprocessor_error);

	set_system_gate(SYSCALL_VECTOR,&system_call);	/* 系统调用中断设置 */

	/*
	 * Should be a barrier for any external CPU state.
	 */
	cpu_init();	/* 重新装入gdt,ldt */

	trap_init_hook(); /* do nothing on i386 */
}

void __init init_IRQ(void)
{
	int i;

	/* all the set up before the call gates are initialised */
	pre_intr_init_hook();	/* 中断请求队列初始化 */

	/*
	 * Cover the whole vector space, no vector can escape	设置中断向量
	 * us. (some of these will be overridden and become
	 * 'special' SMP interrupts)
	 */
	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
		int vector = FIRST_EXTERNAL_VECTOR + i;
		if (i >= NR_IRQS)
			break;
		if (vector != SYSCALL_VECTOR) 
			set_intr_gate(vector, interrupt[i]);
	}

	/* setup after call gates are initialised (usually add in
	 * the architecture specific gates) 在系统调用初始化完毕后特殊设置,和结构相关 
	 */
	intr_init_hook();

	/*
	 * Set the clock to HZ Hz, we already have a valid
	 * vector now: 设置时钟hz
	 */
	setup_pit_timer();

	/*
	 * External FPU? Set up irq13 if so, for
	 * original braindamaged IBM FERR coupling.
	 */
	if (boot_cpu_data.hard_math && !cpu_has_fpu)
		setup_irq(FPU_IRQ, &fpu_irq);

	irq_ctx_init(smp_processor_id());
}


void __init mem_init(void)
{
	extern int ppro_with_ram_bug(void);	/* 检测pentium是否是有bug的cpu */
	int codesize, reservedpages, datasize, initsize;
	int tmp;
	int bad_ppro;

#ifndef CONFIG_DISCONTIGMEM
	if (!mem_map)
		BUG();
#endif
	
	bad_ppro = ppro_with_ram_bug();

#ifdef CONFIG_HIGHMEM
	/* check that fixmap and pkmap do not overlap 确认fixmap和pkmap没有重叠 */
	if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
		printk(KERN_ERR "fixmap and kmap areas overlap - this will crash/n");
		printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh/n",
				PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
		BUG();
	}
#endif
 
	set_max_mapnr_init();	/* 设置highmem区域 */

#ifdef CONFIG_HIGHMEM
	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE);
#else
	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
#endif

	/* this will put all low memory onto the freelists,根据页面位图释放内存中所有可供动态分配的页面 */
	totalram_pages += __free_all_bootmem();

	reservedpages = 0;
	for (tmp = 0; tmp < max_low_pfn; tmp++)
		/*
		 * Only count reserved RAM pages
		 */
		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
			reservedpages++;

	set_highmem_pages_init(bad_ppro);

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 	/* 初始化kcore_mem,应该是实际内存? */
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
		   VMALLOC_END-VMALLOC_START);	/* 虚拟内存初始化 */

	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)/n",
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
		num_physpages << (PAGE_SHIFT-10),
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10,
		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
	       );

#ifdef CONFIG_X86_PAE
	if (!cpu_has_pae)
		panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
#endif
	if (boot_cpu_data.wp_works_ok < 0)
		test_wp_bit();

	/*
	 * Subtle. SMP is doing it's boot stuff late (because it has to
	 * fork idle threads) - but it also needs low mappings for the
	 * protected-mode entry to work. We zap these entries only after
	 * the WP-bit has been tested.
	 */
#ifndef CONFIG_SMP
	zap_low_mappings();
#endif
}


/* Initialisation.
 * Called after the gfp() functions have been enabled, and before smp_init().
 */
void __init kmem_cache_init(void)
{
	size_t left_over;
	struct cache_sizes *sizes;
	struct cache_names *names;

	/*
	 * Fragmentation(分裂) resistance(阻力) on low memory - only use bigger
	 * page orders on machines with more than 32MB of memory.
	 */
	if (num_physpages > (32 << 20) >> PAGE_SHIFT)	/* 系统有多于32MB内存 */
		slab_break_gfp_order = BREAK_GFP_ORDER_HI;

	
	/* Bootstrap is tricky, because several objects are allocated
	 * from caches that do not exist yet:
	 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
	 *    structures of all caches, except cache_cache itself: cache_cache
	 *    is statically allocated.
	 *    Initially an __init data area is used for the head array, it's
	 *    replaced with a kmalloc allocated array at the end of the bootstrap.
	 * 2) Create the first kmalloc cache.
	 *    The kmem_cache_t for the new cache is allocated normally. An __init
	 *    data area is used for the head array.
	 * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
	 * 4) Replace the __init data head arrays for cache_cache and the first
	 *    kmalloc cache with kmalloc allocated arrays.
	 * 5) Resize the head arrays of the kmalloc caches to their final sizes.
	 */

	/* 1) create the cache_cache */
	init_MUTEX(&cache_chain_sem);	/* 初始化cache链表信号量 */
	INIT_LIST_HEAD(&cache_chain);	/* 初始化cache链表 */
	list_add(&cache_cache.next, &cache_chain);	/* 是不是把自己加入到队列头???? */
	cache_cache.colour_off = cache_line_size();	/* 128 */
	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;

	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());

	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
				&left_over, &cache_cache.num);
	if (!cache_cache.num)
		BUG();

	cache_cache.colour = left_over/cache_cache.colour_off;
	cache_cache.colour_next = 0;
	cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
				sizeof(struct slab), cache_line_size());

	/* 2+3) create the kmalloc caches */
	sizes = malloc_sizes;
	names = cache_names;

	while (sizes->cs_size) {
		/* For performance, all the general caches are L1 aligned.
		 * This should be particularly beneficial on SMP boxes, as it
		 * eliminates "false sharing".
		 * Note for systems short on memory removing the alignment will
		 * allow tighter packing of the smaller caches. */
		sizes->cs_cachep = kmem_cache_create(names->name,
			sizes->cs_size, ARCH_KMALLOC_MINALIGN,
			(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);

		/* Inc off-slab bufctl limit until the ceiling is hit. */
		if (!(OFF_SLAB(sizes->cs_cachep))) {
			offslab_limit = sizes->cs_size-sizeof(struct slab);
			offslab_limit /= sizeof(kmem_bufctl_t);
		}

		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
			sizes->cs_size, ARCH_KMALLOC_MINALIGN,
			(ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
			NULL, NULL);

		sizes++;
		names++;
	}
	/* 4) Replace the bootstrap head arrays */
	{
		void * ptr;
		
		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
		local_irq_disable();
		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
		memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
		cache_cache.array[smp_processor_id()] = ptr;
		local_irq_enable();
	
		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
		local_irq_disable();
		BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
		memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
				sizeof(struct arraycache_init));
		malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
		local_irq_enable();
	}

	/* 5) resize the head arrays to their final sizes */
	{
		kmem_cache_t *cachep;
		down(&cache_chain_sem);
		list_for_each_entry(cachep, &cache_chain, next)
			enable_cpucache(cachep);	/* 激活cpu缓存 */
		up(&cache_chain_sem);
	}

	/* Done! */
	g_cpucache_up = FULL;

	/* Register a cpu startup notifier callback
	 * that initializes ac_data for all new cpus
	 */
	register_cpu_notifier(&cpucache_notifier);
	

	/* The reap timers are started later, with a module init call:
	 * That part of the kernel is not yet operational.
	 */
}

void __init pidmap_init(void)
{
	int i;

	pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
	set_bit(0, pidmap_array->page);
	atomic_dec(&pidmap_array->nr_free);

	/*
	 * Allocate PID 0, and hash it via all PID types:
	 */

	for (i = 0; i < PIDTYPE_MAX; i++)	/* 将当前进程加入到hash表中.pid,pgid,tgid,sid */
		attach_pid(current, i, 0);
}

/*
 * We need to finalize in a non-__init function or else race conditions
 * between the root thread and the init thread may cause start_kernel to
 * be reaped by free_initmem before the root thread has proceeded to
 * cpu_idle.
 *
 * gcc-3.4 accidentally inlines this function, so use noinline.
 */

static void noinline rest_init(void)
	__releases(kernel_lock)
{
	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND); /* 启动init内核进程 */
	numa_default_policy();
	unlock_kernel();
 	cpu_idle();
} 

static int init(void * unused)
{
	lock_kernel();
	/*
	 * Tell the world that we're going to be the grim
	 * reaper of innocent orphaned children. 所有进程的父进程
	 *
	 * We don't want people to have to make incorrect
	 * assumptions about where in the task array this
	 * can be found.
	 */
	child_reaper = current;

	/* Sets up cpus_possible() */
	smp_prepare_cpus(max_cpus); /*主cpu会依次启动各个从cpu。见smp_boot_cpus->do_boot_cpu()*/

	do_pre_smp_initcalls();	/* 启动migration_thread,ksoftirqd等CPU进程 */

	fixup_cpu_present_map();
	smp_init();	/* 主要设置APIC */
	sched_init_smp();

	/*
	 * Do this before initcalls, because some drivers want to access
	 * firmware files.
	 */
	populate_rootfs();	/* 生成initrd文件 */

	do_basic_setup();

	/*
	 * check if there is an early userspace init.  If yes, let it do all
	 * the work
	 */
	if (sys_access((const char __user *) "/init", 0) == 0)
		execute_command = "/init";
	else
		prepare_namespace();	/* 装载initrd,安装模块,mount根文件系统 */

	/*
	 * Ok, we have completed the initial bootup, and
	 * we're essentially up and running. Get rid of the
	 * initmem segments and start the user-mode stuff..
	 */
	free_initmem();
	unlock_kernel();
	system_state = SYSTEM_RUNNING;
	numa_default_policy();

	if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
		printk("Warning: unable to open an initial console./n");

	(void) sys_dup(0);
	(void) sys_dup(0);
	
	/*
	 * We try each of these until one succeeds.
	 *
	 * The Bourne shell can be used instead of init if we are 
	 * trying to recover a really broken machine.
	 */

	if (execute_command)
		run_init_process(execute_command);

	run_init_process("/sbin/init");
	run_init_process("/etc/init");
	run_init_process("/bin/init");
	run_init_process("/bin/sh");

	panic("No init found.  Try passing init= option to kernel.");
}

猜你喜欢

转载自blog.csdn.net/enlaihe/article/details/7916027