Linux memory management: Bootmem takes the lead


article content
Linux memory management: Bootmem takes the lead BootmemStart process memory allocator
Linux Memory Management: The Buddy System is long overdue Buddy SystemPartner system memory allocator
Linux memory management: Slab makes its debut Slabmemory allocator

This is the first article in the source code analysis column

It is mainly divided into four major modules for analysis: memory management, device management, system startup and other parts.

Memory management is divided into three parts Bootmem: , Buddy Systemand Slab. This article mainly explains the Bootmemstartup process.


BootmemThe allocator is Linux boot-timean allocator that manages physical memory in stages and provides physical memory allocation and deallocation. As the first real memory allocator in the kernel initialization process, it provides the allocation and recycling of physical memory for the early initialization activities of the kernel, and provides a basis for the creation of the allocator. The allocator manages the physical memory by Buddy itself Bootmem . After being handed over to Buddythe allocator, its mission has been completed, and the kernel officially enables Buddy the allocator to manage system physical memory.

Reference environment

  • Source code version:Linux 2.6.30.4
  • editor:VsCode
  • system structure:ARM/Linux

Skip the previous ones. This article focuses on analyzing bootmemthe initialization process. Before that, swapper_pg_dirthe location of the kernel page table is determined.

void __init paging_init(struct machine_desc *mdesc)
{
    
    
	// ..
	bootmem_init();

bootmem_init

void __init bootmem_init(void)
{
    
    
	struct meminfo *mi = &meminfo;  // 获取内存信息

By meminfoobtaining memory information, which struct meminforepresents bankthe information of each

struct meminfo {
    
    
	int nr_banks;  // bank数量
	struct membank bank[NR_BANKS];  // bank数组
};

And for everystruct membank

struct membank {
    
    
	unsigned long start; // 开始地址 物理地址
	unsigned long size;  // 大小
	int           node;  // node节点
};

There is a step initrd_node = check_initrd(mi);to obtain the node where the virtual disk file is located. We will look at the use of this later and will not explain it here.

	for_each_node(node) {
    
    
		unsigned long end_pfn = bootmem_init_node(node, mi);

To traverse nodes node, the macro is defined as

#define for_each_node(node)	   for_each_node_state(node, N_POSSIBLE)

#define for_each_node_state(__node, __state) \
	for_each_node_mask((__node), node_states[__state])

#define for_each_node_mask(node, mask)			\
	if (!nodes_empty(mask))				\
		for ((node) = 0; (node) < 1; (node)++)

The macro expands to

if (!nodes_empty(node_states[N_POSSIBLE])) 
{
    
    
    for (((node)) = 0; ((node)) < 1; ((node))++)
    {
    
    
    	// ...
	}
}

node_statesAs follows, we will look at this later

enum node_states {
    
    
	N_POSSIBLE,		/* The node could become online at some point */
	N_ONLINE,		/* The node is online */
	N_NORMAL_MEMORY,	/* The node has regular memory */
#ifdef CONFIG_HIGHMEM
	N_HIGH_MEMORY,		/* The node has regular or high memory */
#else
	N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
	N_CPU,		/* The node has one or more cpus */
	NR_NODE_STATES
};

bootmem_init_node

		// 节点获得`end_fn`信息
		unsigned long end_pfn = bootmem_init_node(node, mi);
	pg_data_t *pgdat; // 表示的是一个node的信息

pg_data_trepresents anode

	for_each_nodebank(i, mi, node)

The macro definition expands to

#define for_each_nodebank(iter,mi,no)			\
	for (iter = 0; iter < (mi)->nr_banks; iter++)	\
		if ((mi)->bank[iter].node == no)

for (i = 0; i < (mi)->nr_banks; i++) 
{
    
    
	if ((mi)->bank[i].node == node)
    {
    
    
        // 找到属于该节点的bank,node默认从0开始
    }
}
	struct membank *bank = &mi->bank[i]; // 获取该bank

Get the starting page frame number and ending page frame number (using physical address)

		start = bank_pfn_start(bank); // 起始页帧号
		end = bank_pfn_end(bank);     // 结束页帧号

If you have time, you can take a look at these two macros

#define bank_pfn_start(bank)	__phys_to_pfn((bank)->start)
#define bank_pfn_end(bank)	__phys_to_pfn((bank)->start + (bank)->size)

#define	__phys_to_pfn(paddr)	((paddr) >> PAGE_SHIFT)

This PAGE_SHIFT right shift process is a bit special. Its function is to get the page number of the page where the address is located PAGE_SHIFTby shifting the address to the right.PAGE_SHIFT

Another way is to use it PAGE_MASK, that is, let it PAGE_MASKinteract with

/* PAGE_SHIFT determines the page size */
#define PAGE_SHIFT		12
#define PAGE_SIZE		(1UL << PAGE_SHIFT)
#define PAGE_MASK		(~(PAGE_SIZE-1))

map_memory_bank

	map_memory_bank(bank);

That is, bankmapping

static inline void map_memory_bank(struct membank *bank)
{
    
    
#ifdef CONFIG_MMU
	struct map_desc map;

	map.pfn = bank_pfn_start(bank);
	map.virtual = __phys_to_virt(bank_phys_start(bank));
	map.length = bank_phys_size(bank);
	map.type = MT_MEMORY;

	create_mapping(&map);
#endif
}

The map_descstructure is

struct map_desc {
    
    
	unsigned long virtual; // 起始虚拟地址
	unsigned long pfn;     // 起始页帧号
	unsigned long length;  // 长度
	unsigned int type;     // bank类型
};

Among them, map.virtualthe acquisition of

#define __phys_to_virt(x)	((x) - PHYS_OFFSET + PAGE_OFFSET)

#define bank_phys_start(bank)	(bank)->start

create_mapping

Mapping

void __init create_mapping(struct map_desc *md)
{
    
    
	unsigned long phys, addr, length, end;
	const struct mem_type *type;
	pgd_t *pgd;

This is to prevent the situation where the virtual address is not the interrupt vector table address and is in the user area ( 0-3G), and to prevent the situation where the memory type is IOor but the virtual address is a low-end memory application area ( )ROM3G-3G+768MB

	if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
	    md->virtual >= PAGE_OFFSET && md->virtual < VMALLOC_END) {
    
    
		printk(KERN_WARNING "BUG: mapping for 0x%08llx at 0x%08lx "
		       "overlaps vmalloc space\n",
		       __pfn_to_phys((u64)md->pfn), md->virtual);
	}

The purpose here md->pfn >= 0x100000is to determine whether the page frame number is greater than the page frame number 4G, and will not be summarized.

	type = &mem_types[md->type]; // 获取内存类型

	if (md->pfn >= 0x100000) {
    
    
		create_36bit_mapping(md, type);
		return;
	}

addris the virtual address, physis the physical address, lengthand is the length (see!, it is used here md->virtual & PAGE_MASKto get the page number!)

	addr = md->virtual & PAGE_MASK; // 页号,同上
	phys = (unsigned long)__pfn_to_phys(md->pfn);
	length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));

Get the swapper_pg_dirpage directory entry of the virtual address in the page directory table

	pgd = pgd_offset_k(addr);
	end = addr + length; // 页号的结束位置
#define pgd_offset_k(addr)	pgd_offset(&init_mm, addr)

#define pgd_offset(mm, addr)	((mm)->pgd+pgd_index(addr))

#define pgd_index(addr)		((addr) >> PGDIR_SHIFT) // 得到虚拟地址

Populate the page directory table for mapping

	do {
    
    
		unsigned long next = pgd_addr_end(addr, end); // 以2MB为单位排列的地址
		// 根据type变量中保存的内存类型,更改对应于被指定BANK的各目录
		alloc_init_section(pgd, addr, next, phys, type);

		phys += next - addr;
		addr = next;
	} while (pgd++, addr != end); // 只要不超过end,就获取下一个2MB的虚拟起始地址

alloc_init_section

one sectionis1MB

#define pgd_addr_end(addr, end)						\
({
      
      	unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;	\
	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
})

#define PGDIR_SIZE		(1UL << PGDIR_SHIFT) // 2MB

Map below

#define	pmd_offset(a, b)	((void *)0)

#define __pmd(x)        (x)

#define SECTION_SIZE		(1UL << SECTION_SHIFT)  
#define SECTION_SHIFT		20

static void __init alloc_init_section(pgd_t *pgd, unsigned long addr,
				      unsigned long end, unsigned long phys,
				      const struct mem_type *type)
{
    
    
	pmd_t *pmd = pmd_offset(pgd, addr);  

	if (((addr | end | phys) & ~SECTION_MASK) == 0) {
    
    
		pmd_t *p = pmd;

		if (addr & SECTION_SIZE)
			pmd++;

		do {
    
    
			*pmd = __pmd(phys | type->prot_sect);  // 页目录项属性
			phys += SECTION_SIZE; // 地址地址加2MB
		} while (pmd++, addr += SECTION_SIZE, addr != end);

		flush_pmd_entry(p);
	} else {
    
    
		/*
		 * No need to loop; pte's aren't interested in the
		 * individual L1 entries.
		 */
		alloc_init_pte(pmd, addr, end, __phys_to_pfn(phys), type);
	}
}

bankTherefore, a one-to-one mapping is performed on a certain node flat model, that is,

		map_memory_bank(bank);
	}

Come bootmem_init_nodeback, next is

		start = bank_pfn_start(bank); // 起始页帧号
		end = bank_pfn_end(bank);     // 结束页帧号

		if (start_pfn > start)
			start_pfn = start;
		if (end_pfn < end)
			end_pfn = end;

You can get the starting page frame number and the ending page frame number.

bootmem_bootmap_pages

	boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);	

end_pfnThe sum here start_pfnrepresents a nodepage frame number

static unsigned long __init bootmap_bytes(unsigned long pages)
{
    
    
	unsigned long bytes = (pages + 7) / 8;  // 1b表示1页帧

	return ALIGN(bytes, sizeof(long));
}

unsigned long __init bootmem_bootmap_pages(unsigned long pages) // 计算页面中的位图大小
{
    
    
	unsigned long bytes = bootmap_bytes(pages);

	return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
}

The macro is defined as

#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)

#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))

#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)

#define PAGE_SHIFT		12
#define PAGE_SIZE		(1UL << PAGE_SHIFT)

The main function of this function is to calculate the number of bytes required to identify the corresponding node page number (in units of 4 bytes)

Macro expansion

unsigned long __init bootmem_bootmap_pages(unsigned long pages)
{
    
    
 unsigned long bytes = bootmap_bytes(pages);
 // 求出所需页数
 return (((bytes)+((typeof(bytes))((1UL << 12))-1))&~((typeof(bytes))((1UL << 12))-1)) >> 12;  // 
}

static unsigned long __init bootmap_bytes(unsigned long pages)
{
    
    
 unsigned long bytes = (pages + 7) / 8;

 return (((bytes)+((typeof(bytes))(sizeof(long))-1))&~((typeof(bytes))(sizeof(long))-1));
}

This algorithm is a bit complicated, but it can be used directly. If you use one page to represent the page frame number, you can represent 4KB=b'4*1024*8a page frame.

find_bootmap_pfn

Based on bootmem_bootmap_pagesthe calculated bootmapnumber of pages, you now need to decide where to place the calculated pages.

static unsigned int __init
find_bootmap_pfn(int node, struct meminfo *mi, unsigned int bootmap_pages)
{
    
    
	unsigned int start_pfn, i, bootmap_pfn;

start_pfnRefers to _endthe next location of the kernel code segment, __pawhich refers to the conversion to a physical address

	start_pfn   = PAGE_ALIGN(__pa(_end)) >> PAGE_SHIFT;
	bootmap_pfn = 0;
#define __pa(x)			__virt_to_phys((unsigned long)(x))
#define __va(x)			((void *)__phys_to_virt((unsigned long)(x)))

Let’s look at the specific process below

	for_each_nodebank(i, mi, node) {
    
    
		struct membank *bank = &mi->bank[i];
		unsigned int start, end;

		start = bank_pfn_start(bank);  // 求出起始页帧号
		end   = bank_pfn_end(bank); // 求出末尾页帧号

		if (end < start_pfn) // 
			continue;

		if (start < start_pfn)
			start = start_pfn;  // 起始页帧号要比bss页帧号大

		if (end <= start)
			continue;

		if (end - start >= bootmap_pages) {
    
    
			bootmap_pfn = start; // 最后页帧号要能包含bootmap页
			break;
		}
	}

	if (bootmap_pfn == 0)
		BUG();

	return bootmap_pfn;
}

The specific requirements are

  • The starting position must be bssafter the area
  • And the whole thing bankis bootmapbigger than

It should be easy to find

node_set_online

There is nothing to say about this, it just sets the current node status toONLINE

	/*
	 * Initialise the bootmem allocator for this node, handing the
	 * memory banks over to bootmem.
	 */
	node_set_online(node);

The relevant functions and macros are defined as

#define node_set_online(node)	   node_set_state((node), N_ONLINE)

static inline void node_set_state(int node, enum node_states state)
{
    
    
	__node_set(node, &node_states[state]);
}

static inline void __node_set(int node, volatile nodemask_t *dstp)
{
    
    
	set_bit(node, dstp->bits);
}

#define set_bit(nr,p)			ATOMIC_BITOP_LE(set_bit,nr,p)

#define	ATOMIC_BITOP_LE(name,nr,p)		\
	(__builtin_constant_p(nr) ?		\
	 ____atomic_##name(nr, p) :		\
	 _##name##_le(nr,p))

#define	ATOMIC_BITOP_BE(name,nr,p)		\
	(__builtin_constant_p(nr) ?		\
	 ____atomic_##name(nr, p) :		\
	 _##name##_be(nr,p))
#else
#define ATOMIC_BITOP_LE(name,nr,p)	_##name##_le(nr,p)
#define ATOMIC_BITOP_BE(name,nr,p)	_##name##_be(nr,p)
#endif

static inline void ____atomic_set_bit(unsigned int bit, volatile unsigned long *p)
{
    
    
	unsigned long flags;
	unsigned long mask = 1UL << (bit & 31);

	p += bit >> 5;

	raw_local_irq_save(flags);
	*p |= mask;
	raw_local_irq_restore(flags);
}

The node status is

enum node_states {
    
    
	N_POSSIBLE,		/* 将要联机的节点 */
	N_ONLINE,		/* 当前联机的节点 */
	N_NORMAL_MEMORY,	/* 普通内存类型的节点 */
#ifdef CONFIG_HIGHMEM
	N_HIGH_MEMORY,		/* 普通内存或高端内存类型的节点 */
#else
	N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
	N_CPU,		/* 节点具有1个或多个cpu */
	NR_NODE_STATES
};

NODE_DATA

	pgdat = NODE_DATA(node);

where the macro is defined as

/*
 * Return a pointer to the node data for node n.
 */
#define NODE_DATA(nid)		(&discontig_node_data[nid])

where discontig_node_datais , then the corresponding node descriptor is returned, that is, pg_data_tthe type

pg_data_t discontig_node_data[MAX_NUMNODES] = {
    
    
  {
    
     .bdata = &bootmem_node_data[0] },
  {
    
     .bdata = &bootmem_node_data[1] },
  {
    
     .bdata = &bootmem_node_data[2] },
  {
    
     .bdata = &bootmem_node_data[3] },
#if MAX_NUMNODES == 16
  {
    
     .bdata = &bootmem_node_data[4] },
  {
    
     .bdata = &bootmem_node_data[5] },
  {
    
     .bdata = &bootmem_node_data[6] },
  {
    
     .bdata = &bootmem_node_data[7] },
  {
    
     .bdata = &bootmem_node_data[8] },
  {
    
     .bdata = &bootmem_node_data[9] },
  {
    
     .bdata = &bootmem_node_data[10] },
  {
    
     .bdata = &bootmem_node_data[11] },
  {
    
     .bdata = &bootmem_node_data[12] },
  {
    
     .bdata = &bootmem_node_data[13] },
  {
    
     .bdata = &bootmem_node_data[14] },
  {
    
     .bdata = &bootmem_node_data[15] },
#endif
};

pg_data_tThe types are as follows

typedef struct pglist_data {
    
    
	struct zone node_zones[MAX_NR_ZONES]; // zone数组
	struct zonelist node_zonelists[MAX_ZONELISTS]; // 备份列表
	int nr_zones; // zone个数
#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
	struct page *node_mem_map;  // 页数组指针指向相应节点的页的起始地址
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
	struct page_cgroup *node_page_cgroup;
#endif
#endif
	struct bootmem_data *bdata; //
#ifdef CONFIG_MEMORY_HOTPLUG
	/*
	 * Must be held any time you expect node_start_pfn, node_present_pages
	 * or node_spanned_pages stay constant.  Holding this will also
	 * guarantee that any pfn_valid() stays that way.
	 *
	 * Nests above zone->lock and zone->size_seqlock.
	 */
	spinlock_t node_size_lock; 
#endif
	unsigned long node_start_pfn; // 起始页帧号
	unsigned long node_present_pages; /* 物理页总数 */
	unsigned long node_spanned_pages; /* 包含空洞的总页数 */
	int node_id; // 节点号
	wait_queue_head_t kswapd_wait; // 用于交换守护进程的等待队列
	struct task_struct *kswapd; // 交换守护进程的task_strutc指针
	int kswapd_max_order; // 使用交换系统,用于决定free区域的大小
} pg_data_t;

init_bootmem_node

Initialize the member variables of the node descriptor

	init_bootmem_node(pgdat, boot_pfn, start_pfn, end_pfn);

bdataThere are bootmem_datastructure variables used to manage bitmap information . bdataAfter connecting to the linked list, all bitmaps are set to in use.

typedef struct bootmem_data {
    
    
	unsigned long node_min_pfn;  // 节点的起始页帧号
	unsigned long node_low_pfn;  // 节点的最后页帧号
	void *node_bootmem_map;      // 用于管理的位图
	unsigned long last_end_off;  // 最后分配的位置
	unsigned long hint_idx;      // 下一次使用的页(提示)
	struct list_head list;       // 链表指针
} bootmem_data_t;

that is

unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
				unsigned long startpfn, unsigned long endpfn)
{
    
    
	return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
}

static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
	unsigned long mapstart, unsigned long start, unsigned long end)
{
    
    
	unsigned long mapsize;

	mminit_validate_memmodel_limits(&start, &end);
	bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));  // 设置成员变量的值
	bdata->node_min_pfn = start;
	bdata->node_low_pfn = end;
	link_bootmem(bdata);   // 将bootmem_node插入链表

	/*
	 * Initially all pages are reserved - setup_arch() has to
	 * register free RAM areas explicitly.
	 */
	mapsize = bootmap_bytes(end - start);
    // 将用于管理的位图bootmem_data结构体的node_bootmem_map设置为0xff,也就是将所有页面设置为1,无可用页
	memset(bdata->node_bootmem_map, 0xff, mapsize);

	bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
		bdata - bootmem_node_data, start, mapstart, end, mapsize);

	return mapsize;
}

static void __init link_bootmem(bootmem_data_t *bdata)
{
    
    
	struct list_head *iter;

	list_for_each(iter, &bdata_list) {
    
    
		bootmem_data_t *ent;

		ent = list_entry(iter, bootmem_data_t, list);
		if (bdata->node_min_pfn < ent->node_min_pfn)
			break;
	}
	list_add_tail(&bdata->list, iter);
}

There is something a little strange here phys_to_virt(PFN_PHYS(mapstart)). It is easy to know that boot_pfnit is a physical page frame number. However #define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT), first obtain the starting address of the corresponding physical page frame, and then convert the physical address to a virtual address.

free_bootmem_node

	for_each_nodebank(i, mi, node) {
    
    
		struct membank *bank = &mi->bank[i];
		free_bootmem_node(pgdat, bank_phys_start(bank), bank_phys_size(bank));
		memory_present(node, bank_pfn_start(bank), bank_pfn_end(bank));
	}

Previously the bitmap of the management node node_bootmem_mapwas set to 1, so here it is set to0

void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
			      unsigned long size)
{
    
    
	unsigned long start, end;

	start = PFN_UP(physaddr);
	end = PFN_DOWN(physaddr + size);

	mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
}

reserve_bootmem_node

When requesting a memory allocation, the bitmap actually containing it for management must be excluded from the allocation object

In the bitmap used for management pgdt->bdata->node_bootmem_map, set the corresponding bits to 1, indicating that it is unavailable (i.e. boot_mappage)

This is a "chicken-and-egg" problem

	reserve_bootmem_node(pgdat, boot_pfn << PAGE_SHIFT,
			     boot_pages << PAGE_SHIFT, BOOTMEM_DEFAULT);

boot_pages << PAGE_SHIFTI believe everyone has figured out the purpose here . It is to boot_pfn << PAGE_SHIFTdetermine endthe address together with the address, and then convert everything into a byte address.

One page is 4KB, here boot_pages *2^12B=boot_pages *4KB, you can find the number of bytes

#define BOOTMEM_DEFAULT		0

function body

int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
				 unsigned long size, int flags)
{
    
    
	unsigned long start, end;

	start = PFN_DOWN(physaddr);
	end = PFN_UP(physaddr + size);

	return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
}

The macro is defined as

// 页帧的地址上界
#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
// 页帧的地址下界
#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)

#define BOOTMEM_DEFAULT		0

Another twist and turn

reserve_node_zero

Exclusion 0number node

		/*
		 * Reserve any special node zero regions.
		 */
		if (node == 0)
			reserve_node_zero(NODE_DATA(node));
void __init reserve_node_zero(pg_data_t *pgdat)
{
    
    
	unsigned long res_size = 0;

In the node's bitmap used to manage pages, this function sets the corresponding 0default setting of the node where the kernel and its page directory area are located to the one in use.

	reserve_bootmem_node(pgdat, __pa(_stext), _end - _stext,
			BOOTMEM_DEFAULT);

In kernel location set to1

	reserve_bootmem_node(pgdat, __pa(_stext), _end - _stext,
			BOOTMEM_DEFAULT);

Set the page directory table location to1

		reserve_bootmem_node(pgdat, PHYS_OFFSET, res_size,
				BOOTMEM_DEFAULT);

Pay attention here swapper_pg_dirto the location of the kernel

bootmem_reserve_initrd

Exclude virtual memory disk nodes

		if (node == initrd_node)
			bootmem_reserve_initrd(node);

The function is as follows, setting the page corresponding to the virtual memory disk area to be in use

static void __init bootmem_reserve_initrd(int node)
{
    
    
#ifdef CONFIG_BLK_DEV_INITRD
	pg_data_t *pgdat = NODE_DATA(node);
	int res;

	res = reserve_bootmem_node(pgdat, phys_initrd_start,
			     phys_initrd_size, BOOTMEM_EXCLUSIVE);

	if (res == 0) {
    
    
		initrd_start = __phys_to_virt(phys_initrd_start);
		initrd_end = initrd_start + phys_initrd_size;
	} else {
    
    
		printk(KERN_ERR
			"INITRD: 0x%08lx+0x%08lx overlaps in-use "
			"memory region - disabling initrd\n",
			phys_initrd_start, phys_initrd_size);
	}
#endif
}

		if (end_pfn > memend_pfn)
			memend_pfn = end_pfn;

Find the end page frame number of this node

bootmem_free_node

	/*
	 * Now free memory in each node - free_area_init_node needs
	 * the sparse mem_map arrays initialized by sparse_init()
	 * for memmap_init_zone(), otherwise all PFNs are invalid.
	 */
	for_each_node(node)
		bootmem_free_node(node, mi);

to analyze the function body

	pg_data_t *pgdat = NODE_DATA(node); // 获得节点描述符
	int i;

	start_pfn = pgdat->bdata->node_min_pfn; // 起始页帧
	end_pfn = pgdat->bdata->node_low_pfn;   // 结束页帧

	// 对保存节点所属的zone的大小zone_size和保存zone中包含的空洞大小的zhole_size进行初始化
	memset(zone_size, 0, sizeof(zone_size));
	memset(zhole_size, 0, sizeof(zhole_size));

A node is divided into mainly areas zonesuch as zone_dma, zone_normalandzone_highmem

	zone_size[0] = end_pfn - start_pfn;
	zhole_size[0] = zone_size[0];

Save all page frames in zone_size[0], zone_sizewhich is used to save zonethe number of pages of different types. zhole_sizeThe array is used to save zonethe specific number of hole pages of different types.

zone_sizeand zhole_sizearray are 3elements, each element records the number of page frames and the number of holes of the current node.

	for_each_nodebank(i, mi, node)
		zhole_size[0] -= bank_pfn_size(&mi->bank[i]);

Through this loop, zonethe page number corresponding to the hole is saved in zhole_sizethe page frame number bank_pfn_sizeused for searching . There is no possibility of a hole. Therefore, the number of holes can be obtained.bank

	// node是节点
	// zhole_size是空洞数
	// zone_size是加上空洞数的总页数
	// start_pfn是开始页帧号
	free_area_init_node(node, zone_size, start_pfn, zhole_size);

free_area_init_node

This function initializes the sum and data NUMAof all nodes in the systempg_data_tzonepage

zoneLet’s look at the structure first

struct zone {
    
    
	unsigned long		pages_min, pages_low, pages_high; // 页分配器主要使用的域
	unsigned long		lowmem_reserve[MAX_NR_ZONES];  // 应对内存溢出,确保下层内存

#ifdef CONFIG_NUMA
	int node;
	unsigned long		min_unmapped_pages;
	unsigned long		min_slab_pages;
    // 为每个cpu缓存1页,不返回伙伴系统 每个cpu都有,以快速分配
	struct per_cpu_pageset	*pageset[NR_CPUS];
#else
	struct per_cpu_pageset	pageset[NR_CPUS];
#endif
	spinlock_t		lock;
#ifdef CONFIG_MEMORY_HOTPLUG
	/* see spanned/present_pages for more description */
	seqlock_t		span_seqlock;
#endif
	struct free_area	free_area[MAX_ORDER];  // 伙伴系统的骨干结构体,用于管理可用页

#ifndef CONFIG_SPARSEMEM
	unsigned long		*pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
	ZONE_PADDING(_pad1_)

	spinlock_t		lru_lock;	
	struct {
    
    
		struct list_head list;
		unsigned long nr_scan;
	} lru[NR_LRU_LISTS];

	struct zone_reclaim_stat reclaim_stat;

	unsigned long		pages_scanned;	   /* since last reclaim */
	unsigned long		flags;		   /* zone flags, see below */

	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];

	int prev_priority;

	unsigned int inactive_ratio;


	ZONE_PADDING(_pad2_)

	wait_queue_head_t	* wait_table;
	unsigned long		wait_table_hash_nr_entries;
	unsigned long		wait_table_bits;

	struct pglist_data	*zone_pgdat;
	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
	unsigned long		zone_start_pfn;  // zone的初始页帧号

	unsigned long		spanned_pages;	/* 包含空洞的总页数 */
	unsigned long		present_pages;	/* 减掉空洞的实际页数 */

	const char		*name;
} ____cacheline_internodealigned_in_smp;

struct free_area {
    
    
	struct list_head	free_list[MIGRATE_TYPES];
	unsigned long		nr_free;
};

There is an important macro definition, take it out first

#define MIGRATE_UNMOVABLE     0
#define MIGRATE_RECLAIMABLE   1
#define MIGRATE_MOVABLE       2
#define MIGRATE_RESERVE       3
#define MIGRATE_ISOLATE       4 /* can't allocate from here */
#define MIGRATE_TYPES         5

Let’s look at the function body

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
		unsigned long node_start_pfn, unsigned long *zholes_size)
{
    
    
	pg_data_t *pgdat = NODE_DATA(nid);  // 求出节点描述符pgdat

	pgdat->node_id = nid; // 设置node号
	pgdat->node_start_pfn = node_start_pfn; // 设置起始页帧号
    // 计算包含空洞在内的总页数node_spanned_pages和除空洞外的页数设置值node_present_pages
	calculate_node_totalpages(pgdat, zones_size, zholes_size);  

	alloc_node_mem_map(pgdat); // 设置成员变量node_mem_map,指向页数组中相应节点的页起始位置
#ifdef CONFIG_FLAT_NODE_MEM_MAP
	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
		nid, (unsigned long)pgdat,
		(unsigned long)pgdat->node_mem_map);
#endif
	/*  对该节点的每个区[DMA,NORMAL,HIGH]的的结构进行初始化  */
	free_area_init_core(pgdat, zones_size, zholes_size); // 填充zone结构体
}

You can take a brief look at alloc_node_mem_mapand calculate_node_totalpages, it is relatively easy to understand.

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
		unsigned long *zones_size, unsigned long *zholes_size)
{
    
    
	unsigned long realtotalpages, totalpages = 0;
	enum zone_type i;

	for (i = 0; i < MAX_NR_ZONES; i++)
		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
								zones_size);
	pgdat->node_spanned_pages = totalpages;

	realtotalpages = totalpages;
	for (i = 0; i < MAX_NR_ZONES; i++)
		realtotalpages -=
			zone_absent_pages_in_node(pgdat->node_id, i,
								zholes_size);
	pgdat->node_present_pages = realtotalpages;
	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
							realtotalpages);
}
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
{
    
    
	/* Skip empty nodes */
	if (!pgdat->node_spanned_pages)
		return;

#ifdef CONFIG_FLAT_NODE_MEM_MAP
	if (!pgdat->node_mem_map) {
    
    
		unsigned long size, start, end;
		struct page *map;

		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
		end = ALIGN(end, MAX_ORDER_NR_PAGES);
		size =  (end - start) * sizeof(struct page);
		map = alloc_remap(pgdat->node_id, size);
		if (!map)
			map = alloc_bootmem_node(pgdat, size);
		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
	}
#ifndef CONFIG_NEED_MULTIPLE_NODES
	/*
	 * With no DISCONTIG 不连续模型, the global mem_map is just set as node 0's
	 */
	if (pgdat == NODE_DATA(0)) {
    
    
		mem_map = NODE_DATA(0)->node_mem_map;
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
	}
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}

Why do we need to specify it again pgdat->node_mem_map? Because the previous point points to the total starting address of all node bitmaps, and here we need to point to the starting address of the respective page frame bitmaps.

free_area_init_core

The heavy work involved in initializing the memory domain data structure free_area_init_coreis performed by

	/*  对该节点的每个区[DMA,NORMAL,HIGH]的的结构进行初始化  */
	free_area_init_core(pgdat, zones_size, zholes_size); // 填充zone结构体
/*
 * Set up the zone data structures:
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
		unsigned long *zones_size, unsigned long *zholes_size)
{
    
    
	enum zone_type j;
	int nid = pgdat->node_id;
	unsigned long zone_start_pfn = pgdat->node_start_pfn;
	int ret;
	for (j = 0; j < MAX_NR_ZONES; j++) {
    
    
		struct zone *zone = pgdat->node_zones + j;
		unsigned long size, realsize, memmap_pages;
		enum lru_list l;
		// size为该管理区中的页框数,包括洞
		size = zone_spanned_pages_in_node(nid, j, zones_size);
		// realsize为管理区中的页框数,不包括洞
		realsize = size - zone_absent_pages_in_node(nid, j,
								zholes_size);

The premise here is that the number of page frames in each memory domain has been set in the early stage.


When was zonethe page frame of the memory domain set?

That is, the value is obtained here size, so where does the value come from?

	size = zone_spanned_pages_in_node(nid, j, zones_size);	

Take a closer look

static unsigned long __meminit zone_spanned_pages_in_node(int nid,
					unsigned long zone_type,
					unsigned long *ignored)
{
    
    
	unsigned long node_start_pfn, node_end_pfn;
	unsigned long zone_start_pfn, zone_end_pfn;

	/* Get the start and end of the node and zone */
	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
    // 得到zone的开始页帧和结束页帧
	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
    // 剔除movable管理区的部分
	adjust_zone_range_for_zone_movable(nid, zone_type,
				node_start_pfn, node_end_pfn,
				&zone_start_pfn, &zone_end_pfn);

	/* Check that this node has pages within the zone's required range */
	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
		return 0;

	/* Move the zone boundaries inside the node if necessary */
	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
	zone_start_pfn = max(zone_start_pfn, node_start_pfn);

	/* Return the spanned pages */
	return zone_end_pfn - zone_start_pfn;
}
#else
static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
					unsigned long zone_type,
					unsigned long *zones_size)
{
    
    
	return zones_size[zone_type];
}

It can be seen that the sum is obtained from the arch_zone_lowest_possible_pfn[]two arch_zone_highest_possible_pfn[]arrays zone_start_pfnandzone_end_pfn

And regarding the two arrays arch_zone_lowest_possible_pfnandarch_zone_highest_possible_pfn

  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

Therefore, it will zone_sizes_init-> free_area_init_nodesbe initialized

static void __init zone_sizes_init(void)
{
    
    
	unsigned long max_zone_pfns[MAX_NR_ZONES];
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 
	/*分别获取三个管理区的页面数*/
	max_zone_pfns[ZONE_DMA] =
		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
#endif
 
	free_area_init_nodes(max_zone_pfns);
}

void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
    
    
	unsigned long nid;
	int i;
 
	/* Sort early_node_map as initialisation assumes it is sorted */
	sort_node_map();/*将所有节点按起始页框号排序*/
 
	/* Record where the zone boundaries are */
	/*记录三个管理区的边界*/
	memset(arch_zone_lowest_possible_pfn, 0,
				sizeof(arch_zone_lowest_possible_pfn));
	memset(arch_zone_highest_possible_pfn, 0,
				sizeof(arch_zone_highest_possible_pfn));
	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
	for (i = 1; i < MAX_NR_ZONES; i++) {
    
    
		if (i == ZONE_MOVABLE) /*不处理ZONE_MOVABLE*/
			continue;
		/*将下一个管理区的起始页框置为上一个管理区的结束页框*/
		arch_zone_lowest_possible_pfn[i] =
			arch_zone_highest_possible_pfn[i-1];
		arch_zone_highest_possible_pfn[i] =
			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
	}
	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 
	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
	find_zone_movable_pfns_for_nodes(zone_movable_pfn);
 
	/* Print out the zone ranges */
	printk("Zone PFN ranges:\n");
	for (i = 0; i < MAX_NR_ZONES; i++) {
    
    
		if (i == ZONE_MOVABLE)
			continue;
		printk("  %-8s %0#10lx -> %0#10lx\n",
				zone_names[i],
				arch_zone_lowest_possible_pfn[i],
				arch_zone_highest_possible_pfn[i]);
	}
 
	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
	printk("Movable zone start PFN for each node\n");
	for (i = 0; i < MAX_NUMNODES; i++) {
    
    
		if (zone_movable_pfn[i])
			printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
	}
 
	/* Print out the early_node_map[] */
	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
	for (i = 0; i < nr_nodemap_entries; i++)
		printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
						early_node_map[i].start_pfn,
						early_node_map[i].end_pfn);
 
	/* Initialise every node */
	mminit_verify_pageflags_layout();
	setup_nr_node_ids();
	for_each_online_node(nid) {
    
    /*遍历每个节点*/
		pg_data_t *pgdat = NODE_DATA(nid);
		/*初始化节点*/
		free_area_init_node(nid, NULL,
				find_min_pfn_for_node(nid), NULL);
 
		/* Any memory on that node */
		if (pgdat->node_present_pages)
			node_set_state(nid, N_HIGH_MEMORY);
		check_for_regular_memory(pgdat);
	}
}

Initialization struct zonestructure

		nr_all_pages += realsize;

		zone->spanned_pages = size;
		zone->present_pages = realsize;
#ifdef CONFIG_NUMA
		zone->node = nid;
		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
						/ 100;
		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
#endif
		zone->name = zone_names[j];
		spin_lock_init(&zone->lock);
		spin_lock_init(&zone->lru_lock);
		zone_seqlock_init(zone);
		zone->zone_pgdat = pgdat;

		zone->prev_priority = DEF_PRIORITY;

besides

		zone_pcp_init(zone);

The function body is zonethe structure variable whose main function is to initializepageset

static __meminit void zone_pcp_init(struct zone *zone)
{
    
    
	int cpu;
	unsigned long batch = zone_batchsize(zone);

	for (cpu = 0; cpu < NR_CPUS; cpu++) {
    
    
#ifdef CONFIG_NUMA
		/* Early boot. Slab allocator not functional yet */
		zone_pcp(zone, cpu) = &boot_pageset[cpu]; // 系统初始化的时候使用的是boot_pageset
		setup_pageset(&boot_pageset[cpu],0);
#else
		setup_pageset(zone_pcp(zone,cpu), batch);
#endif
	}
	if (zone->present_pages)
		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
			zone->name, zone->present_pages, batch);
}

If only one page frame is allocated, it can be per_cpuallocated directly from the cache without going through the partner system, which can improve the allocation efficiency; if per_cputhere is no allocable page frame in the cache, allocate batcha page frame from the partner system to the cache. The page frame pageis hooked struct list_head listsin

static struct per_cpu_pageset boot_pageset[NR_CPUS];

struct per_cpu_pages {
    
    
	int count;		/* number of pages in the list */
	int high;		/* high watermark, emptying needed */
	int batch;		/* chunk size for buddy add/remove */
	struct list_head list;	/* the list of pages */
};

struct per_cpu_pageset {
    
    
	struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
	s8 expire;
#endif
#ifdef CONFIG_SMP
	s8 stat_threshold;
	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
} ____cacheline_aligned_in_smp;

Once you understand per_cpucache, it becomes easier to understand. What is per_cpucache? Why can the kernel know cputhe cache structure and allocate it? First let’s look at some concepts

What are hot and cold pages: In Linux Kernelphysical memory management buddy system, the concept of hot and cold pages is introduced. A cold page means that the free page is no longer in the cache (generally L2 Cache), and a hot page means that the free page is still in the cache. Hot and cold pages are for CPUcache. In each case, a hot and cold page zonewill be initialized for allCPUper_cpu_pageset

per_cpuVariables are mainly used to solve SMPthe NUMAproblem of system access to variables. When a Per-CPUvariable is created, each processor in the system will have a unique copy of the variable; Per-CPU the variable can also be stored in the cache of the corresponding processor, so that Get better performance when updating frequently

Linux A variable in the kernel Per-CPUis essentially an array. Each element of the array corresponds to a processor, and each processor uses its own copy of the variable.

Finally this

        /* 设置pgdat->nr_zones和zone->zone_start_pfn成员
         * 初始化zone->free_area成员
         * 初始化zone->wait_table相关成员
         */
		ret = init_currently_empty_zone(zone, zone_start_pfn,
						size, MEMMAP_EARLY);

The function body is

__meminit int init_currently_empty_zone(struct zone *zone,
					unsigned long zone_start_pfn,
					unsigned long size,
					enum memmap_context context)
{
    
    
	struct pglist_data *pgdat = zone->zone_pgdat; // 节点描述符
	int ret;
	ret = zone_wait_table_init(zone, size);
	if (ret)
		return ret;
    // 保存节点在zone数和设置区域的起始页帧号
	pgdat->nr_zones = zone_idx(zone) + 1; 

	zone->zone_start_pfn = zone_start_pfn;

	mminit_dprintk(MMINIT_TRACE, "memmap_init",
			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
			pgdat->node_id,
			(unsigned long)zone_idx(zone),
			zone_start_pfn, (zone_start_pfn + size));
	// 初始化free_list的链表,使nr_free=0
	zone_init_free_lists(zone);

	return 0;
}

static void __meminit zone_init_free_lists(struct zone *zone)
{
    
    
	int order, t;
	for_each_migratetype_order(order, t) {
    
    
		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
		zone->free_area[order].nr_free = 0;
	}
}

Related macros and their expansion

#define for_each_migratetype_order(order, type) \
	for (order = 0; order < MAX_ORDER; order++) \
		for (type = 0; type < MIGRATE_TYPES; type++)

static inline void INIT_LIST_HEAD(struct list_head *list)
{
    
    
	list->next = list;
	list->prev = list;
}

#define MIGRATE_TYPES         5

#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif

// 宏展开
static void __meminit zone_init_free_lists(struct zone *zone)
{
    
    
	int order, t;
    for (order = 0; order < MAX_ORDER; order++) 
    {
    
    
        for (t = 0; t < MIGRATE_TYPES; t++)
        {
    
    
            INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
            zone->free_area[order].nr_free = 0;
        }
    }
}

Finally, let’s take a look at memmap_initthe function. Its function is to set the members of the structure of the corresponding page frame to 1:1bits in the page array that is mapped to the page frame.pageflagsPG_reverse

		// size 
		// nid 节点id
		// zone_start_pfn
		memmap_init(size, nid, j, zone_start_pfn);

The structure of each page is

struct page {
    
    
	unsigned long flags;		/* Atomic flags, some possibly
	atomic_t _count;		// 使用次数,为0表示未使用
	union {
		atomic_t _mapcount;	//当前映射的页表项的值
		struct {		/* SLUB */
			u16 inuse;
			u16 objects;
		};
	};
	union {
    
    
	    struct {
    
    
			unsigned long private;	
			struct address_space *mapping;	
	    };
#if USE_SPLIT_PTLOCKS
	    spinlock_t ptl;
#endif
	    struct kmem_cache *slab;	/* SLUB: Pointer to slab */
	    struct page *first_page;	/* Compound tail pages */
	};
	union {
    
    
		pgoff_t index;		/* Our offset within mapping. */
		void *freelist;		/* SLUB: freelist req. slab lock */
	};
	struct list_head lru;		/* Pageout list, eg. active_list
					 * protected by zone->lru_lock !
					 */
#if defined(WANT_PAGE_VIRTUAL)
	void *virtual;			/* Kernel virtual address (NULL if
					   not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
	unsigned long debug_flags;	/* Use atomic bitops on this */
#endif
};

The function body is

#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif

enum memmap_context {
    
    
	MEMMAP_EARLY,
	MEMMAP_HOTPLUG,
};

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
		unsigned long start_pfn, enum memmap_context context)
{
    
    
	struct page *page;
	unsigned long end_pfn = start_pfn + size;
	unsigned long pfn;
	struct zone *z;

	if (highest_memmap_pfn < end_pfn - 1) // memmap_init_zone是全局变量
		highest_memmap_pfn = end_pfn - 1;

	z = &NODE_DATA(nid)->node_zones[zone]; // 获得相应区域的地址
	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
    
    
		if (context == MEMMAP_EARLY) {
    
    
			if (!early_pfn_valid(pfn))
				continue;
			if (!early_pfn_in_nid(pfn, nid))
				continue;
		}
		page = pfn_to_page(pfn);   // 得到相应页的结构体
		set_page_links(page, zone, nid, pfn);
		mminit_verify_page_links(page, zone, nid, pfn);
		init_page_count(page);
		reset_page_mapcount(page);
		SetPageReserved(page); // 设置页的flags为PG_reserve
		if ((z->zone_start_pfn <= pfn)
		    && (pfn < z->zone_start_pfn + z->spanned_pages)
		    && !(pfn & (pageblock_nr_pages - 1)))
			set_pageblock_migratetype(page, MIGRATE_MOVABLE);

		INIT_LIST_HEAD(&page->lru); // 将设置好的页面添加到链表
#ifdef WANT_PAGE_VIRTUAL
		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
		if (!is_highmem_idx(zone))
			set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
	}
}

You can take a look at a few functions

set_page_linksMainly used to set bits related to zone, nodeandsection

static inline void set_page_links(struct page *page, enum zone_type zone,
	unsigned long node, unsigned long pfn)
{
    
    
	set_page_zone(page, zone);
	set_page_node(page, node);
	set_page_section(page, pfn_to_section_nr(pfn));
}
static inline void set_page_zone(struct page *page, enum zone_type zone)
{
    
    
	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
	page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}

init_page_countset page->_count=1, reset_page_mapcountset page->_mapcount=-1,

static inline void init_page_count(struct page *page)
{
    
    
	atomic_set(&page->_count, 1); // 设置page->_count=1
}
static inline void reset_page_mapcount(struct page *page)
{
    
    
	atomic_set(&(page)->_mapcount, -1);
}

There is a doubt, why now page->_count=1?

Because it indicated that the page was swapped out page->_count=0at that time , no mapping was generated.

When _countthe value 0is , it means that the page is free or is about to be released. When _countthe value is greater than 0, it means that the pagepage has been allocated and is being used by the kernel and will not be released temporarily.

_count+=1There are many situations, not just a reference to the page

Note mapcpountthe difference between and , mapcountthe usage is relatively simple, which means the number of pages mapped by the process.
Please refer to 12.2 The difference between _count and _mapcount

The management mechanisms of physical memory in the kernel mainly include buddy systemcache slaband vmallocmechanism

Both the partner algorithm and slabthe cache allocate physical memory in the physical memory mapping area, while vmallocthe mechanism allocates physical memory in the high-end memory mapping area.

bootmem allocator

This apireference "Into Linuxthe Kernel Architecture" Wolfgang Mauerer

Interface to the kernelinclude/linux.bootmem.h

#define alloc_bootmem(x) \
	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_nopanic(x) \
	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages(x) \
	__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_nopanic(x) \
	__alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_node(pgdat, x) \
	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_node(pgdat, x) \
	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))

#define alloc_bootmem_low(x) \
	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
#define alloc_bootmem_low_pages(x) \
	__alloc_bootmem_low(x, PAGE_SIZE, 0)
#define alloc_bootmem_low_pages_node(pgdat, x) \
	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
extern void *__alloc_bootmem(unsigned long size,
			     unsigned long align,
			     unsigned long goal);
extern void *__alloc_bootmem_nopanic(unsigned long size,
				     unsigned long align,
				     unsigned long goal);
extern void *__alloc_bootmem_node(pg_data_t *pgdat,
				  unsigned long size,
				  unsigned long align,
				  unsigned long goal);
extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
				  unsigned long size,
				  unsigned long align,
				  unsigned long goal);
extern void *__alloc_bootmem_low(unsigned long size,
				 unsigned long align,
				 unsigned long goal);
extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
				      unsigned long size,
				      unsigned long align,
				      unsigned long goal);
extern unsigned long init_bootmem_node(pg_data_t *pgdat,
				       unsigned long freepfn,
				       unsigned long startpfn,
				       unsigned long endpfn);
extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);

extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
extern unsigned long free_all_bootmem(void);

extern void free_bootmem_node(pg_data_t *pgdat,
			      unsigned long addr,
			      unsigned long size);
extern void free_bootmem(unsigned long addr, unsigned long size);

Guess you like

Origin blog.csdn.net/qq_48322523/article/details/128243157