linux内存管理之内存回收机制

[摘要]

[正文]重要数据结构介绍

[正文]重要资源初始化(包括pgdat、struct zone等)

[正文]内存回收时机

[正文]内存回收过程

[总结]

[其他]

注意：请使用谷歌浏览器阅读（IE浏览器排版混乱）

【摘要】

本文将介绍linux内存回收机制.

【正文】重要数据结构介绍

1 membank/meminfo

meminfo描述了整个系统的内存信息，meminfo中可以包含NR_BANKS个membank;

struct meminfo {
int nr_banks;
struct membank bank[NR_BANKS];//NR_BANKS=8
};

每个membank都是通过arm_add_memory()初始化的:setup_arch()->setup_machine_fdt()->arm_add_memory();

struct membank {
phys_addr_t start;
phys_addr_t size;
unsigned int highmem;
};

2 memblock/memblock_type

struct memblock_region {
phys_addr_t base;
phys_addr_t size;
};
struct memblock_type {
unsigned long cnt;/* number of regions */
unsigned long max;/* size of the allocated array */
phys_addr_t total_size;/* size of all regions */
struct memblock_region *regions;
};
struct memblock {
phys_addr_t current_limit;
struct memblock_type memory;
struct memblock_type reserved;
};

初始化：arm_memblock_init->memblock_add->memblock_add_region-> type->regions[0].base =mi->bank[i].start
arm_memblock_init->memblock_reserve->memblock_add_region->

void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
{
	int i;

	for (i = 0; i < mi->nr_banks; i++)
		memblock_add(mi->bank[i].start, mi->bank[i].size);

	/* Register the kernel text, kernel data and initrd with memblock. */
#ifdef CONFIG_XIP_KERNEL
	memblock_reserve(__pa(_sdata), _end - _sdata);
#else
	memblock_reserve(__pa(_stext), _end - _stext);
#endif

	arm_mm_memblock_reserve();
	arm_dt_memblock_reserve();

	/* reserve any platform specific memblock areas */
	if (mdesc->reserve)
		mdesc->reserve();

	/*
	 * reserve memory for DMA contigouos allocations,
	 * must come from DMA area inside low memory
	 */
	dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));

	arm_memblock_steal_permitted = false;
	memblock_allow_resize();
     /*
	memblock_dump(&memblock.memory, "memory");
	memblock_dump(&memblock.reserved, "reserved");
     */
	memblock_dump_all();
}

3 pg_data_t *pgdat=NODE_DATA(0);可以通过NODE_DATA(0)找到ZONE_NORMAL内存区,进而获取这个内存区信息，

如：可用内存大小,zone->managed_pages

typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
struct zonelist node_zonelists[MAX_ZONELISTS];
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
#endif
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
    range, including holes */
int node_id;
nodemask_t reclaim_nodes;/* Nodes allowed to reclaim from */
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd;/* Protected by lock_memory_hotplug() */
int kswapd_max_order;
enum zone_type classzone_idx;
} pg_data_t;

pg_data_t *pgdat=NODE_DATA(0);
操作系统在管理内存时比如申请alloc_pages,释放shrink_zone是通过全局变量pg_data_t来实现的。
通过zone_list找到zone（zone分为正常内存区和DMA内存区等），通过zone结构中的lruvec来管理
lruvec包括以下几种类型的链表:

enum lru_list {
LRU_INACTIVE_ANON = LRU_BASE,
LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
LRU_UNEVICTABLE,
NR_LRU_LISTS
};

【正文】重要资源初始化(包括pgdat、struct zone等)

1 meminfo与membank初始化

系统启动过程中，根据bootargs参数初始化meminfo,membank结构：

bootargs = console=ttyS0,115200 mem=110M root=/dev/mtdblock1 rootfstype=squashfs init=/linuxrc

举例：fdt中指定系统内存起始地址是0x200000,系统内存大小是110M=0x6e00000;

注意：atags或fdt参数都对系统内存的起始地址和内存大小进行了配置，但系统内存的大小最后以bootargs中的mem参数为准.

1.1 初始化membank的方法：

int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
{
struct membank *bank = &meminfo.bank[meminfo.nr_banks];
u64 aligned_start;

if (meminfo.nr_banks >= NR_BANKS) {
printk(KERN_CRIT "NR_BANKS too low, "
"ignoring memory at 0x%08llx\n", (long long)start);
return -EINVAL;
}

/*
* Ensure that start/size are aligned to a page boundary.
* Size is appropriately rounded down, start is rounded up.
*/
size -= start & ~PAGE_MASK;
aligned_start = PAGE_ALIGN(start);

#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BIT
if (aligned_start > ULONG_MAX) {
printk(KERN_CRIT "Ignoring memory at 0x%08llx outside "
      "32-bit physical address space\n", (long long)start);
return -EINVAL;
}

if (aligned_start + size > ULONG_MAX) {
printk(KERN_CRIT "Truncating memory at 0x%08llx to fit in "
"32-bit physical address space\n", (long long)start);
/*
* To ensure bank->start + bank->size is representable in
* 32 bits, we use ULONG_MAX as the upper limit rather than 4GB.
* This means we lose a page after masking.
*/
size = ULONG_MAX - aligned_start;
}
#endif

if (aligned_start < PHYS_OFFSET) {
if (aligned_start + size <= PHYS_OFFSET) {
pr_info("Ignoring memory below PHYS_OFFSET: 0x%08llx-0x%08llx\n",
aligned_start, aligned_start + size);
return -EINVAL;
}

pr_info("Ignoring memory below PHYS_OFFSET: 0x%08llx-0x%08llx\n",
aligned_start, (u64)PHYS_OFFSET);

size -= PHYS_OFFSET - aligned_start;
aligned_start = PHYS_OFFSET;
}

/* atags或fdt中指定的系统内存的起始地址 */
bank->start = aligned_start;
/* atags或fdt中指定的系统内存的大小,bootargs中可以对系统内存大小进行修改 */
bank->size = size & ~(phys_addr_t)(PAGE_SIZE - 1);

/*
* Check whether this memory region has non-zero size or
* invalid node number.
*/
if (bank->size == 0)
return -EINVAL;
/* 
atags或fdt中对应meminfo.membank[0],初始化想要的membank后nr_banks要加1,
下次再有membank加入则对应meminfo.membank[1]
*/
meminfo.nr_banks++;
return 0;
}

1.2 初始化membank的时机：

<1> atags或fdt解析时,根据系统内存的其实地址和大小创建meminfo.membank[0]

void __init early_init_dt_add_memory_arch(u64 base, u64 size)
{
/*根据fdt参数指定的系统内存和大小创建meminfo.membank[0],
 如：membank.start=0x200000;membank.size=0x6e00000<110M>
 系统内存大小会根据bootargs中的mem参数进行二次修改.
*/
arm_add_memory(base,size);
}

<2> 解析bootargs中mem=118M时:

static int __init early_mem(char *p)
{
	static int usermem __initdata = 0;
	phys_addr_t size;
	phys_addr_t start;
	char *endp;

	/*
	 * If the user specifies memory size, we
	 * blow away any automatically generated
	 * size.
	 */
	if (usermem == 0) {
		usermem = 1;
     /* 注意解析fdt时已经添加了membank,此时meminfo.nr_banks=1; 
       此处再次将meminfo.nr_banks=0,表明bootargs中mem参数中指定的内存大小
       对应meminfo.membank[0]，即是对fdt中指定的系统内存大小进行二次配置.
     */
		meminfo.nr_banks = 0;
	}

	start = PHYS_OFFSET;
	size  = memparse(p, &endp);
	if (*endp == '@')
		start = memparse(endp + 1, NULL);

	arm_add_memory(start, size);

	return 0;
}

2 初始化pgdat、struct zone: setup_arch()->paging_init()->bootmem_init()->

void __init bootmem_init(void)
{
	unsigned long min, max_low, max_high;

	max_low = max_high = 0;

/* 根据上面创建好的meminfo.membank[0] ,获取物理地址的页帧号：
min=0x200;max_low=0x6e00;max_high=0x6e00*/
	find_limits(&min, &max_low, &max_high);

	arm_bootmem_init(min, max_low);
	/*
	 * Sparsemem tries to allocate bootmem in memory_present(),
	 * so must be done after the fixed reservations
	 */
	arm_memory_present();


	/*
	 * sparse_init() needs the bootmem allocator up and running.
	 */
	sparse_init();


	/*
	 * Now free the memory - free_area_init_node needs
	 * the sparse mem_map arrays initialized by sparse_init()
	 * for memmap_init_zone(), otherwise all PFNs are invalid.
	 */
	arm_bootmem_free(min, max_low, max_high);


	/*
	 * This doesn't seem to be used by the Linux memory manager any
	 * more, but is used by ll_rw_block.  If we can get rid of it, we
	 * also get rid of some of the stuff above as well.
	 *
	 * Note: max_low_pfn and max_pfn reflect the number of _pages_ in
	 * the system, not the maximum PFN.
	 */
	max_low_pfn = max_low - PHYS_PFN_OFFSET;
        /*系统中物理页个数：0x6c00=0x6e00-0x200*/
	max_pfn = max_high - PHYS_PFN_OFFSET;
}

setup_arch()->paging_init()->bootm_init()->arm_bootmem_init()

static void __init arm_bootmem_init(unsigned long start_pfn,
	unsigned long end_pfn)
{
	struct memblock_region *reg;
	unsigned int boot_pages;
	phys_addr_t bitmap;
	pg_data_t *pgdat;

	/*
	 * Allocate the bootmem bitmap page.  This must be in a region
	 * of memory which has already been mapped.
	 */
	boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
	bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES,
				__pfn_to_phys(end_pfn));

	/*
	 * Initialise the bootmem allocator, handing the
	 * memory banks over to bootmem.
	 */
	node_set_online(0);
	pgdat = NODE_DATA(0);
	init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);

	/* Free the lowmem regions from memblock into bootmem. */
	for_each_memblock(memory, reg) {
           /*物理页帧号：start=0x200;end=0x6e00;*/
		unsigned long start = memblock_region_memory_base_pfn(reg);
		unsigned long end = memblock_region_memory_end_pfn(reg);

		if (end >= end_pfn)
			end = end_pfn;
		if (start >= end)
			break;

		free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT);
	}

	/* Reserve the lowmem memblock reserved regions in bootmem. */
	for_each_memblock(reserved, reg) {
		unsigned long start = memblock_region_reserved_base_pfn(reg);
		unsigned long end = memblock_region_reserved_end_pfn(reg);

		if (end >= end_pfn)
			end = end_pfn;
		if (start >= end)
			break;

		reserve_bootmem(__pfn_to_phys(start),
			        (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT);
	}
}

setup_arch()->paging_init()->bootm_init()->arm_bootmem_free()

static void __init arm_bootmem_free(unsigned long min, unsigned long max_low,
	unsigned long max_high)
{
	unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
	struct memblock_region *reg;

	/*
	 * initialise the zones.
	 */
	memset(zone_size, 0, sizeof(zone_size));

	/*
	 * The memory size has already been determined.  If we need
	 * to do anything fancy with the allocation of this memory
	 * to the zones, now is the time to do it.
	 */
	/*
	0表示ZONE_NORMAL,系统内存大小对应的物理页个数,即0x6c00=0x6e00-0x200;
	zone_size[1]=0;表示ZONE_MOVABLE
	*/
	zone_size[0] = max_low - min;

	/*
	 * Calculate the size of the holes.
	 *  holes = node_size - sum(bank_sizes)
	 */
	memcpy(zhole_size, zone_size, sizeof(zhole_size));
	for_each_memblock(memory, reg) {
		unsigned long start = memblock_region_memory_base_pfn(reg);
		unsigned long end = memblock_region_memory_end_pfn(reg);

		if (start < max_low) {
			unsigned long low_end = min(end, max_low);
			zhole_size[0] -= low_end - start;
		}
#ifdef CONFIG_HIGHMEM
		if (end > max_low) {
			unsigned long high_start = max(start, max_low);
			zhole_size[ZONE_HIGHMEM] -= end - high_start;
		}
#endif
	}

#ifdef CONFIG_ZONE_DMA
	/*
	 * Adjust the sizes according to any special requirements for
	 * this machine type.
	 */
	if (arm_dma_zone_size)
		arm_adjust_dma_zone(zone_size, zhole_size,
			arm_dma_zone_size >> PAGE_SHIFT);
#endif

	free_area_init_node(0, zone_size, min, zhole_size);
}

setup_arch()->paging_init()->bootm_init()->arm_bootmem_free()->free_area_init_node():

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
		unsigned long node_start_pfn, unsigned long *zholes_size)
{
	pg_data_t *pgdat = NODE_DATA(nid);

	/* pg_data_t should be reset to zero when it's allocated */
	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

     /*nid = 0*/
	pgdat->node_id = nid;
     /* 系统内存起始地址对应的物理页帧号:0x200 */
	pgdat->node_start_pfn = node_start_pfn;
	init_zone_allows_reclaim(nid);
     /*系统内存大小：以页为单位(0x6e00-0x200 个页)*/
	calculate_node_totalpages(pgdat, zones_size, zholes_size);

	alloc_node_mem_map(pgdat);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
		nid, (unsigned long)pgdat,
		(unsigned long)pgdat->node_mem_map);
#endif
	free_area_init_core(pgdat, zones_size, zholes_size);
}

setup_arch()->paging_init()->bootm_init()->arm_bootmem_free()->free_area_init_node()->free_area_init_core():

实现struct zone: pgdat->node_zones的初始化

static void __paginginit free_area_init_core(struct pglist_data *pgdat,
		unsigned long *zones_size, unsigned long *zholes_size)
{
	enum zone_type j;
	int nid = pgdat->node_id;
	unsigned long zone_start_pfn = pgdat->node_start_pfn;
	int ret;

	pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
	spin_lock_init(&pgdat->numabalancing_migrate_lock);
	pgdat->numabalancing_migrate_nr_pages = 0;
	pgdat->numabalancing_migrate_next_window = jiffies;
#endif
	init_waitqueue_head(&pgdat->kswapd_wait);
	init_waitqueue_head(&pgdat->pfmemalloc_wait);
	pgdat_page_cgroup_init(pgdat);

     /*MAX_NR_ZONES=2;0表示ZONE_NORMAL;1表示ZONE_MOVALBE*/
	for (j = 0; j < MAX_NR_ZONES; j++) {
           /*初始化struct zone: pgdat->node_zones*/
		struct zone *zone = pgdat->node_zones + j;
		unsigned long size, realsize, freesize, memmap_pages;
           /*系统内存大小以页为单位：
           即zone_names[0]="NORMAL";zone_size[0]=0x6c00=0x6e00-0x200;zholes_size[0]=0;
           zone_names[1]="Movable";zone_size[1]=0;zholes_size[1]=0;
            dma_reserve=0;
           */
		size = zone_spanned_pages_in_node(nid, j, zones_size);
           /*系统内存大小以页为单位：0x6c00*/
		realsize = freesize = size - zone_absent_pages_in_node(nid, j,
								zholes_size);

		/*
		 * Adjust freesize so that it accounts for how much memory
		 * is used by this zone for memmap. This affects the watermark
		 * and per-cpu initialisations
		 */
           /*计算页描述符占用的大小:0x6c00*sizeof(struct page)>>PAGE_SHIFT*/
		memmap_pages = calc_memmap_size(size, realsize);
		if (freesize >= memmap_pages) {
			freesize -= memmap_pages;
			if (memmap_pages)
				printk(KERN_DEBUG
				       "  %s zone: %lu pages used for memmap\n",
				       zone_names[j], memmap_pages);
		} else
			printk(KERN_WARNING
				"  %s zone: %lu pages exceeds freesize %lu\n",
				zone_names[j], memmap_pages, freesize);

		/* Account for reserved pages */
		if (j == 0 && freesize > dma_reserve) {
			freesize -= dma_reserve;
			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
					zone_names[0], dma_reserve);
		}

		if (!is_highmem_idx(j))
			nr_kernel_pages += freesize;
		/* Charge for highmem memmap if there are enough kernel pages */
		else if (nr_kernel_pages > memmap_pages * 2)
			nr_kernel_pages -= memmap_pages;
		nr_all_pages += freesize;
		/*系统内存大小以页为单位：0x6c00*/
		zone->spanned_pages = size;
		zone->present_pages = realsize;
		/* 注意：在之后的mm_init->mem_init->free_all_bootmem中会重新初始化
		 * Set an approximate value for lowmem here, it will be adjusted
		 * when the bootmem allocator frees pages into the buddy system.
		 * And all highmem pages will be managed by the buddy system.
		 */
		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
		zone->node = nid;
		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
						/ 100;
		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
		zone->name = zone_names[j];
		spin_lock_init(&zone->lock);
		spin_lock_init(&zone->lru_lock);
		zone_seqlock_init(zone);
		zone->zone_pgdat = pgdat;

		zone_pcp_init(zone);
                /*初始化zone->lruvec :包括LRU_INACTIVE_ANON/LRU_ACTIVE_ANON等*/
		lruvec_init(&zone->lruvec);
		if (!size)
			continue;

		set_pageblock_order();
		setup_usemap(pgdat, zone, zone_start_pfn, size);
		ret = init_currently_empty_zone(zone, zone_start_pfn,
						size, MEMMAP_EARLY);
		BUG_ON(ret);
                /*初始化页描述符  */
		memmap_init(size, nid, j, zone_start_pfn);
		zone_start_pfn += size;
	}
}

查看struct zone方法,如查看ZONE_NORMAL区：

方法一：struct zone=NODE_DATA(nid)->node_zones(ZONE_NORMAL);其中nid=0;ZONE_NORMAL=0;

方法二：struct *zone=page_zone(struct *page);可以根据page找到所在的struct zone;

注意：zone.managed_pages是系统允许使用的内存页个数,zone.watermark是以此为基准进行校验的,可以参考后文zone_watermark_ok.

zone.managed_pagess在mm_init->mem_init->free_all_bootmem中会重新初始化,并且后续使用的zone.managed_pages以此为准.

memmap_init=memmap_init_zone

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
		unsigned long start_pfn, enum memmap_context context)
{
	struct page *page;
	unsigned long end_pfn = start_pfn + size;
	unsigned long pfn;
	struct zone *z;

	if (highest_memmap_pfn < end_pfn - 1)
		highest_memmap_pfn = end_pfn - 1;
     /*pgdat->node_zones[0]:ZONE_NORMAL*/
	z = &NODE_DATA(nid)->node_zones[zone];
	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
		/*
		 * There can be holes in boot-time mem_map[]s
		 * handed to this function.  They do not
		 * exist on hotplugged memory.
		 */
		if (context == MEMMAP_EARLY) {
			if (!early_pfn_valid(pfn))
				continue;
			if (!early_pfn_in_nid(pfn, nid))
				continue;
		}
		page = pfn_to_page(pfn);
		set_page_links(page, zone, nid, pfn);
		mminit_verify_page_links(page, zone, nid, pfn);
           /* page->count初始化为1，表示该页被申请了，在以后的启动过程中会通过free_all_bootmem->__free_one_page
           释放所有系统内存，那时page->count设置为0
           */
		init_page_count(page);
           /*page->_mapcount设置-1*/
		page_mapcount_reset(page);
		page_nid_reset_last(page);
		SetPageReserved(page);
		/*
		 * Mark the block movable so that blocks are reserved for
		 * movable at startup. This will force kernel allocations
		 * to reserve their blocks rather than leaking throughout
		 * the address space during boot when many long-lived
		 * kernel allocations are made. Later some blocks near
		 * the start are marked MIGRATE_RESERVE by
		 * setup_zone_migrate_reserve()
		 *
		 * bitmap is created for zone's valid pfn range. but memmap
		 * can be created for invalid pages (for alignment)
		 * check here not to call set_pageblock_migratetype() against
		 * pfn out of zone.
		 */
		if ((z->zone_start_pfn <= pfn)
		    && (pfn < zone_end_pfn(z))
		    && !(pfn & (pageblock_nr_pages - 1)))
			set_pageblock_migratetype(page, MIGRATE_MOVABLE);


		INIT_LIST_HEAD(&page->lru);
	}
}

【正文】内存回收时机

1 首先系统启动过程中，会对所有系统内存进行一次释放：

mm_init->mem_init->free_all_bootmem->free_all_bootmem_core->free_hot_cold_page/__free_pages_ok

static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
	struct page *page;
	unsigned long start, end, pages, count = 0;

	if (!bdata->node_bootmem_map)
		return 0;

	start = bdata->node_min_pfn;
	end = bdata->node_low_pfn;

     /*start物理内存起始地址物理页帧号0x200;物理内存结束地址页帧号0x6e00*/
	while (start < end) {
		unsigned long *map, idx, vec;
		unsigned shift;

		map = bdata->node_bootmem_map;
		idx = start - bdata->node_min_pfn;
		shift = idx & (BITS_PER_LONG - 1);
		/*
		 * vec holds at most BITS_PER_LONG map bits,
		 * bit 0 corresponds to start.
		 */
		vec = ~map[idx / BITS_PER_LONG];

		if (shift) {
			vec >>= shift;
			if (end - start >= BITS_PER_LONG)
				vec |= ~map[idx / BITS_PER_LONG + 1] <<
					(BITS_PER_LONG - shift);
		}
		/*
		 * If we have a properly aligned and fully unreserved
		 * BITS_PER_LONG block of pages in front of us, free
		 * it in one go.
		 */
		if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
			int order = ilog2(BITS_PER_LONG);

			__free_pages_bootmem(pfn_to_page(start), order);
			count += BITS_PER_LONG;
			start += BITS_PER_LONG;
		} else {
			unsigned long cur = start;

			start = ALIGN(start + 1, BITS_PER_LONG);
			while (vec && cur != start) {
				if (vec & 1) {
					page = pfn_to_page(cur);
					__free_pages_bootmem(page, 0);
					count++;
				}
				vec >>= 1;
				++cur;
			}
		}
	}

	page = virt_to_page(bdata->node_bootmem_map);
	pages = bdata->node_low_pfn - bdata->node_min_pfn;
	pages = bootmem_bootmap_pages(pages);
	count += pages;
	while (pages--)
		__free_pages_bootmem(page++, 0);

	bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);

	return count;
}

2 alloc_pages过程中因为剩余内存不足引起的内存的回收：

可结合 linux内存管理之伙伴系统管理一文观看;

alloc_pages(gfp_mask,order)->alloc_pages_node(numa_node_id(),gfp_mask,order)->

__alloc_pages(gfp_mask,order,node_zonelist(nid,gfp_mask))->__alloc_pages_nodemask

/* zonelist= node_zonelist(nid,gfp_mask)=NODE_DATA(nid)->node_zonelists;
  其中nid=0;pg_data_t *pgdat=NODE_DATA(0)*/
struct page *__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
struct mem_cgroup *memcg = NULL;

gfp_mask &= gfp_allowed_mask;

lockdep_trace_alloc(gfp_mask);

might_sleep_if(gfp_mask & __GFP_WAIT);

if (should_fail_alloc_page(gfp_mask, order))
return NULL;

/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;

/*
* Will only have any effect when __GFP_KMEMCG is set.  This is
* verified in the (always inline) callee
*/
if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
return NULL;

retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();

/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone)
goto out;

/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,preferred_zone, migratetype);
if (unlikely(!page)) {
/*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
*/
gfp_mask = memalloc_noio_flags(gfp_mask);
/* 系统空闲内存紧张时 get_page_from_freelist失败时 */
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
}

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

out:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;

memcg_kmem_commit_charge(page, memcg, order);

return page;
}

alloc_pages(gfp_mask,order)->alloc_pages_node(numa_node_id(),gfp_mask,order)->

__alloc_pages(gfp_mask,order,node_zonelist(nid,gfp_mask))->__alloc_pages_nodemask->

get_page_from_freelist->zone_reclaim->shrink_zone

static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0;/* set if using zonelist_cache */
int did_zlc_setup = 0;/* just call zlc_setup() one time */

classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
if ((alloc_flags & ALLOC_WMARK_LOW) &&
   (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
goto this_zone_full;

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
/* zone_watermark_ok在空闲内存不足时返回0;足够时返回1 */
if (zone_watermark_ok(zone, order, mark,
   classzone_idx, alloc_flags))
goto try_this_zone;

if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
* by the cpuset.
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}

if (zone_reclaim_mode == 0 ||
   !zone_allows_reclaim(preferred_zone, zone))
goto this_zone_full;

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
/* zone_reclaim进行内存回收,zone_watermark_ok返回0时,表明空闲内存不足,此时通过zone_reclarim->shrink_zone回收内存;
如果回收后，空闲内存仍不足，则通过：__alloc_pages_nodemask->__alloc_pages_slowpath->
wake_all_kswapd 唤醒守护进程继续释放内存:kswapd->balance_pgdat->shrink_zone
*/
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough 内存回收之后，系统空余内存是否足够*/
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
   ret == ZONE_RECLAIM_SOME)
goto this_zone_full;

continue;
}
}

try_this_zone:
/*申请内存：alloc_pages->__alloc_pages_nodemask->get_page_from_freelist->buffered_rmqueue->rmqueue_bulk:
分解大的伙伴.如4页分成2*2页,申请失败时调用*/
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (page)
break;
this_zone_full:
if (IS_ENABLED(CONFIG_NUMA))
zlc_mark_zone_full(zonelist, z);
}

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}

if (page)
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

return page;
}

alloc_pages-> get_page_from_freelist 申请过程中通过 zone_watermark_ok ，判断是否有足够空闲页，否则调用zone_reclaim->shrink_zone回收，如果还未成功，则：

__alloc_pages_slowpath->__alloc_pages_direct_reclaim->__perform_reclaim->try_to_free_pages->shrink_zones->shrink_zone

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
     int classzone_idx, int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,zone_page_state(z, NR_FREE_PAGES));
}
/*
 * Return true if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
     int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
long lowmem_reserve = z->lowmem_reserve[classzone_idx];
int o;
long free_cma = 0;

free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
/* 系统空闲内存不足时,返回0 */
if (free_pages - free_cma <= min + lowmem_reserve)
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;

/* Require fewer higher order pages to be free */
min >>= 1;
/* 系统空闲内存不足时,返回0 */
if (free_pages <= min)
return false;
}
return true;
}

zone->watermark和zone->lowmem_reserve初始化过程:init_per_zone_wmark_min->setup_per_zone_wmarks

/*
 * Initialise min_free_kbytes.
 *
 * For small machines we want it small (128k min).  For large machines
 * we want it large (64MB max).  But it is not linear, because network
 * bandwidth does not increase linearly with machine size.  We use
 *
 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
 *
 * which yields
 *
 * 16MB: 512k
 * 32MB: 724k
 * 64MB: 1024k
 * 128MB: 1448k
 * 256MB: 2048k
 * 512MB: 2896k
 * 1024MB: 4096k
 * 2048MB: 5792k
 * 4096MB: 8192k
 * 8192MB: 11584k
 * 16384MB: 16384k
 */
int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
/*举例：系统内存为118M（bootargs参数中mem=118M）
zone->managed_pages=28764个page;表示系统允许申请的内存页个数.
lowmem_kbytes=(zone->managed_pages=28764)*4=115056kbytes;
*/
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
/*
min_free_kbytes =1356kbytes*/
min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
if (min_free_kbytes < 128)
min_free_kbytes = 128;
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
setup_per_zone_inactive_ratio();
return 0;
}
module_init(init_per_zone_wmark_min)

zone->watermark和zone->lowmem_reserve通过proc设置过程:

min_free_kbytes_sysctl_handler->setup_per_zone_wmarks

# cat /proc/sys/vm/min_free_kbytes
1356
# cat /proc/sys/vm/lowmem_reserve_ratio
32

static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;

/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone->managed_pages;
}

for_each_zone(zone) {
u64 tmp;

spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone->managed_pages;
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
* need highmem pages, so cap pages_min to a small
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas controls asynch page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;

min_pages = zone->managed_pages / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
zone->watermark[WMARK_MIN] = min_pages;
} else {
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
zone->watermark[WMARK_MIN] = tmp;
}

/*举例：系统内存为118M（bootargs参数中mem=118M）
zone->managed_pages=28764个page;表示系统允许申请的内存页个数.
lowmem_kbytes=(zone->managed_pages=28764)*4=115056kbytes;
min_free_kbytes =1356kbytes;
min_wmark_pages(zone)=zone->watermark[WMARK_MIN]=tmp=min_free_kbytes/4=339个pages;
zone->watermark[WMARK_LOW]=339+339/4=423;
zone->watermark[WMARK_HIGH]=339+339/2=508;
*/
zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}

/* update totalreserve_pages */
calculate_totalreserve_pages();
}

3 wake_all_kswapd唤醒守护进程释放空闲内存：kswapd->balance_pgdat->shrink_zone

get_page_from_freelist申请内存失败时，alloc_pages(gfp_mask,order)->alloc_pages_node(numa_node_id(),gfp_mask,order)->

__alloc_pages(gfp_mask,order,node_zonelist(nid,gfp_mask))->__alloc_pages_nodemask->

__alloc_pages_slowpath->wake_all_kswapd唤醒回收机制.

【正文】内存回收过程

本文所讲的物理页面回收是指动态的回收：即，空闲的内存不够用的时候，系统采取相应的方法将正在使用的内存释放，补充空闲内存，以满足内存的分配。

1.首先先简单看一下系统中的内存释放的三种渠道。

1-1>. 在用户进程退出的时候，释放内存。当用户进程退出的时候，会调用do_exit. do_exit最终会调用free_pagetables函数。该函数的作用是：遍历vma，根据vma中的虚拟地址找到实际的物理页，将其释放。在之前讲过，对于用户进程的虚拟地址区间，是以红黑树组织的。

1-2>. 手动的释放。在驱动中，分配内存使用alloc_pages()，释放内存用free_pages(这一点，类似于c语言中的malloc和free)。必须配对使用。否则会造成内存泄漏。

1-3>. 按需求调用内存回收例程来释放内存。这个区别于前两种的最大不同：它是动态的，按需的。当内存不够的时候，系统会自动按照一定的方式，将某些正在使用的内存释放掉，放进buddy system中再利用。

2.内存将会回收哪些页面

用户进程的页面都是通过page fault进行分配的。通过page fault进行分配的页面都是可以进行回收的。这些页面总体可以划分为两种，分别是文件页(file cache)和匿名页(anonymous cache). 文件页，顾名思义，它是和外部存储设备上的某个文件相对应。匿名页，其内容不来自于外部存储设备，例如用户进程中的堆栈。这两种页面是内存回收的目标页面。

内存回收采用的主要算法是近似于LRU的算法。位于LRU链表前面的页是活跃的，位于LRU链表后面的页是不活跃的。为什么说是近似呢？1. 页面在链表上排序并不是严格依据LRU不断移动的。他们挂上去后是不移动的。除非在进行页面回收的时候，有些页面从后面，可能会插入到前面；2. Linux在LRU的基础上又引入了一个Referrenced标志。这种带Referenced标志的近似LRU的算法被有些人称之为Second-Chance Algorithm.

void lruvec_init(struct lruvec *lruvec)
{
enum lru_list lru;
memset(lruvec, 0, sizeof(struct lruvec));

for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
}
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat;
};
enum lru_list {
LRU_INACTIVE_ANON = LRU_BASE,
LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
LRU_UNEVICTABLE,
NR_LRU_LISTS
};

4 缺页异常处理

用户进程的物理页面都是通过page fault进行分配的。通过page fault进行分配的页面都是可以进行回收的。这些页面总体可以划分为两种，分别是文件页(file cache)和匿名页(anonymous cache).下面介绍一下缺页异常处理的主要函数：

int handle_pte_fault(struct mm_struct *mm,
       struct vm_area_struct *vma, unsigned long address,
       pte_t *pte, pmd_t *pmd, unsigned int flags)
{
 pte_t entry;
 spinlock_t *ptl;
 entry = *pte;
 /* 页未被申请 present位被清除 */
 if (!pte_present(entry)) {
  if (pte_none(entry)) {
   if (vma->vm_ops) {
    if (likely(vma->vm_ops->fault))
     return do_linear_fault(mm, vma, address,
      pte, pmd, flags, entry);
   }
    /*匿名页缺页处理:最后会调用到filemap_fault,见后文分析*/
   return do_anonymous_page(mm, vma, address,
       pte, pmd, flags);
  }
  if (pte_file(entry)){
    /*文件页缺页处理*/
    return do_nonlinear_fault(mm, vma, address,
                 pte, pmd, flags, entry);
               }
  return do_swap_page(mm, vma, address,
     pte, pmd, flags, entry);
 }
 if (pte_numa(entry))
  return do_numa_page(mm, vma, address, entry, pte, pmd);
 ptl = pte_lockptr(mm, pmd);
 spin_lock(ptl);
 if (unlikely(!pte_same(*pte, entry)))
  goto unlock;
 if (flags & FAULT_FLAG_WRITE) {
  if (!pte_write(entry))
   return do_wp_page(mm, vma, address,
     pte, pmd, ptl, entry);
  entry = pte_mkdirty(entry);
 }
 entry = pte_mkyoung(entry);
 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
  update_mmu_cache(vma, address, pte);
 } else {
  /*
   * This is needed only for protection faults but the arch code
   * is not yet telling us if this is a protection fault or not.
   * This still avoids useless tlb flushes for .text page faults
   * with threads.
   */
  if (flags & FAULT_FLAG_WRITE)
   flush_tlb_fix_spurious_fault(vma, address);
 }
unlock:
 pte_unmap_unlock(pte, ptl);
 return 0;
}

4.1 匿名页申请

匿名页缺页处理：handle_pte_fault->do_anonymous_page()

static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  unsigned long address, pte_t *page_table, pmd_t *pmd,
  unsigned int flags)
{
 struct page *page;
 spinlock_t *ptl;
 pte_t entry;
 pte_unmap(page_table);
 /* Check if we need to add a guard page to the stack */
 if (check_stack_guard_page(vma, address) < 0)
  return VM_FAULT_SIGBUS;
 /* Use the zero-page for reads */
 if (!(flags & FAULT_FLAG_WRITE)) {
  entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
      vma->vm_page_prot));
  page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
  if (!pte_none(*page_table))
   goto unlock;
  goto setpte;
 }
 /* Allocate our own private page. */
 if (unlikely(anon_vma_prepare(vma)))
  goto oom;
 /*申请物理页*/
 page = alloc_zeroed_user_highpage_movable(vma, address);
 if (!page)
  goto oom;
 /*
  * The memory barrier inside __SetPageUptodate makes sure that
  * preceeding stores to the page contents become visible before
  * the set_pte_at() write.
  */
 /*设置page->flags ;page-flags.h中定义宏*/
 __SetPageUptodate(page);
 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
  goto oom_free_page;
 entry = mk_pte(page, vma->vm_page_prot);
 if (vma->vm_flags & VM_WRITE)
  entry = pte_mkwrite(pte_mkdirty(entry));
 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 if (!pte_none(*page_table))
  goto release;
 inc_mm_counter_fast(mm, MM_ANONPAGES);
/*
 设置page->flags;page->mapping;page->index等.
 将申请的页添加到LRU_ACTIVE_ANON或LRU_UNEVICTABLE链表
*/
 page_add_new_anon_rmap(page, vma, address);
setpte:
 set_pte_at(mm, address, page_table, entry);
 /* No need to invalidate - it was non-present before */
 update_mmu_cache(vma, address, page_table);
unlock:
 pte_unmap_unlock(page_table, ptl);
 return 0;
release:
 mem_cgroup_uncharge_page(page);
 page_cache_release(page);
 goto unlock;
oom_free_page:
 page_cache_release(page);
oom:
 return VM_FAULT_OOM;
}

handle_pte_fault->do_anonymous_page->page_add_new_anon_rmap

void page_add_new_anon_rmap(struct page *page,
	struct vm_area_struct *vma, unsigned long address)
{
	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
	SetPageSwapBacked(page);
	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */

	if (!PageTransHuge(page))
		__inc_zone_page_state(page, NR_ANON_PAGES);
	else
		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
	__page_set_anon_rmap(page, vma, address, 1);
      /*
         判断这段内存是否mlock锁定,mlock(用户虚拟起始地址,锁定的内存大小);
       锁定的内存不会被动kswapd回收机制动态释放
      */
	if (!mlocked_vma_newpage(vma, page))
     /*将页添加到LRU_ACTIVE_ANON*/
		lru_cache_add_lru(page, LRU_ACTIVE_ANON);
	else
     /*将页添加到LRU_UNEVICTABLE链表;动态回收过程(shrink_zone)不会释放锁定的内存*/
		add_page_to_unevictable_list(page);
}

handle_pte_fault->do_wp_page-> page_move_anon_rmap

void page_move_anon_rmap(struct page *page,struct vm_area_struct *vma, unsigned long address)
{
struct anon_vma *anon_vma = vma->anon_vma;
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
}

匿名vma初始化：anon_vma_alloc()

4.2 文件页的申请

1）读取文件可以直接使用read系统调用进行读取;也可以mmap文件句柄,最后会使用文件系统层的mmap注册文件页的缺页异常处理函数，注意此处不是匿名页的缺页异常处理。

以squashfs文件系统为例：

const struct file_operations generic_ro_fops={
    .llseek = generic_file_llseek,
    .read = do_sync_read,
    .aio_read = generic_file_aio_read, --直接读取
    .mmap = generic_file_readonly_mmap,--触发缺页异常,通过filemap_fault读取
    .splice_read = generic_file_splice_read,
}

2）mmap一个文件的过程：generic_file_readonly_mmap->generic_file_mmap

const struct vm_operations_struct generic_file_vm_ops = {
 .fault  = filemap_fault,
 .page_mkwrite = filemap_page_mkwrite,
 .remap_pages = generic_file_remap_pages,
};
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
  return -EINVAL;
 return generic_file_mmap(file, vma);
}
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
 struct address_space *mapping = file->f_mapping;
 if (!mapping->a_ops->readpage)
  return -ENOEXEC;
 file_accessed(file);
/*  const struct vm_operations_struct generic_file_vm_ops = {
      .fault=filemap_fault,//缺页异常时调用
   }
*/
 vma->vm_ops = &generic_file_vm_ops;
 return 0;
}

3）read方式读取文件,系统调用处理过程：generic_file_aio_read->do_generic_file_read()

mmap方式读取文件,缺页异常处理过程：handle_pte_fault->do_nolinear_fault->__do_fault->filemap_fault

int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
 int error;
 struct file *file = vma->vm_file;
 struct address_space *mapping = file->f_mapping;
 struct file_ra_state *ra = &file->f_ra;
 struct inode *inode = mapping->host;
 pgoff_t offset = vmf->pgoff;
 struct page *page;
 pgoff_t size;
 int ret = 0;
 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 if (offset >= size)
  return VM_FAULT_SIGBUS;
 /*
  * Do we have something in the page cache already?
  */
 page = find_get_page(mapping, offset);
 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
  /*
   * We found the page, so try async readahead before
   * waiting for the lock.
   */
  /*调用路径:
   ->page_cache_async_readahead->ondemand_readahead->__do_page_cache_readahead->
     read_pages->squashfs_readpage
   ->page_cache_async_readahead->ondemand_readahead->ra_submit->__do_page_cache_readahead->
     read_pages->squashfs_readpage
 */
  do_async_mmap_readahead(vma, ra, file, page, offset);
 } else if (!page) {
  /* No page in the page cache at all */
   /*调用路径：
     ->ra_submit->__do_page_cache_readahead->read_pages->squashfs_readpage
   */
  do_sync_mmap_readahead(vma, ra, file, offset);
                print_readpage_flag = 0;
  count_vm_event(PGMAJFAULT);
  mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  ret = VM_FAULT_MAJOR;
retry_find:
  page = find_get_page(mapping, offset);
  if (!page)
   goto no_cached_page;
 }
 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
  page_cache_release(page);
  return ret | VM_FAULT_RETRY;
 }
 /* Did it get truncated? */
 if (unlikely(page->mapping != mapping)) {
  unlock_page(page);
  put_page(page);
  goto retry_find;
 }
 VM_BUG_ON(page->index != offset);
 /*
  * We have a locked page in the page cache, now we need to check
  * that it's up-to-date. If not, it is going to be due to an error.
  */
 if (unlikely(!PageUptodate(page)))
  goto page_not_uptodate;
 /*
  * Found the page and have a reference on it.
  * We must recheck i_size under page lock.
  */
 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 if (unlikely(offset >= size)) {
  unlock_page(page);
  page_cache_release(page);
  return VM_FAULT_SIGBUS;
 }
 vmf->page = page;
 return ret | VM_FAULT_LOCKED;
no_cached_page:
 /*
  * We're only likely to ever get here if MADV_RANDOM is in
  * effect.
  */
 error = page_cache_read(file, offset);
 /*
  * The page we want has now been added to the page cache.
  * In the unlikely event that someone removed it in the
  * meantime, we'll just come back here and read it again.
  */
 if (error >= 0)
  goto retry_find;
 /*
  * An error return from page_cache_read can result if the
  * system is low on memory, or a problem occurs while trying
  * to schedule I/O.
  */
 if (error == -ENOMEM)
  return VM_FAULT_OOM;
 return VM_FAULT_SIGBUS;
page_not_uptodate:
 /*
  * Umm, take care of errors if the page isn't up-to-date.
  * Try to re-read it _once_. We do this synchronously,
  * because there really aren't any performance issues here
  * and we need to check for errors.
  */
 ClearPageError(page);
 /* 调用路径：->squashfs_readpage */
 error = mapping->a_ops->readpage(file, page);
 if (!error) {
  wait_on_page_locked(page);
  if (!PageUptodate(page))
   error = -EIO;
 }
 page_cache_release(page);
 if (!error || error == AOP_TRUNCATED_PAGE)
  goto retry_find;
 /* Things didn't work out. Return zero to tell the mm layer so. */
 shrink_readahead_size_eio(file, ra);
 return VM_FAULT_SIGBUS;
}

分析file_fault中代码得知：do_sync_mmap_readahead/do_async_mmap_readahead/page_cache_read最后都会通过squashfs_readpage读取文件.

举例：系统内存紧张时，会释放进程的动态库，此时若再加载动态库，则需要通过file_fault获取，多数情况通过do_sync_mmap_readahead加载代码段到dram上.

下面分析一下do_sync_mmap_readahead/do_async_mmap_readahead/page_cache_read这三个函数，这三种读取文件的过程都有可能申请文件页，且此时申请的文件页

需要通过add_to_page_cache_lru挂载到lru链表上.do_sync_mmap_readahead/do_async_mmap_readahead实现类似统一介绍：

无论哪种方式文件也得申请都形如（do_generic_file_read中虽未直接调用page_cache_read,但也有类似实现add_to_page_cache_lru）：

filemap_fault->page_cache_async_readahead->ondemand_readahead->__do_page_cache_readahead->
read_pages->squashfs_readpage
filemap_fault->page_cache_async_readahead->ondemand_readahead->ra_submit->__do_page_cache_readahead->
read_pages->squashfs_readpage

4)申请文件页时机：__do_page_cache_readahead->过程申请文件页：

static int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
			pgoff_t offset, unsigned long nr_to_read,
			unsigned long lookahead_size)
{
	struct inode *inode = mapping->host;
	struct page *page;
	unsigned long end_index;	/* The last page we want to read */
	LIST_HEAD(page_pool);
	int page_idx;
	int ret = 0;
	loff_t isize = i_size_read(inode);

	if (isize == 0)
		goto out;

	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
	/*
	 * Preallocate as many pages as we will need.
	 */
	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
		pgoff_t page_offset = offset + page_idx;

		if (page_offset > end_index)
			break;

		rcu_read_lock();
		page = radix_tree_lookup(&mapping->page_tree, page_offset);
		rcu_read_unlock();
		if (page)
			continue;

           /*申请文件页,在之后read_pages->add_to_page_cache_lru中将申请到的文件页添加到lru链表*/
		page = page_cache_alloc_readahead(mapping);
		if (!page)
			break;
		page->index = page_offset;
		list_add(&page->lru, &page_pool);
		if (page_idx == nr_to_read - lookahead_size)
			SetPageReadahead(page);
		ret++;
	}

	/*
	 * Now start the IO.  We ignore I/O errors - if the page is not
	 * uptodate then the caller will launch readpage again, and
	 * will then handle the error.
	 */
	if (ret)
		read_pages(mapping, filp, &page_pool, ret);
	BUG_ON(!list_empty(&page_pool));
out:
	return ret;
}

__do_page_cache_readahead->read_pages

static int read_pages(struct address_space *mapping, struct file *filp,
		struct list_head *pages, unsigned nr_pages)
{
	struct blk_plug plug;
	unsigned page_idx;
	int ret;

	blk_start_plug(&plug);

	if (mapping->a_ops->readpages) {
		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
		/* Clean up the remaining pages */
		put_pages_list(pages);
		goto out;
	}

	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
		struct page *page = list_to_page(pages);
		list_del(&page->lru);
		if (!add_to_page_cache_lru(page, mapping,
					page->index, GFP_KERNEL)) {
  
			mapping->a_ops->readpage(filp, page);
		}
		page_cache_release(page);
	}
	ret = 0;

out:
	blk_finish_plug(&plug);

	return ret;
}

5)申请文件页时机：squashfs_readpage过程申请文件页：可以参考linux文件读取过程一文

static int squashfs_readpage(struct file *file, struct page *page)
{
	struct inode *inode = page->mapping->host;
	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
	int bytes, i, offset = 0, sparse = 0;
	struct squashfs_cache_entry *buffer = NULL;
	void *pageaddr;
	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
	int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
	int start_index = page->index & ~mask;
	int end_index = start_index | mask;
	int file_end = i_size_read(inode) >> msblk->block_log;
    
	if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
					PAGE_CACHE_SHIFT))
		goto out;

	if (index < file_end || squashfs_i(inode)->fragment_block ==
					SQUASHFS_INVALID_BLK) {
		/*
		 * Reading a datablock from disk.  Need to read block list
		 * to get location and block size.
		 */
		u64 block = 0;
		int bsize = read_blocklist(inode, index, &block);
		if (bsize < 0)
			goto error_out;
   
		if (bsize == 0) { /* hole */
			bytes = index == file_end ?
				(i_size_read(inode) & (msblk->block_size - 1)) :
				 msblk->block_size;
			sparse = 1;
		} else {
			/*
			 * Read and decompress datablock.
			 */
			buffer = squashfs_get_datablock(inode->i_sb,
								block, bsize);
			if (buffer->error) {
                     squashfs_file_lookup(inode,file->f_dentry,0);
				ERROR("Unable to read page0 ,block %llx, size %x"
					"\n", block, bsize);
				squashfs_cache_put(buffer);
				goto error_out;
			}
			bytes = buffer->length;
		}
	} else {
                /*
                * Datablock is stored inside a fragment (tail-end packed
                * block).
                */
	        if(print_readpage_flag)
			printk("read page, fragment block %llx, size %x\n",
				squashfs_i(inode)->fragment_block,
				squashfs_i(inode)->fragment_size);


		buffer = squashfs_get_fragment(inode->i_sb,
				squashfs_i(inode)->fragment_block,
				squashfs_i(inode)->fragment_size);


		if (buffer->error) {
			ERROR("Unable to read page, block %llx, size %x\n",
				squashfs_i(inode)->fragment_block,
				squashfs_i(inode)->fragment_size);
			squashfs_cache_put(buffer);
			goto error_out;
		}
		bytes = i_size_read(inode) & (msblk->block_size - 1);
		offset = squashfs_i(inode)->fragment_offset;
	}

	/*
	 * Loop copying datablock into pages.  As the datablock likely covers
	 * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
	 * grab the pages from the page cache, except for the page that we've
	 * been called to fill.
	 */
	for (i = start_index; i <= end_index && bytes > 0; i++,
			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
		struct page *push_page;
		int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);

		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
          /*申请文件页,在之后grab_cache_page_nowait->add_to_page_cache_lru中将申请到的文件页添加到lru链表
           squashfs_readpage一次读取一个逻辑块大小，而入参只是一个页，所以要再申请文件页
          */
		push_page = (i == page->index) ? page :
			grab_cache_page_nowait(page->mapping, i);

		if (!push_page)
			continue;

		if (PageUptodate(push_page))
			goto skip_page;

		pageaddr = kmap_atomic(push_page);
		squashfs_copy_data(pageaddr, buffer, offset, avail);
		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
		kunmap_atomic(pageaddr);
		flush_dcache_page(push_page);
		SetPageUptodate(push_page);
skip_page:
		unlock_page(push_page);
		if (i != page->index)
			page_cache_release(push_page);
	}

	if (!sparse)
		squashfs_cache_put(buffer);

	return 0;

error_out:
	SetPageError(page);
out:
	pageaddr = kmap_atomic(page);
	memset(pageaddr, 0, PAGE_CACHE_SIZE);
	kunmap_atomic(pageaddr);
	flush_dcache_page(page);
	if (!PageError(page))
		SetPageUptodate(page);
	unlock_page(page);

	return 0;
}

squashfs_readpage->grab_cache_page_nowait

struct page *
grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
{
	struct page *page = find_get_page(mapping, index);

	if (page) {
		if (trylock_page(page))
			return page;
		page_cache_release(page);
		return NULL;
	}
     /*申请文件页,在之后grab_cache_page_nowait->add_to_page_cache_lru中
     将申请到的文件页添加到lru链表*/
	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
	if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
		page_cache_release(page);
		page = NULL;
	}
	return page;
}

6)申请文件页时机：filemap_fault->page_cache_read:过程申请文件页

do_nolinear_fault->__do_fault->filemap_fault->page_cache_read

static int page_cache_read(struct file *file, pgoff_t offset)
{
 struct address_space *mapping = file->f_mapping;
 struct page *page; 
 int ret;
 do {
   /*通过alloc_pages申请内存*/
  page = page_cache_alloc_cold(mapping);
  if (!page)
   return -ENOMEM;
/*
 设置page->flags;page->mapping;page->index等.
 将申请的页添加到LRU_INACTIVE_FILE链表
*/
  ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
  if (ret == 0)
   ret = mapping->a_ops->readpage(file, page);
  else if (ret == -EEXIST)
   ret = 0; /* losing race to add is OK */
  page_cache_release(page);
 } while (ret == AOP_TRUNCATED_PAGE);
  
 return ret;
}

do_nolinear_fault->__do_fault->filemap_fault->page_cache_read->add_to_page_cache_lru()

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
    pgoff_t offset, gfp_t gfp_mask)
{
 int ret;
 /*设置PG_locked;调用add_to_page_cache_locked*/
 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
 if (ret == 0)
  /*将申请的页添加到LRU_INACTIVE_FILE链表*/
  lru_cache_add_file(page);
 return ret;
}

add_to_page_cache_locked设置page->mapping=file->f_mapping;page->index=offset;

注意:do_dentry_open时f_mapping=inode->i_mapping;

inode->i_mapping的初始化：

1>是在do_last->lookup_open->vfs_create->ubifs_create->ubifs_new_inode创建一个文件时初始化的.

2>对于已存在文件,是在do_last->lookup_open->lookup_real->ubifs_lookup->ubifs_iget

add_to_page_cache_lru->add_to_page_cache->add_to_page_cache_locked 和 __delete_from_page_cache对应;

int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
  pgoff_t offset, gfp_t gfp_mask)
{
 int error;
 VM_BUG_ON(!PageLocked(page));
 VM_BUG_ON(PageSwapBacked(page));
 error = mem_cgroup_cache_charge(page, current->mm,
     gfp_mask & GFP_RECLAIM_MASK);
 if (error)
  goto out;
 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 if (error == 0) {
  page_cache_get(page);
   /*设置page->mapping=file->f_mapping*/
  page->mapping = mapping;
   /*offset表示要访问文件的偏移地址所在页*/
  page->index = offset;
  spin_lock_irq(&mapping->tree_lock);
  error = radix_tree_insert(&mapping->page_tree, offset, page);
  if (likely(!error)) {
   mapping->nrpages++;
   __inc_zone_page_state(page, NR_FILE_PAGES);
   spin_unlock_irq(&mapping->tree_lock);
   trace_mm_filemap_add_to_page_cache(page);
  } else {
   page->mapping = NULL;
   /* Leave page->index set: truncation relies upon it */
   spin_unlock_irq(&mapping->tree_lock);
   mem_cgroup_uncharge_cache_page(page);
   page_cache_release(page);
  }
  radix_tree_preload_end();
 } else
  mem_cgroup_uncharge_cache_page(page);
out:
 return error;
}

7)申请文件页时机：do_generic_file_read过程申请文件页

详见博文：linux文件系统实现原理简述 http://write.blog.csdn.net/postedit/71249365 ;

5 内存回收的代码介绍：

alloc_pages申请内存过程,主要有两个时机会回收内存:
时机一：get_page_from_freelist->zone_reclaim->shrink_zone 主要根据zone_watermark_ok判断是否zone_reclaim;

时机二：__alloc_pages_slowpath->wake_all_kswapd唤醒回收机制.kswapd->balance_pgdat->shrink_zone;

alloc_pages过程中，如果get_page_from_freelist申请失败，则唤醒守护进程进行内存回收.

无论哪种方式，最后都是通过形如shrink_zone的函数进行内存回收的.

shrink_zone->shrink_lruvc

/*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;

/*获取NR_LRU_LISTS链表上每种物理页个数*/
get_scan_count(lruvec, sc, nr);

blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(lru) {
if (nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
nr[lru] -= nr_to_scan;

nr_reclaimed += shrink_list(lru, nr_to_scan,
   lruvec, sc);
}
}
/*
* On large memory systems, scan >> priority can become
* really large. This is fine for the starting priority;
* we want to put equal scanning pressure on each zone.
* However, if the VM has a harder time of freeing pages,
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/
if (nr_reclaimed >= nr_to_reclaim &&
   sc->priority < DEF_PRIORITY)
break;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;

/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.inactive_anon_is_low(lruvec)=0
*/
if (inactive_anon_is_low(lruvec))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
  sc, LRU_ACTIVE_ANON);

throttle_vm_writeout(sc->gfp_mask);
}

shrink_lruvec->shrink_list()

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
/*inactive_list_is_low返回值：
如果是文件页：LRU_ACTIVE_FILE链表上的页比LRU_INACTIVE_FILE链表上的页多，则为真，否则为假;
如果是匿名页则一直未假;
*/
if (inactive_list_is_low(lruvec, lru))
/* 把页从active链表移到inactive链表 */
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
/*真正的页面回收函数
shrink_inactive_list->shrink_page_list
*/
return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}

当inactive链表上的页数不够的时候，会调用shrink_active_list，该函数会将active链表上的页move到inactive链表上;

shrink_lruvec->shrink_list()->shrink_active_list()

static void shrink_active_list(unsigned long nr_to_scan,
      struct lruvec *lruvec,
      struct scan_control *sc,
      enum lru_list lru)
{
unsigned long nr_taken;
unsigned long nr_scanned;
unsigned long vm_flags;
LIST_HEAD(l_hold);/* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned long nr_rotated = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct zone *zone = lruvec_zone(lruvec);    
        unsigned long vm_oflags = 0;
        static int age = 0;
        int puttoinactive = 0;
        lru_add_drain();

if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;

spin_lock_irq(&zone->lru_lock);

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
    &nr_scanned, sc, isolate_mode, lru);
if (global_reclaim(sc))
zone->pages_scanned += nr_scanned;

reclaim_stat->recent_scanned[file] += nr_taken;

__count_zone_vm_events(PGREFILL, zone, nr_scanned);
__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
spin_unlock_irq(&zone->lru_lock);

while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);

if (unlikely(!page_evictable(page))) {
putback_lru_page(page);
continue;
}

if (unlikely(buffer_heads_over_limit)) {
if (page_has_private(page) && trylock_page(page)) {
if (page_has_private(page))
try_to_release_page(page, 0);
unlock_page(page);
}
}
/* 
page_referenced返回0：表示该页需要添加到inactive链表上,如果某些特殊情况不想释放特定页可以在此时做特殊处理;
page_referenced返回1:如果文件页是代码段,则添加到active链表上,因此代码段有更多机会留在内存上;
 */
if (page_referenced(page, 0, sc->target_mem_cgroup,
   &vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure.  Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
/*此时表示文件页*/
if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);
continue;
}
}

ClearPageActive(page); /* we are de-activating */
/* 将页添加到inactive链表中 */
list_add(&page->lru, &l_inactive);
}

/*
* Move pages back to the lru list.
*/
spin_lock_irq(&zone->lru_lock);
/*
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated.  This
* helps balance scan pressure between file and anonymous pages in
* get_scan_ratio.
*/
reclaim_stat->recent_rotated[file] += nr_rotated;

move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);

free_hot_cold_page_list(&l_hold, 1);
}

shrink_active_list将页移到inactive链表上;

下一步进行真正的页面回收函数

shrink_inactive_list->shrink_page_list

static noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
    struct scan_control *sc, enum lru_list lru)
{
LIST_HEAD(page_list);
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
unsigned long nr_dirty = 0;
unsigned long nr_writeback = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct zone *zone = lruvec_zone(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);

/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
return SWAP_CLUSTER_MAX;
}

lru_add_drain();

if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;

spin_lock_irq(&zone->lru_lock);

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
    &nr_scanned, sc, isolate_mode, lru);

__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);

if (global_reclaim(sc)) {
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
else
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
}
spin_unlock_irq(&zone->lru_lock);

if (nr_taken == 0)
return 0;

/*真正回收页面*/
nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
&nr_dirty, &nr_writeback, false);

spin_lock_irq(&zone->lru_lock);

reclaim_stat->recent_scanned[file] += nr_taken;

if (global_reclaim(sc)) {
if (current_is_kswapd())
__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
      nr_reclaimed);
else
__count_zone_vm_events(PGSTEAL_DIRECT, zone,
      nr_reclaimed);
}

putback_inactive_pages(lruvec, &page_list);

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);

spin_unlock_irq(&zone->lru_lock);

free_hot_cold_page_list(&page_list, 1);

/*
* If reclaim is isolating dirty pages under writeback, it implies
* that the long-lived page allocation rate is exceeding the page
* laundering rate. Either the global limits are not being effective
* at throttling processes due to the page distribution throughout
* zones or there is heavy usage of a slow backing device. The
* only option is to throttle from reclaim context which is not ideal
* as there is no guarantee the dirtying process is throttled in the
* same way balance_dirty_pages() manages.
*
* This scales the number of dirty pages that must be under writeback
* before throttling depending on priority. It is a simple backoff
* function that has the most effect in the range DEF_PRIORITY to
* DEF_PRIORITY-2 which is the priority reclaim is considered to be
* in trouble and reclaim is considered to be in trouble.
*
* DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
* DEF_PRIORITY-1  50% must be PageWriteback
* DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
* ...
* DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
*                     isolated page is PageWriteback
*/
if (nr_writeback && nr_writeback >=
(nr_taken >> (DEF_PRIORITY - sc->priority)))
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);

trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
nr_scanned, nr_reclaimed,
sc->priority,
trace_shrink_flags(file));
return nr_reclaimed;
}

shrink_inactive_list->shrink_page_list()->__remove_mapping()

static int __remove_mapping(struct address_space *mapping, struct page *page)
{
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));

spin_lock_irq(&mapping->tree_lock);
/*
* The non racy check for a busy page.
*
* Must be careful with the order of the tests. When someone has
* a ref to the page, it may be possible that they dirty it then
* drop the reference. So if PageDirty is tested before page_count
* here, then the following race may occur:
*
* get_user_pages(&page);
* [user mapping goes away]
* write_to(page);
* !PageDirty(page)    [good]
* SetPageDirty(page);
* put_page(page);
* !page_count(page)   [good, discard it]
*
* [oops, our write_to data is lost]
*
* Reversing the order of the tests ensures such a situation cannot
* escape unnoticed. The smp_rmb is needed to ensure the page->flags
* load is not satisfied before that of page->_count.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
*/
if (!page_freeze_refs(page, 2))
goto cannot_free;
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_unfreeze_refs(page, 2);
goto cannot_free;
}

if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
spin_unlock_irq(&mapping->tree_lock);
swapcache_free(swap, page);
} else {
void (*freepage)(struct page *);

freepage = mapping->a_ops->freepage;

__delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);

if (freepage != NULL)
freepage(page);
}

return 1;

cannot_free:
spin_unlock_irq(&mapping->tree_lock);
return 0;
}

__remove_mapping->delete_from_page_cache->__delete_from_page_cache()

void __delete_from_page_cache(struct page *page)
{
 struct address_space *mapping = page->mapping;
 trace_mm_filemap_delete_from_page_cache(page);
 /*
  * if we're uptodate, flush out into the cleancache, otherwise
  * invalidate any existing cleancache entries.  We can't leave
  * stale data around in the cleancache once our page is gone
  */
 if (PageUptodate(page) && PageMappedToDisk(page))
  cleancache_put_page(page);
 else
  cleancache_invalidate_page(mapping, page);
 /*从lru链表上删除,和add_to_page_cache_locked对应*/
 radix_tree_delete(&mapping->page_tree, page->index);
 page->mapping = NULL;
 /* Leave page->index set: truncation lookup relies upon it */
 mapping->nrpages--;
 __dec_zone_page_state(page, NR_FILE_PAGES);
 if (PageSwapBacked(page))
  __dec_zone_page_state(page, NR_SHMEM);
 BUG_ON(page_mapped(page));
 /*
  * Some filesystems seem to re-dirty the page even after
  * the VM has canceled the dirty bit (eg ext3 journaling).
  *
  * Fix it up by doing a final dirty accounting check after
  * having removed the page entirely.
  */
 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
  dec_zone_page_state(page, NR_FILE_DIRTY);
  dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 }
}

【总结】
1 系统内存信息的初始化过程(包括：初始化pgdat、struct zone: )：
paging_init->bootmem_init->arm_bootmem_init
paging_init->bootmem_init->arm_bootmem_free->free_area_init_node
为页描述符地址分配内存：
free_area_init_node->alloc_node_mem_map(pgdat)->alloc_bootmem_node_nopanic->alloc_bootmem_bdata->__reserve;
初始化页描述符，初始化struct zone node_zones：
free_area_init_node->free_area_init_core->memmap_init_zone
初始化node_zonelists：
start_kernel->build_all_zonelists()
释放所有页框，将page->lru添加到：mem_init->free_all_bootmem_core
初始化阶段释放所有页：free_all_bootmem->__free_one_page（
list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
2 申请内存：alloc_pages->__alloc_pages_nodemask->get_page_from_freelist->buffered_rmqueue->rmqueue_bulk:分解大的伙伴。如4页分成2*2页，申请失败时调用。
rmqueue_bulk从zone->free_area链表上分配内存页，然后将页链到pcp->lists中。
3 zone->watermar k赋值：__setup_per_zone_wmarks。
alloc_pages->zone_watermark_ok根据>watermark值决定是否能申请到内存。
alloc_pages-申请过程中通过zone_watermark_ok，判断是否有足够空闲页，否则调用zone_reclaim->shrink_zone回收，如果还未成功，则：
__alloc_pages_slowpath->__alloc_pages_direct_reclaim->__perform_reclaim->try_to_free_pages->shrink_zones->shrink_zone

ps:伙伴系统buddy：11个块链表大小分别为2的0、1、2、、、10次方个页，最多申请连续1024个页=4M

4通过page fault申请文件页
1）映射页：page->mapping = file->f_mapping：
filemap_fault->add_to_page_cache_lru->add_to_page_cache_locked-》add_to_page_cache_locked：page->mapping = mapping;page->index = offset;
2）匿名页：handle_pte_fault->do_wp_page->page_move_anon_rmap
3) 匿名vma初始化：anon_vma_alloc()

5 动态内存回收机制，主要回收通过page fault进行申请的页.

1）alloc_pages因为剩余内存不足引起的回收：
try_to_free_pages->shrink_zone;
zone_reclaim->shrink_zone;
shrink_zone -> shrink_lruvec -> shrink_list->shrink_inactive_list->shrink_page_list
shrink_zone -> shrink_lruvec -> shrink_list->shrink_active_list
2）守护进程释放kswapd->balance_pgdat->shrink_zone
kswapd_init->kswapd_run->kthread_run(kswapd,pgdat,"kswapd%d",nid)
6 真正的页面回收函数
shrink_page_list->try_to_unmap_anon:
匿名线性区赋值，存放在页描述符的mapping字段。当mapping最低位为0时，存放的是映射页address_space;非0存放匿名页线性区。
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;（可以插入几个进程的线性区()，只有所有进程都释放了该页（mapcount为-1时）
才能释放，即从active，inactive链表中删除，并调用free_hot_page释放页框;初始化page->_mapcount：copy_one_pte->page_dup_rmap(page);
try_to_unmap_anon逐个线性区释放。
1）一个线性区如何释放？
shrink_active_list的执行条件：非活动链表中比活动链表小，则开始缩小活动链表。
move_active_pages_to_lru：list_move(&page->lru, &lruvec->lists[lru])，将活动也链到活动也链表中。
1)mark_page_accessed: page从inactive链表向active链表切换，其中采用了二次机会。
inactive reference=0-> inactive reference=1 ->active reference=0->active reference=1
2）page_referenced：page从active向inactive表切换.每当置换算法扫描一次页面，就老化一次。
3）用户进程的页面都是通过缺页异常 page fault分配的handle_pte_fault，都是可回收的，分为：文件页和匿名页。
4） handle_pte_fault函数（do_swap_page换入，pageout换出，页不在内存时-present标记清0时调用。匿名页添加到活动lru；文件页加到非活动lru）

【其他】

1 不断触发内存动态回收方法，内存紧张时，代码段可能被回收：

static int wake_all_kswapd_debug(void)
{
/* 每个两秒触发一次内存回收，触发30次 */
for(j=0;j<30;j++)
{
    /*8表示连续256个page即1M大小;NODE_DATA(0)表示ZONE_NORMAL*/
    wake_all_kswapd(8,NODE_DATA(0)->node_zonelists,0,0);
    msleep(2000);
}
}

2 判断test进程的代码段，动态库等被重新加载的方法:filemap_fault函数中加入打印信息.

int filemap_fault()
{
     if(strncmp(current->comm,"test",4)==0)
     {
             printk("[%s]filemap_fault：%s\n",
                 current->comm,file->f_path.dentry->d_iname);
     }
}

3 统计加载一个动态库到ram上时，需要申请的文件页数，根据上文得知申请文件页的时机，则分别统计即可.

文件页个数增加过程,一般来说系统通过add_to_page_cache_lru讲申请到的页作为文件页，可以据此统计文件页个数：

1>do_generic_file_read过程申请文件页，do_generic_file_read->add_to_page_cache_lru之后统计文件页个数;

2>filemap_fault->page_cache_read->add_to_page_cache_lru之后统计文件页个数;

3>filemap_fault->__do_page_cache_readahead->read_pages->->add_to_page_cache_lru之后统计文件页个数;

4>do_generic_file_read/filemap_fault->(mapping->a_ops->readpage=squashfs_readpage）->

squashfs_readpage->grab_cache_page_nowait之后统计文件页个数，注意不统计入参的一个文件页;

文件页减少过程：

shrink_page_list->remove_mapping->__remove_mapping->__delete_from_page_cache后统计;

[其他]

vm.min_free_kbytes=409600;
vm.vfs_cache_pressure=200; // sysctl_vfs_cache_pressure

vm.swappiness=40

调整MIN_FREE_KBYTES的目的是保持物理内存有足够的空闲空间，防止突发性的换页。swapiness缺省为60，减少swapiness会使系统尽快通过swapout不使用的进程资源来释放更多的物理内存。vfs_cache_pressure的缺省值是100，加大这个参数设置了虚拟内存回收directory和i-node缓冲的倾向，这个值越大，回收的倾向越严重。调整这3个参数的目的就是让操作系统在平时就尽快回收缓冲，释放物理内存，这样就可以避免突发性的大规模换页。

linux内存管理之内存回收机制

猜你喜欢