【Linux内核学习笔记三】内存管理-页面(page)

1.页面

    系统的内存划分成大小确定的许多块,这些块也称为页面帧,页帧是系统内存的最小单位。内核需要保持该结构尽可能小,因为对于现代计算机,物理内存包含大量的页帧,即便是增加page一点点空间,都会导致保存所有页帧page结构多占用大量物理内存。例如当页长度为4KB,主内存384MB时大约需要100000页。

     每个物理页面帧由一个 struct page 描述。它在<kernel/include/linux/mm_types.h>中定义如下:

/*
 * Each physical page in the system has a struct page associated with
 * it to keep track of whatever it is we are using the page for at the
 * moment. Note that we have no way to track which tasks are using
 * a page, though if it is a pagecache page, rmap structures can tell us
 * who is mapping it.
 *
 * The objects in struct page are organized in double word blocks in
 * order to allows us to use atomic double word operations on portions
 * of struct page. That is currently only used by slub but the arrangement
 * allows the use of atomic double word operations on the flags/mapping
 * and lru list pointers also.
 */
struct page {
	/* First double word block */
	unsigned long flags;		/* Atomic flags, some possibly
					 * updated asynchronously */
	union {
		struct address_space *mapping;	/* If low bit clear, points to
						 * inode address_space, or NULL.
						 * If page mapped as anonymous
						 * memory, low bit is set, and
						 * it points to anon_vma object:
						 * see PAGE_MAPPING_ANON below.
						 */
		void *s_mem;			/* slab first object */
	};

	/* Second double word */
	struct {
		union {
			pgoff_t index;		/* Our offset within mapping. */
			void *freelist;		/* sl[aou]b first free object */
		};

		union {
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
	defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
			/* Used for cmpxchg_double in slub */
			unsigned long counters;
#else
			/*
			 * Keep _count separate from slub cmpxchg_double data.
			 * As the rest of the double word is protected by
			 * slab_lock but _count is not.
			 */
			unsigned counters;
#endif

			struct {

				union {
					/*
					 * Count of ptes mapped in
					 * mms, to show when page is
					 * mapped & limit reverse map
					 * searches.
					 *
					 * Used also for tail pages
					 * refcounting instead of
					 * _count. Tail pages cannot
					 * be mapped and keeping the
					 * tail page _count zero at
					 * all times guarantees
					 * get_page_unless_zero() will
					 * never succeed on tail
					 * pages.
					 */
					atomic_t _mapcount;

					struct { /* SLUB */
						unsigned inuse:16;
						unsigned objects:15;
						unsigned frozen:1;
					};
					int units;	/* SLOB */
				};
				atomic_t _count;		/* Usage count, see below. */
			};

			unsigned int active;	/* SLAB */
		};
	};

	/* Third double word block */
	union {
		struct list_head lru;	/* Pageout list, eg. active_list
					 * protected by zone->lru_lock !
					 * Can be used as a generic list
					 * by the page owner.
					 */
		struct {		/* slub per cpu partial pages */
			struct page *next;	/* Next partial slab */
#ifdef CONFIG_64BIT
			int pages;	/* Nr of partial slabs left */
			int pobjects;	/* Approximate # of objects */
#else
			short int pages;
			short int pobjects;
#endif
		};

		struct slab *slab_page; /* slab fields */
		struct rcu_head rcu_head;	/* Used by SLAB
						 * when destroying via RCU
						 */
		/* First tail page of compound page */
		struct {
			compound_page_dtor *compound_dtor;
			unsigned long compound_order;
		};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
		pgtable_t pmd_huge_pte; /* protected by page->ptl */
#endif
	};

	/* Remainder is not double word aligned */
	union {
		unsigned long private;		/* Mapping-private opaque data:
					 	 * usually used for buffer_heads
						 * if PagePrivate set; used for
						 * swp_entry_t if PageSwapCache;
						 * indicates order in the buddy
						 * system if PG_buddy is set.
						 */
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
		spinlock_t *ptl;
#else
		spinlock_t ptl;
#endif
#endif
		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
		struct page *first_page;	/* Compound tail pages */
	};

#ifdef CONFIG_MEMCG
	struct mem_cgroup *mem_cgroup;
#endif

	/*
	 * On machines where all RAM is mapped into kernel address space,
	 * we can simply calculate the virtual address. On machines with
	 * highmem some memory is mapped into kernel virtual memory
	 * dynamically, so we need a place to store that address.
	 * Note that this field could be 16 bits on x86 ... ;)
	 *
	 * Architectures with slow multiplication can define
	 * WANT_PAGE_VIRTUAL in asm/page.h
	 */
#if defined(WANT_PAGE_VIRTUAL)
	void *virtual;			/* Kernel virtual address (NULL if
					   not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef CONFIG_KMEMCHECK
	/*
	 * kmemcheck wants to track the status of each byte in a page; this
	 * is a pointer to such a status block. NULL if not tracked.
	 */
	void *shadow;
#endif

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
	int _last_cpupid;
#endif
}

下面对page的主要字段进行描述:

  • flags:用来描述页面状态的标志,这些标志在<kernel/include/linux/page-flags.h>中定义如下:
enum pageflags {
	PG_locked,		/* Page is locked. Don't touch. */
	PG_error,
	PG_referenced,
	PG_uptodate,
	PG_dirty,
	PG_lru,
	PG_active,
	PG_slab,
	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use*/
	PG_arch_1,
	PG_reserved,
	PG_private,		/* If pagecache, has fs-private data */
	PG_private_2,		/* If pagecache, has fs aux data */
	PG_writeback,		/* Page is under writeback */
#ifdef CONFIG_PAGEFLAGS_EXTENDED
	PG_head,		/* A head page */
	PG_tail,		/* A tail page */
#else
	PG_compound,		/* A compound page */
#endif
	PG_swapcache,		/* Swap page: swp_entry_t in private */
	PG_mappedtodisk,	/* Has blocks allocated on-disk */
	PG_reclaim,		/* To be reclaimed asap */
	PG_swapbacked,		/* Page is backed by RAM/swap */
	PG_unevictable,		/* Page is "unevictable"  */
#ifdef CONFIG_MMU
	PG_mlocked,		/* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
	PG_uncached,		/* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
	PG_hwpoison,		/* hardware poisoned page. Don't touch */
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	PG_compound_lock,
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
	PG_young,
	PG_idle,
#endif
	__NR_PAGEFLAGS,

	/* Filesystems */
	PG_checked = PG_owner_priv_1,

	/* Two page bits are conscripted by FS-Cache to maintain local caching
	 * state.  These bits are set on pages belonging to the netfs's inodes
	 * when those inodes are being locally cached.
	 */
	PG_fscache = PG_private_2,	/* page backed by cache */

	/* XEN */
	/* Pinned in Xen as a read-only pagetable page. */
	PG_pinned = PG_owner_priv_1,
	/* Pinned as part of domain save (see xen_mm_pin_all()). */
	PG_savepinned = PG_dirty,
	/* Has a grant mapping of another (foreign) domain's page. */
	PG_foreign = PG_owner_priv_1,

	/* SLOB */
	PG_slob_free = PG_private,
};

下表列出了一些常用的标志及其含义所示:

标志名 含义
PG_locked 页被锁定,例如,在磁盘I/O操作中涉及的页
PG_error 在传输页时发生I/O错误
PG_referenced 刚刚访问过的页
PG_uptodate 在完成读操作后置位,除非发生磁盘I/O错误
PG_dirty 页已经被修改
PG_lru 页在活动或者非活动链表中
PG_active 页在活动页链表中
PG_slab 包含在slab中的页框
PG_arch_1 特定于体系结构的页面状态位。一般的代码保证当页面首次进入页面缓存时,该位将被清除。
PG_reserved 保留给内核或没有使用的页
PG_private 如果page中的private成员非空,则需要设置该标志。参考下面对private的解释。
PG_writeback 正在将页写到磁盘上
PG_compound 通过扩展分页机制处理页框
PG_swapcache 页属于对换高速缓存
PG_mappedtodisk 页框中的所有数据对应于磁盘上分配的块
PG_reclaim 为回收内存对页已经做了写入磁盘的标记
PG_swapbacked 该page的后备存储器是swap
PG_unevictable 该page被锁住,不能交换,并会出现在LRU_UNEVICTABLE链表中,它包括的几种page:ramdisk或ramfs使用的页、shm_locked、mlock锁定的页。
PG_mlocked 该page在vma中被锁定,一般是通过系统调用mlock()锁定了一段内存

下面列出了测试,设置和清除这些标志的宏,他们都定义在<kernel/include/linux/page-flags.h>中:

/*
 * Macros to create function definitions for page flags
 */
#define TESTPAGEFLAG(uname, lname)					\
static inline int Page##uname(const struct page *page)			\
			{ return test_bit(PG_##lname, &page->flags); }

#define SETPAGEFLAG(uname, lname)					\
static inline void SetPage##uname(struct page *page)			\
			{ set_bit(PG_##lname, &page->flags); }

#define CLEARPAGEFLAG(uname, lname)					\
static inline void ClearPage##uname(struct page *page)			\
			{ clear_bit(PG_##lname, &page->flags); }

#define __SETPAGEFLAG(uname, lname)					\
static inline void __SetPage##uname(struct page *page)			\
			{ __set_bit(PG_##lname, &page->flags); }

#define __CLEARPAGEFLAG(uname, lname)					\
static inline void __ClearPage##uname(struct page *page)		\
			{ __clear_bit(PG_##lname, &page->flags); }

#define TESTSETFLAG(uname, lname)					\
static inline int TestSetPage##uname(struct page *page)			\
		{ return test_and_set_bit(PG_##lname, &page->flags); }

#define TESTCLEARFLAG(uname, lname)					\
static inline int TestClearPage##uname(struct page *page)		\
		{ return test_and_clear_bit(PG_##lname, &page->flags); }

#define __TESTCLEARFLAG(uname, lname)					\
static inline int __TestClearPage##uname(struct page *page)		\
		{ return __test_and_clear_bit(PG_##lname, &page->flags); }

#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)		\
	SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)

#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)		\
	__SETPAGEFLAG(uname, lname)  __CLEARPAGEFLAG(uname, lname)

#define TESTSCFLAG(uname, lname)					\
	TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)

#define TESTPAGEFLAG_FALSE(uname)					\
static inline int Page##uname(const struct page *page) { return 0; }

#define SETPAGEFLAG_NOOP(uname)						\
static inline void SetPage##uname(struct page *page) {  }

#define CLEARPAGEFLAG_NOOP(uname)					\
static inline void ClearPage##uname(struct page *page) {  }

#define __CLEARPAGEFLAG_NOOP(uname)					\
static inline void __ClearPage##uname(struct page *page) {  }

#define TESTSETFLAG_FALSE(uname)					\
static inline int TestSetPage##uname(struct page *page) { return 0; }

#define TESTCLEARFLAG_FALSE(uname)					\
static inline int TestClearPage##uname(struct page *page) { return 0; }

#define __TESTCLEARFLAG_FALSE(uname)					\
static inline int __TestClearPage##uname(struct page *page) { return 0; }

#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname)			\
	SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)

#define TESTSCFLAG_FALSE(uname)						\
	TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
  • mapping:
  1. 当page->mapping == NULL,该page属于交换高速缓存页(swap cache),当需要使用地址空间时会指定交换分区的地址空间swapper_space。
  2. 当page->mapping != NULL,并且bit[0] == 0,该page属于页缓存或文件映射,mapping指向inode address_space。
  3. 当page->mapping != NULL,并且bit[0] != 0,该page属于匿名映射,且设置了PAGE_MAPPING_ANON位(bit[0] )被,page->mapping指向struct anon_vma对象。
  4. 当page->mapping != NULL,并且bit[1] != 0,bit[0] != 0,该page映射在VM_MERGEABLE区域的匿名页面上,如果启用了CONFIG_KSM,那么PAGE_MAPPING_KSM(bit[1]) 位与PAGE_MAPPING_ANON位一起被设置;然后page->mapping,不是指向anon_vma, 而是指向KSM与合并页相关联的private数据结构。

     内核定义了一些API来取得mapping指针的状态,如下所示:

/*
 * On an anonymous page mapped into a user virtual memory area,
 * page->mapping points to its anon_vma, not to a struct address_space;
 * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
 *
 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
 * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
 * and then page->mapping points, not to an anon_vma, but to a private
 * structure which KSM associates with that merged page.  See ksm.h.
 *
 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
 *
 * Please note that, confusingly, "page_mapping" refers to the inode
 * address_space which maps the page from disk; whereas "page_mapped"
 * refers to user virtual address space into which the page is mapped.
 */
#define PAGE_MAPPING_ANON	1
#define PAGE_MAPPING_KSM	2
#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)

/* 获取page->mapping anon_vma位状态,如果该位被设置,则page->mapping指向该page
 * 的anon_vma私有数据
 */
static inline int PageAnon(struct page *page)
{
	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}

#ifdef CONFIG_KSM
/*
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
 * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
 * anon_vma, but to that page's node of the stable tree.
 */
/* 获取page->mapping anon_vma位和ksm位状态,如果都被设置,则page->mapping指向该page
 * 的KSM私有数据
 */
static inline int PageKsm(struct page *page)
{
	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}
#else
TESTPAGEFLAG_FALSE(Ksm)
#endif
  • s_mem:第一个slab对象。
  • index:这个字段有两个用途,它的意义与该页面的状态有关。如果页面是文件映射的一部分,它就是页面在文件中的偏移。如果页面是交换高速缓存的一部分,它就是在交换地址空间中(swapper_space)address_space的偏移量。此外,如果包含页面的块被释放以提供给一个特殊的进程,那么被释放的块的顺序存放在index中。这在函数__free_pages_ok()中设置。
  • freelist:所在slab的第一个free对象。
  • _mapcount:页映射计数器, 被页表映射的次数,也就是说该page同时被多少个进程共享。 初始值为-1,如果只被一个进程的页表映射了,该值为0. 如果该page处于伙伴系统中,该值为PAGE_BUDDY_MAPCOUNT_VALUE(-128),内核通过判断该值是否为PAGE_BUDDY_MAPCOUNT_VALUE来确定该page是否属于伙伴系统。
  • inuse:16:在当前slab中,已经分配的slab数量。
  • objects:15:在当前slab中,总slab数量。
  • frozen:1:如果为1,表示只能由活动CPU分配SLAB其他CPU只能向其释放SLAB。
  • _count:引用计数,表示内核中引用该page的次数, 如果要操作该page, 引用计数会+1, 操作完成-1. 当该值为0时, 表示没有引用该page的位置,所以该page可以被解除映射,这往往在内存回收时是有用的
  • active:SLAB用于管理页面中可用的SLAB对象。
  • lru:对SLAB来说,通过lru将其链接到SLAB的partial、free等链表中。
  • private:是一个指向"私有"数据的指针,虚拟内存管理会忽略该数据。根据页的用途,可以用不同的方式指向该指针。大多数情况下它用于将页与数据缓冲区关联起来。private私有数据指针, 由应用场景确定其具体的含义:
    a.如果设置了PG_private标志,则private字段指向struct buffer_head
    b.如果设置了PG_compound,则指向struct page
    c.如果设置了PG_swapcache标志,private存储了该page在交换分区中对应的位置信息swp_entry_t。
    d.如果_mapcount = PAGE_BUDDY_MAPCOUNT_VALUE,说明该page位于伙伴系统,private存储该伙伴的阶
  • slab_cache:指向本页关联的SLAB描述符。

2.页面初始化

    系统在初始化管理区(zone)后,紧接着就会建立和初始化zone相关的page。在【Linux内核学习笔记二】内存管理-管理区(zone)中已经讲述了free_area_init_core()用于向每个zone填充相关信息 。 free_area_init_core()向每个zone填充相关信息的过程中调用memmap_init()建立和初始化page,并设置所有page的PG_reserved位。

        memmap_init()通过memmap_init_zone()建立和初始化某个zone中的page,代码如下:

/*
 * Initially all pages are reserved - free ones are freed
 * up by free_all_bootmem() once the early boot process is
 * done. Non-atomic initialization, single-pass.
 */
/**
 * 将zone中的页框PG_reserved置位。表示该页不可用。
 * 同时初始化页框中其他值。
 */
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
		unsigned long start_pfn, enum memmap_context context)
{
	pg_data_t *pgdat = NODE_DATA(nid);
	unsigned long end_pfn = start_pfn + size;
	unsigned long pfn;
	struct zone *z;
	unsigned long nr_initialised = 0;

	if (highest_memmap_pfn < end_pfn - 1)
		highest_memmap_pfn = end_pfn - 1;

	z = &pgdat->node_zones[zone];
	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
		/*
		 * There can be holes in boot-time mem_map[]s
		 * handed to this function.  They do not
		 * exist on hotplugged memory.
		 */
		if (context == MEMMAP_EARLY) {
			if (!early_pfn_valid(pfn))
				continue;
			if (!early_pfn_in_nid(pfn, nid))
				continue;
			if (!update_defer_init(pgdat, pfn, end_pfn,
						&nr_initialised))
				break;
		}

		/*
		 * Mark the block movable so that blocks are reserved for
		 * movable at startup. This will force kernel allocations
		 * to reserve their blocks rather than leaking throughout
		 * the address space during boot when many long-lived
		 * kernel allocations are made. Later some blocks near
		 * the start are marked MIGRATE_RESERVE by
		 * setup_zone_migrate_reserve()
		 *
		 * bitmap is created for zone's valid pfn range. but memmap
		 * can be created for invalid pages (for alignment)
		 * check here not to call set_pageblock_migratetype() against
		 * pfn out of zone.
		 */
		if (!(pfn & (pageblock_nr_pages - 1))) {
                        //创建page
			struct page *page = pfn_to_page(pfn);
                        //初始化page
			__init_single_page(page, pfn, zone, nid);
			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
		} else {
			__init_single_pfn(pfn, zone, nid);
		}
	}
}

         memmap_init_zone()中,在一个循环中建立和初始化了该zone地址范围内的所有page,函数pfn_to_page()用于创建page,

__init_single_page()用于初始化page,__init_single_page()代码如下所示:

static void __meminit __init_single_page(struct page *page, unsigned long pfn,
				unsigned long zone, int nid)
{
	//建立page到zone和node的映射
        set_page_links(page, zone, nid, pfn);
        init_page_count(page);
	page_mapcount_reset(page);
	page_cpupid_reset_last(page);

	INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
	if (!is_highmem_idx(zone))
		set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}

3.页面和zone,node之间的映射

       在上一节中内核已经通过set_page_links()将页面映射到了zone和node。映射过程中,内核使用struct page的flags中的字段来保存页所属的zone以及node。通过page_zone(),可以获取某个page所在的zone。

       以下是页面和zone,node之间的映射的一些api:

static inline struct zone *page_zone(const struct page *page)
{
	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}

static inline void set_page_zone(struct page *page, enum zone_type zone)
{
	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
	page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}

static inline void set_page_node(struct page *page, unsigned long node)
{
	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
	page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}

       其中NODE_DATA为全局的node表。在UMA结构中, 只有一个node结点即contig_page_data, 此时NODE_DATA直接指向了全局的contig_page_data, 而与node的编号nid无关,NODE_DATA在<kernel/include/linux/mmzone.h>中定义如下:

#ifndef CONFIG_NEED_MULTIPLE_NODES

extern struct pglist_data contig_page_data;
#define NODE_DATA(nid)		(&contig_page_data)

        在NUMA结构的系统中, 所有的node都存储在node_data数组中,NODE_DATA直接通过node编号索引即可,以x86-

32系统为例,NODE_DATA的在<kernel/arch/x86/include/asm/mmzone_32.h>中定义如下:

#ifdef CONFIG_NUMA
extern struct pglist_data *node_data[];
#define NODE_DATA(nid)	(node_data[nid])
#endif /* CONFIG_NUMA */

猜你喜欢

转载自blog.csdn.net/wyy4045/article/details/82114115