Let’s not talk about the old-fashioned memory management methods.
The kernel used in this article is Linux 2.6.x.x
version. Memory management? Memory management! Whether it is in the various subsystems of the old version of the operating system or the modern version of the operating system, they are extremely complex and huge. Of course, everything remains the same, but looking at the source code can make you lose your direction. This article explains the virtual address space on the basis of the previous article. Of course, it is not realistic to list the source code completely. You can just compare it with your own pre-understanding. If you have the time and energy to fully understand the source code, you can is the best!
The article on virtual address space explains it from an architectural perspective. If you want to explain it in detail, you can check out: A long article will take you through the entire process of Linux dynamic linking . Dynamic links are the ultimate use of
Glibc
,vir addr
andDynamic Linking
!
Compared with the previous version, this version is a new Linux 0.11
and great change. Many of the following codes follow the code of this version, and it also has its magic: the device tree. But in terms of transformativeness, I think it is not as good as that , so I chose this version for analysis.0.99
2.6.x.x
Linux 3.0
2.6
P.S.: After watching 369’s Sword Demon today, I can only say that there are talented people for generations to come!
bibliography:
- "In-depth Analysis of Linux Architecture"
- "In-depth Understanding of the Linux Kernel"
- "Analysis of Linux Source Code"
- "Linux Kernel Design and Implementation"
References:
Reference article:
- Why must the Linux kernel be mapped to all physical memory?
- What is a virtual address space? Explained from an architectural perspective
- Linux memory management slab 1: slab principle (+buddy partner system)
- slab coloring
Memory organization in NUMA model
For explanations of NUMA
and , see: What is a virtual address space? Explained from an architectural perspectiveUMA
Introduction: Memory is divided into three levels: node
node
, areazone
and page (each memory stick can be easily regarded as one )page
node
enum zone_type {
#ifdef CONFIG_ZONE_DMA
ZONE_DMA, // 0-16MB
#endif
#ifdef CONFIG_ZONE_DMA32
ZONE_DMA32, // 使用32位地址字可寻址,只有在64位中才使用,32位中为0
#endif
ZONE_NORMAL, // 16MB-896MB
#ifdef CONFIG_HIGHMEM
ZONE_HIGHMEM, // 896MB-1G
#endif
ZONE_MOVABLE,
MAX_NR_ZONES // 结束标记
};
Each memory domain zone
is associated with an array to organize the physical memory pages belonging to that memory domain.
Each memory node node
is stored in a singly linked list for the kernel to traverse.
For performance reasons, when allocating memory for a process, the kernel always tries to do it on the
CPU
associatedNUMA
node that is currently running, but sometimes the memory of that node may be exhausted. For such cases, each node provides a backup A liststruct zonelist
that contains other nodes that can be used to allocate memory instead of the current node (the further back the list item is, the less suitable it is for allocation)
nodenode
Node zone
data structure
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES]; // 包含了各个内存域的数据结构
struct zonelist node_zonelists[MAX_ZONELISTS]; // 备用节点及其内存域的列表
int nr_zones; // 节点中不同域的数量
#ifdef CONFIG_FLAT_NODE_MEM_MAP
struct page *node_mem_map; // 描述节点的所有物理内存页,包含了节点中所有内存域的页
#endif
struct bootmem_data *bdata; // bootmem内存分配器
#ifdef CONFIG_MEMORY_HOTPLUG
spinlock_t node_size_lock;
#endif
unsigned long node_start_pfn; // NUMA节点中第一个页帧的逻辑编号,一般是0
unsigned long node_present_pages; /* 物理内存页的总数 */
unsigned long node_spanned_pages; /* 物理内存页的总长度,包含洞在内 */
int node_id; // 全局节点ID,一般是0
wait_queue_head_t kswapd_wait; // 交换守护进程的等待队列,在将页帧换出节点时会用到
struct task_struct *kswapd; // 下一个节点,形成单链表
int kswapd_max_order; // 页交换子系统的实现,定义需要释放的区域的长度
} pg_data_t;
The kernel will maintain a bitmap to provide status information of each node, which is placed in the file
node_state
. Related functions includestatic inline void node_set_state(int node, enum node_states state)
andstatic inline void node_clear_state(int node, enum node_states state)
visiblenodemask.h
files.
memory domainzone
Memory domain zone
data structure
struct zone {
unsigned long pages_min, pages_low, pages_high; // 页换出时使用的水印,如果内存不足,可以将其写入到硬盘,这3个成员会影响交换守护进程的行为
unsigned long lowmem_reserve[MAX_NR_ZONES]; // 分别为各种内存域指定了若干页 用于一些无论如何都不能失败的关键
性内存分配。
#ifdef CONFIG_NUMA
int node;
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
struct per_cpu_pageset *pageset[NR_CPUS]; // 用于实现每个CPU的热/冷页帧列表
#else
struct per_cpu_pageset pageset[NR_CPUS];
#endif
spinlock_t lock;
#ifdef CONFIG_MEMORY_HOTPLUG
seqlock_t span_seqlock;
#endif
struct free_area free_area[MAX_ORDER]; // 伙伴系统
#ifndef CONFIG_SPARSEMEM
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
ZONE_PADDING(_pad1_) // 根据活动情况对内存域中使用的页进行编号。活动的页:页访问频繁
spinlock_t lru_lock;
struct list_head active_list; // 活动页的集合
struct list_head inactive_list; // 不活动页的集合
unsigned long nr_scan_active; // 回收时需要扫描的活动的页
unsigned long nr_scan_inactive; // 回收时需要扫描的不动的页
unsigned long pages_scanned; // 指定了上次换出一页以来,有多少页未能成功扫描
unsigned long flags; /* 描述内存域的当前状态 */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; // 内存域的统计信息
int prev_priority; // 存储了上一次扫描操作扫描该内存域的优先级
ZONE_PADDING(_pad2_)
wait_queue_head_t * wait_table; // 等待队列 可供等待某一页变为可用的进程使用
unsigned long wait_table_hash_nr_entries;
unsigned long wait_table_bits;
struct pglist_data *zone_pgdat;
unsigned long zone_start_pfn; // 内存域第一个页帧的索引
unsigned long spanned_pages; /* 内存域中页的总数 */
unsigned long present_pages; /* 内存域中页的长度,包括洞 */
const char *name; // 内存域的名称
} ____cacheline_internodealigned_in_smp;
Memory watermark
The concept of memory watermark
- If there are more free pages than
pages_high
, the state of the memory domain is ideal. - If the number of free pages falls below
pages_low
, the kernel starts swapping pages out to the hard disk. - If the number of free pages is
pages_min
lower than
# 查看内存域水印的大小
root@huawei linux-version # cat /proc/sys/vm/min_free_kbytes [0]
67584
The filling of the watermark value in the data structure init_per_zone_pages_min
is handled by this function, which is called by the kernel during startup and does not need to be called explicitly
/*
* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
* min_free_kbytes = sqrt(lowmem_kbytes * 16)
*
* which yields
*
* 16MB: 512k
* 32MB: 724k
* 64MB: 1024k
* 128MB: 1448k
* 256MB: 2048k
* 512MB: 2896k
* 1024MB: 4096k
* 2048MB: 5792k
* 4096MB: 8192k
* 8192MB: 11584k
* 16384MB: 16384k
*/
static int __init init_per_zone_pages_min(void)
{
// ...
setup_per_zone_pages_min(); // 设置pages_min, pages_low, pages_high
setup_per_zone_lowmem_reserve(); // 设置lowmem_reserve
return 0;
}
module_init(init_per_zone_pages_min)
Hot and cold pages
struct zone
The members of pageset
are used to implement hot and cold distributors hot-n-cold allocator
. When the kernel says that a page is hot, it means
that the page has been loaded into CPU
cache and its data can be accessed faster than if the page were in memory. In contrast, cold pages are not in
the cache. On multiprocessor systems, each CPU
of which has one or more caches, CPU
must be managed independently.
struct zone {
// ...
struct per_cpu_pageset pageset[NR_CPUS];
// ...
};
struct per_cpu_pageset {
struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
// ...
} ____cacheline_aligned_in_smp;
struct per_cpu_pages {
int count; /* 列表中页数 */
int high; /* 页数上限水印 */
int batch; /* 添加/删除多页快的时候,块的大小 */
struct list_head list; /* 页的链表 */
};
Initialize memory management
It mainly includes the following parts:
- On many , the memory model appropriate to the kernel
CPU
must be explicitly set (typically switching to PE mode and detecting registers, built by specific assembly, not outlined here)Linux
- Create data structures for memory management, and many other things
- Use an additional simplified form of memory management module during system startup
Create data structure
Initialization of the relevant data structures begins with start_kernel()
a routine that is executed after loading the kernel and activating various subsystems. Since memory management is a very important part of the kernel, initialization of memory management is performed immediately after detecting the memory and determining the allocation of memory in the system during the architecture-specific setup step.
At this point, an instance has been generated for each system memory model pgdata_t
, which holds information such as the amount of memory in the node and how the memory is distributed among the various memory domains.
start_kernel()
--> setup_arch() // 是一个特定于体系结构的设置函数,其中一项任务是负责初始化自举分配器
--> setup_per_cpu_areas() // 初始化per-cpu 为系统的各个CPU分别创建一份这些数据的副本。
--> build_all_zonelists() // 建立结点和内存域的数据结构
--> mem_init() // 用于停用bootmem分配器并迁移到实际的内存管理函数 伙伴系统
--> kmem_cache_init() // 初始化内核内部用于小块内存区的分配器 slab
--> setup_per_cpu_pageset() // 还负责设置冷热分配器的限制和第一个处理器的冷热页帧分配
build_all_zonelists
void build_all_zonelists(void)
{
set_zonelist_order();
if (system_state == SYSTEM_BOOTING) {
__build_all_zonelists(NULL);
cpuset_init_current_mems_allowed();
} else {
stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
}
vm_total_pages = nr_free_pagecache_pages();
if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
page_group_by_mobility_disabled = 1;
else
page_group_by_mobility_disabled = 0;
printk("Built %i zonelists in %s order, mobility grouping %s. "
"Total pages: %ld\n",
num_online_nodes(),
zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
printk("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}
static int __build_all_zonelists(void *dummy)
{
int nid;
for_each_online_node(nid) {
// 遍历所有活动节点
pg_data_t *pgdat = NODE_DATA(nid); // 获得节点及其信息
build_zonelists(pgdat); // 使用pgdat 包含节点内存配置的所有现存信息 配置内存分配层次信息
build_zonelist_cache(pgdat);
}
return 0;
}
Architecture-specific settings
Kernel layout in memory
The first 4 KB is the first page frame and is generally ignored because it is usually reserved for
BIOS
use .
The next 640 KiB is available in principle, but is also not used for kernel loading. The area immediately following this area is reserved by the system and is used to map various ROMs. It is not possible to write data to the area of the mapped ROM.
The kernel is always loaded into a contiguous memory area. If you want to load the kernel image from 4 KB as the starting position, the kernel must be smaller than 640 KB.
To solve these problems, IA-32
the kernel uses 0x100000
as the starting address.
The memory occupied by the kernel is divided into several segments
_text
and_etext
are the starting and ending addresses of the code segment, which contains the compiled kernel code- The data segment is located between
_etext
and_edata
and holds most of the kernel variables - The initialization data is no longer needed after the kernel boot process. Save in the last paragraph, from
_edata
to_end
Each time the kernel is compiled, a file is generated
System.map
and saved in the source code directory
machine_specific_memory_setup
: Create a list including the memory area occupied by the system and the free memory area. The mapping provided by the BIOS gives the various memory areas used in this case
If
BIOS
this information is not provided (which may be the case on older machines), the kernel itself generates a table marking memory between 0 and 640 K and before 1 MB as available.
parse_cmdline_early
: Analyze the command line. Mainly focus on mem=XXX[KkmM]、highmem=XXX [kKmM]或memmap=XXX[KkmM]" "@XXX[KkmM]
parameters like this. If the value calculated or BIOS
supplied by the kernel is incorrect,
the administrator can modify the amount of available memory or manually delimit the memory area.
setup_memory
: Determines the number of physical memory pages available (per node). Initialize the bootmem allocator. Allocate various memory areas.
paging_init
: Initialize kernel page tables and enable memory paging. Responsible for establishing page tables that can only be used by the kernel and cannot be accessed by user space.
zone_sizes_init
pgdat_t
: Initialize instances of all nodes in the system . add_active_range()
and free_area_init_nodes()
function
For the initialization of the paging mechanism, you can see: High-end memory HighMem in the Linux kernel space
There is a question here: Why does the high-end memory area of the kernel need to be mapped to all physical memory?
According to the current kernel design, the kernel determines which physical memory is used by which process, so the kernel must have control over all physical memory; (I don’t feel very convincing, but this seems to make sense. Slowly Let’s study)
paging_init()
Running steps:
pagetable_init
: Initialize the system's page table swapper_pg_dir
as a basis.
kernel_physical_mapping_init
: Maps the physical memory page (front space) to the starting position 896MB
in the virtual address space . PAGE_OFFSET
Next, establish the memory area corresponding to the fixed mapping item and the persistent kernel mapping. The same goes for filling the page table with the appropriate values.
After
pagetable_ini
the page table initialization is completed with t,cr3
the register is set to a pointer to the global page directory (swapper_ pg_dir
). At this point a new page table must be activated.
__flush_all_tlb
Because: Since the TLB cache entry still contains some memory address data allocated at startup, it must also be flushed out at this time.
kmap_init
: Initialize global variables kmap_pte
. When mapping a page from the high memory domain to the kernel address space, this variable is used to store the page table entry in the corresponding memory area. Additionally, the address of the first fixed-mapped memory area used for high-memory kernel mapping is saved in a global variable kmem_vstart
.
Initialization of hot and cold caches
Register memory active area
Memory management during startup process
During the boot process, although memory management has not yet been initialized, the kernel still needs to allocate memory to create various data structures. bootmem
The allocator is used to allocate memory early in the startup phase.
The allocator manages pages using a bitmap with the same number of bits as the number of physical memory pages in the system. A bit of
1 indicates a used page; a bit of 0 indicates a free page.
When memory needs to be allocated, the allocator scans the bitmap bit by bit until it finds a location that provides enough contiguous pages, the so-called first-
best or first-fit location.
typedef struct bootmem_data {
unsigned long node_boot_start; // 系统中第一个页的编号,大多数体系结构下都是0
unsigned long node_low_pfn; // 直接管理的物理地址空间中最后一页的编号 即ZONE_NORMAL的结束页
void *node_bootmem_map; // 是指向存储分配位图的内存区的指针
unsigned long last_offset; //
unsigned long last_pos; // 上一次分配的页的编号
unsigned long last_success; // 指定位图中上一次成功分配内存的位置,新的分配将由此开始
struct list_head list;
} bootmem_data_t;
NUMA
Computers, where one allocator is registered for each nodebootmem
, but if the physical address space is littered with holes, onebootmem
allocator can be registered for each contiguous memory region.
Registering a new bootstrap allocator can be used init_bootmem_core
. All registered allocators are stored in a linked list, and the header
is a global variable bdata_list
.
static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
unsigned long mapstart, unsigned long start, unsigned long end)
{
bootmem_data_t *bdata = pgdat->bdata;
unsigned long mapsize;
bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
bdata->node_boot_start = PFN_PHYS(start);
bdata->node_low_pfn = end;
link_bootmem(bdata);
mapsize = get_mapsize(bdata);
memset(bdata->node_bootmem_map, 0xff, mapsize);
return mapsize;
}
On UMA
the system, only one bootmem_t
instance is required, ie contig_bootmem_data
. It is associated through bdata
members with
contig_page_data
#ifndef CONFIG_NEED_MULTIPLE_NODES
static bootmem_data_t contig_bootmem_data;
struct pglist_data contig_page_data = {
.bdata = &contig_bootmem_data };
EXPORT_SYMBOL(contig_page_data);
#endif
Take IA-32
as an example
setup_memory
Analyze the detected memory area to find the largest page frame number in the lower memory area. The global variable max_low_pfn
holds the number of the highest mappable page. The kernel reports the amount of memory found in the boot log
Since high-end memory processing is too cumbersome, it
bootmem
is useless for allocators.
void __init setup_bootmem_allocator(unsigned long free_pfn)
{
unsigned long bootmap_size;
bootmap_size = init_bootmem_node(NODE_DATA(0), free_pfn,
min_low_pfn, max_low_pfn);
// ...
register_bootmem_low_pages();
node_set_online(0);
reserve_bootmem(__MEMORY_START+PAGE_SIZE,
(PFN_PHYS(free_pfn)+bootmap_size+PAGE_SIZE-1)-__MEMORY_START);
reserve_bootmem(__MEMORY_START, PAGE_SIZE);
// ...
}
Interface to the kernel
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
extern void reserve_bootmem(unsigned long addr, unsigned long size);
#define alloc_bootmem(x) \
__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low(x) \
__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
#define alloc_bootmem_pages(x) \
__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem_low(x, PAGE_SIZE, 0)
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
extern unsigned long free_all_bootmem(void);
extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
extern void *__alloc_bootmem_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal);
extern unsigned long init_bootmem_node(pg_data_t *pgdat,
unsigned long freepfn,
unsigned long startpfn,
unsigned long endpfn);
extern void reserve_bootmem_node(pg_data_t *pgdat,
unsigned long physaddr,
unsigned long size);
extern void free_bootmem_node(pg_data_t *pgdat,
unsigned long addr,
unsigned long size);
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
#define alloc_bootmem_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low_pages_node(pgdat, x) \
__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
Physical memory management
Structure of the Buddy System
Each physical memory page (page frame) in the system memory corresponds to an struct page
instance. Each memory domain is associated with an struct zone
instance that holds the main array used to manage partner data.
struct zone {
...
/*
* 不同长度的空闲区域
*/
struct free_area free_area[MAX_ORDER];
...
};
struct free_area {
struct list_head free_list[MIGRATE_TYPES]; // 用于连接空闲页的链表
unsigned long nr_free; // 当前内存区中空闲页块的数目
};
Tier is a very important term in the buddy system. It describes the quantity unit of memory allocation. The length of the memory block is 2 order
, which
ranges order
from 0 to MAX_ORDER
.
/* Free memory management - zoned buddy allocator. */
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
free_area[]
The index of each element in the array is also interpreted as the order. Used to specify how many page frames the continuous memory area in the corresponding linked list contains. The memory area contained in the 0th linked list is a single page (2 0 =1), the memory area managed by the first linked list is two pages (2 1 =2), the memory area managed by the third linked list is 4 pages, and so on.
Partners don't have to be connected to each other. If a memory area is split into two halves during allocation, the kernel will automatically add the unused half
to the corresponding linked list. If at some point in the future, both memory areas are idle due to memory release, you can
determine whether they are partners by their addresses. Less administrative work is a major advantage of the buddy system.
Memory management based on the buddy system focuses on a certain memory domain of a certain node, for example, DMA
or a high-end memory domain. However, all memory
domains and node partner systems are connected through standby allocation lists.
When the preferred memory domain or node cannot satisfy a memory allocation request, another memory domain of the same node is first tried, and then
another node is tried until the request is satisfied.
root@huawei linux-version # cat /proc/buddyinfo [0]
Node 0, zone DMA 1 0 0 1 2 1 2 1 2 2 2
Node 0, zone DMA32 4145 7315 3523 965 567 140 31 10 4 0 0
Node 0, zone Normal 451 540 303 66 12 50 2 0 0 0 0
avoid fragmentation
After the system has been running for a long time, many memory fragments will be generated.
Many modern
CPU
systems offer the possibility of jumbo pages, which are much larger than normal pages, which can be beneficial for memory-intensive programs. When using larger pages, the address translation lookaside buffer has to process fewer entries, reducing the likelihood of TLB cache misses.
The general approach is anti-fragmentation anti-fragmentation
, i.e. trying to prevent fragmentation as much as possible in the first place.
Generally, the kernel divides allocated memory pages into three types:
- Unmovable pages: have a fixed location in memory and cannot be moved to other places. Most of the memory allocated by the core kernel falls into this category
- Recyclable pages: cannot be moved directly, but can be deleted and their contents can be regenerated from some source.
- Movable pages can be moved anywhere. Pages belonging to user-space applications fall into this category. They are mapped through page tables. If they are copied to a new location, the page table entries can be updated accordingly without the application noticing anything.
Movable pages can be moved directly. Two lists can be created for immovable pages and recyclable pages: the immovable list and the recyclable page list. When allocating, in the recyclable page, you can choose to temporarily release the free space, and then allocate continuous memory space.
Initially, memory is not divided into areas of varying mobility; these are formed at runtime.
#define MIGRATE_UNMOVABLE 0 // 不可移动
#define MIGRATE_RECLAIMABLE 1 // 可回收
#define MIGRATE_MOVABLE 2 // 可移动
#define MIGRATE_RESERVE 3 // 当向具有特定可移动性的列表请求分配内存失败,此时可以向MIGRATE_RESERVE分配内存
#define MIGRATE_ISOLATE 4 /* can't allocate from here */
#define MIGRATE_TYPES 5 // 迁移类型的数目,不表示具体的区域
Adjustments to the partner system
struct free_area {
// 用MIGRATE_TYPES将其分为不同的列表
struct list_head free_list[MIGRATE_TYPES]; // 用于连接空闲页的链表
unsigned long nr_free; // 当前内存区中空闲页块的数目
};
If the kernel cannot satisfy an allocation request for a given migration type, a fallback list is provided that specifies which migration type should be used next if the allocation request cannot be satisfied in the specified list:
static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
[MIGRATE_UNMOVABLE] = {
MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
[MIGRATE_RECLAIMABLE] = {
MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
[MIGRATE_MOVABLE] = {
MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
[MIGRATE_RESERVE] = {
MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
};
Although the page mobility grouping feature is always compiled into the kernel, it only makes sense if there is enough memory in the system to allocate linked lists corresponding to multiple migration types. Since each migration list should have an appropriate amount of memory, the kernel needs to define the concept of "appropriate".
This is provided via two global variables pageblock_order
and . pageblock_nr_pages
The first one represents an allocation level that the kernel considers to be "large", pageblock_nr_pages
and represents the number of pages corresponding to that allocation level.
// 支持巨型页
#define pageblock_order HUGETLB_PAGE_ORDER
// 不支持巨型页
#define pageblock_order (MAX_ORDER-1)
If there is not a large contiguous memory in the linked list of each migration type, then page migration will not provide any benefit, so the kernel will turn off this feature when there is too little available memory. This is build_all_zonelists
checked in the function that initializes the list of memory domains. If not enough memory is available, the global variable page_group_by_mobility
is set to 0, otherwise it is set to 1.
Details about individual memory allocations are specified through allocation masks. The kernel provides two flags to indicate whether the allocated memory is removable __GFP_ MOVABLE
or reclaimable __GFP_RECLAIMABLE
. If neither of these flags is set, the allocated memory is assumed to be non-movable.
// 转换分配标志与类型
static inline int allocflags_to_migratetype(gfp_t gfp_flags)
{
WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
if (unlikely(page_group_by_mobility_disabled))
return MIGRATE_UNMOVABLE;
/* Group based on mobility */
return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
((gfp_flags & __GFP_RECLAIMABLE) != 0);
}
During initialization, the kernel automatically ensures that for each different migration type grouping in the memory domain, pageblock_flags
enough space is allocated to store NR_PAGEBLOCK_BITS
1 bit. Currently, the migration type representing a contiguous memory area requires 3 bits:
// 用于帮助定义比特位范围的宏
#define PB_range(name, required_bits) \
name, name ## _end = (name + required_bits) - 1
enum pageblock_bits {
PB_range(PB_migrate, 3), /* 3 bits required for migrate types */
NR_PAGEBLOCK_BITS
};
set_pageblock_migratetype
Responsible for setting page
the migration type of a memory area headed by
static void set_pageblock_migratetype(struct page *page, int migratetype)
{
set_pageblock_flags_group(page, (unsigned long)migratetype,
PB_migrate, PB_migrate_end);
}
void set_pageblock_flags_group(struct page *page, unsigned long flags,
int start_bitidx, int end_bitidx)
{
struct zone *zone;
unsigned long *bitmap;
unsigned long pfn, bitidx;
unsigned long value = 1;
zone = page_zone(page);
pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
if (flags & value)
__set_bit(bitidx + start_bitidx, bitmap);
else
__clear_bit(bitidx + start_bitidx, bitmap);
}
View related linked list
root@huawei linux-version # cat /proc/pagetypeinfo [130]
Page block order: 9
Pages per block: 512
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 1 0 0 1 2 1 1 0 1 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 1 2
Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 1 1 1 1 0
Node 0, zone DMA, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type CMA 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA32, type Unmovable 580 543 454 128 79 6 1 1 0 0 0
Node 0, zone DMA32, type Movable 2740 2510 1927 740 505 121 20 9 5 0 0
Node 0, zone DMA32, type Reclaimable 685 1556 1241 346 180 46 8 2 1 0 0
Node 0, zone DMA32, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA32, type CMA 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Unmovable 469 256 171 13 2 0 0 0 0 0 0
Node 0, zone Normal, type Movable 49 79 8 2 3 33 4 0 0 0 0
Node 0, zone Normal, type Reclaimable 3 79 102 42 12 7 4 1 0 0 0
Node 0, zone Normal, type HighAtomic 41 37 21 7 2 0 0 0 0 0 0
Node 0, zone Normal, type CMA 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Isolate 0 0 0 0 0 0 0 0 0 0 0
Number of blocks type Unmovable Movable Reclaimable HighAtomic CMA Isolate
Node 0, zone DMA 1 5 2 0 0 0
Node 0, zone DMA32 51 1296 181 0 0 0
Node 0, zone Normal 52 438 21 1 0 0
During the initialization of the memory subsystem, memmap_init_zone
it is responsible for handling the page instances of the memory domain, marking all pages as initially marked as removable.
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
* handed to this function. They do not
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
}
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
reset_page_mapcount(page);
SetPageReserved(page);
if ((pfn & (pageblock_nr_pages-1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}
}
If movable memory areas are allocated less often during startup, the allocator has a high chance of allocating the largest memory area and converting it from a movable list to an unmovable list. Since the allocated memory area is of maximum length, fragmentation is not introduced into removable memory.
All in all, this approach avoids memory allocated by the kernel during startup (which is often not freed throughout the entire runtime of the system) from being spread across physical memory, thereby keeping other types of memory allocations free from fragmentation
Another way to reduce memory fragmentation is ZONE_MOVABLE
to use virtual removable memory domains, see: What is a virtual address space? Explained from an architectural perspective
Initialize memory domain and node data structure
The architecture has the following information during startup
- The page frame boundaries of each memory domain in the system are stored in
max_zone_pfn
the array - The allocation of page frames for each node is stored in global variables
early_node_map
.
free_area_init_nodes
The lowest and highest page frame numbers that can be used by each memory domain need to be calculated against the boundaries of the memory domains specified in zone_max_pfn
and . zone_min_pfn
Two global arrays are used to store this information
static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
unsigned long nid;
enum zone_type i;
/* Sort early_node_map as initialisation assumes it is sorted */
sort_node_map();
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
for (i = 1; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i-1];
arch_zone_highest_possible_pfn[i] =
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
}
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes(zone_movable_pfn);
/* Print out the zone ranges */
printk("Zone PFN ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
printk(" %-8s %8lu -> %8lu\n",
zone_names[i],
arch_zone_lowest_possible_pfn[i],
arch_zone_highest_possible_pfn[i]);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
printk("Movable zone start PFN for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
}
/* Print out the early_node_map[] */
printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
for (i = 0; i < nr_nodemap_entries; i++)
printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
early_node_map[i].start_pfn,
early_node_map[i].end_pfn);
/* Initialise every node */
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, pgdat, NULL,
find_min_pfn_for_node(nid), NULL);
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_HIGH_MEMORY);
check_for_regular_memory(pgdat);
}
}
After the memory domain boundaries have been determined, create data structures free_area_init_nodes
for each memory domain separately.free_area_ init_node
void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long node_start_pfn,
unsigned long *zholes_size)
{
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
calculate_node_totalpages(pgdat, zones_size, zholes_size); // 累计各个内存域的页数,计算节点中页的总数
alloc_node_mem_map(pgdat); // 负责初始化一个简单但非常重要的数据结构 page
free_area_init_core(pgdat, zones_size, zholes_size);
}
The code defaults to aligning the memory map to the maximum allocation order of the partner system.
The kernel keeps track of the number of pages in the system using two global variables.
nr_kernel_pages
Counts all consistently mapped pages,nr_all_pages
including upper memory pages.
Generally speaking, the partner system solves external fragments, and
slab
the mechanism solves internal fragments.
Internal fragmentation: refers to memory allocated by the kernel but cannot be used.
External fragmentation: memory fragmentation caused by frequent allocation and release of page frames. For example, there is only one page frame that cannot be allocated to a process that requires a large contiguous page frame.
slab allocator
slab
The layer is its cache mechanism, which is managed based on objects. Its core is to allocate actual physical pages by the partner system.
Two functions:
- Fine-grained memory regions for kernel allocation
- Used as a cache, mainly for objects that are frequently allocated and released
The memory area of the cache is divided into multiple areas slab
, each slab
consisting of one or more contiguous page frames. These page frames protect allocated objects and the pages contain free objects.
slab
The basic idea of the allocator is to first use the page partner allocator to allocate a single or a group of continuous physical pages, and then divide the entire page into multiple equal small memory units to meet the requirements of small memory space allocation. need. Of course, in order to effectively manage these small memory units and ensure extremely high memory usage speed and efficiency. ( In-depth Linux device driver kernel mechanism )
Objects of the same type are grouped into one category. Whenever such an object is requested, slab
the allocator allocates a unit of this size from a slab
list, and when it is to be released, it is Resaves in this list rather than returning directly to the partner system, thus avoiding internal fragmentation.
For objects that are frequently allocated and released, slab
the allocator stores the freed memory blocks in an internal list and does not immediately return them to the partner system. When a request is made to allocate a new instance of an object of this class, the most recently freed block of memory is used.
Alternative allocators
Although slab
the allocator works well for many possible workloads, there are some situations where it does not provide optimal performance.
slob
Allocator: It revolves around a simple linked list of memory blocks and uses the same simple first-fit algorithm (tiny embedded systems) when allocating memory. Allocator: by
slub
packing page frames into groups and struct page
passing them through Use fields to manage these groups in an attempt to minimize
the memory overhead required (for large computer systems)
View all cache activity
root@huawei linux-version # cat /proc/slabinfo [0]
slabinfo - version: 2.1
# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>
au_finfo 0 0 192 21 1 : tunables 0 0 0 : slabdata 0 0 0
au_icntnr 0 0 832 19 4 : tunables 0 0 0 : slabdata 0 0 0
au_dinfo 0 0 192 21 1 : tunables 0 0 0 : slabdata 0 0 0
ovl_inode 816 1127 688 23 4 : tunables 0 0 0 : slabdata 49 49 0
nf_conntrack 132 132 320 12 1 : tunables 0 0 0 : slabdata 11 11 0
ext4_groupinfo_4k 336 336 144 28 1 : tunables 0 0 0 : slabdata 12 12 0
btrfs_delayed_node 0 0 312 13 1 : tunables 0 0 0 : slabdata 0 0 0
btrfs_ordered_extent 0 0 416 19 2 : tunables 0 0 0 : slabdata 0 0 0
btrfs_inode 0 0 1168 14 4 : tunables 0 0 0 : slabdata 0 0 0
fsverity_info 0 0 248 16 1 : tunables 0 0 0 : slabdata 0 0 0
ip6-frags 0 0 184 22 1 : tunables 0 0 0 : slabdata 0 0 0
PINGv6 0 0 1216 13 4 : tunables 0 0 0 : slabdata 0 0 0
RAWv6 130 130 1216 13 4 : tunables 0 0 0 : slabdata 10 10 0
UDPv6 168 168 1344 12 4 : tunables 0 0 0 : slabdata 14 14 0
tw_sock_TCPv6 144 144 248 16 1 : tunables 0 0 0 : slabdata 9 9 0
request_sock_TCPv6 0 0 304 13 1 : tunables 0 0 0 : slabdata 0 0 0
...
slab
How allocation is implemented:
Each cache is responsible for only one object type (such as struct task_struct
an instance), or provides a general buffer. The number in each cache slab
varies, depending on the number of pages used, object length, and the number of managed objects.
All caches in the system are stored in a doubly linked list. This gives the kernel a chance to traverse all caches in sequence, always
slab
Structure: divided into management data and object data, color space, etc.
struct slab {
struct list_head list; // 双向链表
unsigned long colouroff; // 着色
void *s_mem; /* including colour offset */
unsigned int inuse; /* num of objs active in slab */
kmem_bufctl_t free;
unsigned short nodeid;
};
In some cases,
slab
the length of the memory area (minus the header management data) is not divisible by the object length, so the kernel will give the excess memory in the form of an offsetslab
.
Eachslab
member of the cache specifies different offsets to locate data in different cache lines, so the free memory at the beginning and end of the slab is different.
static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
{
page->lru.next = (struct list_head *)cache;
}
static inline struct kmem_cache *page_get_cache(struct page *page)
{
page = compound_head(page);
BUG_ON(!PageSlab(page));
return (struct kmem_cache *)page->lru.next;
}
static inline void page_set_slab(struct page *page, struct slab *slab)
{
page->lru.prev = (struct list_head *)slab;
}
static inline struct slab *page_get_slab(struct page *page)
{
BUG_ON(!PageSlab(page));
return (struct slab *)page->lru.prev;
}
The kernel also sets a flag on each physical memory page allocated to the slab allocator
PG_SLAB
Coloring: refers to the association between cache and hardware cache. For the same size slab object
, although they are at different physical addresses, due to cache
the rules of access addresses, the two addresses are likely to be assigned to the same one cache line
for loading, which will cause a problem, such as repeated reading in the software. What consequences will these two objects bring?
We found that the corresponding ones cache line
needed to be refreshed repeatedly, while the rest cache line
were underutilized. The different ones cache line
can only load the address of a specific address offset.
slab
They are all allocated through the page allocator, that is, its unit is the page size, and some of it is unused space. Then by making an offset free
to different starting addresses, the corresponding ones will also have An offset. If each offset is different, then the object address offsets in different objects will be different, and different ones can be used to load.slab
object
slab
slab
cache line
Each cache is kmem_cache
represented by an instance in the structure
struct kmem_cache {
/* 1) per-CPU数据,在每次分配/释放期间都会访问 */
struct array_cache *array[NR_CPUS]; //
/* 2) 可调整的缓存参数。由cache_chain_mutex保护 */
unsigned int batchcount; // 指定了在per-CPU列表为空的情况下,从缓存的slab中获取对象的数目。
unsigned int limit; // 指定了per-CPU列表中保存的对象的最大数目。如果超出该值,内核会将batchcount个对象返回到slab
unsigned int shared;
unsigned int buffer_size; // 指定了缓存中管理的对象的长度。
u32 reciprocal_buffer_size;
/* 3) 后端每次分配和释放内存时都会访问 */
unsigned int flags; /* 一个标志寄存器,定义缓存的全局性质 */
unsigned int num; /* # 每个slab中对象的数量 */
/* 4) 缓存的增长/缩减 */
/* 每个slab中页数,取以2为底数的对数 */
unsigned int gfporder;
/* 强制的GFP标志,例如GFP_DMA */
gfp_t gfpflags;
size_t colour; /* 缓存着色范围 */
unsigned int colour_off; /* 着色偏移 */
struct kmem_cache *slabp_cache;
unsigned int slab_size;
unsigned int dflags; /* 动态标志 */
/* 构造函数 */
void (*ctor)(struct kmem_cache *, void *); // 指向在对象创建时调用的构造函数
/* 5) 缓存创建/删除 */
const char *name; // 字符串,包含该缓存的名称 在列出/proc/slabinfo中可用的缓存时,会使用。
struct list_head next; // 标准的链表元素,用于将kmem_cache的所有实例保存在全局链表cache_chain上
/* 6) 统计量 */
#if STATS
unsigned long num_active;
unsigned long num_allocations;
unsigned long high_mark;
unsigned long grown;
unsigned long reaped;
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
unsigned long node_frees;
unsigned long node_overflow;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss;
// ...
struct kmem_list3 *nodelists[MAX_NUMNODES]; // 每个数组项对应于系统中一个可能的内存结点
};
`slab` initialization process
The main argument here is a "chicken and egg" problem. ("In-depth Linux Kernel Architecture")
To initialize slab
data structures, the kernel requires a number of memory blocks that are much smaller than a full page, and these are best allocated by kmalloc. It can only slab
be used after the system has been enabled kmalloc
.
kmem_cache_init
The function is used to initialize slab
the allocator. It is called during the kernel initialization phase start_kernel
and after the partner system is enabled.
void __init kmem_cache_init(void);
asmlinkage void __init start_kernel(void)
{
// ...
vfs_caches_init_early();
cpuset_init_early();
mem_init();
kmem_cache_init();
setup_per_cpu_pageset();
numa_policy_init();
// ...
}
But on a multi-processor system, the startup CPU
is running at this time while others CPU
have not yet been initialized. kmem_cache_init
A multi-step process was used to gradually activate slab
the dispenser.
kmem_cache_init
Create the first cache in the systemslab
tokmem_cache
provide memory for the instance. For this purpose, the kernel uses mostly static data created at compile time. A static data structure is used asper-CPU
an array. The name of this cache iscache_cache
.kmem_cache_init
Next, a general cache is initialized to be used askmalloc
a source of memory. To solve this problem, the kernel usesg_cpucache_up
variables, which accept the following 4 values (NONE、PARTIAL_AC、PARTIAL_L3、FULL
) to reflectkmalloc
the initialized state. During minimalkmalloc
cache initialization, a static variable is again used forper-CPU
the cache data.g_cpucache_up
The state in is next set toPARTIAL_AC
, meaning thatarray_cache
the instance can be allocated immediately. If the initial length is still long enough to allocatekmem_list3
the instance, the state changes to immediatelyPARTIAL_L3
. Otherwise, you can only change it after the next larger cache is initialized. The remainingkmalloc
cachedper-CPU
data can now be created using kmalloc, which is anarraycache_init
instance requiring only a minimalkmalloc
memory area.- In
kmem_cache_init
the final step, all statically instantiated members of the data structure you have used so far are replaced
withkmalloc
dynamically allocated versions.g_cpucache_up
The status of is nowFULL
, indicating thatslab
the allocator is ready
for use.
(This process is selected from "In-depth Linux Kernel Architecture")
There is another way of interpretation, which I think is better: detailed explanation of slab mechanism (4) slab initialization
Three steps:
cache_cache
Create the first one by initializing global variablescache
. Note that allcache
are hung under the linked listcache_chain
,cache_cache
which is the first node of the linked list; after having thestruct kmem_cache
"rules" of the lengthcache
, you canslab
applykmem_cache
for the memory. This iscache
laid the foundation for creating other “rules” ;struct arraycache_init、struct kmem_list3
Next , 20 caches with lengths ranging from 32 to 4194304 are created one after anothercache
. They are all so-called ordinary caches. Pay attention to the changes in the global variables that mark the initialization progressg_cpucache_up
during this period, asNONE->PARTIAL_AC->PARTIAL_L3
described in detail before;- By
kmalloc
applying for the ones originally simulated by global variablescache
, includingstruct arraycache_init
andstruct kmem_list3
(respectivelyinitarray_cache
andinitkmem_list3
); at this timeslab
, the initialization is completed, and other modules cankmalloc
easily obtain the corresponding physical memory, and the global variable of the initialization progressg_cpucache_up
is set toEARLY
;
Now , set start_kernel中后续调用函数kmem_cache_init_late
the global variable of the initialization progress to completely complete the initialization.g_cpucache_up
FULL
slab
Related APIs
Create new slab
cache
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(struct kmem_cache *, void *))
allocate specific objects
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
return __cache_alloc(cachep, flags, __builtin_return_address(0));
}
EXPORT_SYMBOL(kmem_cache_alloc);
Release object
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
unsigned long flags;
local_irq_save(flags);
debug_check_no_locks_freed(objp, obj_size(cachep));
__cache_free(cachep, objp);
local_irq_restore(flags);
}
EXPORT_SYMBOL(kmem_cache_free);
Destroy cache
static void __kmem_cache_destroy(struct kmem_cache *cachep)
{
int i;
struct kmem_list3 *l3;
for_each_online_cpu(i)
kfree(cachep->array[i]);
/* NUMA: free the list3 structures */
for_each_online_node(i) {
l3 = cachep->nodelists[i];
if (l3) {
kfree(l3->shared);
free_alien_cache(l3->alien);
kfree(l3);
}
}
kmem_cache_free(&cache_cache, cachep);
}
Finished, scatter flowers!