Linux memory management: (3) kmalloc, vmalloc, malloc, mmap

Article description:

1. kmalloc

The core implementation of the kmalloc() function commonly used in the kernel is the slab mechanism . Similar to the partner system mechanism, multiple slab descriptors are created in the memory block according to the order power of 2 bytes, such as 16 bytes, 32 bytes, 64 bytes, 128 bytes, etc., and the system will create kmalloc respectively. -16, kmalloc-32, kmalloc-64 and other slab descriptors, this is completed in the create_kmalloc_caches() function when the system starts . For example, to allocate a small memory block of 30 bytes, you can use "kmalloc(30, GFP_KERNEL)'', and then the system will allocate an object from the kmalloc-32 slab descriptor.

static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
    
    
	...
	// kmalloc_index() 函数可以用于查找使用的是哪个 slab 缓冲区
	index = kmalloc_index(size);
	...
	return kmem_cache_alloc_trace(
				kmalloc_caches[kmalloc_type(flags)][index],
				flags, size);
	...
}

2. vmalloc

kmalloc() is based on the slab allocator. The slab buffer is built on a large memory block with continuous physical addresses, so its cache objects also have continuous physical addresses. If there is no need for continuous physical addresses in the kernel, but only the virtual addresses of the kernel space are required to be continuous memory blocks, how to deal with it? This is where vmalloc() comes in handy.

The kernel virtual address area mapped by the vmalloc() function is shown in the figure below:

Insert image description here

The process of allocating memory by vmalloc() is shown in the figure below:

Insert image description here

I believe that everyone already has a macro understanding of vmalloc allocating memory. In order to give readers a more realistic understanding, the following will explain this process based on the flow chart and focus on the source code:

vmalloc->__vmalloc_node_flags->__vmalloc_node

static void *__vmalloc_node(unsigned long size, unsigned long align,
			    gfp_t gfp_mask, pgprot_t prot,
			    int node, const void *caller)
{
    
    
	// VMALLOC_START 是 vmalloc 区域的开始地址,它以内核模块区域的结束地址(MODULES_END)为起始点
	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
				gfp_mask, prot, 0, node, caller);
}

vmalloc->__vmalloc_node_flags->__vmalloc_node->__vmalloc_node_range

void *__vmalloc_node_range(unsigned long size, unsigned long align,
			unsigned long start, unsigned long end, gfp_t gfp_mask,
			pgprot_t prot, unsigned long vm_flags, int node,
			const void *caller)
{
    
    
    ...
    // vmalloc() 分配的大小要以页面大小对齐,适合分配大内存块
	size = PAGE_ALIGN(size);
	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
		goto fail;
    
    area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
				vm_flags, start, end, node, gfp_mask, caller);
	if (!area)
		goto fail;
    
    // 分配物理内存,并和 vm_struct 空间建立映射关系
	// 返回 vm_struct 空间的起始地址
	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
	if (!addr)
		return NULL;
    ...
}

vmalloc->__vmalloc_node_flags->__vmalloc_node->__vmalloc_node_range->__get_vm_area_node

static struct vm_struct *__get_vm_area_node(unsigned long size,
		unsigned long align, unsigned long flags, unsigned long start,
		unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
    
    
	struct vmap_area *va;
	struct vm_struct *area;

	// 确保当前不处于中断上下文中,因为 vmalloc() 在分配过程中可能会睡眠
	BUG_ON(in_interrupt());
	// 又一次按页面对齐
	size = PAGE_ALIGN(size);
	if (unlikely(!size))
		return NULL;

	// 如果分配的 vmalloc 区域是用于 IOREMAP 的,那么默认情况下按 128 个页面对齐
	if (flags & VM_IOREMAP)
		align = 1ul << clamp_t(int, get_count_order_long(size),
				       PAGE_SHIFT, IOREMAP_MAX_ORDER);

	// 分配一个 vm_struct 数据结构来描述这个 vmalloc
	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
	if (unlikely(!area))
		return NULL;

	// 如果 flags 中没有 VM_NO_GUARD 标志位,那么要多分配一个页面,以便备用
	if (!(flags & VM_NO_GUARD))
		size += PAGE_SIZE;

	// 分配 vmalloc 区域,在 vmalloc 区域中查找一块大小合适的并且没有使用的空间,这段空间称为缝隙
    // 找到新的区块缝隙后,调用__insert_vmap_area()函数把这个缝隙注册到红黑树中
	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
	if (IS_ERR(va)) {
    
    
		kfree(area);
		return NULL;
	}

	// 构建一个 vm_struct 空间,返回这个 vm_struct 数据结构
	setup_vmalloc_vm(area, va, flags, caller);

	return area;
}

vmalloc->__vmalloc_node_flags->__vmalloc_node->__vmalloc_node_range->__vmalloc_area_node

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
				 pgprot_t prot, int node)
{
    
    
	...
	// 设置 __GFP_HIGHMEM 分配掩码。当请求分配掩码 gfp_mask 没有指定必须从 DMA 的 zone 分配内存时,应该设置 __GFP_HIGHMEM ,优先使用高端内存
	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
					0 :
					__GFP_HIGHMEM;

	// 计算 vm_struct 区域包含多少个页面
	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
	// 使用 area->pages 保存已分配页面的 page 数据结构的指针
	array_size = (nr_pages * sizeof(struct page *));

	area->nr_pages = nr_pages;
	/* Please note that the recursion is strictly bounded. */
	if (array_size > PAGE_SIZE) {
    
    
		pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
				PAGE_KERNEL, node, area->caller);
	} else {
    
    
		pages = kmalloc_node(array_size, nested_gfp, node);
	}
	area->pages = pages;
	if (!area->pages) {
    
    
		remove_vm_area(area->addr);
		kfree(area);
		return NULL;
	}

	// 使用 for 循环遍历所有的 area->nr_pages 页面,为每个页面调用 alloc_page() 接口函数来分配实际的物理页面
	// 由于这里对每个物理页面单独调用 alloc_page() 接口函数,因此通过 vmalloc() 分配的物理页面可能不是连续的
	for (i = 0; i < area->nr_pages; i++) {
    
    
		struct page *page;

		if (node == NUMA_NO_NODE)
			page = alloc_page(alloc_mask|highmem_mask);
		else
			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);

		if (unlikely(!page)) {
    
    
			/* Successfully allocated i pages, free them in __vunmap() */
			area->nr_pages = i;
			goto fail;
		}
		area->pages[i] = page;
		if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
			cond_resched();
	}

	// 建立页面映射
	if (map_vm_area(area, prot, pages))
		goto fail;
	// 返回 vm_struct 区域的起始地址
	return area->addr;

...
}

3. malloc

The malloc() function is a core function encapsulated by the C standard library. The C standard library will call the Linux system call interface brk after doing some processing. The space allocated by brk is from the top end_data of the data segment to the bottom of the user stack, so the dynamically allocated space starts from the end_data of the process. Each time a piece of space is allocated, the boundary address is pushed up by a certain amount. At the same time, the kernel and the process will record the current boundary. address. The ARM64 process address space layout is shown in the figure below:

Insert image description here

The flow chart of the brk system call is shown below:

Insert image description here

In order to give readers a more realistic understanding, this process will be explained below based on the flow chart around the source code:

__do_sys_brk

// 宏展开后的函数名为 __do_sys_brk
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
    
    
	unsigned long retval;
	unsigned long newbrk, oldbrk, origbrk;
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *next;
	unsigned long min_brk;
	bool populate;
	bool downgraded = false;
	LIST_HEAD(uf);

	// 申请写者类型的读写信号量 mm->mmap_sem,因为后续要修改进程的地址空间
	if (down_write_killable(&mm->mmap_sem))
		return -EINTR;

	// 进程的内存管理描述符里有一个 brk 成员,用于记录动态分配区的当前底部
	origbrk = mm->brk;

...
	// 进程的内存管理描述符里有一个 start_brk 成员,用于记录动态分配区的起始地址
	min_brk = mm->start_brk;
#endif
	if (brk < min_brk)
		goto out;

	...

	// newbrk 表示 brk 要求的新边界地址,是用户进程要求分配内存的大小与当前动态分配区底部边界地址的和
	newbrk = PAGE_ALIGN(brk);
	// oldbrk 表示当前动态分配区的底部边界地址
	oldbrk = PAGE_ALIGN(mm->brk);
	// 判断是否要移动分配的边界地址
	if (oldbrk == newbrk) {
    
    
		mm->brk = brk;
		goto success;
	}

	// 如果新边界地址小于旧边界地址,那么表示进程请求释放空间,调用 do_munmap() 来释放这一部分空间的内存
	if (brk <= mm->brk) {
    
    
		...
		ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
		...
	}

	// find_vma() 以旧边界地址去查找的 VMA,以确定当前用户进程中是否已经有一块 VMA 和 star_addr 重叠
	// 如果找到一块包含 star_addr 的 VMA,说明以旧边界地址开始的地址空间已经在使用,就不需要再寻找了
	next = find_vma(mm, oldbrk);
	if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
		goto out;

	// 若没找到一块已经存在的 VMA,那么可以调用 do_brk_flags() 函数继续分配 VMA
	if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
		goto out;
	// 设置这次请求的 brk 到进程内存描述符 mm->brk 中,以便下一次调用 brk 时知道当前的 brk 地址
	mm->brk = brk;

success:
	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
	// 释放 mm->mmap_sem 信号量
	if (downgraded)
		up_read(&mm->mmap_sem);
	else
		up_write(&mm->mmap_sem);
	userfaultfd_unmap_complete(mm, &uf);
	if (populate)
		// 分配物理内存
		mm_populate(oldbrk, newbrk - oldbrk);
	//  返会这次请求的 brk 地址
	return brk;

...
}

__do_sys_brk->do_brk_flags

// 分配 VAM
// addr: 旧的边界地址
// len: 要申请内存的大小
// flags: 分配时传递的标志位
// uf: 内部临时用的链表
static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
{
    
    
	...
	// 通常传递的 flags 参数为 0,这里设置为 VM_DATA_DEFAULT_FLAGS(VMA 的属性为 VM_READ 和 VM_WRITE)
	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;

	// get_unmapped_area() 函数在进程地址空间中寻找一个可以使用的线性地址区间
	// 返回一段没有映射过的空间的起始地址
	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
	if (offset_in_page(error))
		return error;

	error = mlock_future_check(mm, mm->def_flags, len);
	if (error)
		return error;

	// find_vma_links 函数遍历用户进程红黑树中的 VMA,然后根据 addr 来查找最合适插入红黑树的节点,最终 rb_link 指针指向
	// 最合适节点的 rb_left 或 rb_right 指针本身的地址
	// 若返回 0,表示寻找到最合适插入的节点
	// 若返回 -ENOMEM,表示和现有的 VMA 重叠,会调用 do_munmap() 函数来释放这段重叠的空间
	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
			      &rb_parent)) {
    
    
		if (do_munmap(mm, addr, len, uf))
			return -ENOMEM;
	}

	...

	// vma_merge() 函数检查有没有办法合并 addr 附近的 VMA
	// 如果没办法合并,那么只能新创建一个 VMA,VMA的地址空间就是 [addr,addr+len]
	vma = vma_merge(mm, prev, addr, addr + len, flags,
			NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
	if (vma)
		goto out;

	// 若 vma_merge() 函数没办法和现有的 VMA 进行合并,就新建一个 VMA
	vma = vm_area_alloc(mm);
	if (!vma) {
    
    
		vm_unacct_memory(len >> PAGE_SHIFT);
		return -ENOMEM;
	}

	...
	// vm_get_page_prot() 函数通过 flags 值来获取 PTE 的相关属性
	vma->vm_page_prot = vm_get_page_prot(flags);
	// 将新创建的 VMA 添加到 mm->mmap 链表和红黑树中
	vma_link(mm, vma, prev, rb_link, rb_parent);
...
}

4. mmap

The role of the mmap/munmap system call function:

  • Allocate memory, read and write large files, and link dynamic library files in user programs
  • Shared memory between multiple processes

The mmap/munmap function is declared as follows:

#include <sys/mman.h>

void *mmap(void *addr,size_t length,int prot,int flags,
          int fd,off_t offset);
int munmap(void *addr,size_t length);
  • addr: used to specify the starting address mapped to the process address space. In order to improve the portability of the application, it is generally set to NULL to let the kernel allocate an appropriate address.
  • length: represents the size mapped to the process address space
  • prot: used to set the read and write attributes of the memory mapped area, etc.
  • flags: used to set the properties of memory mapping, such as shared mapping, private mapping, etc.
  • fd: indicates that this is a file mapping, fd is the handle of the open file
  • offset: In file mapping, represents the offset of the file

The code framework implemented by the mmap mechanism in the Linux kernel is very similar to the brk mechanism. The implementation process of the mmap mechanism in the Linux kernel is as shown in the figure below:

Insert image description here

Guess you like

Origin blog.csdn.net/qq_58538265/article/details/135200569