2.7 Linux存储管理-物理页面分配

进程需要的连续的页面，通过alloc_pages来完成

该函数在mm/numa.c和mm/page_alloc.h中都有定义

NUMA和UMA分配内存的函数是不并存的，根据CONFIG_DISCONTIGMEM的勾选与否选择其中一个

1. NUMA的分配内存函数： 选择CONFIG_DISCONTIGMEM选项

被编译的条件是“不连续的存储空间”，而不是CONFIG_NUMA，但是CONFIG_NUMA会对程序造成影响

alloc_pages（int gfp_mask,unsigned long order）

gfp_mask：表示采用哪一种分配策略

order：申请2^order个页面

numa的alloc_pages的代码：

如果定义了NUMA，就需要获取 pgdat_list，并且需要遍历所有的pg_data_t节点

分配时轮流从各个节点开始，并希望各节点负载均衡

在每个节点上使用 alloc_pages_pgdat函数

 
      
    
==================== mm/numa.c 43 43 ====================
 #ifdef CONFIG_DISCONTIGMEM
==================== mm/numa.c 91 128 ====================
 /*
* This can be refined. Currently, tries to do round robin, instead
* should do concentratic circle search, starting from current node.
*/
 struct page * alloc_pages(int gfp_mask, unsigned long order)
{
   struct page *ret = 0;
   pg_data_t *start, *temp;
 #ifndef CONFIG_NUMA
   unsigned long flags;
   static pg_data_t *next = 0;
#endif
103
if (order >= MAX_ORDER)
   return NULL;
#ifdef CONFIG_NUMA
   temp = NODE_DATA(numa_node_id());
#else
   spin_lock_irqsave(&node_lock, flags);
   if (!next) next = pgdat_list;
   temp = next;
   next = next->node_next;
   spin_unlock_irqrestore(&node_lock, flags);
#endif
   start = temp;
while (temp) {
   if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
       return(ret);
   temp = temp->node_next;
}
temp = pgdat_list;
while (temp != start) {
  if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
      return(ret);
   temp = temp->node_next;
}
   return(0);
}

1.2 alloc_pages_pgdat函数：

NUMA和UMA机制下都使用了相同的函数，在UMA处在做详细介绍

gfp_mask相当于node_zonelists数组的下标

 
 static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,
unsigned long order)
{
    return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);
}

2. UMA的分配内存函数：不选择CONFIG_DISCONTIGMEM选项

该函数只有在CONFIG_DISCONTIGMEM无定义时才编译

很明显在UMA结构下 只有一个pg_data_t节点，也就是 contig_page_data，所以无序遍历

具体的内存分配过程由__alloc_pages()完成

 
     
   
#ifndef CONFIG_DISCONTIGMEM
static inline struct page * alloc_pages(int gfp_mask, unsigned long order)
{
/*
* Gets optimized away by the compiler.
*/
if (order >= MAX_ORDER)
   return NULL;
return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);
}

2.1 __alloc_pages函数：

该函数完成了内存的具体分配

zonelist是 contig_page_data节点的中的zone列表，要在这些zone里面找到合适物理页面进行分配

zonelist_t结构中存放了具体的内存分配策略，也就是gfp_task，其实是一些标志位

memory_pressure表示页面管理所受的压力，分配内存页面时增加，归还内存时减少

申请的页面数为1，而且允许等待完成、不用于管理的目的，则我们将 direct_reclaim设置为1，

表示可以从相应的页面管理区的“不活跃页面”中回收，一般而言，这些页面都不是连接成块的，

所以提供给了单页面请求使用，而且这些页面的内容已经写出到了交换设备中（swap分区）

当发现页面短缺，则需要唤醒kswapd和bdflush线程，腾出空间

 
      
    
 
         
/*
* This is the 'heart' of the zoned buddy allocator:
*/
struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
{
   zone_t **zone;
   int direct_reclaim = 0;
   unsigned int gfp_mask = zonelist->gfp_mask;
   struct page * page;
279
/*
* Allocations put pressure on the VM subsystem.
*/
   memory_pressure++;
284
/*
* (If anyone calls gfp from interrupts nonatomically then it
* will sooner or later tripped up by a schedule().)
*
* We are falling back to lower-level zones if allocation
* in a higher zone fails.
*/
292
/*
* Can we take pages directly from the inactive_clean
* list?
*/
    //申请的内存空间为1页，且允许等待
if (order == 0 && (gfp_mask & __GFP_WAIT) &&
   !(current->flags & PF_MEMALLOC))
   direct_reclaim = 1;
300
/*
* If we are about to get low on free pages and we also have
* an inactive page shortage, wake up kswapd.
84
*/
if (inactive_shortage() > inactive_target / 2 && free_shortage())
   wakeup_kswapd(0);
/*
* If we are about to get low on free pages and cleaning
* the inactive_dirty pages would fix the situation,
* wake up bdflush.
*/
else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
   && nr_inactive_dirty_pages >= freepages.high)
   wakeup_bdflush(0);
315
 
     

对pgdata_t节点中的所有zone进行遍历（其实只有三个zone）

rmqueue从管理区中获取若干连续的内存页，当内存不足时，唤醒 kreclaimd（）

，让其帮助回收页面

 
     
   
try_again:
/*
* First, see if we have any zones with lots of free memory.
*
* We allocate free memory first because it doesn't contain
* any data ... DUH!
*/
   zone = zonelist->zones;
    //死循环
for (;;) {
    zone_t *z = *(zone++);
   if (!z)
        break;
   if (!z->size)
        BUG();
330
   if (z->free_pages >= z->pages_low) {
       page = rmqueue(z, order);    //分配内存
   if (page)
        return page;
   } else if (z->free_pages < z->pages_min &&
    waitqueue_active(&kreclaimd_wait)) {
       wake_up_interruptible(&kreclaimd_wait);
}


假如三个zone都失败，要考虑下面的事

1）降低页面管理区的中的“保持水位的

要求”

2）把缓存在管理区中的“不活跃干净页面”考虑进去

PAGES_LOW和 PAGES_HIGH其实表示了不同的free_list，然后使用 __alloc_pages_limit，

申请内存，再次失败，就说明内存真的短缺了

2.6之后的内核引用新的参数（migirate_type），用来表示迁移类型（数值越小说明内存与越紧张）

 
      
    
 
         
/*
* Try to allocate a page from a zone with a HIGH
* amount of free + inactive_clean pages.
*
* If there is a lot of activity, inactive_target
* will be high and we'll have a good chance of
* finding a page using the HIGH limit.
*/
page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
if (page)
return page;
352
/*
* Then try to allocate a page from a zone with more
* than zone->pages_low free + inactive_clean pages.
*
* When the working set is very large and VM activity
* is low, we're most likely to have our allocation
* succeed here.
*/
page = (zonelist, order, PAGES_LOW, direct_reclaim);
if (page)
return page;
364
 
     

zone中的页面非常短缺，

1）唤醒内核线程 kswapd，让其设法换成一些页面，gfp_mask宁可等待也要申请内存，那就让系统进行一次调度

并让当前进程为其他进程让路，这样kswapd可能会立即执行

2）其他进程可能会释放一些页面，也可减缓了要求分配页面的速度，最后以 PAGES_MIN参数

再次执行 __alloc_pages_limit

当然还是可能会失败

 
     
   
/*
* OK, none of the zones on our zonelist has lots
* of pages free.
*
* We wake up kswapd, in the hope that kswapd will
* resolve this situation before memory gets tight.
*
* We also yield the CPU, because that:
* - gives kswapd a chance to do something
* - slows down allocations, in particular the
* allocations from the fast allocator that's
* causing the problems ...
* - ... which minimises the impact the "bad guys"
* have on the rest of the system
* - if we don't have __GFP_IO set, kswapd may be
* able to free some memory we can't free ourselves
*/
wakeup_kswapd(0);
if (gfp_mask & __GFP_WAIT) {
__set_current_state(TASK_RUNNING);
current->policy |= SCHED_YIELD;
schedule();
}
388
/*
* After waking up kswapd, we try to allocate a page
* from any zone which isn't critical yet.
*
* Kswapd should, in most situations, bring the situation
* back to normal in no time.
*/
page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
if (page)
return page;
399

如果再次失败，需要查看是谁在有要求内核页面。如果是kswapd和kreclaimd，本身就是“

内存分配工作者”，要求分配内存页面的目的是执行公务，这 比一般进程更重要

这些进程task_struct结构中的flags字段的PF_MEMALLO标志位为1，一般进程为0

失败的原因：

1）可分配页面的数量太少；

2）页面总量不少，但是要求的页面块无法满足，此时往往有很多单个页面在管理区的

inactive_clean_pages中，回收的话，有可能拼装出较大的页面块

inactive_drity_pages队列中，把脏页面的内容写到交换设备上或文件中，可以使他们变成干净页面

加以回收

__free_page()释放页面时，会把空闲页面拼装起尽可能大的页面块，所以在回收每一个页面后都

调用一下rmqueue，看看是否满足要求

在调用 page_launder()期间把当前进程的PF_MEMALLOC标志位设为1，是其有了“执行公务”时的特权

这是因为page_la uncher也会要求分配一些临时性的工作页面，不把PF_MEMALLOC标志位设为1

还是失败，唤醒kswapd，要求分配页面的进程等待，有kswapd完成一轮运行后，唤醒申请页面的进程

如果申请单个页面，通过goto语句转换__alloc_pages开头处的标号try_again处

另一种方法是直接调用try_to_free_pages，这个函数本来是kswaps调用的

如果是“执行公务”，或者想尽一切办法，只不过因为要求分配的是成块页面，所以才没有装回前面的

标号try_again处

 
      
    
 
         
/*
* Damn, we didn't succeed.
*
* This can be due to 2 reasons:
* - we're doing a higher-order allocation
* --> move pages to the free list until we succeed
* - we're /really/ tight on memory
* --> wait on the kswapd waitqueue until memory is freed
*/
if (!(current->flags & PF_MEMALLOC)) {
/*
* Are we dealing with a higher order allocation?
*
* Move pages from the inactive_clean to the free list
* in the hope of creating a large, physically contiguous
* piece of free memory.
*/
    if (order > 0 && (gfp_mask & __GFP_WAIT)) {
        zone = zonelist->zones;
        /* First, clean some dirty pages. */
        current->flags |= PF_MEMALLOC;
        page_launder(gfp_mask, 1);
        current->flags &= ~PF_MEMALLOC;
        for (;;) {
            zone_t *z = *(zone++);
            if (!z)
                break;
            if (!z->size)
            continue;
            while (z->inactive_clean_pages) {
                struct page * page;
                /* Move one page to the free list. */
                page = reclaim_page(z);
                if (!page)
                    break;
                __free_page(page);
                /* Try if the allocation succeeds. */
                page = rmqueue(z, order);
                if (page)
                    return page;
            }
        }
    }
/*
* When we arrive here, we are really tight on memory.
*
* We wake up kswapd and sleep until kswapd wakes us
* up again. After that we loop back to the start.
*
* We have to do this because something else might eat
* the memory kswapd frees for us and we need to be
* reliable. Note that we don't loop back for higher
* order allocations since it is possible that kswapd
* simply cannot free a large enough contiguous area
* of memory *ever*.
*/
    if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
        wakeup_kswapd(1);
        memory_pressure++;
        if (!order)
            goto try_again;
/*
* If __GFP_IO isn't set, we can't wait on kswapd because
* kswapd just might need some IO locks /we/ are holding ...
*
* SUBTLE: The scheduling point above makes sure that
* kswapd does get the chance to free memory we can't
* free ourselves...
*/
     } else if (gfp_mask & __GFP_WAIT) {
            try_to_free_pages(gfp_mask);
            memory_pressure++;
            if (!order)
            goto try_again;
     }
475
}
477
 
     

前面使用 __alloc_pages_limit()，其实还有所保留

我们使用 PAGES_MIN为参数，此时判断是否可以分配的准则是管理区中可分配页面的“水位”高于

z->pages_min，所以还留着一些“老本

为了应付紧急情况，已经到了“不惜血本”的时候了，继续下面处理

 
      
    
 
           
         
 
           
         
/*
* Final phase: allocate anything we can!
*
* Higher order allocations, GFP_ATOMIC allocations and
* recursive allocations (PF_MEMALLOC) end up here.
*
* Only recursive allocations can use the very last pages
* in the system, otherwise it would be just too easy to
* deadlock the system...
*/
zone = zonelist->zones;
for (;;) {
    zone_t *z = *(zone++);
    struct page * page = NULL;
    if (!z)
        break;
    if (!z->size)
        BUG();
496
/*
* SUBTLE: direct_reclaim is only possible if the task
* becomes PF_MEMALLOC while looping above. This will
* happen when the OOM killer selects this task for
* instant execution...
*/
if (direct_reclaim) {
    page = reclaim_page(z);
    if (page)
        return page;
}
508
/* XXX: is pages_min/4 a good amount to reserve for this? */
if (z->free_pages < z->pages_min / 4 &&
    !(current->flags & PF_MEMALLOC))
    continue;
    page = rmqueue(z, order);
if (page)
    return page;
}
517
    /* No luck.. */
    printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
    return NULL;
}

如果再次失败，那就是系统有问题

2.2 rmqueue函数： 从指定的zone中，获取2^order数量的页面

zone中的free_area是按照order建立的数组，每个free_area里面有多个free_list，也可能为空,

也就是当前order中没有空闲空间，为空时从order更大的free_area中申请

我们要在order对应的free_area中申请内存页面，并将其从page链中摘出，摘链的过程不能被打断，需要 spin_lock_irqsave加锁

rmqueue函数代码（2.4版本），2.6以及之后的版本发生了修改（看page_alloc.c文件）

memlist_entry提供了需要的free_list的头page结构， memlist_del帮我们把free_list从free_area中删除

 
      
    
 
         
static struct page * rmqueue(zone_t *zone, unsigned long order)
{
    free_area_t * area = zone->free_area + order;
    unsigned long curr_order = order;
    struct list_head *head, *curr;
    unsigned long flags;
    struct page *page;
179
    spin_lock_irqsave(&zone->lock, flags);
    do {
        head = &area->free_list;
        curr = memlist_next(head);
184
        if (curr != head) {
            unsigned int index;
        
            page = memlist_entry(curr, struct page, list);
            if (BAD_RANGE(zone,page))
                BUG();
            memlist_del(curr);
            index = (page - mem_map) - zone->offset;
            MARK_USED(index, curr_order, area);
            zone->free_pages -= 1 << order;
            //用来分解大块内存   
            page = expand(zone, page, index, order, curr_order, area);
            spin_unlock_irqrestore(&zone->lock, flags);
198
            set_page_count(page, 1);
            if (BAD_RANGE(zone,page))
                BUG();
            DEBUG_ADD_PAGE
            return page;
        }
        curr_order++;
        area++;
    } while (curr_order < MAX_ORDER);
    spin_unlock_irqrestore(&zone->lock, flags);
209
    return NULL;
}
 
     

2.3 expand函数

用来将order更大的free_area中free_list进行分割，并存入order更低的free_area

high为更大的order，low申请的order，当将high中一个free_list分割到low的大小时，就停止

 
      
    
 
         
static inline struct page * expand (zone_t *zone, struct page *page,
unsigned long index, int low, int high, free_area_t * area)
{
    unsigned long size = 1 << high;
154
    while (high > low) {
        if (BAD_RANGE(zone,page))
            BUG();
        area--;
        high--;
        size >>= 1;
        memlist_add_head(&(page)->list, &(area)->free_list);
        MARK_USED(index, high, area);
        index += size;
        page += size;
    }
    if (BAD_RANGE(zone,page))
        BUG();
    return page;
}
 
     

2.7 Linux存储管理-物理页面分配

猜你喜欢