linux内核工程师 3.03节 Linux伙伴系统(三)--分配页

前面已经介绍了伙伴系统的原理和Linux伙伴系统的数据结构，现在来看伙伴系统是如何来分配页面的。实际上，伙伴系统分配页面的算法并不复杂，但是由于考虑到分配内存时要尽量减少碎片的产生(涉及迁移机制)以及当内存不足时需要采取各种更为积极的手段，使得内核分配页面的相关函数完整地分析起来比较复杂庞大。在这里，我们只关注分配时最一般的情况，而其他情况的处理在以后单独拿出来讨论。

我们从__alloc_pages_nodemask()这个函数开始分析，所有的分配页面的函数最终都会落到这个函数上面，它是伙伴系统的入口。

[cpp]view plain copy
<span style="font-size:12px;">struct page *  
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,  
            struct zonelist *zonelist, nodemask_t *nodemask)  
{  
    /*根据gfp_mask确定分配页所处的管理区*/  
    enum zone_type high_zoneidx = gfp_zone(gfp_mask);  
    struct zone *preferred_zone;  
    struct page *page;  
    /*根据gfp_mask得到迁移类分配页的型*/  
    int migratetype = allocflags_to_migratetype(gfp_mask);  
  
    gfp_mask &= gfp_allowed_mask;  
  
    lockdep_trace_alloc(gfp_mask);  
  
    might_sleep_if(gfp_mask & __GFP_WAIT);  
  
    if (should_fail_alloc_page(gfp_mask, order))  
        return NULL;  
  
    /* 
     * Check the zones suitable for the gfp_mask contain at least one 
     * valid zone. It's possible to have an empty zonelist as a result 
     * of GFP_THISNODE and a memoryless node 
     */  
    if (unlikely(!zonelist->_zonerefs->zone))  
        return NULL;  
  
    /* The preferred zone is used for statistics later */  
    /*从zonelist中找到zone_idx与high_zoneidx相同的管理区，也就是之前认定的管理区*/  
    first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);  
    if (!preferred_zone)  
        return NULL;  
  
    /* First allocation attempt */  
    page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,  
            zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,  
            preferred_zone, migratetype);  
    if (unlikely(!page))  
        /*第一次分配失败的话则会用通过一条低速路径来进行第二次分配，包括唤醒页换出守护进程等等*/  
        page = __alloc_pages_slowpath(gfp_mask, order,  
                zonelist, high_zoneidx, nodemask,  
                preferred_zone, migratetype);  
  
    trace_mm_page_alloc(page, order, gfp_mask, migratetype);  
    return page;  
}</span>  

首先要做的就是找到指定的分配管理区，管理区的编号保存在high_zoneidx中
然后就是尝试第一次分配，流程是从指定的管理区开始扫描管理区-->找到充足的管理区-->从指定的迁移类型链表中分配内存-->如果在指定迁移类型中找不到则到其他的迁移类型中去寻找
如果第二步在各个区域都找不到可以满足分配的内存了，那么说明管理区的内存已经确实不够了，于是开始启用一条慢速的途径来分配，包括尝试去换出一些不经常使用的页等等，内核会在这次分配中表现得更为积极，其中的细节涉及到了其他一些复杂的东西，以后再做分析

[cpp]view plain copy
static struct page *  
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,  
        struct zonelist *zonelist, int high_zoneidx, int alloc_flags,  
        struct zone *preferred_zone, int migratetype)  
{  
    struct zoneref *z;  
    struct page *page = NULL;  
    int classzone_idx;  
    struct zone *zone;  
    nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */  
    int zlc_active = 0;     /* set if using zonelist_cache */  
    int did_zlc_setup = 0;      /* just call zlc_setup() one time */  
  
    /*获取管理区的编号*/  
    classzone_idx = zone_idx(preferred_zone);  
zonelist_scan:  
    /* 
     * Scan zonelist, looking for a zone with enough free. 
     * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 
     */  
    /*从认定的管理区开始遍历，直到找到一个拥有足够空间的管理区， 
      例如，如果high_zoneidx对应的ZONE_HIGHMEM，则遍历顺序为HIGHMEM-->NORMAL-->DMA， 
      如果high_zoneidx对应ZONE_NORMAL，则遍历顺序为NORMAL-->DMA*/  
    for_each_zone_zonelist_nodemask(zone, z, zonelist,  
                        high_zoneidx, nodemask) {  
        if (NUMA_BUILD && zlc_active &&  
            !zlc_zone_worth_trying(zonelist, z, allowednodes))  
                continue;  
  
        /*检查给定的内存域是否属于该进程允许运行的CPU*/  
        if ((alloc_flags & ALLOC_CPUSET) &&  
            !cpuset_zone_allowed_softwall(zone, gfp_mask))  
                goto try_next_zone;  
  
        BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);  
        if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {  
            unsigned long mark;  
            int ret;  
              
            /*通过alloc_flags来确定是使用何种水印，pages_min?pages_low?pages_high? 
              选择了一种水印，就要求分配后的空闲不低于该水印才能进行分配*/  
            mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];  
  
            /*如果管理区的水位线处于正常水平，则在该管理区进行分配*/  
            if (zone_watermark_ok(zone, order, mark,  
                    classzone_idx, alloc_flags))  
                goto try_this_zone;  
  
            if (zone_reclaim_mode == 0)  
                goto this_zone_full;  
  
            /*下面这部分都是针对NUMA架构的申请页面回收*/  
            ret = zone_reclaim(zone, gfp_mask, order);  
            switch (ret) {  
            case ZONE_RECLAIM_NOSCAN:/*没有进行回收*/  
                /* did not scan */  
                goto try_next_zone;  
            case ZONE_RECLAIM_FULL:  /*没有找到可回收的页面*/  
                /* scanned but unreclaimable */  
                goto this_zone_full;  
            default:  
                /* did we reclaim enough */  
                if (!zone_watermark_ok(zone, order, mark,  
                        classzone_idx, alloc_flags))  
                    goto this_zone_full;  
            }  
        }  
  
try_this_zone:/*分配2^order个页*/  
        page = buffered_rmqueue(preferred_zone, zone, order,  
                        gfp_mask, migratetype);  
        if (page)  
            break;  
this_zone_full:  
        if (NUMA_BUILD)  
            zlc_mark_zone_full(zonelist, z);  
try_next_zone:  
        if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {  
            /* 
             * we do zlc_setup after the first zone is tried but only 
             * if there are multiple nodes make it worthwhile 
             */  
            allowednodes = zlc_setup(zonelist, alloc_flags);  
            zlc_active = 1;  
            did_zlc_setup = 1;  
        }  
    }  
  
    if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {  
        /* Disable zlc cache for second zonelist scan */  
        zlc_active = 0;  
        goto zonelist_scan;  
    }  
    return page;  
}  

从指定的管理区开始按照zonelist中定义的顺序来遍历管理区
如果该管理区的水位线正常，则调用buffered_rmqueue()在该管理区中分配
如果管理区的水位线过低，则在NUMA架构下会申请页面回收

[cpp]view plain copy
<span style="font-size:12px;">static inline  
struct page *buffered_rmqueue(struct zone *preferred_zone,  
            struct zone *zone, int order, gfp_t gfp_flags,  
            int migratetype)  
{  
    unsigned long flags;  
    struct page *page;  
    int cold = !!(gfp_flags & __GFP_COLD);  
    int cpu;  
  
again:  
    cpu  = get_cpu();  
    if (likely(order == 0)) {/*order为0，即要求分配一个页*/  
        struct per_cpu_pages *pcp;  
        struct list_head *list;  
  
        pcp = &zone_pcp(zone, cpu)->pcp;/*获取本地CPU对应的pcp*/  
        list = &pcp->lists[migratetype];/*获取和迁移类型对应的链表*/  
        local_irq_save(flags);  
  
        /*如果链表为空，则表示没有可分配的页，需要从伙伴系统中分配2^batch个页给list*/  
        if (list_empty(list)) {  
            pcp->count += rmqueue_bulk(zone, 0,  
                    pcp->batch, list,  
                    migratetype, cold);  
            if (unlikely(list_empty(list)))  
                goto failed;  
        }  
  
        if (cold)/*如果是需要冷页，则从链表的尾部获取*/  
            page = list_entry(list->prev, struct page, lru);  
        else     /*如果是需要热页，则从链表的头部获取*/  
            page = list_entry(list->next, struct page, lru);  
          
        list_del(&page->lru);  
        pcp->count--;  
    } else {  
        if (unlikely(gfp_flags & __GFP_NOFAIL)) {  
            /* 
             * __GFP_NOFAIL is not to be used in new code. 
             * 
             * All __GFP_NOFAIL callers should be fixed so that they 
             * properly detect and handle allocation failures. 
             * 
             * We most definitely don't want callers attempting to 
             * allocate greater than order-1 page units with 
             * __GFP_NOFAIL. 
             */  
            WARN_ON_ONCE(order > 1);  
        }  
        spin_lock_irqsave(&zone->lock, flags);  
        /*从管理区的伙伴系统中选择合适的内存块进行分配*/  
        page = __rmqueue(zone, order, migratetype);  
        spin_unlock(&zone->lock);  
        if (!page)  
            goto failed;  
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));  
    }  
  
    __count_zone_vm_events(PGALLOC, zone, 1 << order);  
    zone_statistics(preferred_zone, zone);  
    local_irq_restore(flags);  
    put_cpu();  
  
    VM_BUG_ON(bad_range(zone, page));  
    if (prep_new_page(page, order, gfp_flags))  
        goto again;  
    return page;  
  
failed:  
    local_irq_restore(flags);  
    put_cpu();  
    return NULL;  
}  
</span>  

该函数分两种情况进行处理，一种是只要求分配单个页框，另一种是要求分配多个连续页框
对于单个页面，内核选择从每CPU页框高速缓存中分配，它的核心描述结构也是MIGRATE_TYPES个链表，只不过链表中的元素都是单个页。这些页分为热页和冷页，所谓热页就是还处在CPU高速缓存中的页，相反，冷页就是不存在于高速缓存中的页。对于单个页框的申请，分配热页可以提高效率。需要注意的是，越靠近链表头的页越热，越靠近链表尾的页越冷，因为每次释放单个页框的时候，页框是插入到链表的头部的，也就是说靠近头部的页框是最近才释放的，因此最有可能存在于高速缓存当中
对于连续的页框分配，通过调用__rmqueue()来完成分配

[cpp]view plain copy
<span style="font-size:12px;">static struct page *__rmqueue(struct zone *zone, unsigned int order,  
                        int migratetype)  
{  
    struct page *page;  
  
retry_reserve:  
      
    page = __rmqueue_smallest(zone, order, migratetype);  
  
    /*如果分配失败并且迁移类型不是MIGRATE_RESERVE(如果是MIGRATE_RESERVE， 
      则表明已经没有其他的迁移类型可供选择了)*/  
    if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {  
        page = __rmqueue_fallback(zone, order, migratetype);  
  
        /* 
         * Use MIGRATE_RESERVE rather than fail an allocation. goto 
         * is used because __rmqueue_smallest is an inline function 
         * and we want just one call site 
         */  
        if (!page) {  
            migratetype = MIGRATE_RESERVE;  
            goto retry_reserve;  
        }  
    }  
  
    trace_mm_page_alloc_zone_locked(page, order, migratetype);  
    return page;  
}  
</span>  

首先按照指定的迁移类型，调用__rmqueue_smallest()来分配对应的内存块,该函数是伙伴系统的算法体现
如果分配失败，则说明指定的迁移类型中没有充足的内存来满足分配，这时就要按fallbacks中定义的顺序从其他的迁移链表中寻找了，__rmqueue_fallback()函数较为复杂，体现了利用迁移类型来避免碎片的思想，后面单独拿出来分析

[cpp]view plain copy
static inline  
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,  
                        int migratetype)  
{  
    unsigned int current_order;  
    struct free_area * area;  
    struct page *page;  
  
    /* Find a page of the appropriate size in the preferred list */  
    for (current_order = order; current_order < MAX_ORDER; ++current_order) {  
  
        /*获取和现在的阶数对应的free_area*/  
        area = &(zone->free_area[current_order]);  
  
        /*和迁移类型对应的free_list为空则不执行下面的内容*/  
        if (list_empty(&area->free_list[migratetype]))  
            continue;  
          
        /*得到满足要求的页块中的第一个页描述符*/  
        page = list_entry(area->free_list[migratetype].next,  
                            struct page, lru);  
        list_del(&page->lru);  
        rmv_page_order(page);/*将page的private域设为0*/  
        area->nr_free--;         /*内存块数减1*/  
          
        /*进行拆分(在current_order>order的情况下)*/  
        expand(zone, page, order, current_order, area, migratetype);  
        return page;  
    }  
  
    return NULL;  
}  

[cpp]view plain copy
   
  
static inline void expand(struct zone *zone, struct page *page,  
    int low, int high, struct free_area *area,  
    int migratetype)  
{  
    unsigned long size = 1 << high;/*order为high的页块对应的页框数*/  
  
    /*申请的order为low,实际分配的块对应的order为high 
      如果high大于low则要将大块进行拆分，并且将拆分后的伙伴块添加到下一级order的块链表中去*/  
    while (high > low) {  
        area--;/*area减1得到下一级order对应的area*/  
        high--;/*high减1表明进行了一次拆分*/  
        size >>= 1;/*拆分一次size就要除以2*/  
        VM_BUG_ON(bad_range(zone, &page[size]));  
  
        /*通过size来定位拆分后的伙伴块的起始页框描述符， 
        并将其作为第一个块添加到下一级order的块链表中*/  
        list_add(&page[size].lru, &area->free_list[migratetype]);  
        area->nr_free++;/*该order区域的块数加1*/  
        set_page_order(&page[size], high);/*设定private域为high*/  
    }  
}  
  
   

只需要注意一点，一个块的定位可以由块首的起始页对应的描述符和order(size)来定位，因此只需要将一个块的第一个页描述符链入相应的链表就可以了。

linux内核工程师 3.03节 Linux伙伴系统(三)--分配页

猜你喜欢