存储管理(二)--学习《Linux内核源代码情景分析》第二章(方便理解,内容在注释中)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/tyhaotingdege/article/details/79787290

        2.7 物理页面的分配

        分配若干页面时,分配页面用于DMA(direct memory assess)当然应该是连续的,其实出于物理存储空间质地一致性考虑,内存页面都是连续分配的。

        分配若干页面时可调用alloc_page( )来完成,实际上内核出于物理存储空间质地是否一致的考虑,会有两个alloc_page( ),具体使用哪一个由条件编译选项CONFIG_DISCONTIGMEM来决定:

43      #ifdef CONFIG_DISCONTIGMEM          //用于NUMA(non-uniform memory assess非均匀介质),这里是广义的NUMA,表示地址不连续的物理空间 及 质地不均匀的物理空间       

91      /*

92       * This can be refined. Currently, tries to do round robin, instead

93       * should do concentratic circle search, starting from current node.

94       */

95      struct page * alloc_pages(int gfp_mask, unsigned long order)           //gfp_mask是整数,表示分配策略。order表示要求分配的物理页面值,为2的order次方

96      

97             struct page *ret = 0;

98             pg_data_t *start, *temp;      //将质地均匀且地址连续的物理空间称之为一个节点:pg_data_t

99              #ifndef CONFIG_NUMA     

100                 unsigned long flags;

101                 static pg_data_t *next = 0;

102              #endif

103            

104             if (order >= MAX_ORDER)

105             return NULL;

106              #ifdef CONFIG_NUMA          //在NUMA结构中,可通过 NODE_DATA(numa_node_id())找到cpu所在节点的 pg_data_t队列

107                 temp = NODE_DATA(numa_node_id());

108              #else                         //在UMA结构但是物理空间非连续,也有个pg_data_t队列pgdat_list,分配页面时轮流查询各个节点,以求各个节点负载平衡。

109                 spin_lock_irqsave(&node_lock, flags);

110                 if (!next) next = pgdat_list;

111                 temp = next;

112                 next = next->node_next;

113                 spin_unlock_irqrestore(&node_lock, flags);

114              #endif

115             start = temp;

116             while (temp) {            //从pg_data_t 队列头找到尾,各节点尝试分配页面

117                 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))

118                 return(ret);

119                 temp = temp->node_next;

120             }

121             temp = pgdat_list;

122             while (temp != start) {               //pg_data_t对列尾找到头,各节点尝试分配页面

123                 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))          //alloc_pages_pgdat下面介绍

124                 return(ret);

125                 temp = temp->node_next;

126             }

127         return(0);

128      }

        alloc_pages_pgdat( ):

85      static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,

86     unsigned long order)

87      {

88             return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);        //gfp_mask为数组下标,下边介绍

89      }

        和下边代码表示的UMA结构中的 alloc_pages 对比,可以发现UMA 结构中只有一个节点 contig_page_data:

343      #ifndef CONFIG_DISCONTIGMEM           //与上述NUMA结构预编译宏相反,所以只会有一个被编译

344      static inline struct page * alloc_pages(int gfp_mask, unsigned long order)

345      {

346             /*

347              * Gets optimized away by the compiler.

348              */

349             if (order >= MAX_ORDER)

350             return NULL;

351             return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);         //下面介绍

352      }

        __alloc_pages( ):

[alloc_pages()>__alloc_pages()]

270      /*

271       * This is the 'heart' of the zoned buddy allocator:

272       */

273      struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)       //两个参数:分配策略、需要分配的物理页面

274      {

275             zone_t **zone;

276             int direct_reclaim = 0;

277             unsigned int gfp_mask = zonelist->gfp_mask;

278             struct page * page;

279            

280             /*

281              * Allocations put pressure on the VM subsystem.

282              */

283             memory_pressure++;        //页面压力,很形象,分配页面时增加,归还时减少

284            

285             /*

286              * (If anyone calls gfp from interrupts nonatomically then it

287              * will sooner or later tripped up by a schedule().)

288              *

289              * We are falling back to lower-level zones if allocation

290              * in a higher zone fails.

291              */

292            

293             /*

294              * Can we take pages directly from the inactive_clean

295              * list?

296              */

297             if (order == 0 && (gfp_mask & __GFP_WAIT) &&         

298             !(current->flags & PF_MEMALLOC))                             //依次表示:单个页面、等待分配完成、不是用于管理目的,若满足将局部变量 direct_reclaim 置 1

299                 direct_reclaim = 1;         //表示页面短缺可以从该节点“干净不活跃页面”队列中回收页面。通常回收的页面不能和空闲页面一般连续成块,所以分配一个页面才如此

300            

301             /*

302              * If we are about to get low on free pages and we also have

303              * an inactive page shortage, wake up kswapd.

304              */

305             if (inactive_shortage() > inactive_target / 2 && free_shortage())

306                 wakeup_kswapd(0);               //可分配页面短缺是唤醒该进程腾出一些页面

307             /*

308              * If we are about to get low on free pages and cleaning

309              * the inactive_dirty pages would fix the situation,

310              * wake up bdflush.

311              */

312             else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()

313             && nr_inactive_dirty_pages >= freepages.high)

314                 wakeup_bdflush(0);              //可分配页面短缺是唤醒该进程腾出一些页面

315                

        我们继续往下看如何分配连续的页面:

[alloc_pages()>__alloc_pages()]

316      try_again:

317     /*

318      * First, see if we have any zones with lots of free memory.

319      *

320      * We allocate free memory first because it doesn't contain

321      * any data ... DUH!

322      */

323     zone = zonelist->zones;

324     for (;;) {                           //在分配策略规定的所有管理区内循环

325             zone_t *z = *(zone++);

326             if (!z)

327                 break;

328             if (!z->size)

329                 BUG();

330             

331             if (z->free_pages >= z->pages_low) {               //各个管理区的空闲页面总数大于设定的最低点则进入

332                     page = rmqueue(z, order);               //试图从管理区中分配若干连续页面,下文介绍

333                     if (page)

334                         return page;

335             } else if (z->free_pages < z->pages_min &&

336             waitqueue_active(&kreclaimd_wait)) {                //管理区的空闲页面总述小于设定的最低点 && 有进程kreclaimd_wait在等待队列中睡眠

337                     wake_up_interruptible(&kreclaimd_wait);             //唤醒kreclaimd_wait进程

338             }

339     }

340     

        rmqueue( ):

[alloc_pages()>__alloc_pages()>rmqueue()]

172      static struct page * rmqueue(zone_t *zone, unsigned long order)

173      {

174             free_area_t * area = zone->free_area + order;          //一个管理区有很多空闲队列用free_area数组表示,area指针即指向所需大小的空闲队列队列头

175             unsigned long curr_order = order;

176             struct list_head *head, *curr;

177             unsigned long flags;

178             struct page *page;

179            

180             spin_lock_irqsave(&zone->lock, flags);             //不允许干扰

181             do {

182                 head = &area->free_list;

183                 curr = memlist_next(head);

184                

185                 if (curr != head) {

186                     unsigned int index;

187                    

188                     page = memlist_entry(curr, struct page, list);       //从非空空闲队列取第一个page结构元素

189                     if (BAD_RANGE(zone,page))

190                         BUG();

191                     memlist_del(curr);               //将memlist_entry取到的page元素从队列中删除掉

192                     index = (page - mem_map) - zone->offset;           //管理区的起始页面号

193                     MARK_USED(index, curr_order, area);

194                     zone->free_pages -= 1 << order;  

195                    

196                     page = expand(zone, page, index, order, curr_order, area);            //将分配到的大块物理页面块除去所需页面后剩余部分分解成小块链接到相应队列。下边介绍

197                     spin_unlock_irqrestore(&zone->lock, flags);

198                    

199                     set_page_count(page, 1);                      //page的count += 1。

200                     if (BAD_RANGE(zone,page))

201                         BUG();

202                     DEBUG_ADD_PAGE

203                     return page;

204                 }

205                 curr_order++;

206                 area++;

207             } while (curr_order < MAX_ORDER);

208             spin_unlock_irqrestore(&zone->lock, flags);

209            

210             return NULL;

211      }

        函数expand( ):

[alloc_pages()>__alloc_pages()>rmqueue()>expand()]

150      static inline struct page * expand (zone_t *zone, struct page *page,                //分配页面是按照2的次方来分

151      unsigned long index, int low, int high, free_area_t * area)                       //这里low表示需要的页面order。high表示当前空闲队列的curr_order,可分配的物理页面(2的 curr_order 次方个)

152      { 

153             unsigned long size = 1 << high;         //表示好多个页面

154            

155             while (high > low) {               //只有当 需要的页面 == 可分配的物理页面 时才会跳出循环返回page

156                     if (BAD_RANGE(zone,page))

157                         BUG();

158                     area--;

159                     high--;

160                     size >>= 1;              //size = size \ 2

161                     memlist_add_head(&(page)->list, &(area)->free_list);

162                     MARK_USED(index, high, area);

163                     index += size;

164                     page += size;

165             }

166             if (BAD_RANGE(zone,page))

167                 BUG();

168             return page;

169      }

        就这样,rmqueue( )一直向curr_order大的空闲队列扫描。如果rmqueue( )最后失败,则__alloc_page( )将通过for循环尝试分配策略规定的下个管理区,直到成功或遇见NULL而最终失败。如果rmqueue( )成功__alloc_page( )将返回page指针,指向分配的页面块的第一个页面,并将该page的count += 1。

        如果尝试分配策略规定的所有管理区都失败了,还有两种方式分配页面:一是降低管理区最低的页面“水位”,二是将缓冲在管理区的“不活跃干净页面”考虑进来。继续alloc_pages( )

[alloc_pages()>__alloc_pages()]

341         /*

342          * Try to allocate a page from a zone with a HIGH

343          * amount of free + inactive_clean pages.

344          *

345          * If there is a lot of activity, inactive_target

346          * will be high and we'll have a good chance of

347          * finding a page using the HIGH limit.

348          */

349         page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);              //先使用PAGES_HIGH

350         if (page)

351             return page;

352        

353         /*

354          * Then try to allocate a page from a zone with more

355          * than zone->pages_low free + inactive_clean pages.

356          *

357          * When the working set is very large and VM activity

358          * is low, we're most likely to have our allocation

359          * succeed here.

360          */

361         page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);                      //再使用PAGES_LOW

362         if (page)

363             return page;

364        

        __alloc_pages_limit( ),内容是对应上述两种方式分配页面:

[alloc_pages()>__alloc_pages()>__alloc_pages_limit()]

213          #define PAGES_MIN   0

214          #define PAGES_LOW   1

215          #define PAGES_HIGH  2

216        

217          /*

218           * This function does the dirty work for __alloc_pages

219           * and is separated out to keep the code size smaller.

220           * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)

221           */

222          static struct page * __alloc_pages_limit(zonelist_t *zonelist,

223         unsigned long order, int limit, int direct_reclaim)

224          {

225                 zone_t **zone = zonelist->zones;

226                

227                 for (;;) {

228                         zone_t *z = *(zone++);

229                         unsigned long water_mark;

230                        

231                         if (!z)

232                             break;

233                         if (!z->size)

234                             BUG();

235                        

236                         /*

237                          * We allocate if the number of free + inactive_clean

238                          * pages is above the watermark.

239                          */

240                         switch (limit) {                 //更改最低管理区最低页面“水位”

241                             default:

242                             case PAGES_MIN:

243                             water_mark = z->pages_min;

244                             break;

245                             case PAGES_LOW:

246                             water_mark = z->pages_low;

247                             break;

248                             case PAGES_HIGH:

249                             water_mark = z->pages_high;

250                         }

251                        

252                         if (z->free_pages + z->inactive_clean_pages > water_mark) {

253                             struct page *page = NULL;

254                             /* If possible, reclaim a page directly. */

255                             if (direct_reclaim && z->free_pages < z->pages_min + 8)           //满足条件则进入,回收页面

256                                 page = reclaim_page(z);            //从 inactive_clean_list( ) 中回收页面,在“页面定期换出”节末尾讲解

257                             /* If that fails, fall back to rmqueue. */

258                             if (!page)

259                                 page = rmqueue(z, order);

260                             if (page)

261                                 return page;

262                         }

263                 }

264                

265                 /* Found nothing. */

266                 return NULL;

267          }

        如果还是不行,我们还是不能放弃,继续alloc_pages( ):

[alloc_pages()>__alloc_pages()]

365         /*

366          * OK, none of the zones on our zonelist has lots

367          * of pages free.

368          *

369          * We wake up kswapd, in the hope that kswapd will

370          * resolve this situation before memory gets tight.

371          *

372          * We also yield the CPU, because that:

373          * - gives kswapd a chance to do something

374          * - slows down allocations, in particular the

375          *   allocations from the fast allocator that's

376          *   causing the problems ...

377          * - ... which minimises the impact the "bad guys"

378          *   have on the rest of the system

379          * - if we don't have __GFP_IO set, kswapd may be

380          *   able to free some memory we can't free ourselves

381          */

382         wakeup_kswapd(0);          //调用kswapd进程尝试分配页面

383         if (gfp_mask & __GFP_WAIT) {            //如果很执着,分配不到页面宁可等待。进入if语句表示:为其他进程让路,因为其它进程可能释放些内存,并且让kswapd进程可能立即就被调用。

384             __set_current_state(TASK_RUNNING);

385             current->policy |= SCHED_YIELD;

386             schedule();

387         }

388        

389         /*

390          * After waking up kswapd, we try to allocate a page

391          * from any zone which isn't critical yet.

392          *

393          * Kswapd should, in most situations, bring the situation

394          * back to normal in no time.

395          */

396         page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);             //尝试以PAGES_MIN参数调用__alloc_pages_limit( )

397         if (page)

398             return page;

399

        假如还是失败了呢?我们就要先看是哪个进程在要求分配页面了,如果是 kswapd 或 kreclaimd 进程,它们比较重要,这类进程的 task_struct.flags 字段的PF_MEMALLOC标志位为1,我们先看PF_MEMALLOC标志位为0的情况:

[alloc_pages()>__alloc_pages()]

400         /*

401          * Damn, we didn't succeed.

402          *

403          * This can be due to 2 reasons:                   两种可能:可分配页面总数太少、总数不少但是物理块大小不满足

404          * - we're doing a higher-order allocation

405          *   --> move pages to the free list until we succeed

406          * - we're /really/ tight on memory

407          *   --> wait on the kswapd waitqueue until memory is freed

408          */

409         if (!(current->flags & PF_MEMALLOC)) {

410         /*

411          * Are we dealing with a higher order allocation?

412          *

413          * Move pages from the inactive_clean to the free list

414          * in the hope of creating a large, physically contiguous

415          * piece of free memory.

416          */

417         if (order > 0 && (gfp_mask & __GFP_WAIT)) {

418         zone = zonelist->zones;

419         /* First, clean some dirty pages. */

420         current->flags |= PF_MEMALLOC;

421         page_launder(gfp_mask, 1);                //将脏页面洗净

422         current->flags &= ~PF_MEMALLOC;      //不把PF_MEMALLOC设置为1就有可能使得函数在409-476行内递归

423         for (;;) {          //主要任务是在各个管理区回收和释放“干净”页面

424             zone_t *z = *(zone++);

425             if (!z)

426             break;

427             if (!z->size)

428             continue;

429             while (z->inactive_clean_pages) {             //回收和释放“干净”页面核心部分

430                     struct page * page;

431                     /* Move one page to the free list. */

432                     page = reclaim_page(z);

433                     if (!page)

434                         break;

435                     __free_page(page);              //释放页面时会把空闲页面拼装成尽可能大的页面块

436                     /* Try if the allocation succeeds. */

437                     page = rmqueue(z, order);     //每回收一个页面都调用一次试试看能否成功分配页面了

438                     if (page)

439                         return page;

440                 }

441             }

442         }

443         /*

444          * When we arrive here, we are really tight on memory.

445          *

446          * We wake up kswapd and sleep until kswapd wakes us

447          * up again. After that we loop back to the start.

448          *

449          * We have to do this because something else might eat

450          * the memory kswapd frees for us and we need to be

451          * reliable. Note that we don't loop back for higher

452          * order allocations since it is possible that kswapd

453          * simply cannot free a large enough contiguous area

454          * of memory *ever*.

455          */

456         if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {                  //回收了页面还是不够,就只能是可分配页面不够了

457                 wakeup_kswapd(1);

458                 memory_pressure++;

459                 if (!order)                //如果是单个分配单个页面就回到__alloc_page( )开头处

460                     goto try_again;

461                 /*

462                  * If __GFP_IO isn't set, we can't wait on kswapd because

463                  * kswapd just might need some IO locks /we/ are holding ...

464                  *

465                  * SUBTLE: The scheduling point above makes sure that

466                  * kswapd does get the chance to free memory we can't

467                  * free ourselves...

468                  */

469                 } else if (gfp_mask & __GFP_WAIT) {

470                         try_to_free_pages(gfp_mask);            //其实也是kswapd( )进程调用一个的函数

471                         memory_pressure++;

472                         if (!order)

473                         goto try_again;

474                 }

475        

476        }

477        

        如果是PF_MEMALLOC == 1或者是任无法分配内存,那我们就到了不惜代价的时候了,因为前边在调用__alloc_pages_limit( )实际上是有保留的,例如管理区可分配页面”水位“高于z -> pages_min,继续看__alloc_pages( ):

[alloc_pages()>__alloc_pages()]

478         /*

479          * Final phase: allocate anything we can!

480          *

481          * Higher order allocations, GFP_ATOMIC allocations and

482          * recursive allocations (PF_MEMALLOC) end up here.

483          *

484          * Only recursive allocations can use the very last pages

485          * in the system, otherwise it would be just too easy to

486          * deadlock the system...

487          */

488         zone = zonelist->zones;

489         for (;;) {

490                 zone_t *z = *(zone++);

491                 struct page * page = NULL;

492                 if (!z)

493                     break;

494                 if (!z->size)

495                     BUG();

496                

497                 /*

498                  * SUBTLE: direct_reclaim is only possible if the task

499                  * becomes PF_MEMALLOC while looping above. This will

500                  * happen when the OOM killer selects this task for

501                  * instant execution...93

502                  */

503                 if (direct_reclaim) {

504                     page = reclaim_page(z);

505                     if (page)

506                     return page;

507                 }

508                

509                 /* XXX: is pages_min/4 a good amount to reserve for this? */

510                 if (z->free_pages < z->pages_min / 4 &&

511                 !(current->flags & PF_MEMALLOC))

512                     continue;

513                 page = rmqueue(z, order);

514                 if (page)

515                     return page;

516                 }

517                

518                 /* No luck.. */

519                 printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);

520                 return NULL;

521          }

        2.8物理页面的定期换出

        linux内核中专门设置了定期换出页面的线程kswapd。线程和进程的区别是:线程无自己独立的地址空间,使用的是内核的空间,除了Kswapd以外中断服务程序也是这样的。还有kswapd的代码是静态链接在内核上的

        kswapd代码在mm/vmscan.c中,先看它的建立:

1146         static int __init kswapd_init(void)

1147         {

1148                 printk("Starting kswapd v1.8\n");

1149                 swap_setup();         //根据物理内存大小定义一个全局量page_cluster,这个参数表示一次性从磁盘读取到内存的数量,所以要视内存实际大小而定。

1150                 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);      //创建线程kswapd

1151                 kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);         //创建线程kreclaimd

1152                 return 0;

1153         }

        swap_setup():

[kswapd_init()>swap_setup()]

293             /*

294              * Perform any setup for the swap system

295              */

296             void __init swap_setup(void)

297             {

298                    /* Use a smaller cluster for memory <16MB or <32MB */

299                    if (num_physpages < ((16 * 1024 * 1024) >> PAGE_SHIFT))

300                    page_cluster = 2;

301                    else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT))

302                    page_cluster = 3;

303                    else

304                    page_cluster = 4;

305             }

        在函数 kswapd_init 中创建了两个线程,这里只讲解kswapd线程。我们假定线程kswapd已创建成功,那么将从函数kswapd()开始执行:

==================== mm/vmscan.c 947 1046 ====================

947          /*

948           * The background pageout daemon, started as a kernel thread

949           * from the init process.

950           *

951           * This basically trickles out pages so that we have _some_

952           * free memory available even if there is no other activity

953           * that frees anything up. This is needed for things like routing

954           * etc, where we otherwise might have all activity going on in

955           * asynchronous contexts that cannot page things out.

956           *

957           * If there are applications that are active memory-allocators

958           * (most normal use), this basically shouldn't matter.

959           */

960          int kswapd(void *unused)

961          {

962                 struct task_struct *tsk = current;            

963                 

964                 tsk->session = 1;

965                 tsk->pgrp = 1;

966                 strcpy(tsk->comm, "kswapd");

967                 sigfillset(&tsk->blocked);

968                 kswapd_task = tsk;

969                 

970                 /*

971                  * Tell the memory management that we're a "memory allocator",

972                  * and that if we need more memory we should get access to it

973                  * regardless (see "__alloc_pages()"). "kswapd" should

974                  * never get caught in the normal page freeing logic.

975                  *

976                  * (Kswapd normally doesn't need memory anyway, but sometimes

977                  * you need a small amount of memory in order to be able to

978                  * page out something else, and this flag essentially protects

979                  * us from recursively trying to free more memory as we're

980                  * trying to free the first piece of memory in the first place).

981                  */

982                 tsk->flags |= PF_MEMALLOC;

983                 

984                 /*

985                  * Kswapd main loop.

986                  */

987                 for (;;) {

988                             static int recalc = 0;

989                             

990                             /* If needed, try to free some memory. */

991                             if (inactive_shortage() || free_shortage()) {         

                                  /*下文介绍 inactive_shortage()。inactive_shortage()内存中可分配和周转的物理页面是否空缺。free_shortage()用来检测某个管理区直接可供分配页面是否小于最低水平。*/

992                                     int wait = 0;

993                                     /* Do we need to do some synchronous flushing? */

994                                     if (waitqueue_active(&kswapd_done))         //查看kswapd_done 队列中是否有函数在等待并将结果作为参数传给do_try_to_free_pages,内核中设备驱动的底层函数将挂入这个队列,下文介绍

995                                             wait = 1;

996                                     do_try_to_free_pages(GFP_KSWAPD, wait);             //设法换出或释放若干页面

997                             }

998                             

999                             /*

1000                              * Do some (very minimal) background scanning. This

1001                              * will scan all pages on the active list once

1002                              * every minute. This clears old referenced bits

1003                              * and moves unused pages to the inactive list.

1004                              */

1005                             refill_inactive_scan(6, 0);

1006                            

1007                             /* Once a second, recalculate some VM stats. */

1008                             if (time_after(jiffies, recalc + HZ)) {

1009                                     recalc = jiffies;

1010                                     recalculate_vm_stats();

1011                             }

1012                            

1013                             /*

1014                              * Wake up everybody waiting for free memory

1015                              * and unplug the disk queue.

1016                              */

1017                             wake_up_all(&kswapd_done);

1018                             run_task_queue(&tq_disk);

1019                            

1020                             /*

1021                              * We go to sleep if either the free page shortage

1022                              * or the inactive page shortage is gone. We do this

1023                              * because:

1024                              * 1) we need no more free pages   or

1025                              * 2) the inactive pages need to be flushed to disk,

1026                              *    it wouldn't help to eat CPU time now ...

1027                              *

1028                              * We go to sleep for one second, but if it's needed

1029                              * we'll be woken up earlier...

1030                              */

1031                             if (!free_shortage() || !inactive_shortage()) {

1032                                     interruptible_sleep_on_timeout(&kswapd_wait, HZ);           //进入睡眠,HZ表示一秒钟多少个时钟中断,这里也就是1秒唤醒一次。当然有时候内核会不到1秒就唤醒一次,比如分配不到页面时。

1033                                     /*

1034                                      * If we couldn't free enough memory, we see if it was

1035                                      * due to the system just not having enough memory.

1036                                      * If that is the case, the only solution is to kill

1037                                      * a process (the alternative is enternal deadlock).

1038                                      *

1039                                      * If there still is enough memory around, we just loop

1040                                      * and try free some more memory...

1041                                      */

1042                             } else if (out_of_memory()) {

1043                                     oom_kill();

1044                             }

1045                     }

1046         }

        kswapd线程可分为两部分:一部分在页面短缺才进行,目的是断开页面映射,使之从活跃状态变为不活跃状态。二部分是将不活跃脏页面写入交换设备变为不活跃干净页面,或者进而回收一些空闲页面。

        先看第一部分,for循环中 inactive_shortage() 表示内存中可分配和周转的物理页面是否空缺:

==================== mm/vmscan.c 805 822 ====================

[kswapd()>inactive_shortage()]

805          /*

806           * How many inactive pages are we short?

807           */

808          int inactive_shortage(void)

809          {

810                 int shortage = 0;

811                

812                 shortage += freepages.high;     //系统应该维持的可分配页面量为freepages.high、inactive_target之和

813                 shortage += inactive_target;

814                 shortage -= nr_free_pages();         //nr_free_pages() 计算多少空闲页面

815                 shortage -= nr_inactive_clean_pages();

816                 shortage -= nr_inactive_dirty_pages;

817                

818                 if (shortage > 0)

819                     return shortage;

820                

821                 return 0;

822          }

        do_try_to_free_pages()之前要判断kswapd_done 队列中是否有函数在等待,这个任务由waitqueue_active()完成:

==================== include/linux/wait.h 152 161 ====================

[kswapd()>waitqueue_active()]

152          static inline int waitqueue_active(wait_queue_head_t *q)

153          {

154                  #if WAITQUEUE_DEBUG

155                 if (!q)

156                 WQ_BUG();

157                 CHECK_MAGIC_WQHEAD(q);

158                  #endif

159                

160                 return !list_empty(&q->task_list);

161          }

        do_try_to_free_pages():

==================== mm/vmscan.c 907 941 ====================

[kswapd()>do_try_to_free_pages()]

907          static int do_try_to_free_pages(unsigned int gfp_mask, int user)

908          {

909                 int ret = 0;

910                

911                 /*

912                  * If we're low on free pages, move pages from the

913                  * inactive_dirty list to the inactive_clean list.

914                  *

915                  * Usually bdflush will have pre-cleaned the pages

916                  * before we get around to moving them to the other

917                  * list, so this is a relatively cheap operation.

918                  */

919                 if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +

920                 nr_inactive_clean_pages())

921                         ret += page_launder(gfp_mask, user);         //把不活跃状态的脏页面洗净,使它们变为立即可分配状态。见下文介绍

922                

923                 /*

924                  * If needed, we move pages from the active list

925                  * to the inactive list. We also "eat" pages from

926                  * the inode and dentry cache whenever we do this.

927                  */

928                 if (free_shortage() || inactive_shortage()) {

929                         shrink_dcache_memory(6, gfp_mask);    //在讲解page_launder()将介绍将介绍

930                         shrink_icache_memory(6, gfp_mask);      //在讲解page_launder()将介绍将介绍

931                         ret += refill_inactive(gfp_mask, user);    //很重要,在讲解page_launder()将介绍将介绍

932                 } else {

933                         /*

934                          * Reclaim unused slab cache memory.

935                          */

936                         kmem_cache_reap(gfp_mask);          //在讲解page_launder()将介绍将介绍

937                         ret = 1;

938                 }

939                

940                 return ret;

941          }

        page_launder():

==================== mm/vmscan.c 465 670 ====================

[kswapd()>do_try_to_free_pages()>page_launder()]

465          /**

466           * page_launder - clean dirty inactive pages, move to inactive_clean list

467           * @gfp_mask: what operations we are allowed to do

468           * @sync: should we wait synchronously for the cleaning of pages

469           *

470           * When this function is called, we are most likely low on free +

471           * inactive_clean pages. Since we want to refill those pages as

472           * soon as possible, we'll make two loops over the inactive list,

473           * one to move the already cleaned pages to the inactive_clean lists

474           * and one to (often asynchronously) clean the dirty inactive pages.

475           *

476           * In situations where kswapd cannot keep up, user processes will

477           * end up calling this function. Since the user process needs to

478           * have a page before it can continue with its allocation, we'll

479           * do synchronous page flushing in that case.

480           *

481           * This code is heavily inspired by the FreeBSD source code. Thanks

482           * go out to Matthew Dillon.

483           */

484          #define MAX_LAUNDER (4 * (1 << page_cluster))

485          int page_launder(int gfp_mask, int sync)

486          {

                      /*launder_loop表示不活跃脏页面的扫描次数,扫描一次为0,扫描两次为1cleaned_pages是累计洗净的页面数。maxlaunder是为了不重复的扫描不活跃脏页面而设置的计数*/

487                 int launder_loop, maxscan, cleaned_pages, maxlaunder;      

488                 int can_get_io_locks;

489                 struct list_head * page_lru;

490                 struct page * page;

491                

492                 /*

493                  * We can only grab the IO locks (eg. for flushing dirty

494                  * buffers to disk) if __GFP_IO is set.

495                  */

496                 can_get_io_locks = gfp_mask & __GFP_IO;

497                

498                 launder_loop = 0;

499                 maxlaunder = 0;

500                 cleaned_pages = 0;

501                

502                  dirty_page_rescan:

503                 spin_lock(&pagemap_lru_lock);

504                 maxscan = nr_inactive_dirty_pages;

505                 while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&          //扫描不活跃脏页面

506                 maxscan-- > 0) {

507                         page = list_entry(page_lru, struct page, lru);

508                        

509                         /* Wrong page on list?! (list corruption, should not happen) */

510                         if (!PageInactiveDirty(page)) {               //检查Page_Inactive_Dirty 标志位是否为1,否则该页面本就不应该存在这个队列中,就进入语句将其删除

511                                 printk("VM: page_launder, wrong page on list.\n");

512                                 list_del(page_lru);

513                                 nr_inactive_dirty_pages--;

514                                 page->zone->inactive_dirty_pages--;

515                                 continue;

516                         }

517                        

518                         /* Page is or was in use?  Move it to the active list. */

519                         if (PageTestandClearReferenced(page) || page->age > 0 ||              //检测几个条件才能最终判断页面是否属于不活跃干净页面,如果是用于模拟磁盘的页面也不能换出

520                         (!page->buffers && page_count(page) > 1) ||

521                         page_ramdisk(page)) {

522                                 del_page_from_inactive_dirty_list(page);

523                                 add_page_to_active_list(page);

524                                 continue;

525                         }

526                        

527                         /*

528                          * The page is locked. IO in progress?

529                          * Move it to the back of the list.

530                          */

531                         if (TryLockPage(page)) {        //锁住的页面,移置不活跃脏队列尾

532                                 list_del(page_lru);

533                                 list_add(page_lru, &inactive_dirty_list);

534                                 continue;

535                         }

536                        

537                         /*

538                          * Dirty swap-cache page? Write it out if

539                          * last copy..

540                          */

541                         if (PageDirty(page)) {          //脏页面标志位为1则进入

542                                 int (*writepage)(struct page *) = page->mapping->a_ops->writepage;          //页面写操作函数     

543                                 int result;

544                                

545                                 if (!writepage)                  //没页面写操作函数,就将页面放入 page_active 队列

546                                         goto page_active;

547                                

548                                 /* First time through? Move it to the back of the list */      //第一次扫描将页面放入对列尾

549                                 if (!launder_loop) {

550                                         list_del(page_lru);

551                                         list_add(page_lru, &inactive_dirty_list);

552                                         UnlockPage(page);

553                                         continue;

554                                 }

555                                

556                                 /* OK, do a physical asynchronous write to swap.  */          //第二次扫描将页面写入交换设备

557                                 ClearPageDirty(page);             //将PG_dirty标志位清0,这样若页面写出期间,程序再次进入page_launder也不会写出2次

558                                 page_cache_get(page);            //递增页面使用计数

559                                 spin_unlock(&pagemap_lru_lock);

560                                

561                                 result = writepage(page);           //写出到交换设备

562                                 page_cache_release(page);           //递减页面使用计数

563                                

564                                 /* And re-start the thing.. */

565                                 spin_lock(&pagemap_lru_lock);

566                                 if (result != 1)

567                                         continue;

568                                 /* writepage refused to do anything */

569                                 set_page_dirty(page);

570                                 goto page_active;

571                         }

572                        

573                         /*

574                          * If the page has buffers, try to free the buffer mappings

575                          * associated with this page. If we succeed we either free

576                          * the page (in case it was a buffercache only page) or we

577                          * move the page to the inactive_clean list.

578                          *

579                          * On the first round, we should free all previously cleaned

580                          * buffer pages

581                          */

582                         if (page->buffers) {           //到这里页面肯定不是脏的了。判断是否是文件读写的页面

583                                 int wait, clearedbuf;

584                                 int freed_page = 0;

585                                 /*

586                                  * Since we might be doing disk IO, we have to

587                                  * drop the spinlock and take an extra reference

588                                  * on the page so it doesn't go away from under us.

589                                  */

590                                 del_page_from_inactive_dirty_list(page);

591                                 page_cache_get(page);

592                                 spin_unlock(&pagemap_lru_lock);

593                                

594                                 /* Will we do (asynchronous) IO? */

595                                 if (launder_loop && maxlaunder == 0 && sync)

596                                         wait = 2; /* Synchrounous IO */

597                                 else if (launder_loop && maxlaunder-- > 0)

598                                         wait = 1; /* Async IO */

599                                 else

600                                 wait = 0; /* No IO */

601                                

602                                 /* Try to free the page buffers. */

603                                 clearedbuf = try_to_free_buffers(page, wait);        //释放这种页面,并将页面使用计数减1     

604                                

605                                 /*

606                                  * Re-take the spinlock. Note that we cannot

607                                  * unlock the page yet since we're still

608                                  * accessing the page_struct here...

609                                  */

610                                 spin_lock(&pagemap_lru_lock);

611                                

612                                 /* The buffers were not freed. */

613                                 if (!clearedbuf) {

614                                         add_page_to_inactive_dirty_list(page);

615                                        

616                                         /* The page was only in the buffer cache. */

617                                 } else if (!page->mapping) {

618                                         atomic_dec(&buffermem_pages);

619                                         freed_page = 1;

620                                         cleaned_pages++;

621                                         

622                                         /* The page has more users besides the cache and us. */

623                                 } else if (page_count(page) > 2) {

624                                         add_page_to_active_list(page);

625                                        

626                                         /* OK, we "created" a freeable page. */

627                                 } else /* page->mapping && page_count(page) == 2 */ {

628                                         add_page_to_inactive_clean_list(page);

629                                         cleaned_pages++;

630                                 }

631                                

632                                 /*

633                                  * Unlock the page and drop the extra reference.

634                                  * We can only do it here because we ar accessing

635                                  * the page struct above.

636                                  */

637                                 UnlockPage(page);

638                                 page_cache_release(page);

639                                

640                                 /*

641                                  * If we're freeing buffer cache pages, stop when

642                                  * we've got enough free memory.

643                                  */

644                                 if (freed_page && !free_shortage())

645                                         break;

646                                 continue;

647                         } else if (page->mapping && !PageDirty(page)) {            //页面不是脏的,并在某个address_space数组,表示已经洗净

648                                 /*

649                                  * If a page had an extra reference in

650                                  * deactivate_page(), we will find it here.

651                                  * Now the page is really freeable, so we

652                                  * move it to the inactive_clean list.

653                                  */

654                                 del_page_from_inactive_dirty_list(page);

655                                 add_page_to_inactive_clean_list(page);

656                                 UnlockPage(page);

657                                 cleaned_pages++;

658                         } else {                   //无法处理页面,放回page_active队列

659                                  page_active:

660                                 /*

661                                  * OK, we don't know what to do with the page.

662                                  * It's no use keeping it here, so we move it to

663                                  * the active list.

664                                  */

665                                 del_page_from_inactive_dirty_list(page);

666                                 add_page_to_active_list(page);

667                                 UnlockPage(page);

668                         }

669         }

670         spin_unlock(&pagemap_lru_lock);

        这里补充:页面的使用计数在分配时记为1,之后不管是“文件读写缓冲 ”还是“进程使用 ”页面使用计数 都会++。页面用作文件读写缓冲是不建立映射的,所以也有可能是不活跃状态。但是进程使用该页面一定是活跃状态。

        上文说过launder_loop表示不活跃脏页面的扫描次数 ,并在launder_loop 为1时将页面转出,继续阅读page_launder就可以知道:

==================== mm/vmscan.c 671 697 ====================

[kswapd()>do_try_to_free_pages()>page_launder()]

671                

672                 /*

673                  * If we don't have enough free pages, we loop back once

674                  * to queue the dirty pages for writeout. When we were called

675                  * by a user process (that /needs/ a free page) and we didn't

676                  * free anything yet, we wait synchronously on the writeout of

677                  * MAX_SYNC_LAUNDER pages.

678                  *

679                  * We also wake up bdflush, since bdflush should, under most

680                  * loads, flush out the dirty pages before we have to wait on

681                  * IO.

682                  */

683                 if (can_get_io_locks && !launder_loop && free_shortage()) {       //空闲页面是否短缺、参数gfp_mask中__GFP_IO标识位是否为1

684                         launder_loop = 1;       

685                         /* If we cleaned pages, never do synchronous IO. */

686                         if (cleaned_pages)

687                         sync = 0;

688                         /* We only do a few "out of order" flushes. */

689                         maxlaunder = MAX_LAUNDER;

690                         /* Kflushd takes care of the rest. */

691                         wakeup_bdflush(0);

692                         goto dirty_page_rescan;           //跳到502行

693                 }

694                

695                 /* Return the number of pages moved to the inactive_clean list. */

696                 return cleaned_pages;

697          }

        继续看到 do_try_to_free_pages() 没读完的代码,此时若可分配的物理页面还是不足就应该从一下4各方面来回收:

        shrink_dcache_memory(6, gfp_mask);    //用于回收打开文件时建立,但在关闭文件后作为后备未立即回收的 dentry(代表目录项 )数据结构

        shrink_icache_memory(6, gfp_mask);     //用于回收打开文件时建立,但在关闭文件后作为后备未立即回收的 inode(代表文件索引节点)数据结构

        ret += refill_inactive(gfp_mask, user);     //下文详解

        kmem_cache_reap(gfp_mask);        //回收内核运行中动态分配的数据结构,这种机构采用slab管理机制,这个管理机制下下章介绍

        refill_inactive ():

==================== mm/vmscan.c 824 905 ====================

[kswapd()>do_try_to_free_pages()>refill_inactive()]

824          /*

825           * We need to make the locks finer granularity, but right

826           * now we need this so that we can do page allocations

827           * without holding the kernel lock etc.

828           *

829           * We want to try to free "count" pages, and we want to

830           * cluster them so that we get good swap-out behaviour.

831           *

832           * OTOH, if we're a user process (and not kswapd), we

833           * really care about latency. In that case we don't try

834           * to free too many pages.

835           */

836          static int refill_inactive(unsigned int gfp_mask, int user)      //user是 kswapd() 里994行传下来的参数,表示kswapd _done队列中是否有等待函数,这决定回收页面是否能慢慢来

837          {

838                 int priority, count, start_count, made_progress;

839                

840                 count = inactive_shortage() + free_shortage();

841                 if (user)

842                         count = (1 << page_cluster);

843                 start_count = count;

844                

845                 /* Always trim SLAB caches when memory gets low. */

846                 kmem_cache_reap(gfp_mask);      //回收slab机制管理的空闲物理页面

847                

848                 priority = 6;

849                 do {

850                         made_progress = 0;

851                        

                             /*need_resched:这个标志位是为强制调度设置,每当cpu进行系统调用或中断后从系统返回到用户,就会检查一次这个标志。但是kswapd是个内核线程,也就是说永不会返回用户空间,只有靠自己检查自己了*/

852                         if (current->need_resched) {         //current 表示当前进程的task_struct结构体,应该是个全局变量。如果其need_resched为1表示某中断服务程序要求调度

853                                 __set_current_state(TASK_RUNNING);        //把本进程状态设置为 TASK_RUNNING,表示继续运行

854                                 schedule();          //让内核对该中断进行一次调度

855                         }

856                        

857                         while (refill_inactive_scan(priority, 1)) {        //循环中主要做的事1,扫描活跃页面,试图将部分转换为不活跃,下文介绍

858                                 made_progress = 1;

859                                 if (--count <= 0)

860                                     goto done;

861                         }

862                        

863                         /*

864                          * don't be too light against the d/i cache since

865                               * refill_inactive() almost never fail when there's

866                               * really plenty of memory free.

867                          */

868                         shrink_dcache_memory(priority, gfp_mask);        //还得试试回收 inode 和 dentry 数据结构

869                         shrink_icache_memory(priority, gfp_mask);

870                        

871                         /*

872                          * Then, try to page stuff out..

873                          */

874                         while (swap_out(priority, gfp_mask)) {      //循环中主要做的事2,扫描某个进程,从其映射表中试图照出可以转入不活跃状态的页面下文介绍

875                                 made_progress = 1;

876                                 if (--count <= 0)

877                                         goto done;

878                         }

879                        

880                         /*

881                          * If we either have enough free memory, or if

882                          * page_launder() will be able to make enough

883                          * free memory, then stop.

884                          */

885                         if (!inactive_shortage() || !free_shortage())

886                                 goto done;

887                        

888                         /*

889                          * Only switch to a lower "priority" if we

890                          * didn't make any useful progress in the

891                          * last loop.

892                          */

893                         if (!made_progress)

894                                 priority--;

895                 } while (priority >= 0);    //priority不断减小表示优先级不断增大,也即是回收页面的力度不断增加

896                

897                 /* Always end on a refill_inactive.., may sleep... */

898                 while (refill_inactive_scan(0, 1)) {

899                         if (--count <= 0)

900                                 goto done;

901                 }

902

903                  done:

904                 return (count < start_count);

905          }

        refill_inactive_scan( ):

==================== mm/vmscan.c 699 769 ====================

699          /**

700           * refill_inactive_scan - scan the active list and find pages to deactivate

701           * @priority: the priority at which to scan

702           * @oneshot: exit after deactivating one page

703           *

704           * This function will scan a portion of the active list to find

705           * unused pages, those pages will then be moved to the inactive list.

706           */

707          int refill_inactive_scan(unsigned int priority, int oneshot)

708          {

709                 struct list_head * page_lru;

710                 struct page * page;

711                 int maxscan, page_active = 0;   //maxscan控制扫描页面数

712                 int ret = 0;

713                

714                 /* Take the lock while messing with the list... */

715                 spin_lock(&pagemap_lru_lock);

716                 maxscan = nr_active_pages >> priority;      //扫描页面数由优先级来决定的,优先级为0时才是对整个活跃页面进行扫描

717                 while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {      

718                         page = list_entry(page_lru, struct page, lru);

719                        

720                         /* Wrong page on list?! (list corruption, should not happen) */

721                         if (!PageActive(page)) {        //判断页面是否是活跃页面

722                                 printk("VM: refill_inactive, wrong page on list.\n");   

723                                 list_del(page_lru);

724                                 nr_active_pages--;

725                                 continue;

726                         }

727                        

728                         /* Do aging on the pages. */

729                         if (PageTestandClearReferenced(page)) {        //判断页面近期是否受到访问,如果减少页面寿命后为0表示页面已经寿命耗尽

730                                 age_page_up_nolock(page);

731                                 page_active = 1;

732                         } else {

733                                 age_page_down_ageonly(page);

734                                 /*

735                                  * Since we don't hold a reference on the page

736                                  * ourselves, we have to do our test a bit more

737                                  * strict then deactivate_page(). This is needed

738                                  * since otherwise the system could hang shuffling

739                                  * unfreeable pages from the active list to the

740                                  * inactive_dirty list and back again...

741                                  *

742                                  * SUBTLE: we can have buffer pages with count 1.

743                                  */

744                                 if (page->age == 0 && page_count(page) <=      //光是寿命耗尽还不能将其转不活跃。如果页面不作文件读写缓冲但是count>1表示有进程映射,不能转入,等swap_out断开映射才有可能

745                                 (page->buffers ? 2 : 1)) {       //page->buffers表示文件缓冲区

746                                         deactivate_page_nolock(page);

747                                         page_active = 0;

748                                 } else {

749                                         page_active = 1;

750                                 }

751                         }

752                         /*

753                          * If the page is still on the active list, move it

754                          * to the other end of the list. Otherwise it was

755                          * deactivated by age_page_down and we exit successfully.

756                          */

757                         if (page_active || PageActive(page)) {       //对于还不能转入不活跃页面的队列就移到队列尾

758                                 list_del(page_lru);

759                                 list_add(page_lru, &active_list);

760                         } else {

761                                 ret = 1;

762                                 if (oneshot)

763                                 break;

764                         }

765                 }

766                 spin_unlock(&pagemap_lru_lock);

767                

768                 return ret;

769          }

        swap_out()。先讲解下一必要知识点:内核中不是每个映射的页面都在内存中,有磁盘映射的页面在磁盘中,所以“驻内页面”所有建立了映射的页面的一个子集,其大小为mm->rss。

==================== mm/vmscan.c 297 378 ====================

[kswapd()>do_try_to_free_pages()>refill_inactive()>swap_out()]

297          /*

298           * Select the task with maximal swap_cnt and try to swap out a page.

299           * N.B. This function returns only 0 or 1.  Return values != 1 from

300           * the lower level routines result in continued processing.

301           */

302          #define SWAP_SHIFT 5

303          #define SWAP_MIN 8

304        

305          static int swap_out(unsigned int priority, int gfp_mask)

306          {

307                     int counter;

308                     int __ret = 0;

309                    

310                     /*

311                      * We make one or two passes through the task list, indexed by

312                      * assign = {0, 1}:

313                      *   Pass 1: select the swappable task with maximal RSS that has

314                      *         not yet been swapped out.

315                      *   Pass 2: re-assign rss swap_cnt values, then select as above.

316                      *

317                      * With this approach, there's no need to remember the last task

318                      * swapped out.  If the swap-out fails, we clear swap_cnt so the

319                      * task won't be selected again until all others have been tried.

320                      *

321                      * Think of swap_cnt as a "shadow rss" - it tells us which process

322                      * we want to page out (always try largest first).

323                      */

324                     counter = (nr_threads << SWAP_SHIFT) >> priority;     //值为内核进程、线程数量和优先级来决定,最大值为32*nr_thread。表示换出页面的决心。

325                     if (counter < 1)

326                             counter = 1;

327                    

328                     for (; counter >= 0; counter--) {

329                             struct list_head *p;

330                             unsigned long max_cnt = 0;

331                             struct mm_struct *best = NULL;

332                             int assign = 0;

333                             int found_task = 0;

334                             select:

335                             spin_lock(&mmlist_lock);

336                             p = init_mm.mmlist.next;       //内核中所有进程在一个循环队列中,init_mm.mmlist是内核中运行的第一个进程,是其他进程的祖宗

337                             for (; p != &init_mm.mmlist; p = p->next) {      //遍历所有进程,找到 mm->swap_cnt 最大的进程,其中swap_cnt表示进程中未被检查的页面数

338                                     struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);

339                                     if (mm->rss <= 0)  

340                                             continue;

341                                     found_task++;

342                                     /* Refresh swap_cnt? */

343                                     if (assign == 1) {          //将mm->rss拷贝到mm->cnt

344                                             mm->swap_cnt = (mm->rss >> SWAP_SHIFT);   

345                                             if (mm->swap_cnt < SWAP_MIN)

346                                                     mm->swap_cnt = SWAP_MIN;

347                                     }

348                                     if (mm->swap_cnt > max_cnt) {        //找到swap_cnt值最大的页面

349                                             max_cnt = mm->swap_cnt;

350                                             best = mm;

351                                     }

352                             }

353                            

354                             /* Make sure it doesn't disappear */

355                             if (best)

356                                     atomic_inc(&best->mm_users);      //递增mm_struct中的使用计数,使这个数据结构多一个用户而不被释放

357                             spin_unlock(&mmlist_lock);

358                            

359                             /*

360                              * We have dropped the tasklist_lock, but we

361                              * know that "mm" still exists: we are running

362                              * with the big kernel lock, and exit_mm()

363                              * cannot race with us.

364                              */

365                             if (!best) {

366                                     if (!assign && found_task > 0) {

367                                             assign = 1;

368                                             goto select;

369                                     }

370                                     break;

371                             } else {

372                                     __ret = swap_out_mm(best, gfp_mask);         //将找到的进程的部分页面换出,下文介绍

373                                     mmput(best);        //还原mm_struct中的使用计数

374                                     break;

375                             }

376                     }

377                     return __ret;

378          }

        swap_out_mm( ):

==================== mm/vmscan.c 257 295 ====================

[kswapd()>do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()]

257          static int swap_out_mm(struct mm_struct * mm, int gfp_mask)

258          {

259                 int result = 0;

260                 unsigned long address;

261                 struct vm_area_struct* vma;

262                

263                 /*

264                  * Go through process' page directory.

265                  */

266                

267                 /*

268                  * Find the proper vm-area after freezing the vma chain

269                  * and ptes.

270                  */

271                 spin_lock(&mm->page_table_lock);

272                 address = mm->swap_address;                //进程起始的虚拟地址

273                 vma = find_vma(mm, address);                    //mm表示的进程中虚存区间列表中,找到第一个vm_area_struct->vm_end > addressd的vma

274                 if (vma) {

275                         if (address < vma->vm_start)

276                                 address = vma->vm_start;

277                        

278                         for (;;) {                                                  //尝试该进程中每一个vma,看能否从中换出页面来。

279                                 result = swap_out_vma(mm, vma, address, gfp_mask);                      //遍历该vma中每个页面,试图换出页面的函数,成功返回1,否则又尝试下一个虚存区间

280                                 if (result)

281                                         goto out_unlock;

282                                 vma = vma->vm_next;

283                                 if (!vma)

284                                         break;

285                                 address = vma->vm_start;

286                         }

287                 }

288                 /* Reset to 0 when we reach the end of address space */

289                 mm->swap_address = 0;     //进程中每个虚存区间都尝试后将其置0

290                 mm->swap_cnt = 0;

291                

292         out_unlock:

293                 spin_unlock(&mm->page_table_lock);

294                 return result;

295          }

        从 swap_out_vma( )一层一层的看代码,swap_out_vma( ) -->swap_out_pgd( ) -->swap_out_pmd( ) -->try_to_swap_out( ),我们直接来看最关键的 try_to_swap_out( ):换出页面表项pte指向的内存页面(用紫色表示该函数)。

==================== mm/vmscan.c 27 56 ====================

[kswapd()>do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()>swap_out_pmd()>try_to_swap_out()]

27          /*

28           * The swap-out functions return 1 if they successfully

29           * threw something out, and we got a free page. It returns

30           * zero if it couldn't do anything, and any other value

31           * indicates it decreased rss, but the page was shared.

32           *

33           * NOTE! If it sleeps, it *must* return 1 to make sure we

34           * don't continue with the swap-out. Otherwise we may be

35           * using a process that no longer actually exists (it might

36           * have died while we slept).

37           */

38          static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address,

              pte_t * page_table, int gfp_mask)

39          {

40                 pte_t pte;

41                 swp_entry_t entry;

42                 struct page * page;

43                 int onlist;

44                

45                 pte = *page_table;               //将函数传入的页面表项参数读给pte

46                 if (!pte_present(pte))            //判断pte所指向的页面是否在内存中

47                         goto out_failed;

48                 page = pte_page(pte);         //将页面表项内容转换为指向物理页面的指针

                    /*#define VALID_PAGE(page) ( (page -mem_map) < max_mapnr ),判断page在mem_map数组中的下标是否小于max_mapnr,若大于表示页面在外部设备。pageReserve表示页面不允许换出

49                 if ((!VALID_PAGE(page)) || PageReserved(page))    

50                         goto out_failed;

51                

52                 if (!mm->swap_cnt)

53                         return 1;

54                

55                 mm->swap_cnt--;                 //未考察的页面减1

56                

        上述中如果pte_present( )判断的结果是错误的,将返回上层函数跳到下个页面,当该页面表中页面走完后返回 上上层函数 跳到下个页面表,同理页面表走完后,就跳到下个虚存区间。

        上述判断完成后,就得到一个具体的页面,下面就是对这个页面进行具体考察,继续阅读try_to_swap_out( ):

==================== mm/vmscan.c 57 74 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()>swap_out_pmd()>try_to_swap_out()]

57         onlist = PageActive(page);         //#define PageActive(page) test_bit(PG_active, &(page)->flags)。检查页面PG_active标志位,判断页面是否活跃

58         /* Don't look at this pte if it's been accessed recently. */

59         if (ptep_test_and_clear_young(page_table)) {                   //页面能否换出取决于页面最近是否受到访问(年轻),该函数就是判断页面是否“年轻”,“年轻”则进入,见下文介绍

60                 age_page_up(page);                //这个页面若是在不活跃对列中则增加页面“观察后换出”的时间,下文介绍

61                 goto out_failed;

62         }

63         if (!onlist)               //页面不“年轻”并且处于不活跃状态,进入处理

64         /* The page is still mapped, so it can't be freeable... */

65                 age_page_down_ageonly(page);                   //不能因为页面“不年轻”就将其立马换出,要待查看时间过后才能将页面换出(页面寿命耗尽),该函数就是减少页面寿命,下文介绍

66        

67         /*

68          * If the page is in active use by us, or if the page

69          * is in active use by others, don't unmap it or

70          * (worse) start unneeded IO.

71          */

72         if (page->age > 0)                  //页面寿命未耗尽

73                 goto out_failed;

74        

        ptep_test_and_clear_young():

==================== include/asm-i386/pgtable.h 285 285 ====================

285        static inline  int ptep_test_and_clear_young(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_ACCESSED, ptep); }

        介绍下上述 _PAGE_BIT_ACCESSED 这个页表项标志位:对于已经建立映射的虚拟地址,当访问到对应的物理页面时内存映射机制就自动将该标志位置为1。所以当 test_and_clear_bit() 中的 pte_young() 函数返回1则表示,上一次调用 swap_out_mm() 至今页面至少被访问一次,不能转出。判断为完成后将这个标志位置0,为下次检查这个标志位做准备。ptep_test_and_clear_young() 函数内还有 SetGageReference() 函数:如果页面还活跃就将 PG_reference 标志位置1,表示受到访问的信息转移到 page 数据结构中。

        age_page_up():

==================== mm/swap.c 125 138 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()>age_page_up()]

125          void age_page_up(struct page * page)

126          {

127                 /*

128                  * We're dealing with an inactive page, move the page

129                  * to the active list.

130                  */

131                 if (!page->age)

132                         activate_page(page);

133                

134                 /* The actual page aging bit */

135                 page->age += PAGE_AGE_ADV;

136                 if (page->age > PAGE_AGE_MAX)

137                         page->age = PAGE_AGE_MAX;

138          }

        这里先解决个问题:问什么页面是有映射的但是却在60行age_page_up() 的注释中看到可能在不活跃队列中呢?答案是:页面因缺页异常而恢复一个不活跃页面的映射时不是马上就将其加入活跃页面队列中,而将这项工作留给前边看到的 page_launder() 来处理,当系统比较闲时来处理,所以是有可能产生上述问题的现象的。

        接着介绍上文的函数 age_page_down_ageonly():

==================== mm/swap.c 103 110 ====================

103          /*

104           * We use this (minimal) function in the case where we

105           * know we can't deactivate the page (yet).

106           */

107          void age_page_down_ageonly(struct page * page)

108          {

109                 page->age /= 2;

110          }

        到了这里的页面原则上已经是可换出对象了,继续阅读代码:

==================== mm/vmscan.c 75 108 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()>swap_out_pmd()>try_to_swap_out()]

75         if (TryLockPage(page))     //需要互斥操作,将页面锁住   #define TryLockPage(page)  test_and_set_bit(PG_locked, &(page)->flags),若返回值为1表示页面已经先被其他进程锁住了

76                 goto out_failed;

77         

78         /* From this point on, the odds are that we're going to

79          * nuke this pte, so read and clear the pte.  This hook

80          * is needed on CPUs which update the accessed and dirty

81          * bits in hardware.

82          */

83         pte = ptep_get_and_clear(page_table);     //再读一次页面表项内容并把表项内容清0,再读一次可保证信息正确,因为存在多核处理器的情况,页面表项随时可能有变化

84         flush_tlb_page(vma, address);

85         

86         /*

87          * Is the page already in the swap cache? If so, then

88          * we can just drop our reference to it without doing

89          * any IO - it's already up-to-date on disk.

90          *

91          * Return 0, as we didn't actually free any real

92          * memory, and we should just continue our scan.

93          */

94         if (PageSwapCache(page)) {                    //#define PageSwapCache(page) test_bit(PG_swap_cache, &(page)->flags),PG_swap_cache标志位为1表示page结构swapper_space中

95                 entry.val = page->index;                 //index是个32位索引项,是指向交换设备上映射的指针

96                 if (pte_dirty(pte))                       //在swapper_space中的页面也有干净和脏页面,如果是脏页面则进入语句

97                         set_page_dirty(page);                //将页面转入脏页面队列

98                  set_swap_pte:

99                 swap_duplicate(entry);                     //对索引项进行检测并将相应的盘上页面使用计数递增,下文介绍

100                 set_pte(page_table, swp_entry_to_pte(entry));                  //页面表项由内存映射改为盘上映射

101                  drop_pte:                //到了这里我们尝试的进程的内存页面映射就已经断开了,下面试试该页面能不能转为不活跃页面

102                 UnlockPage(page);

103                 mm->rss--;

104                 deactivate_page(page);                     //试图将页面置为不活跃页面   下文介绍

105                 page_cache_release(page);                      //递减页面使用计数,释放使用计数为0的页面

106                  out_failed:

107                 return 0;             //处理完这个页面后要返回零,这样swap_mm才能依次考虑这个进程的所有页面

108         }

        swap_duplicate():

==================== mm/swapfile.c 820 871 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()>swap_duplicate()]

820          /*

821           * Verify that a swap entry is valid and increment its swap map count.

822           * Kernel_lock is held, which guarantees existance of swap device.

823           *

824           * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as

825           * "permanent", but will be reclaimed by the next swapoff.

826           */

827          int swap_duplicate(swp_entry_t entry)         //以前讲过,页面在盘上则页面表项pte_t 即变为 swp_entry_t 指向盘上页面

828          {

829                     struct swap_info_struct * p;    

830                     unsigned long offset, type;

831                     int result = 0;

832                     

833                     /* Swap entry 0 is illegal */

834                     if (!entry.val)

835                             goto out;

836                     type = SWP_TYPE(entry);

837                     if (type >= nr_swapfiles)

838                             goto bad_file;

839                     p = type + swap_info;

840                     offset = SWP_OFFSET(entry);     

841                     if (offset >= p->max)

842                             goto bad_offset;

843                     if (!p->swap_map[offset])    //如果盘上页面映射建立,则该页面在此数组的相应位置就有着该页面的“共享计数

844                             goto bad_unused;

845                     /*

846                      * Entry is valid, so increment the map count.

847                      */

848                     swap_device_lock(p);

849                     if (p->swap_map[offset] < SWAP_MAP_MAX)       //递增后的“共享计数”不能大于 SWAP_MAP_MAX

850                             p->swap_map[offset]++;

851                     else {

852                             static int overflow = 0;

853                             if (overflow++ < 5)

854                             printk("VM: swap entry overflow\n");

855                             p->swap_map[offset] = SWAP_MAP_MAX;

856                     }

857                     swap_device_unlock(p);

858                     result = 1;

859                      out:

860                     return result;

861                     

862                      bad_file:

863                     printk("Bad swap file entry %08lx\n", entry.val);

864                     goto out;

865                      bad_offset:

866                     printk("Bad swap offset entry %08lx\n", entry.val);

867                     goto out;

868                      bad_unused:

869                     printk("Unused swap offset entry in swap_dup %08lx\n", entry.val);

870                     goto out;

871          }

        deactivate_page,将页面置为不活跃页面:

==================== mm/swap.c 189 194 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()>deactivate_page()]

189            void deactivate_page(struct page * page)

190            {

191                    spin_lock(&pagemap_lru_lock);

192                    deactivate_page_nolock(page);

193                    spin_unlock(&pagemap_lru_lock);

194            }

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()>deactivate_page()>deactivate_page_nolock()]

154          /**

155           * (de)activate_page - move pages from/to active and inactive lists

156           * @page: the page we want to move

157           * @nolock - are we already holding the pagemap_lru_lock?

158           *

159           * Deactivate_page will move an active page to the right

160           * inactive list, while activate_page will move a page back

161           * from one of the inactive lists to the active list. If

162           * called on a page which is not on any of the lists, the

163           * page is left alone.

164           */

165          void deactivate_page_nolock(struct page * page)

166          {

167                     /*

168                      * One for the cache, one for the extra reference the

169                      * caller has and (maybe) one for the buffers.

170                      *

171                      * This isn't perfect, but works for just about everything.

172                      * Besides, as long as we don't move unfreeable pages to the

173                      * inactive_clean list it doesn't need to be perfect...

174                      */

175                     int maxcount = (page->buffers ? 3 : 2);  //页面空闲使用计数为0,分配后为1,之后进程映射或文件读物都++,文件读写不建立映射,但是否可以将页面转入不活跃队列只取决于有无进程映射,所以需要判断是否                                                                                       

                                                                                       文件读写。所以当page->buffers非0时,maxcount = 3表示之前程序断开的是最后一个映射已是最后一个映射

176                     page->age = 0;   

177                     ClearPageReferenced(page);

178                    

179                     /*

180                      * Don't touch it if it's not on the active list.

181                      * (some pages aren't on any list at all)

182                      */

183                     if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) {    //前两个条件不多说,第三个条件是用作模拟硬盘的 ramdisk 页面永远不能是活跃页面

184                             del_page_from_active_list(page);      //脱离活跃队列,下文介绍

185                             add_page_to_inactive_dirty_list(page);    //加入脏不活跃队列,下文介绍

186                     }

187          }

       内核中只有脏不活跃队列,但是每个管理区都有个干净不活跃队列,加入不活跃队列的页面都是先加入脏不活跃队列中:add_page_to_inactive_dirty_list

==================== include/linux/swap.h 234 240 ====================

234          #define del_page_from_active_list(page) { \

235                 list_del(&(page)->lru); \

236                 ClearPageActive(page); \              //设置page结构中PG_active标志位为0

237                 nr_active_pages--; \

238                 DEBUG_ADD_PAGE \

239                 ZERO_PAGE_BUG \

240          }

==================== include/linux/swap.h 217 224 ====================

217          #define add_page_to_inactive_dirty_list(page) { \                

218                     DEBUG_ADD_PAGE \                                                  

219                     ZERO_PAGE_BUG \                                                   

220                     SetPageInactiveDirty(page); \                 //设置page结构中PG_inactive_dirty标志位为1                    

221                     list_add(&(page)->lru, &inactive_dirty_list); \                   

222                     nr_inactive_dirty_pages++; \                                      

223                     page->zone->inactive_dirty_pages++; \                             

224          }                                                                

        上述两个函数都为改变页面的使用计数,回到try_to_swap_out函数第105行,递减页面使用计数:

==================== include/linux/pagemap.h 34 34 ====================

34          #define page_cache_release(x) __free_page(x)

==================== include/linux/mm.h 379 379 ====================

379      #define __free_page(page) __free_pages((page), 0)

==================== mm/page_alloc.c 549 553 ====================

549          void __free_pages(struct page *page, unsigned long order)

550          {

551                 if (!PageReserved(page) && put_page_testzero(page))        //put_page_testzero(),页面使用计数减1,并判断使用计数是否为0

552                 __free_pages_ok(page, order);          //将该页面释放

553          }

==================== include/linux/mm.h 152 152 ====================

152         #define put_page_testzero(p) atomic_dec_and_test(&(p)->count)

        到这里应该纳闷了,要是页面没在swapper_space对列中呢?继续阅读try_to_swap_out代码:

==================== mm/vmscan.c 110 157 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()]

110                 /*

111                  * Is it a clean page? Then it must be recoverable

112                  * by just paging it in again, and we can just drop

113                  * it..

114                  *

115                  * However, this won't actually free any real

116                  * memory, as the page will just be in the page cache

117                  * somewhere, and as such we should just continue

118                  * our scan.

119                  *

120                  * Basically, this just makes it possible for us to do

121                  * some real work in the future in "refill_inactive()".

122                  */

123                 flush_cache_page(vma, address);

124                 if (!pte_dirty(pte))                                  //static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; },通过页面表项判断是否干净页面。

125                         goto drop_pte;                      //解除该页面映射

126                

127                 /*

128                  * Ok, it's really dirty. That means that

129                  * we should either create a new swap cache

130                  * entry for it, or we should write it back

131                  * to its own backing store.

132                  */

133                 if (page->mapping) {          //是否通过mmap()建立的映射,mapping指向相应的address_space结构

134                         set_page_dirty(page);           //到达这里的页面都是脏页面。下文介绍

135                         goto drop_pte;

136                 }

137                

138                 /*

139                  * This is a dirty, swappable page.  First of all,

140                  * get a suitable swap entry for it, and make sure

141                  * we have the swap cache set up to associate the

142                  * page with that swap entry.

143                  */

144                 entry = get_swap_page();                      //到了这里的页面已经过4层条件筛选了,这样的页面要为之分配一个盘上页面:#define  get_swap_page()  __get_swap_page(1)

145                 if (!entry.val)                                       //分配盘上页面失败

146                         goto out_unlock_restore;    /* No swap space left */

147                

148                 /* Add it to the swap cache and mark it dirty */

149                 add_to_swap_cache(page, entry);           //页面链入swapper_space队列及活跃队列中

150                 set_page_dirty(page);                            //将页面转入不活跃脏页面

151                 goto set_swap_pte;

152                

153                  out_unlock_restore:

154                 set_pte(page_table, pte);

155                 UnlockPage(page);

156                 return 0;

157          }

        set_page_dirty( ):

==================== include/linux/mm.h 187 191 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()>set_page_dirty()]

187          static inline void set_page_dirty(struct page * page)

188          {

189                 if (!test_and_set_bit(PG_dirty, &page->flags))           //如果页面表项中_PAGE_DIRTY标志位为1,则将page->PG_dirty标志位置1

190                         __set_page_dirty(page);                     //页面移至该文件映射的脏队列

191           }

==================== mm/filemap.c 134 147 ====================

134                  /*

135                   * Add a page to the dirty page list.

136                   */

137              void __set_page_dirty(struct page *page)                       

138              {

139                     struct address_space *mapping = page->mapping;

140                    

141                     spin_lock(&pagecache_lock);

142                     list_del(&page->list);

143                     list_add(&page->list, &mapping->dirty_pages);

144                     spin_unlock(&pagecache_lock);

145                    

146                     mark_inode_dirty_pages(mapping->host);

147                }

        补充:

                    实际页面写出是前边介绍 page_launder( ) 的事。

                    refill_inactive( )如果一直未找到页面转入不活跃队列会重新在循环一次,新的一次循环中可能有的页面就“老化”而瞒住要求了,但是要是一直不能找到呢?就会调用oom_kill杀掉这个进程。

猜你喜欢

转载自blog.csdn.net/tyhaotingdege/article/details/79787290