存储管理（二）--学习《Linux内核源代码情景分析》第二章（方便理解，内容在注释中）

2.7 物理页面的分配

分配若干页面时，分配页面用于DMA（direct memory assess）当然应该是连续的，其实出于物理存储空间质地一致性考虑，内存页面都是连续分配的。

分配若干页面时可调用alloc_page( )来完成，实际上内核出于物理存储空间质地是否一致的考虑，会有两个alloc_page( )，具体使用哪一个由条件编译选项CONFIG_DISCONTIGMEM来决定：

43 #ifdef CONFIG_DISCONTIGMEM //用于NUMA(non-uniform memory assess非均匀介质)，这里是广义的NUMA，表示地址不连续的物理空间及质地不均匀的物理空间

91 /*

92 * This can be refined. Currently, tries to do round robin, instead

93 * should do concentratic circle search, starting from current node.

94 */

95 struct page * alloc_pages(int gfp_mask, unsigned long order) //gfp_mask是整数，表示分配策略。order表示要求分配的物理页面值，为2的order次方

97 struct page *ret = 0;

98 pg_data_t *start, *temp; //将质地均匀且地址连续的物理空间称之为一个节点：pg_data_t

99 #ifndef CONFIG_NUMA

100 unsigned long flags;

101 static pg_data_t *next = 0;

102 #endif

103

104 if (order >= MAX_ORDER)

105 return NULL;

106 #ifdef CONFIG_NUMA //在NUMA结构中，可通过 NODE_DATA(numa_node_id())找到cpu所在节点的 pg_data_t队列

107 temp = NODE_DATA(numa_node_id());

108 #else //在UMA结构但是物理空间非连续，也有个pg_data_t队列pgdat_list，分配页面时轮流查询各个节点，以求各个节点负载平衡。

109 spin_lock_irqsave(&node_lock, flags);

110 if (!next) next = pgdat_list;

111 temp = next;

112 next = next->node_next;

113 spin_unlock_irqrestore(&node_lock, flags);

114 #endif

115 start = temp;

116 while (temp) { //从pg_data_t 队列头找到尾，各节点尝试分配页面

117 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))

118 return(ret);

119 temp = temp->node_next;

120 }

121 temp = pgdat_list;

122 while (temp != start) { //从pg_data_t对列尾找到头，各节点尝试分配页面

123 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order))) //alloc_pages_pgdat下面介绍

124 return(ret);

125 temp = temp->node_next;

126 }

127 return(0);

128 }

alloc_pages_pgdat( )：

85 static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,

86 unsigned long order)

87 {

88 return __alloc_pages(pgdat->node_zonelists + gfp_mask, order); //gfp_mask为数组下标，下边介绍

89 }

和下边代码表示的UMA结构中的 alloc_pages 对比，可以发现UMA 结构中只有一个节点 contig_page_data：

343 #ifndef CONFIG_DISCONTIGMEM //与上述NUMA结构预编译宏相反，所以只会有一个被编译

344 static inline struct page * alloc_pages(int gfp_mask, unsigned long order)

345 {

346 /*

347 * Gets optimized away by the compiler.

348 */

349 if (order >= MAX_ORDER)

350 return NULL;

351 return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order); //下面介绍

352 }

__alloc_pages( )：

[alloc_pages()>__alloc_pages()]

270 /*

271 * This is the 'heart' of the zoned buddy allocator:

272 */

273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order) //两个参数：分配策略、需要分配的物理页面

274 {

275 zone_t **zone;

276 int direct_reclaim = 0;

277 unsigned int gfp_mask = zonelist->gfp_mask;

278 struct page * page;

279

280 /*

281 * Allocations put pressure on the VM subsystem.

282 */

283 memory_pressure++; //页面压力，很形象，分配页面时增加，归还时减少

284

285 /*

286 * (If anyone calls gfp from interrupts nonatomically then it

287 * will sooner or later tripped up by a schedule().)

288 *

289 * We are falling back to lower-level zones if allocation

290 * in a higher zone fails.

291 */

292

293 /*

294 * Can we take pages directly from the inactive_clean

295 * list?

296 */

297 if (order == 0 && (gfp_mask & __GFP_WAIT) &&

298 !(current->flags & PF_MEMALLOC)) //依次表示：单个页面、等待分配完成、不是用于管理目的，若满足将局部变量 direct_reclaim 置 1

299 direct_reclaim = 1; //表示页面短缺可以从该节点“干净不活跃页面”队列中回收页面。通常回收的页面不能和空闲页面一般连续成块，所以分配一个页面才如此

300

301 /*

302 * If we are about to get low on free pages and we also have

303 * an inactive page shortage, wake up kswapd.

304 */

305 if (inactive_shortage() > inactive_target / 2 && free_shortage())

306 wakeup_kswapd(0); //可分配页面短缺是唤醒该进程腾出一些页面

307 /*

308 * If we are about to get low on free pages and cleaning

309 * the inactive_dirty pages would fix the situation,

310 * wake up bdflush.

311 */

312 else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()

313 && nr_inactive_dirty_pages >= freepages.high)

314 wakeup_bdflush(0); //可分配页面短缺是唤醒该进程腾出一些页面

315

我们继续往下看如何分配连续的页面：

[alloc_pages()>__alloc_pages()]

316 try_again:

317 /*

318 * First, see if we have any zones with lots of free memory.

319 *

320 * We allocate free memory first because it doesn't contain

321 * any data ... DUH!

322 */

323 zone = zonelist->zones;

324 for (;;) { //在分配策略规定的所有管理区内循环

325 zone_t *z = *(zone++);

326 if (!z)

327 break;

328 if (!z->size)

329 BUG();

330

331 if (z->free_pages >= z->pages_low) { //各个管理区的空闲页面总数大于设定的最低点则进入

332 page = rmqueue(z, order); //试图从管理区中分配若干连续页面，下文介绍

333 if (page)

334 return page;

335 } else if (z->free_pages < z->pages_min &&

336 waitqueue_active(&kreclaimd_wait)) { //管理区的空闲页面总述小于设定的最低点 && 有进程kreclaimd_wait在等待队列中睡眠

337 wake_up_interruptible(&kreclaimd_wait); //唤醒kreclaimd_wait进程

338 }

339 }

340

rmqueue( )：

[alloc_pages()>__alloc_pages()>rmqueue()]

172 static struct page * rmqueue(zone_t *zone, unsigned long order)

173 {

174 free_area_t * area = zone->free_area + order; //一个管理区有很多空闲队列用free_area数组表示，area指针即指向所需大小的空闲队列队列头

175 unsigned long curr_order = order;

176 struct list_head *head, *curr;

177 unsigned long flags;

178 struct page *page;

179

180 spin_lock_irqsave(&zone->lock, flags); //不允许干扰

181 do {

182 head = &area->free_list;

183 curr = memlist_next(head);

184

185 if (curr != head) {

186 unsigned int index;

187

188 page = memlist_entry(curr, struct page, list); //从非空空闲队列取第一个page结构元素

189 if (BAD_RANGE(zone,page))

190 BUG();

191 memlist_del(curr); //将memlist_entry取到的page元素从队列中删除掉

192 index = (page - mem_map) - zone->offset; //管理区的起始页面号

193 MARK_USED(index, curr_order, area);

194 zone->free_pages -= 1 << order;

195

196 page = expand(zone, page, index, order, curr_order, area); //将分配到的大块物理页面块除去所需页面后剩余部分分解成小块链接到相应队列。下边介绍

197 spin_unlock_irqrestore(&zone->lock, flags);

198

199 set_page_count(page, 1); //page的count += 1。

200 if (BAD_RANGE(zone,page))

201 BUG();

202 DEBUG_ADD_PAGE

203 return page;

204 }

205 curr_order++;

206 area++;

207 } while (curr_order < MAX_ORDER);

208 spin_unlock_irqrestore(&zone->lock, flags);

209

210 return NULL;

211 }

函数expand( )：

[alloc_pages()>__alloc_pages()>rmqueue()>expand()]

150 static inline struct page * expand (zone_t *zone, struct page *page, //分配页面是按照2的次方来分

151 unsigned long index, int low, int high, free_area_t * area) //这里low表示需要的页面order。high表示当前空闲队列的curr_order，可分配的物理页面(2的 curr_order 次方个)

152 {

153 unsigned long size = 1 << high; //表示好多个页面

154

155 while (high > low) { //只有当需要的页面 == 可分配的物理页面时才会跳出循环返回page

156 if (BAD_RANGE(zone,page))

157 BUG();

158 area--;

159 high--;

160 size >>= 1; //size = size \ 2

161 memlist_add_head(&(page)->list, &(area)->free_list);

162 MARK_USED(index, high, area);

163 index += size;

164 page += size;

165 }

166 if (BAD_RANGE(zone,page))

167 BUG();

168 return page;

169 }

就这样，rmqueue( )一直向curr_order大的空闲队列扫描。如果rmqueue( )最后失败，则__alloc_page( )将通过for循环尝试分配策略规定的下个管理区，直到成功或遇见NULL而最终失败。如果rmqueue( )成功__alloc_page( )将返回page指针，指向分配的页面块的第一个页面，并将该page的count += 1。

如果尝试分配策略规定的所有管理区都失败了，还有两种方式分配页面：一是降低管理区最低的页面“水位”，二是将缓冲在管理区的“不活跃干净页面”考虑进来。继续alloc_pages( )

[alloc_pages()>__alloc_pages()]

341 /*

342 * Try to allocate a page from a zone with a HIGH

343 * amount of free + inactive_clean pages.

344 *

345 * If there is a lot of activity, inactive_target

346 * will be high and we'll have a good chance of

347 * finding a page using the HIGH limit.

348 */

349 page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); //先使用PAGES_HIGH

350 if (page)

351 return page;

352

353 /*

354 * Then try to allocate a page from a zone with more

355 * than zone->pages_low free + inactive_clean pages.

356 *

357 * When the working set is very large and VM activity

358 * is low, we're most likely to have our allocation

359 * succeed here.

360 */

361 page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); //再使用PAGES_LOW

362 if (page)

363 return page;

364

__alloc_pages_limit( )，内容是对应上述两种方式分配页面：

[alloc_pages()>__alloc_pages()>__alloc_pages_limit()]

213 #define PAGES_MIN 0

214 #define PAGES_LOW 1

215 #define PAGES_HIGH 2

216

217 /*

218 * This function does the dirty work for __alloc_pages

219 * and is separated out to keep the code size smaller.

220 * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)

221 */

222 static struct page * __alloc_pages_limit(zonelist_t *zonelist,

223 unsigned long order, int limit, int direct_reclaim)

224 {

225 zone_t **zone = zonelist->zones;

226

227 for (;;) {

228 zone_t *z = *(zone++);

229 unsigned long water_mark;

230

231 if (!z)

232 break;

233 if (!z->size)

234 BUG();

235

236 /*

237 * We allocate if the number of free + inactive_clean

238 * pages is above the watermark.

239 */

240 switch (limit) { //更改最低管理区最低页面“水位”

241 default:

242 case PAGES_MIN:

243 water_mark = z->pages_min;

244 break;

245 case PAGES_LOW:

246 water_mark = z->pages_low;

247 break;

248 case PAGES_HIGH:

249 water_mark = z->pages_high;

250 }

251

252 if (z->free_pages + z->inactive_clean_pages > water_mark) {

253 struct page *page = NULL;

254 /* If possible, reclaim a page directly. */

255 if (direct_reclaim && z->free_pages < z->pages_min + 8) //满足条件则进入，回收页面

256 page = reclaim_page(z); //从 inactive_clean_list( ) 中回收页面，在“页面定期换出”节末尾讲解

257 /* If that fails, fall back to rmqueue. */

258 if (!page)

259 page = rmqueue(z, order);

260 if (page)

261 return page;

262 }

263 }

264

265 /* Found nothing. */

266 return NULL;

267 }

如果还是不行，我们还是不能放弃，继续alloc_pages( )：

[alloc_pages()>__alloc_pages()]

365 /*

366 * OK, none of the zones on our zonelist has lots

367 * of pages free.

368 *

369 * We wake up kswapd, in the hope that kswapd will

370 * resolve this situation before memory gets tight.

371 *

372 * We also yield the CPU, because that:

373 * - gives kswapd a chance to do something

374 * - slows down allocations, in particular the

375 * allocations from the fast allocator that's

376 * causing the problems ...

377 * - ... which minimises the impact the "bad guys"

378 * have on the rest of the system

379 * - if we don't have __GFP_IO set, kswapd may be

380 * able to free some memory we can't free ourselves

381 */

382 wakeup_kswapd(0); //调用kswapd进程尝试分配页面

383 if (gfp_mask & __GFP_WAIT) { //如果很执着，分配不到页面宁可等待。进入if语句表示：为其他进程让路，因为其它进程可能释放些内存，并且让kswapd进程可能立即就被调用。

384 __set_current_state(TASK_RUNNING);

385 current->policy |= SCHED_YIELD;

386 schedule();

387 }

388

389 /*

390 * After waking up kswapd, we try to allocate a page

391 * from any zone which isn't critical yet.

392 *

393 * Kswapd should, in most situations, bring the situation

394 * back to normal in no time.

395 */

396 page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); //尝试以PAGES_MIN参数调用__alloc_pages_limit( )

397 if (page)

398 return page;

399

假如还是失败了呢？我们就要先看是哪个进程在要求分配页面了，如果是 kswapd 或 kreclaimd 进程，它们比较重要，这类进程的 task_struct.flags 字段的PF_MEMALLOC标志位为1，我们先看PF_MEMALLOC标志位为0的情况：

[alloc_pages()>__alloc_pages()]

400 /*

401 * Damn, we didn't succeed.

402 *

403 * This can be due to 2 reasons: 两种可能：可分配页面总数太少、总数不少但是物理块大小不满足

404 * - we're doing a higher-order allocation

405 * --> move pages to the free list until we succeed

406 * - we're /really/ tight on memory

407 * --> wait on the kswapd waitqueue until memory is freed

408 */

409 if (!(current->flags & PF_MEMALLOC)) {

410 /*

411 * Are we dealing with a higher order allocation?

412 *

413 * Move pages from the inactive_clean to the free list

414 * in the hope of creating a large, physically contiguous

415 * piece of free memory.

416 */

417 if (order > 0 && (gfp_mask & __GFP_WAIT)) {

418 zone = zonelist->zones;

419 /* First, clean some dirty pages. */

420 current->flags |= PF_MEMALLOC;

421 page_launder(gfp_mask, 1); //将脏页面洗净

422 current->flags &= ~PF_MEMALLOC; //不把PF_MEMALLOC设置为1就有可能使得函数在409-476行内递归

423 for (;;) { //主要任务是在各个管理区回收和释放“干净”页面

424 zone_t *z = *(zone++);

425 if (!z)

426 break;

427 if (!z->size)

428 continue;

429 while (z->inactive_clean_pages) { //回收和释放“干净”页面核心部分

430 struct page * page;

431 /* Move one page to the free list. */

432 page = reclaim_page(z);

433 if (!page)

434 break;

435 __free_page(page); //释放页面时会把空闲页面拼装成尽可能大的页面块

436 /* Try if the allocation succeeds. */

437 page = rmqueue(z, order); //每回收一个页面都调用一次试试看能否成功分配页面了

438 if (page)

439 return page;

440 }

441 }

442 }

443 /*

444 * When we arrive here, we are really tight on memory.

445 *

446 * We wake up kswapd and sleep until kswapd wakes us

447 * up again. After that we loop back to the start.

448 *

449 * We have to do this because something else might eat

450 * the memory kswapd frees for us and we need to be

451 * reliable. Note that we don't loop back for higher

452 * order allocations since it is possible that kswapd

453 * simply cannot free a large enough contiguous area

454 * of memory *ever*.

455 */

456 if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) { //回收了页面还是不够，就只能是可分配页面不够了

457 wakeup_kswapd(1);

458 memory_pressure++;

459 if (!order) //如果是单个分配单个页面就回到__alloc_page( )开头处

460 goto try_again;

461 /*

462 * If __GFP_IO isn't set, we can't wait on kswapd because

463 * kswapd just might need some IO locks /we/ are holding ...

464 *

465 * SUBTLE: The scheduling point above makes sure that

466 * kswapd does get the chance to free memory we can't

467 * free ourselves...

468 */

469 } else if (gfp_mask & __GFP_WAIT) {

470 try_to_free_pages(gfp_mask); //其实也是kswapd( )进程调用一个的函数

471 memory_pressure++;

472 if (!order)

473 goto try_again;

474 }

475

476 }

477

如果是PF_MEMALLOC == 1或者是任无法分配内存，那我们就到了不惜代价的时候了，因为前边在调用__alloc_pages_limit( )实际上是有保留的，例如管理区可分配页面”水位“高于z -> pages_min，继续看__alloc_pages( )：

[alloc_pages()>__alloc_pages()]

478 /*

479 * Final phase: allocate anything we can!

480 *

481 * Higher order allocations, GFP_ATOMIC allocations and

482 * recursive allocations (PF_MEMALLOC) end up here.

483 *

484 * Only recursive allocations can use the very last pages

485 * in the system, otherwise it would be just too easy to

486 * deadlock the system...

487 */

488 zone = zonelist->zones;

489 for (;;) {

490 zone_t *z = *(zone++);

491 struct page * page = NULL;

492 if (!z)

493 break;

494 if (!z->size)

495 BUG();

496

497 /*

498 * SUBTLE: direct_reclaim is only possible if the task

499 * becomes PF_MEMALLOC while looping above. This will

500 * happen when the OOM killer selects this task for

501 * instant execution...93

502 */

503 if (direct_reclaim) {

504 page = reclaim_page(z);

505 if (page)

506 return page;

507 }

508

509 /* XXX: is pages_min/4 a good amount to reserve for this? */

510 if (z->free_pages < z->pages_min / 4 &&

511 !(current->flags & PF_MEMALLOC))

512 continue;

513 page = rmqueue(z, order);

514 if (page)

515 return page;

516 }

517

518 /* No luck.. */

519 printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);

520 return NULL;

521 }

2.8物理页面的定期换出

linux内核中专门设置了定期换出页面的线程kswapd。线程和进程的区别是：线程无自己独立的地址空间，使用的是内核的空间，除了Kswapd以外中断服务程序也是这样的。还有kswapd的代码是静态链接在内核上的

kswapd代码在mm/vmscan.c中，先看它的建立：

1146 static int __init kswapd_init(void)

1147 {

1148 printk("Starting kswapd v1.8\n");

1149 swap_setup(); //根据物理内存大小定义一个全局量page_cluster，这个参数表示一次性从磁盘读取到内存的数量，所以要视内存实际大小而定。

1150 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); //创建线程kswapd

1151 kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); //创建线程kreclaimd

1152 return 0;

1153 }

swap_setup()：

[kswapd_init()>swap_setup()]

293 /*

294 * Perform any setup for the swap system

295 */

296 void __init swap_setup(void)

297 {

298 /* Use a smaller cluster for memory <16MB or <32MB */

299 if (num_physpages < ((16 * 1024 * 1024) >> PAGE_SHIFT))

300 page_cluster = 2;

301 else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT))

302 page_cluster = 3;

303 else

304 page_cluster = 4;

305 }

在函数 kswapd_init 中创建了两个线程，这里只讲解kswapd线程。我们假定线程kswapd已创建成功，那么将从函数kswapd()开始执行：

==================== mm/vmscan.c 947 1046 ====================

947 /*

948 * The background pageout daemon, started as a kernel thread

949 * from the init process.

950 *

951 * This basically trickles out pages so that we have _some_

952 * free memory available even if there is no other activity

953 * that frees anything up. This is needed for things like routing

954 * etc, where we otherwise might have all activity going on in

955 * asynchronous contexts that cannot page things out.

956 *

957 * If there are applications that are active memory-allocators

958 * (most normal use), this basically shouldn't matter.

959 */

960 int kswapd(void *unused)

961 {

962 struct task_struct *tsk = current;

963

964 tsk->session = 1;

965 tsk->pgrp = 1;

966 strcpy(tsk->comm, "kswapd");

967 sigfillset(&tsk->blocked);

968 kswapd_task = tsk;

969

970 /*

971 * Tell the memory management that we're a "memory allocator",

972 * and that if we need more memory we should get access to it

973 * regardless (see "__alloc_pages()"). "kswapd" should

974 * never get caught in the normal page freeing logic.

975 *

976 * (Kswapd normally doesn't need memory anyway, but sometimes

977 * you need a small amount of memory in order to be able to

978 * page out something else, and this flag essentially protects

979 * us from recursively trying to free more memory as we're

980 * trying to free the first piece of memory in the first place).

981 */

982 tsk->flags |= PF_MEMALLOC;

983

984 /*

985 * Kswapd main loop.

986 */

987 for (;;) {

988 static int recalc = 0;

989

990 /* If needed, try to free some memory. */

991 if (inactive_shortage() || free_shortage()) {

/*下文介绍 inactive_shortage()。inactive_shortage()内存中可分配和周转的物理页面是否空缺。free_shortage()用来检测某个管理区直接可供分配页面是否小于最低水平。*/

992 int wait = 0;

993 /* Do we need to do some synchronous flushing? */

994 if (waitqueue_active(&kswapd_done)) //查看kswapd_done 队列中是否有函数在等待并将结果作为参数传给do_try_to_free_pages，内核中设备驱动的底层函数将挂入这个队列，下文介绍

995 wait = 1;

996 do_try_to_free_pages(GFP_KSWAPD, wait); //设法换出或释放若干页面

997 }

998

999 /*

1000 * Do some (very minimal) background scanning. This

1001 * will scan all pages on the active list once

1002 * every minute. This clears old referenced bits

1003 * and moves unused pages to the inactive list.

1004 */

1005 refill_inactive_scan(6, 0);

1006

1007 /* Once a second, recalculate some VM stats. */

1008 if (time_after(jiffies, recalc + HZ)) {

1009 recalc = jiffies;

1010 recalculate_vm_stats();

1011 }

1012

1013 /*

1014 * Wake up everybody waiting for free memory

1015 * and unplug the disk queue.

1016 */

1017 wake_up_all(&kswapd_done);

1018 run_task_queue(&tq_disk);

1019

1020 /*

1021 * We go to sleep if either the free page shortage

1022 * or the inactive page shortage is gone. We do this

1023 * because:

1024 * 1) we need no more free pages or

1025 * 2) the inactive pages need to be flushed to disk,

1026 * it wouldn't help to eat CPU time now ...

1027 *

1028 * We go to sleep for one second, but if it's needed

1029 * we'll be woken up earlier...

1030 */

1031 if (!free_shortage() || !inactive_shortage()) {

1032 interruptible_sleep_on_timeout(&kswapd_wait, HZ); //进入睡眠，HZ表示一秒钟多少个时钟中断，这里也就是1秒唤醒一次。当然有时候内核会不到1秒就唤醒一次，比如分配不到页面时。

1033 /*

1034 * If we couldn't free enough memory, we see if it was

1035 * due to the system just not having enough memory.

1036 * If that is the case, the only solution is to kill

1037 * a process (the alternative is enternal deadlock).

1038 *

1039 * If there still is enough memory around, we just loop

1040 * and try free some more memory...

1041 */

1042 } else if (out_of_memory()) {

1043 oom_kill();

1044 }

1045 }

1046 }

kswapd线程可分为两部分：一部分在页面短缺才进行，目的是断开页面映射，使之从活跃状态变为不活跃状态。二部分是将不活跃脏页面写入交换设备变为不活跃干净页面，或者进而回收一些空闲页面。

先看第一部分，for循环中 inactive_shortage() 表示内存中可分配和周转的物理页面是否空缺：

==================== mm/vmscan.c 805 822 ====================

[kswapd()>inactive_shortage()]

805 /*

806 * How many inactive pages are we short?

807 */

808 int inactive_shortage(void)

809 {

810 int shortage = 0;

811

812 shortage += freepages.high; //系统应该维持的可分配页面量为freepages.high、inactive_target之和

813 shortage += inactive_target;

814 shortage -= nr_free_pages(); //nr_free_pages() 计算多少空闲页面

815 shortage -= nr_inactive_clean_pages();

816 shortage -= nr_inactive_dirty_pages;

817

818 if (shortage > 0)

819 return shortage;

820

821 return 0;

822 }

do_try_to_free_pages()之前要判断kswapd_done 队列中是否有函数在等待，这个任务由waitqueue_active()完成：

==================== include/linux/wait.h 152 161 ====================

[kswapd()>waitqueue_active()]

152 static inline int waitqueue_active(wait_queue_head_t *q)

153 {

154 #if WAITQUEUE_DEBUG

155 if (!q)

156 WQ_BUG();

157 CHECK_MAGIC_WQHEAD(q);

158 #endif

159

160 return !list_empty(&q->task_list);

161 }

do_try_to_free_pages()：

==================== mm/vmscan.c 907 941 ====================

[kswapd()>do_try_to_free_pages()]

907 static int do_try_to_free_pages(unsigned int gfp_mask, int user)

908 {

909 int ret = 0;

910

911 /*

912 * If we're low on free pages, move pages from the

913 * inactive_dirty list to the inactive_clean list.

914 *

915 * Usually bdflush will have pre-cleaned the pages

916 * before we get around to moving them to the other

917 * list, so this is a relatively cheap operation.

918 */

919 if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +

920 nr_inactive_clean_pages())

921 ret += page_launder(gfp_mask, user); //把不活跃状态的脏页面洗净，使它们变为立即可分配状态。见下文介绍

922

923 /*

924 * If needed, we move pages from the active list

925 * to the inactive list. We also "eat" pages from

926 * the inode and dentry cache whenever we do this.

927 */

928 if (free_shortage() || inactive_shortage()) {

929 shrink_dcache_memory(6, gfp_mask); //在讲解page_launder()后将介绍将介绍

930 shrink_icache_memory(6, gfp_mask); //在讲解page_launder()后将介绍将介绍

931 ret += refill_inactive(gfp_mask, user); //很重要，在讲解page_launder()后将介绍将介绍

932 } else {

933 /*

934 * Reclaim unused slab cache memory.

935 */

936 kmem_cache_reap(gfp_mask); //在讲解page_launder()后将介绍将介绍

937 ret = 1;

938 }

939

940 return ret;

941 }

page_launder()：

==================== mm/vmscan.c 465 670 ====================

[kswapd()>do_try_to_free_pages()>page_launder()]

465 /**

466 * page_launder - clean dirty inactive pages, move to inactive_clean list

467 * @gfp_mask: what operations we are allowed to do

468 * @sync: should we wait synchronously for the cleaning of pages

469 *

470 * When this function is called, we are most likely low on free +

471 * inactive_clean pages. Since we want to refill those pages as

472 * soon as possible, we'll make two loops over the inactive list,

473 * one to move the already cleaned pages to the inactive_clean lists

474 * and one to (often asynchronously) clean the dirty inactive pages.

475 *

476 * In situations where kswapd cannot keep up, user processes will

477 * end up calling this function. Since the user process needs to

478 * have a page before it can continue with its allocation, we'll

479 * do synchronous page flushing in that case.

480 *

481 * This code is heavily inspired by the FreeBSD source code. Thanks

482 * go out to Matthew Dillon.

483 */

484 #define MAX_LAUNDER (4 * (1 << page_cluster))

485 int page_launder(int gfp_mask, int sync)

486 {

/*launder_loop表示不活跃脏页面的扫描次数，扫描一次为0，扫描两次为1。cleaned_pages是累计洗净的页面数。maxlaunder是为了不重复的扫描不活跃脏页面而设置的计数*/

487 int launder_loop, maxscan, cleaned_pages, maxlaunder;

488 int can_get_io_locks;

489 struct list_head * page_lru;

490 struct page * page;

491

492 /*

493 * We can only grab the IO locks (eg. for flushing dirty

494 * buffers to disk) if __GFP_IO is set.

495 */

496 can_get_io_locks = gfp_mask & __GFP_IO;

497

498 launder_loop = 0;

499 maxlaunder = 0;

500 cleaned_pages = 0;

501

502 dirty_page_rescan:

503 spin_lock(&pagemap_lru_lock);

504 maxscan = nr_inactive_dirty_pages;

505 while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list && //扫描不活跃脏页面

506 maxscan-- > 0) {

507 page = list_entry(page_lru, struct page, lru);

508

509 /* Wrong page on list?! (list corruption, should not happen) */

510 if (!PageInactiveDirty(page)) ｛ //检查Page_Inactive_Dirty 标志位是否为1，否则该页面本就不应该存在这个队列中，就进入语句将其删除

511 printk("VM: page_launder, wrong page on list.\n");

512 list_del(page_lru);

513 nr_inactive_dirty_pages--;

514 page->zone->inactive_dirty_pages--;

515 continue;

516 }

517

518 /* Page is or was in use? Move it to the active list. */

519 if (PageTestandClearReferenced(page) || page->age > 0 || //检测几个条件才能最终判断页面是否属于不活跃干净页面，如果是用于模拟磁盘的页面也不能换出

520 (!page->buffers && page_count(page) > 1) ||

521 page_ramdisk(page)) {

522 del_page_from_inactive_dirty_list(page);

523 add_page_to_active_list(page);

524 continue;

525 }

526

527 /*

528 * The page is locked. IO in progress?

529 * Move it to the back of the list.

530 */

531 if (TryLockPage(page)) { //锁住的页面，移置不活跃脏队列尾

532 list_del(page_lru);

533 list_add(page_lru, &inactive_dirty_list);

534 continue;

535 }

536

537 /*

538 * Dirty swap-cache page? Write it out if

539 * last copy..

540 */

541 if (PageDirty(page)) { //脏页面标志位为1则进入

542 int (*writepage)(struct page *) = page->mapping->a_ops->writepage; //页面写操作函数

543 int result;

544

545 if (!writepage) //没页面写操作函数，就将页面放入 page_active 队列

546 goto page_active;

547

548 /* First time through? Move it to the back of the list */ //第一次扫描将页面放入对列尾

549 if (!launder_loop) {

550 list_del(page_lru);

551 list_add(page_lru, &inactive_dirty_list);

552 UnlockPage(page);

553 continue;

554 }

555

556 /* OK, do a physical asynchronous write to swap. */ //第二次扫描将页面写入交换设备

557 ClearPageDirty(page); //将PG_dirty标志位清0，这样若页面写出期间，程序再次进入page_launder也不会写出2次

558 page_cache_get(page); //递增页面使用计数

559 spin_unlock(&pagemap_lru_lock);

560

561 result = writepage(page); //写出到交换设备

562 page_cache_release(page); //递减页面使用计数

563

564 /* And re-start the thing.. */

565 spin_lock(&pagemap_lru_lock);

566 if (result != 1)

567 continue;

568 /* writepage refused to do anything */

569 set_page_dirty(page);

570 goto page_active;

571 }

572

573 /*

574 * If the page has buffers, try to free the buffer mappings

575 * associated with this page. If we succeed we either free

576 * the page (in case it was a buffercache only page) or we

577 * move the page to the inactive_clean list.

578 *

579 * On the first round, we should free all previously cleaned

580 * buffer pages

581 */

582 if (page->buffers) { //到这里页面肯定不是脏的了。判断是否是文件读写的页面

583 int wait, clearedbuf;

584 int freed_page = 0;

585 /*

586 * Since we might be doing disk IO, we have to

587 * drop the spinlock and take an extra reference

588 * on the page so it doesn't go away from under us.

589 */

590 del_page_from_inactive_dirty_list(page);

591 page_cache_get(page);

592 spin_unlock(&pagemap_lru_lock);

593

594 /* Will we do (asynchronous) IO? */

595 if (launder_loop && maxlaunder == 0 && sync)

596 wait = 2; /* Synchrounous IO */

597 else if (launder_loop && maxlaunder-- > 0)

598 wait = 1; /* Async IO */

599 else

600 wait = 0; /* No IO */

601

602 /* Try to free the page buffers. */

603 clearedbuf = try_to_free_buffers(page, wait); //释放这种页面，并将页面使用计数减1

604

605 /*

606 * Re-take the spinlock. Note that we cannot

607 * unlock the page yet since we're still

608 * accessing the page_struct here...

609 */

610 spin_lock(&pagemap_lru_lock);

611

612 /* The buffers were not freed. */

613 if (!clearedbuf) {

614 add_page_to_inactive_dirty_list(page);

615

616 /* The page was only in the buffer cache. */

617 } else if (!page->mapping) {

618 atomic_dec(&buffermem_pages);

619 freed_page = 1;

620 cleaned_pages++;

621

622 /* The page has more users besides the cache and us. */

623 } else if (page_count(page) > 2) {

624 add_page_to_active_list(page);

625

626 /* OK, we "created" a freeable page. */

627 } else /* page->mapping && page_count(page) == 2 */ {

628 add_page_to_inactive_clean_list(page);

629 cleaned_pages++;

630 }

631

632 /*

633 * Unlock the page and drop the extra reference.

634 * We can only do it here because we ar accessing

635 * the page struct above.

636 */

637 UnlockPage(page);

638 page_cache_release(page);

639

640 /*

641 * If we're freeing buffer cache pages, stop when

642 * we've got enough free memory.

643 */

644 if (freed_page && !free_shortage())

645 break;

646 continue;

647 } else if (page->mapping && !PageDirty(page)) { //页面不是脏的，并在某个address_space数组，表示已经洗净

648 /*

649 * If a page had an extra reference in

650 * deactivate_page(), we will find it here.

651 * Now the page is really freeable, so we

652 * move it to the inactive_clean list.

653 */

654 del_page_from_inactive_dirty_list(page);

655 add_page_to_inactive_clean_list(page);

656 UnlockPage(page);

657 cleaned_pages++;

658 } else { //无法处理页面，放回page_active队列

659 page_active:

660 /*

661 * OK, we don't know what to do with the page.

662 * It's no use keeping it here, so we move it to

663 * the active list.

664 */

665 del_page_from_inactive_dirty_list(page);

666 add_page_to_active_list(page);

667 UnlockPage(page);

668 }

669 }

670 spin_unlock(&pagemap_lru_lock);

这里补充：页面的使用计数在分配时记为1，之后不管是“文件读写缓冲 ”还是“进程使用 ”页面使用计数都会++。页面用作文件读写缓冲是不建立映射的，所以也有可能是不活跃状态。但是进程使用该页面一定是活跃状态。

上文说过launder_loop表示不活跃脏页面的扫描次数，并在launder_loop 为1时将页面转出，继续阅读page_launder就可以知道：

==================== mm/vmscan.c 671 697 ====================

[kswapd()>do_try_to_free_pages()>page_launder()]

671

672 /*

673 * If we don't have enough free pages, we loop back once

674 * to queue the dirty pages for writeout. When we were called

675 * by a user process (that /needs/ a free page) and we didn't

676 * free anything yet, we wait synchronously on the writeout of

677 * MAX_SYNC_LAUNDER pages.

678 *

679 * We also wake up bdflush, since bdflush should, under most

680 * loads, flush out the dirty pages before we have to wait on

681 * IO.

682 */

683 if (can_get_io_locks && !launder_loop && free_shortage()) { //空闲页面是否短缺、参数gfp_mask中__GFP_IO标识位是否为1

684 launder_loop = 1;

685 /* If we cleaned pages, never do synchronous IO. */

686 if (cleaned_pages)

687 sync = 0;

688 /* We only do a few "out of order" flushes. */

689 maxlaunder = MAX_LAUNDER;

690 /* Kflushd takes care of the rest. */

691 wakeup_bdflush(0);

692 goto dirty_page_rescan; //跳到502行

693 }

694

695 /* Return the number of pages moved to the inactive_clean list. */

696 return cleaned_pages;

697 }

继续看到 do_try_to_free_pages() 没读完的代码，此时若可分配的物理页面还是不足就应该从一下4各方面来回收：

shrink_dcache_memory(6, gfp_mask); //用于回收打开文件时建立，但在关闭文件后作为后备未立即回收的 dentry（代表目录项）数据结构

shrink_icache_memory(6, gfp_mask); //用于回收打开文件时建立，但在关闭文件后作为后备未立即回收的 inode（代表文件索引节点）数据结构

ret += refill_inactive(gfp_mask, user); //下文详解

kmem_cache_reap(gfp_mask); //回收内核运行中动态分配的数据结构，这种机构采用slab管理机制，这个管理机制下下章介绍

refill_inactive ()：

==================== mm/vmscan.c 824 905 ====================

[kswapd()>do_try_to_free_pages()>refill_inactive()]

824 /*

825 * We need to make the locks finer granularity, but right

826 * now we need this so that we can do page allocations

827 * without holding the kernel lock etc.

828 *

829 * We want to try to free "count" pages, and we want to

830 * cluster them so that we get good swap-out behaviour.

831 *

832 * OTOH, if we're a user process (and not kswapd), we

833 * really care about latency. In that case we don't try

834 * to free too many pages.

835 */

836 static int refill_inactive(unsigned int gfp_mask, int user) //user是 kswapd() 里994行传下来的参数，表示kswapd _done队列中是否有等待函数，这决定回收页面是否能慢慢来

837 {

838 int priority, count, start_count, made_progress;

839

840 count = inactive_shortage() + free_shortage();

841 if (user)

842 count = (1 << page_cluster);

843 start_count = count;

844

845 /* Always trim SLAB caches when memory gets low. */

846 kmem_cache_reap(gfp_mask); //回收slab机制管理的空闲物理页面

847

848 priority = 6;

849 do {

850 made_progress = 0;

851

/*need_resched：这个标志位是为强制调度设置，每当cpu进行系统调用或中断后从系统返回到用户，就会检查一次这个标志。但是kswapd是个内核线程，也就是说永不会返回用户空间，只有靠自己检查自己了*/

852 if (current->need_resched) { //current 表示当前进程的task_struct结构体，应该是个全局变量。如果其need_resched为1表示某中断服务程序要求调度

853 __set_current_state(TASK_RUNNING); //把本进程状态设置为 TASK_RUNNING，表示继续运行

854 schedule(); //让内核对该中断进行一次调度

855 }

856

857 while (refill_inactive_scan(priority, 1)) { //循环中主要做的事1，扫描活跃页面，试图将部分转换为不活跃，下文介绍

858 made_progress = 1;

859 if (--count <= 0)

860 goto done;

861 }

862

863 /*

864 * don't be too light against the d/i cache since

865 * refill_inactive() almost never fail when there's

866 * really plenty of memory free.

867 */

868 shrink_dcache_memory(priority, gfp_mask); //还得试试回收 inode 和 dentry 数据结构

869 shrink_icache_memory(priority, gfp_mask);

870

871 /*

872 * Then, try to page stuff out..

873 */

874 while (swap_out(priority, gfp_mask)) { //循环中主要做的事2，扫描某个进程，从其映射表中试图照出可以转入不活跃状态的页面，下文介绍

875 made_progress = 1;

876 if (--count <= 0)

877 goto done;

878 }

879

880 /*

881 * If we either have enough free memory, or if

882 * page_launder() will be able to make enough

883 * free memory, then stop.

884 */

885 if (!inactive_shortage() || !free_shortage())

886 goto done;

887

888 /*

889 * Only switch to a lower "priority" if we

890 * didn't make any useful progress in the

891 * last loop.

892 */

893 if (!made_progress)

894 priority--;

895 } while (priority >= 0); //priority不断减小表示优先级不断增大，也即是回收页面的力度不断增加

896

897 /* Always end on a refill_inactive.., may sleep... */

898 while (refill_inactive_scan(0, 1)) {

899 if (--count <= 0)

900 goto done;

901 }

902

903 done:

904 return (count < start_count);

905 }

refill_inactive_scan( )：

==================== mm/vmscan.c 699 769 ====================

699 /**

700 * refill_inactive_scan - scan the active list and find pages to deactivate

701 * @priority: the priority at which to scan

702 * @oneshot: exit after deactivating one page

703 *

704 * This function will scan a portion of the active list to find

705 * unused pages, those pages will then be moved to the inactive list.

706 */

707 int refill_inactive_scan(unsigned int priority, int oneshot)

708 {

709 struct list_head * page_lru;

710 struct page * page;

711 int maxscan, page_active = 0; //maxscan控制扫描页面数

712 int ret = 0;

713

714 /* Take the lock while messing with the list... */

715 spin_lock(&pagemap_lru_lock);

716 maxscan = nr_active_pages >> priority; //扫描页面数由优先级来决定的，优先级为0时才是对整个活跃页面进行扫描

717 while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {

718 page = list_entry(page_lru, struct page, lru);

719

720 /* Wrong page on list?! (list corruption, should not happen) */

721 if (!PageActive(page)) { //判断页面是否是活跃页面

722 printk("VM: refill_inactive, wrong page on list.\n");

723 list_del(page_lru);

724 nr_active_pages--;

725 continue;

726 }

727

728 /* Do aging on the pages. */

729 if (PageTestandClearReferenced(page)) { //判断页面近期是否受到访问，如果减少页面寿命后为0表示页面已经寿命耗尽

730 age_page_up_nolock(page);

731 page_active = 1;

732 } else {

733 age_page_down_ageonly(page);

734 /*

735 * Since we don't hold a reference on the page

736 * ourselves, we have to do our test a bit more

737 * strict then deactivate_page(). This is needed

738 * since otherwise the system could hang shuffling

739 * unfreeable pages from the active list to the

740 * inactive_dirty list and back again...

741 *

742 * SUBTLE: we can have buffer pages with count 1.

743 */

744 if (page->age == 0 && page_count(page) <= //光是寿命耗尽还不能将其转不活跃。如果页面不作文件读写缓冲但是count>1表示有进程映射，不能转入，等swap_out断开映射才有可能

745 (page->buffers ? 2 : 1)) { //page->buffers表示文件缓冲区

746 deactivate_page_nolock(page);

747 page_active = 0;

748 } else {

749 page_active = 1;

750 }

751 }

752 /*

753 * If the page is still on the active list, move it

754 * to the other end of the list. Otherwise it was

755 * deactivated by age_page_down and we exit successfully.

756 */

757 if (page_active || PageActive(page)) { //对于还不能转入不活跃页面的队列就移到队列尾

758 list_del(page_lru);

759 list_add(page_lru, &active_list);

760 } else {

761 ret = 1;

762 if (oneshot)

763 break;

764 }

765 }

766 spin_unlock(&pagemap_lru_lock);

767

768 return ret;

769 }

swap_out()。先讲解下一必要知识点：内核中不是每个映射的页面都在内存中，有磁盘映射的页面在磁盘中，所以“驻内页面”所有建立了映射的页面的一个子集，其大小为mm->rss。

==================== mm/vmscan.c 297 378 ====================

[kswapd()>do_try_to_free_pages()>refill_inactive()>swap_out()]

297 /*

298 * Select the task with maximal swap_cnt and try to swap out a page.

299 * N.B. This function returns only 0 or 1. Return values != 1 from

300 * the lower level routines result in continued processing.

301 */

302 #define SWAP_SHIFT 5

303 #define SWAP_MIN 8

304

305 static int swap_out(unsigned int priority, int gfp_mask)

306 {

307 int counter;

308 int __ret = 0;

309

310 /*

311 * We make one or two passes through the task list, indexed by

312 * assign = {0, 1}:

313 * Pass 1: select the swappable task with maximal RSS that has

314 * not yet been swapped out.

315 * Pass 2: re-assign rss swap_cnt values, then select as above.

316 *

317 * With this approach, there's no need to remember the last task

318 * swapped out. If the swap-out fails, we clear swap_cnt so the

319 * task won't be selected again until all others have been tried.

320 *

321 * Think of swap_cnt as a "shadow rss" - it tells us which process

322 * we want to page out (always try largest first).

323 */

324 counter = (nr_threads << SWAP_SHIFT) >> priority; //值为内核进程、线程数量和优先级来决定，最大值为32*nr_thread。表示换出页面的决心。

325 if (counter < 1)

326 counter = 1;

327

328 for (; counter >= 0; counter--) {

329 struct list_head *p;

330 unsigned long max_cnt = 0;

331 struct mm_struct *best = NULL;

332 int assign = 0;

333 int found_task = 0;

334 select:

335 spin_lock(&mmlist_lock);

336 p = init_mm.mmlist.next; //内核中所有进程在一个循环队列中，init_mm.mmlist是内核中运行的第一个进程，是其他进程的祖宗

337 for (; p != &init_mm.mmlist; p = p->next) { //遍历所有进程，找到 mm->swap_cnt 最大的进程，其中swap_cnt表示进程中未被检查的页面数

338 struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);

339 if (mm->rss <= 0)

340 continue;

341 found_task++;

342 /* Refresh swap_cnt? */

343 if (assign == 1) { //将mm->rss拷贝到mm->cnt

344 mm->swap_cnt = (mm->rss >> SWAP_SHIFT);

345 if (mm->swap_cnt < SWAP_MIN)

346 mm->swap_cnt = SWAP_MIN;

347 }

348 if (mm->swap_cnt > max_cnt) { //找到swap_cnt值最大的页面

349 max_cnt = mm->swap_cnt;

350 best = mm;

351 }

352 }

353

354 /* Make sure it doesn't disappear */

355 if (best)

356 atomic_inc(&best->mm_users); //递增mm_struct中的使用计数，使这个数据结构多一个用户而不被释放

357 spin_unlock(&mmlist_lock);

358

359 /*

360 * We have dropped the tasklist_lock, but we

361 * know that "mm" still exists: we are running

362 * with the big kernel lock, and exit_mm()

363 * cannot race with us.

364 */

365 if (!best) {

366 if (!assign && found_task > 0) {

367 assign = 1;

368 goto select;

369 }

370 break;

371 } else {

372 __ret = swap_out_mm(best, gfp_mask); //将找到的进程的部分页面换出，下文介绍

373 mmput(best); //还原mm_struct中的使用计数

374 break;

375 }

376 }

377 return __ret;

378 }

swap_out_mm( )：

==================== mm/vmscan.c 257 295 ====================

[kswapd()>do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()]

257 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)

258 {

259 int result = 0;

260 unsigned long address;

261 struct vm_area_struct* vma;

262

263 /*

264 * Go through process' page directory.

265 */

266

267 /*

268 * Find the proper vm-area after freezing the vma chain

269 * and ptes.

270 */

271 spin_lock(&mm->page_table_lock);

272 address = mm->swap_address; //进程起始的虚拟地址

273 vma = find_vma(mm, address); //mm表示的进程中虚存区间列表中，找到第一个vm_area_struct->vm_end > addressd的vma

274 if (vma) {

275 if (address < vma->vm_start)

276 address = vma->vm_start;

277

278 for (;;) { //尝试该进程中每一个vma，看能否从中换出页面来。

279 result = swap_out_vma(mm, vma, address, gfp_mask); //遍历该vma中每个页面，试图换出页面的函数，成功返回1，否则又尝试下一个虚存区间

280 if (result)

281 goto out_unlock;

282 vma = vma->vm_next;

283 if (!vma)

284 break;

285 address = vma->vm_start;

286 }

287 }

288 /* Reset to 0 when we reach the end of address space */

289 mm->swap_address = 0; //进程中每个虚存区间都尝试后将其置0

290 mm->swap_cnt = 0;

291

292 out_unlock:

293 spin_unlock(&mm->page_table_lock);

294 return result;

295 }

从 swap_out_vma( )一层一层的看代码，swap_out_vma( ) -->swap_out_pgd( ) -->swap_out_pmd( ) -->try_to_swap_out( )，我们直接来看最关键的 try_to_swap_out( )：换出页面表项pte指向的内存页面（用紫色表示该函数）。

==================== mm/vmscan.c 27 56 ====================

[kswapd()>do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()>swap_out_pmd()>try_to_swap_out()]

27 /*

28 * The swap-out functions return 1 if they successfully

29 * threw something out, and we got a free page. It returns

30 * zero if it couldn't do anything, and any other value

31 * indicates it decreased rss, but the page was shared.

32 *

33 * NOTE! If it sleeps, it *must* return 1 to make sure we

34 * don't continue with the swap-out. Otherwise we may be

35 * using a process that no longer actually exists (it might

36 * have died while we slept).

37 */

38 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address,

pte_t * page_table, int gfp_mask)

39 {

40 pte_t pte;

41 swp_entry_t entry;

42 struct page * page;

43 int onlist;

45 pte = *page_table; //将函数传入的页面表项参数读给pte

46 if (!pte_present(pte)) //判断pte所指向的页面是否在内存中

47 goto out_failed;

48 page = pte_page(pte); //将页面表项内容转换为指向物理页面的指针

/*#define VALID_PAGE(page) ( (page -mem_map) < max_mapnr )，判断page在mem_map数组中的下标是否小于max_mapnr，若大于表示页面在外部设备。pageReserve表示页面不允许换出

49 if ((!VALID_PAGE(page)) || PageReserved(page))

50 goto out_failed;

52 if (!mm->swap_cnt)

53 return 1;

55 mm->swap_cnt--; //未考察的页面减1

上述中如果pte_present( )判断的结果是错误的，将返回上层函数跳到下个页面，当该页面表中页面走完后返回上上层函数跳到下个页面表，同理页面表走完后，就跳到下个虚存区间。

上述判断完成后，就得到一个具体的页面，下面就是对这个页面进行具体考察，继续阅读try_to_swap_out( )：

==================== mm/vmscan.c 57 74 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()>swap_out_pmd()>try_to_swap_out()]

57 onlist = PageActive(page); //#define PageActive(page) test_bit(PG_active, &(page)->flags)。检查页面PG_active标志位，判断页面是否活跃

58 /* Don't look at this pte if it's been accessed recently. */

59 if (ptep_test_and_clear_young(page_table)) { //页面能否换出取决于页面最近是否受到访问（年轻），该函数就是判断页面是否“年轻”，“年轻”则进入，见下文介绍

60 age_page_up(page); //这个页面若是在不活跃对列中则增加页面“观察后换出”的时间，下文介绍

61 goto out_failed;

62 }

63 if (!onlist) //页面不“年轻”并且处于不活跃状态，进入处理

64 /* The page is still mapped, so it can't be freeable... */

65 age_page_down_ageonly(page); //不能因为页面“不年轻”就将其立马换出，要待查看时间过后才能将页面换出（页面寿命耗尽），该函数就是减少页面寿命，下文介绍

67 /*

68 * If the page is in active use by us, or if the page

69 * is in active use by others, don't unmap it or

70 * (worse) start unneeded IO.

71 */

72 if (page->age > 0) //页面寿命未耗尽

73 goto out_failed;

ptep_test_and_clear_young():

==================== include/asm-i386/pgtable.h 285 285 ====================

285 static inline int ptep_test_and_clear_young(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_ACCESSED, ptep); }

介绍下上述 _PAGE_BIT_ACCESSED 这个页表项标志位：对于已经建立映射的虚拟地址，当访问到对应的物理页面时内存映射机制就自动将该标志位置为1。所以当 test_and_clear_bit() 中的 pte_young() 函数返回1则表示，上一次调用 swap_out_mm() 至今页面至少被访问一次，不能转出。判断为完成后将这个标志位置0，为下次检查这个标志位做准备。ptep_test_and_clear_young() 函数内还有 SetGageReference() 函数：如果页面还活跃就将 PG_reference 标志位置1，表示受到访问的信息转移到 page 数据结构中。

age_page_up()：

==================== mm/swap.c 125 138 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()>age_page_up()]

125 void age_page_up(struct page * page)

126 {

127 /*

128 * We're dealing with an inactive page, move the page

129 * to the active list.

130 */

131 if (!page->age)

132 activate_page(page);

133

134 /* The actual page aging bit */

135 page->age += PAGE_AGE_ADV;

136 if (page->age > PAGE_AGE_MAX)

137 page->age = PAGE_AGE_MAX;

138 }

这里先解决个问题：问什么页面是有映射的但是却在60行age_page_up() 的注释中看到可能在不活跃队列中呢？答案是：页面因缺页异常而恢复一个不活跃页面的映射时不是马上就将其加入活跃页面队列中，而将这项工作留给前边看到的 page_launder() 来处理，当系统比较闲时来处理，所以是有可能产生上述问题的现象的。

接着介绍上文的函数 age_page_down_ageonly()：

==================== mm/swap.c 103 110 ====================

103 /*

104 * We use this (minimal) function in the case where we

105 * know we can't deactivate the page (yet).

106 */

107 void age_page_down_ageonly(struct page * page)

108 {

109 page->age /= 2;

110 }

到了这里的页面原则上已经是可换出对象了，继续阅读代码：

==================== mm/vmscan.c 75 108 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()>swap_out_pmd()>try_to_swap_out()]

75 if (TryLockPage(page)) //需要互斥操作，将页面锁住 #define TryLockPage(page) test_and_set_bit(PG_locked, &(page)->flags)，若返回值为1表示页面已经先被其他进程锁住了

76 goto out_failed;

78 /* From this point on, the odds are that we're going to

79 * nuke this pte, so read and clear the pte. This hook

80 * is needed on CPUs which update the accessed and dirty

81 * bits in hardware.

82 */

83 pte = ptep_get_and_clear(page_table); //再读一次页面表项内容并把表项内容清0，再读一次可保证信息正确，因为存在多核处理器的情况，页面表项随时可能有变化

84 flush_tlb_page(vma, address);

86 /*

87 * Is the page already in the swap cache? If so, then

88 * we can just drop our reference to it without doing

89 * any IO - it's already up-to-date on disk.

90 *

91 * Return 0, as we didn't actually free any real

92 * memory, and we should just continue our scan.

93 */

94 if (PageSwapCache(page)) { //#define PageSwapCache(page) test_bit(PG_swap_cache, &(page)->flags)，PG_swap_cache标志位为1表示page结构swapper_space中

95 entry.val = page->index; //index是个32位索引项，是指向交换设备上映射的指针

96 if (pte_dirty(pte)) //在swapper_space中的页面也有干净和脏页面，如果是脏页面则进入语句

97 set_page_dirty(page); //将页面转入脏页面队列

98 set_swap_pte:

99 swap_duplicate(entry); //对索引项进行检测并将相应的盘上页面使用计数递增，下文介绍

100 set_pte(page_table, swp_entry_to_pte(entry)); //页面表项由内存映射改为盘上映射

101 drop_pte: //到了这里我们尝试的进程的内存页面映射就已经断开了，下面试试该页面能不能转为不活跃页面

102 UnlockPage(page);

103 mm->rss--;

104 deactivate_page(page); //试图将页面置为不活跃页面下文介绍

105 page_cache_release(page); //递减页面使用计数，释放使用计数为0的页面

106 out_failed:

107 return 0; //处理完这个页面后要返回零，这样swap_mm才能依次考虑这个进程的所有页面

108 }

swap_duplicate()：

==================== mm/swapfile.c 820 871 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()>swap_duplicate()]

820 /*

821 * Verify that a swap entry is valid and increment its swap map count.

822 * Kernel_lock is held, which guarantees existance of swap device.

823 *

824 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as

825 * "permanent", but will be reclaimed by the next swapoff.

826 */

827 int swap_duplicate(swp_entry_t entry) //以前讲过，页面在盘上则页面表项pte_t 即变为 swp_entry_t 指向盘上页面。

828 {

829 struct swap_info_struct * p;

830 unsigned long offset, type;

831 int result = 0;

832

833 /* Swap entry 0 is illegal */

834 if (!entry.val)

835 goto out;

836 type = SWP_TYPE(entry);

837 if (type >= nr_swapfiles)

838 goto bad_file;

839 p = type + swap_info;

840 offset = SWP_OFFSET(entry);

841 if (offset >= p->max)

842 goto bad_offset;

843 if (!p->swap_map[offset]) //如果盘上页面映射建立，则该页面在此数组的相应位置就有着该页面的“共享计数”

844 goto bad_unused;

845 /*

846 * Entry is valid, so increment the map count.

847 */

848 swap_device_lock(p);

849 if (p->swap_map[offset] < SWAP_MAP_MAX) //递增后的“共享计数”不能大于 SWAP_MAP_MAX

850 p->swap_map[offset]++;

851 else {

852 static int overflow = 0;

853 if (overflow++ < 5)

854 printk("VM: swap entry overflow\n");

855 p->swap_map[offset] = SWAP_MAP_MAX;

856 }

857 swap_device_unlock(p);

858 result = 1;

859 out:

860 return result;

861

862 bad_file:

863 printk("Bad swap file entry %08lx\n", entry.val);

864 goto out;

865 bad_offset:

866 printk("Bad swap offset entry %08lx\n", entry.val);

867 goto out;

868 bad_unused:

869 printk("Unused swap offset entry in swap_dup %08lx\n", entry.val);

870 goto out;

871 }

deactivate_page，将页面置为不活跃页面：

==================== mm/swap.c 189 194 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()>deactivate_page()]

189 void deactivate_page(struct page * page)

190 {

191 spin_lock(&pagemap_lru_lock);

192 deactivate_page_nolock(page);

193 spin_unlock(&pagemap_lru_lock);

194 }

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()>deactivate_page()>deactivate_page_nolock()]

154 /**

155 * (de)activate_page - move pages from/to active and inactive lists

156 * @page: the page we want to move

157 * @nolock - are we already holding the pagemap_lru_lock?

158 *

159 * Deactivate_page will move an active page to the right

160 * inactive list, while activate_page will move a page back

161 * from one of the inactive lists to the active list. If

162 * called on a page which is not on any of the lists, the

163 * page is left alone.

164 */

165 void deactivate_page_nolock(struct page * page)

166 {

167 /*

168 * One for the cache, one for the extra reference the

169 * caller has and (maybe) one for the buffers.

170 *

171 * This isn't perfect, but works for just about everything.

172 * Besides, as long as we don't move unfreeable pages to the

173 * inactive_clean list it doesn't need to be perfect...

174 */

175 int maxcount = (page->buffers ? 3 : 2); //页面空闲使用计数为0，分配后为1，之后进程映射或文件读物都++，文件读写不建立映射，但是否可以将页面转入不活跃队列只取决于有无进程映射，所以需要判断是否

文件读写。所以当page->buffers非0时，maxcount = 3表示之前程序断开的是最后一个映射已是最后一个映射

176 page->age = 0;

177 ClearPageReferenced(page);

178

179 /*

180 * Don't touch it if it's not on the active list.

181 * (some pages aren't on any list at all)

182 */

183 if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) { //前两个条件不多说，第三个条件是用作模拟硬盘的 ramdisk 页面永远不能是活跃页面

184 del_page_from_active_list(page); //脱离活跃队列，下文介绍

185 add_page_to_inactive_dirty_list(page); //加入脏不活跃队列，下文介绍

186 }

187 }

内核中只有脏不活跃队列，但是每个管理区都有个干净不活跃队列，加入不活跃队列的页面都是先加入脏不活跃队列中：add_page_to_inactive_dirty_list

==================== include/linux/swap.h 234 240 ====================

234 #define del_page_from_active_list(page) { \

235 list_del(&(page)->lru); \

236 ClearPageActive(page); \ //设置page结构中PG_active标志位为0

237 nr_active_pages--; \

238 DEBUG_ADD_PAGE \

239 ZERO_PAGE_BUG \

240 }

==================== include/linux/swap.h 217 224 ====================

217 #define add_page_to_inactive_dirty_list(page) { \

218 DEBUG_ADD_PAGE \

219 ZERO_PAGE_BUG \

220 SetPageInactiveDirty(page); \ //设置page结构中PG_inactive_dirty标志位为1

221 list_add(&(page)->lru, &inactive_dirty_list); \

222 nr_inactive_dirty_pages++; \

223 page->zone->inactive_dirty_pages++; \

224 }

上述两个函数都为改变页面的使用计数，回到try_to_swap_out函数第105行，递减页面使用计数：

==================== include/linux/pagemap.h 34 34 ====================

34 #define page_cache_release(x) __free_page(x)

==================== include/linux/mm.h 379 379 ====================

379 #define __free_page(page) __free_pages((page), 0)

==================== mm/page_alloc.c 549 553 ====================

549 void __free_pages(struct page *page, unsigned long order)

550 {

551 if (!PageReserved(page) && put_page_testzero(page)) //put_page_testzero()，页面使用计数减1，并判断使用计数是否为0

552 __free_pages_ok(page, order); //将该页面释放

553 }

==================== include/linux/mm.h 152 152 ====================

152 #define put_page_testzero(p) atomic_dec_and_test(&(p)->count)

到这里应该纳闷了，要是页面没在swapper_space对列中呢？继续阅读try_to_swap_out代码：

==================== mm/vmscan.c 110 157 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()]

110 /*

111 * Is it a clean page? Then it must be recoverable

112 * by just paging it in again, and we can just drop

113 * it..

114 *

115 * However, this won't actually free any real

116 * memory, as the page will just be in the page cache

117 * somewhere, and as such we should just continue

118 * our scan.

119 *

120 * Basically, this just makes it possible for us to do

121 * some real work in the future in "refill_inactive()".

122 */

123 flush_cache_page(vma, address);

124 if (!pte_dirty(pte)) //static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }，通过页面表项判断是否干净页面。

125 goto drop_pte; //解除该页面映射

126

127 /*

128 * Ok, it's really dirty. That means that

129 * we should either create a new swap cache

130 * entry for it, or we should write it back

131 * to its own backing store.

132 */

133 if (page->mapping) { //是否通过mmap()建立的映射，mapping指向相应的address_space结构

134 set_page_dirty(page); //到达这里的页面都是脏页面。下文介绍

135 goto drop_pte;

136 }

137

138 /*

139 * This is a dirty, swappable page. First of all,

140 * get a suitable swap entry for it, and make sure

141 * we have the swap cache set up to associate the

142 * page with that swap entry.

143 */

144 entry = get_swap_page(); //到了这里的页面已经过4层条件筛选了，这样的页面要为之分配一个盘上页面：#define get_swap_page() __get_swap_page(1)

145 if (!entry.val) //分配盘上页面失败

146 goto out_unlock_restore; /* No swap space left */

147

148 /* Add it to the swap cache and mark it dirty */

149 add_to_swap_cache(page, entry); //页面链入swapper_space队列及活跃队列中

150 set_page_dirty(page); //将页面转入不活跃脏页面

151 goto set_swap_pte;

152

153 out_unlock_restore:

154 set_pte(page_table, pte);

155 UnlockPage(page);

156 return 0;

157 }

set_page_dirty( )：

==================== include/linux/mm.h 187 191 ====================

[kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()

>swap_out_pmd()>try_to_swap_out()>set_page_dirty()]

187 static inline void set_page_dirty(struct page * page)

188 {

189 if (!test_and_set_bit(PG_dirty, &page->flags)) //如果页面表项中_PAGE_DIRTY标志位为1，则将page->PG_dirty标志位置1

190 __set_page_dirty(page); //页面移至该文件映射的脏队列

191 }

==================== mm/filemap.c 134 147 ====================

134 /*

135 * Add a page to the dirty page list.

136 */

137 void __set_page_dirty(struct page *page)

138 {

139 struct address_space *mapping = page->mapping;

140

141 spin_lock(&pagecache_lock);

142 list_del(&page->list);

143 list_add(&page->list, &mapping->dirty_pages);

144 spin_unlock(&pagecache_lock);

145

146 mark_inode_dirty_pages(mapping->host);

147 }

补充：

实际页面写出是前边介绍 page_launder( ) 的事。

refill_inactive( )如果一直未找到页面转入不活跃队列会重新在循环一次，新的一次循环中可能有的页面就“老化”而瞒住要求了，但是要是一直不能找到呢？就会调用oom_kill杀掉这个进程。

存储管理（二）--学习《Linux内核源代码情景分析》第二章（方便理解，内容在注释中）

猜你喜欢