2.7 Linux存儲管理-實體頁面配置設定

程序需要的連續的頁面，通過alloc_pages來完成該函數在mm/numa.c和mm/page_alloc.h中都有定義 NUMA和UMA配置設定記憶體的函數是不并存的，根據CONFIG_DISCONTIGMEM的勾選與否選擇其中一個

1. NUMA的配置設定記憶體函數：選擇CONFIG_DISCONTIGMEM選項被編譯的條件是“不連續的存儲空間”，而不是CONFIG_NUMA，但是CONFIG_NUMA會對程式造成影響 alloc_pages（int gfp_mask,unsigned long order） gfp_mask：表示采用哪一種配置設定政策 order：申請2^order個頁面

numa的alloc_pages的代碼：

如果定義了NUMA，就需要擷取 pgdat_list，并且需要周遊所有的pg_data_t節點配置設定時輪流從各個節點開始，并希望各節點負載均衡在每個節點上使用 alloc_pages_pgdat函數

==================== mm/numa.c 43 43 ====================

43  #ifdef CONFIG_DISCONTIGMEM

==================== mm/numa.c 91 128 ====================

91  /*

92  * This can be refined. Currently, tries to do round robin, instead

93  * should do concentratic circle search, starting from current node.

94  */

95  struct page * alloc_pages(int gfp_mask, unsigned long order)

96  {

97     struct page *ret = 0;

98     pg_data_t *start, *temp;

99  #ifndef CONFIG_NUMA

100     unsigned long flags;

101     static pg_data_t *next = 0;

102 #endif

104 if (order >= MAX_ORDER)

105     return NULL;

106 #ifdef CONFIG_NUMA

107     temp = NODE_DATA(numa_node_id());

108 #else

109     spin_lock_irqsave(&node_lock, flags);

110     if (!next) next = pgdat_list;

111     temp = next;

112     next = next->node_next;

113     spin_unlock_irqrestore(&node_lock, flags);

114 #endif

115     start = temp;

116 while (temp) {

117     if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))

118         return(ret);

119     temp = temp->node_next;

120 }

121 temp = pgdat_list;

122 while (temp != start) {

123    if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))

124        return(ret);

125     temp = temp->node_next;

126 }

127     return(0);

128 }

1.2 alloc_pages_pgdat函數： NUMA和UMA機制下都使用了相同的函數，在UMA處在做詳細介紹 gfp_mask相當于node_zonelists數組的下标

85  static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,

86 unsigned long order)

87  {

88     return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);

89  }

2. UMA的配置設定記憶體函數：不選擇CONFIG_DISCONTIGMEM選項

該函數隻有在CONFIG_DISCONTIGMEM無定義時才編譯很明顯在UMA結構下隻有一個pg_data_t節點，也就是 contig_page_data，是以無序周遊具體的記憶體配置設定過程由__alloc_pages()完成

343 #ifndef CONFIG_DISCONTIGMEM

344 static inline struct page * alloc_pages(int gfp_mask, unsigned long order)

345 {

346 /*

347 * Gets optimized away by the compiler.

348 */

349 if (order >= MAX_ORDER)

350     return NULL;

351 return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);

352 }

2.1 __alloc_pages函數：該函數完成了記憶體的具體配置設定 zonelist是 contig_page_data節點的中的zone清單，要在這些zone裡面找到合适實體頁面進行配置設定 zonelist_t結構中存放了具體的記憶體配置設定政策，也就是gfp_task，其實是一些标志位 memory_pressure表示頁面管理所受的壓力，配置設定記憶體頁面時增加，歸還記憶體時減少

申請的頁面數為1，而且允許等待完成、不用于管理的目的，則我們将 direct_reclaim設定為1，表示可以從相應的頁面管理區的“不活躍頁面”中回收，一般而言，這些頁面都不是連接配接成塊的，是以提供給了單頁面請求使用，而且這些頁面的内容已經寫出到了交換裝置中（swap分區）當發現頁面短缺，則需要喚醒kswapd和bdflush線程，騰出空間

270 /*

271 * This is the 'heart' of the zoned buddy allocator:

272 */

273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)

274 {

275     zone_t **zone;

276     int direct_reclaim = 0;

277     unsigned int gfp_mask = zonelist->gfp_mask;

278     struct page * page;

280 /*

281 * Allocations put pressure on the VM subsystem.

282 */

283     memory_pressure++;

285 /*

286 * (If anyone calls gfp from interrupts nonatomically then it

287 * will sooner or later tripped up by a schedule().)

288 *

289 * We are falling back to lower-level zones if allocation

290 * in a higher zone fails.

291 */

293 /*

294 * Can we take pages directly from the inactive_clean

295 * list?

296 */

   //申請的記憶體空間為1頁，且允許等待

297 if (order == 0 && (gfp_mask & __GFP_WAIT) &&

298     !(current->flags & PF_MEMALLOC))

299     direct_reclaim = 1;

301 /*

302 * If we are about to get low on free pages and we also have

303 * an inactive page shortage, wake up kswapd.

304 */

305 if (inactive_shortage() > inactive_target / 2 && free_shortage())

306     wakeup_kswapd(0);

307 /*

308 * If we are about to get low on free pages and cleaning

309 * the inactive_dirty pages would fix the situation,

310 * wake up bdflush.

311 */

312 else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()

313     && nr_inactive_dirty_pages >= freepages.high)

314     wakeup_bdflush(0);

對pgdata_t節點中的所有zone進行周遊（其實隻有三個zone） rmqueue從管理區中擷取若幹連續的記憶體頁，當記憶體不足時，喚醒 kreclaimd（），讓其幫助回收頁面

316 try_again:

317 /*

318 * First, see if we have any zones with lots of free memory.

319 *

320 * We allocate free memory first because it doesn't contain

321 * any data ... DUH!

322 */

323     zone = zonelist->zones;

   //死循環

324 for (;;) {

325     zone_t *z = *(zone++);

326     if (!z)

327         break;

328     if (!z->size)

329         BUG();

331     if (z->free_pages >= z->pages_low) {

332         page = rmqueue(z, order);    //配置設定記憶體

333     if (page)

334         return page;

335     } else if (z->free_pages < z->pages_min &&

336     waitqueue_active(&kreclaimd_wait)) {

337         wake_up_interruptible(&kreclaimd_wait);

338 }

假如三個zone都失敗，要考慮下面的事 1）降低頁面管理區的中的“保持水位的要求” 2）把緩存在管理區中的“不活躍幹淨頁面”考慮進去 PAGES_LOW和 PAGES_HIGH其實表示了不同的free_list，然後使用 __alloc_pages_limit，申請記憶體，再次失敗，就說明記憶體真的短缺了 2.6之後的核心引用新的參數（migirate_type），用來表示遷移類型（數值越小說明記憶體與越緊張）

341 /*

342 * Try to allocate a page from a zone with a HIGH

343 * amount of free + inactive_clean pages.

344 *

345 * If there is a lot of activity, inactive_target

346 * will be high and we'll have a good chance of

347 * finding a page using the HIGH limit.

348 */

349 page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);

350 if (page)

351 return page;

353 /*

354 * Then try to allocate a page from a zone with more

355 * than zone->pages_low free + inactive_clean pages.

356 *

357 * When the working set is very large and VM activity

358 * is low, we're most likely to have our allocation

359 * succeed here.

360 */

361 page = (zonelist, order, PAGES_LOW, direct_reclaim);

362 if (page)

363 return page;

zone中的頁面非常短缺， 1）喚醒核心線程 kswapd，讓其設法換成一些頁面，gfp_mask甯可等待也要申請記憶體，那就讓系統進行一次排程并讓目前程序為其他程序讓路，這樣kswapd可能會立即執行 2）其他程序可能會釋放一些頁面，也可減緩了要求配置設定頁面的速度，最後以 PAGES_MIN參數再次執行 __alloc_pages_limit 當然還是可能會失敗

365 /*

366 * OK, none of the zones on our zonelist has lots

367 * of pages free.

368 *

369 * We wake up kswapd, in the hope that kswapd will

370 * resolve this situation before memory gets tight.

371 *

372 * We also yield the CPU, because that:

373 * - gives kswapd a chance to do something

374 * - slows down allocations, in particular the

375 * allocations from the fast allocator that's

376 * causing the problems ...

377 * - ... which minimises the impact the "bad guys"

378 * have on the rest of the system

379 * - if we don't have __GFP_IO set, kswapd may be

380 * able to free some memory we can't free ourselves

381 */

382 wakeup_kswapd(0);

383 if (gfp_mask & __GFP_WAIT) {

384 __set_current_state(TASK_RUNNING);

385 current->policy |= SCHED_YIELD;

386 schedule();

387 }

389 /*

390 * After waking up kswapd, we try to allocate a page

391 * from any zone which isn't critical yet.

392 *

393 * Kswapd should, in most situations, bring the situation

394 * back to normal in no time.

395 */

396 page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);

397 if (page)

398 return page;

如果再次失敗，需要檢視是誰在有要求核心頁面。如果是kswapd和kreclaimd，本身就是“ 記憶體配置設定工作者”，要求配置設定記憶體頁面的目的是執行公務，這比一般程序更重要這些程序task_struct結構中的flags字段的PF_MEMALLO标志位為1，一般程序為0 失敗的原因： 1）可配置設定頁面的數量太少； 2）頁面總量不少，但是要求的頁面塊無法滿足，此時往往有很多單個頁面在管理區的 inactive_clean_pages中，回收的話，有可能拼裝出較大的頁面塊 inactive_drity_pages隊列中，把髒頁面的内容寫到交換裝置上或檔案中，可以使他們變成幹淨頁面加以回收 __free_page()釋放頁面時，會把空閑頁面拼裝起盡可能大的頁面塊，是以在回收每一個頁面後都調用一下rmqueue，看看是否滿足要求在調用 page_launder()期間把目前程序的PF_MEMALLOC标志位設為1，是其有了“執行公務”時的特權這是因為page_la uncher也會要求配置設定一些臨時性的工作頁面，不把PF_MEMALLOC标志位設為1

還是失敗，喚醒kswapd，要求配置設定頁面的程序等待，有kswapd完成一輪運作後，喚醒申請頁面的程序如果申請單個頁面，通過goto語句轉換__alloc_pages開頭處的标号try_again處另一種方法是直接調用try_to_free_pages，這個函數本來是kswaps調用的

如果是“執行公務”，或者想盡一切辦法，隻不過因為要求配置設定的是成塊頁面，是以才沒有裝回前面的标号try_again處

400 /*

401 * Damn, we didn't succeed.

402 *

403 * This can be due to 2 reasons:

404 * - we're doing a higher-order allocation

405 * --> move pages to the free list until we succeed

406 * - we're /really/ tight on memory

407 * --> wait on the kswapd waitqueue until memory is freed

408 */

409 if (!(current->flags & PF_MEMALLOC)) {

410 /*

411 * Are we dealing with a higher order allocation?

412 *

413 * Move pages from the inactive_clean to the free list

414 * in the hope of creating a large, physically contiguous

415 * piece of free memory.

416 */

417     if (order > 0 && (gfp_mask & __GFP_WAIT)) {

418         zone = zonelist->zones;

419         /* First, clean some dirty pages. */

420         current->flags |= PF_MEMALLOC;

421         page_launder(gfp_mask, 1);

422         current->flags &= ~PF_MEMALLOC;

423         for (;;) {

424             zone_t *z = *(zone++);

425             if (!z)

426                 break;

427             if (!z->size)

428             continue;

429             while (z->inactive_clean_pages) {

430                 struct page * page;

431                 /* Move one page to the free list. */

432                 page = reclaim_page(z);

433                 if (!page)

434                     break;

435                 __free_page(page);

436                 /* Try if the allocation succeeds. */

437                 page = rmqueue(z, order);

438                 if (page)

439                     return page;

440             }

441         }

442     }

443 /*

444 * When we arrive here, we are really tight on memory.

445 *

446 * We wake up kswapd and sleep until kswapd wakes us

447 * up again. After that we loop back to the start.

448 *

449 * We have to do this because something else might eat

450 * the memory kswapd frees for us and we need to be

451 * reliable. Note that we don't loop back for higher

452 * order allocations since it is possible that kswapd

453 * simply cannot free a large enough contiguous area

454 * of memory *ever*.

455 */

456     if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {

457         wakeup_kswapd(1);

458         memory_pressure++;

459         if (!order)

460             goto try_again;

461 /*

462 * If __GFP_IO isn't set, we can't wait on kswapd because

463 * kswapd just might need some IO locks /we/ are holding ...

464 *

465 * SUBTLE: The scheduling point above makes sure that

466 * kswapd does get the chance to free memory we can't

467 * free ourselves...

468 */

469      } else if (gfp_mask & __GFP_WAIT) {

470             try_to_free_pages(gfp_mask);

471             memory_pressure++;

472             if (!order)

473             goto try_again;

474      }

476 }

前面使用 __alloc_pages_limit()，其實還有所保留我們使用 PAGES_MIN為參數，此時判斷是否可以配置設定的準則是管理區中可配置設定頁面的“水位”高于 z->pages_min，是以還留着一些“老本為了應付緊急情況，已經到了“不惜血本”的時候了，繼續下面處理

478 /*

479 * Final phase: allocate anything we can!

480 *

481 * Higher order allocations, GFP_ATOMIC allocations and

482 * recursive allocations (PF_MEMALLOC) end up here.

483 *

484 * Only recursive allocations can use the very last pages

485 * in the system, otherwise it would be just too easy to

486 * deadlock the system...

487 */

488 zone = zonelist->zones;

489 for (;;) {

490     zone_t *z = *(zone++);

491     struct page * page = NULL;

492     if (!z)

493         break;

494     if (!z->size)

495         BUG();

497 /*

498 * SUBTLE: direct_reclaim is only possible if the task

499 * becomes PF_MEMALLOC while looping above. This will

500 * happen when the OOM killer selects this task for

501 * instant execution...

502 */

503 if (direct_reclaim) {

504     page = reclaim_page(z);

505     if (page)

506         return page;

507 }

509 /* XXX: is pages_min/4 a good amount to reserve for this? */

510 if (z->free_pages < z->pages_min / 4 &&

511     !(current->flags & PF_MEMALLOC))

512     continue;

513     page = rmqueue(z, order);

514 if (page)

515     return page;

516 }

518     /* No luck.. */

519     printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);

520     return NULL;

521 }

如果再次失敗，那就是系統有問題

2.2 rmqueue函數：從指定的zone中，擷取2^order數量的頁面 zone中的free_area是按照order建立的數組，每個free_area裡面有多個free_list，也可能為空, 也就是目前order中沒有空閑空間，為空時從order更大的free_area中申請我們要在order對應的free_area中申請記憶體頁面，并将其從page鍊中摘出，摘鍊的過程不能被打斷，需要 spin_lock_irqsave加鎖 rmqueue函數代碼（2.4版本），2.6以及之後的版本發生了修改（看page_alloc.c檔案） memlist_entry提供了需要的free_list的頭page結構， memlist_del幫我們把free_list從free_area中删除

172 static struct page * rmqueue(zone_t *zone, unsigned long order)

173 {

174     free_area_t * area = zone->free_area + order;

175     unsigned long curr_order = order;

176     struct list_head *head, *curr;

177     unsigned long flags;

178     struct page *page;

180     spin_lock_irqsave(&zone->lock, flags);

181     do {

182         head = &area->free_list;

183         curr = memlist_next(head);

185         if (curr != head) {

186             unsigned int index;

188             page = memlist_entry(curr, struct page, list);

189             if (BAD_RANGE(zone,page))

190                 BUG();

191             memlist_del(curr);

192             index = (page - mem_map) - zone->offset;

193             MARK_USED(index, curr_order, area);

194             zone->free_pages -= 1 << order;

195             //用來分解大塊記憶體

196             page = expand(zone, page, index, order, curr_order, area);

197             spin_unlock_irqrestore(&zone->lock, flags);

199             set_page_count(page, 1);

200             if (BAD_RANGE(zone,page))

201                 BUG();

202             DEBUG_ADD_PAGE

203             return page;

204         }

205         curr_order++;

206         area++;

207     } while (curr_order < MAX_ORDER);

208     spin_unlock_irqrestore(&zone->lock, flags);

210     return NULL;

211 }

2.3 expand函數用來将order更大的free_area中free_list進行分割，并存入order更低的free_area high為更大的order，low申請的order，當将high中一個free_list分割到low的大小時，就停止

150 static inline struct page * expand (zone_t *zone, struct page *page,

151 unsigned long index, int low, int high, free_area_t * area)

152 {

153     unsigned long size = 1 << high;

155     while (high > low) {

156         if (BAD_RANGE(zone,page))

157             BUG();

158         area--;

159         high--;

160         size >>= 1;

161         memlist_add_head(&(page)->list, &(area)->free_list);

162         MARK_USED(index, high, area);

163         index += size;

164         page += size;

165     }

166     if (BAD_RANGE(zone,page))

167         BUG();

168     return page;

169 }

2.7 Linux存儲管理-實體頁面配置設定

繼續閱讀

作業系統（python）多程序學習

Ubuntu14.04 LTS下安裝mongodb

httpd服務的部署、啟動、配置和簡單優化一、部署二、啟動三、配置檔案

配置網頁内容通路

手動安裝Intel network I217-LM網卡的Linux驅動

禁止ubuntu系統彈出報錯界面

Ubuntu Linux下Apache的配置檔案

ACS基本配置-權限等級管理

傳說FreeBSD等比Linux更穩定，更“健壯”

無人機--飛控科普

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

samba伺服器的功能

【Linux】UDP廣播封包接收速率問題

Linux裝置模型（中）之上層容器

PowerPC平台 Linux移植三