天天看點

2.7 Linux存儲管理-實體頁面配置設定

程序需要的連續的頁面,通過alloc_pages來完成 該函數在mm/numa.c和mm/page_alloc.h中都有定義 NUMA和UMA配置設定記憶體的函數是不并存的,根據CONFIG_DISCONTIGMEM的勾選與否選擇其中一個

1. NUMA的配置設定記憶體函數: 選擇CONFIG_DISCONTIGMEM選項 被編譯的條件是“不連續的存儲空間”,而不是CONFIG_NUMA,但是CONFIG_NUMA會對程式造成影響 alloc_pages(int gfp_mask,unsigned long order) gfp_mask:表示采用哪一種配置設定政策 order:申請2^order個頁面

numa的alloc_pages的代碼:

如果定義了NUMA,就需要擷取 pgdat_list,并且需要周遊所有的pg_data_t節點 配置設定時輪流從各個節點開始,并希望各節點負載均衡 在每個節點上使用 alloc_pages_pgdat函數

1

==================== mm/numa.c 43 43 ====================      

2

43  #ifdef CONFIG_DISCONTIGMEM      

3

==================== mm/numa.c 91 128 ====================      

4

91  /*      

5

92  * This can be refined. Currently, tries to do round robin, instead      

6

93  * should do concentratic circle search, starting from current node.      

7

94  */      

8

95  struct page * alloc_pages(int gfp_mask, unsigned long order)      

9

96  {      

10

97     struct page *ret = 0;      

11

98     pg_data_t *start, *temp;      

12

99  #ifndef CONFIG_NUMA      

13

100     unsigned long flags;      

14

101     static pg_data_t *next = 0;      

15

102 #endif      

16

103      

17

104 if (order >= MAX_ORDER)      

18

105     return NULL;      

19

106 #ifdef CONFIG_NUMA      

20

107     temp = NODE_DATA(numa_node_id());      

21

108 #else      

22

109     spin_lock_irqsave(&node_lock, flags);      

23

110     if (!next) next = pgdat_list;      

24

111     temp = next;      

25

112     next = next->node_next;      

26

113     spin_unlock_irqrestore(&node_lock, flags);      

27

114 #endif      

28

115     start = temp;      

29

116 while (temp) {      

30

117     if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))      

31

118         return(ret);      

32

119     temp = temp->node_next;      

33

120 }      

34

121 temp = pgdat_list;      

35

122 while (temp != start) {      

36

123    if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))      

37

124        return(ret);      

38

125     temp = temp->node_next;      

39

126 }      

40

127     return(0);      

41

128 }      

1.2 alloc_pages_pgdat函數: NUMA和UMA機制下都使用了相同的函數,在UMA處在做詳細介紹 gfp_mask相當于node_zonelists數組的下标

1

85  static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,      

2

86 unsigned long order)      

3

87  {      

4

88     return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);      

5

89  }      

2. UMA的配置設定記憶體函數: 不選擇CONFIG_DISCONTIGMEM選項

該函數隻有在CONFIG_DISCONTIGMEM無定義時才編譯 很明顯在UMA結構下 隻有一個pg_data_t節點,也就是 contig_page_data,是以無序周遊 具體的記憶體配置設定過程由__alloc_pages()完成

1

343 #ifndef CONFIG_DISCONTIGMEM      

2

344 static inline struct page * alloc_pages(int gfp_mask, unsigned long order)      

3

345 {      

4

346 /*      

5

347 * Gets optimized away by the compiler.      

6

348 */      

7

349 if (order >= MAX_ORDER)      

8

350     return NULL;      

9

351 return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);      

10

352 }      

2.1 __alloc_pages函數: 該函數完成了記憶體的具體配置設定 zonelist是 contig_page_data節點的中的zone清單,要在這些zone裡面找到合适實體頁面進行配置設定 zonelist_t結構中存放了具體的記憶體配置設定政策,也就是gfp_task,其實是一些标志位 memory_pressure表示頁面管理所受的壓力,配置設定記憶體頁面時增加,歸還記憶體時減少

申請的頁面數為1,而且允許等待完成、不用于管理的目的,則我們将 direct_reclaim設定為1, 表示可以從相應的頁面管理區的“不活躍頁面”中回收,一般而言,這些頁面都不是連接配接成塊的, 是以 提供給了單頁面請求使用,而且這些頁面的内容已經寫出到了交換裝置中(swap分區) 當發現頁面短缺,則需要喚醒kswapd和bdflush線程,騰出空間

1

270 /*      

2

271 * This is the 'heart' of the zoned buddy allocator:      

3

272 */      

4

273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)      

5

274 {      

6

275     zone_t **zone;      

7

276     int direct_reclaim = 0;      

8

277     unsigned int gfp_mask = zonelist->gfp_mask;      

9

278     struct page * page;      

10

279      

11

280 /*      

12

281 * Allocations put pressure on the VM subsystem.      

13

282 */      

14

283     memory_pressure++;      

15

284      

16

285 /*      

17

286 * (If anyone calls gfp from interrupts nonatomically then it      

18

287 * will sooner or later tripped up by a schedule().)      

19

288 *      

20

289 * We are falling back to lower-level zones if allocation      

21

290 * in a higher zone fails.      

22

291 */      

23

292      

24

293 /*      

25

294 * Can we take pages directly from the inactive_clean      

26

295 * list?      

27

296 */      

28

   //申請的記憶體空間為1頁,且允許等待      

29

297 if (order == 0 && (gfp_mask & __GFP_WAIT) &&      

30

298     !(current->flags & PF_MEMALLOC))      

31

299     direct_reclaim = 1;      

32

300      

33

301 /*      

34

302 * If we are about to get low on free pages and we also have      

35

303 * an inactive page shortage, wake up kswapd.      

36

84      

37

304 */      

38

305 if (inactive_shortage() > inactive_target / 2 && free_shortage())      

39

306     wakeup_kswapd(0);      

40

307 /*      

41

308 * If we are about to get low on free pages and cleaning      

42

309 * the inactive_dirty pages would fix the situation,      

43

310 * wake up bdflush.      

44

311 */      

45

312 else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()      

46

313     && nr_inactive_dirty_pages >= freepages.high)      

47

314     wakeup_bdflush(0);      

48

315      

對pgdata_t節點中的所有zone進行周遊(其實隻有三個zone) rmqueue從管理區中擷取若幹連續的記憶體頁,當記憶體不足時,喚醒 kreclaimd() ,讓其幫助回收頁面

1

316 try_again:      

2

317 /*      

3

318 * First, see if we have any zones with lots of free memory.      

4

319 *      

5

320 * We allocate free memory first because it doesn't contain      

6

321 * any data ... DUH!      

7

322 */      

8

323     zone = zonelist->zones;      

9

   //死循環      

10

324 for (;;) {      

11

325     zone_t *z = *(zone++);      

12

326     if (!z)      

13

327         break;      

14

328     if (!z->size)      

15

329         BUG();      

16

330      

17

331     if (z->free_pages >= z->pages_low) {      

18

332         page = rmqueue(z, order);    //配置設定記憶體      

19

333     if (page)      

20

334         return page;      

21

335     } else if (z->free_pages < z->pages_min &&      

22

336     waitqueue_active(&kreclaimd_wait)) {      

23

337         wake_up_interruptible(&kreclaimd_wait);      

24

338 }      

25

339       

假如三個zone都失敗,要考慮下面的事 1)降低頁面管理區的中的“保持水位的 要求” 2)把緩存在管理區中的“不活躍幹淨頁面”考慮進去 PAGES_LOW和 PAGES_HIGH其實表示了不同的free_list,然後使用 __alloc_pages_limit, 申請記憶體,再次失敗,就說明記憶體真的短缺了 2.6之後的核心引用新的參數(migirate_type),用來表示遷移類型(數值越小說明記憶體與越緊張)

1

341 /*      

2

342 * Try to allocate a page from a zone with a HIGH      

3

343 * amount of free + inactive_clean pages.      

4

344 *      

5

345 * If there is a lot of activity, inactive_target      

6

346 * will be high and we'll have a good chance of      

7

347 * finding a page using the HIGH limit.      

8

348 */      

9

349 page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);      

10

350 if (page)      

11

351 return page;      

12

352      

13

353 /*      

14

354 * Then try to allocate a page from a zone with more      

15

355 * than zone->pages_low free + inactive_clean pages.      

16

356 *      

17

357 * When the working set is very large and VM activity      

18

358 * is low, we're most likely to have our allocation      

19

359 * succeed here.      

20

360 */      

21

361 page = (zonelist, order, PAGES_LOW, direct_reclaim);      

22

362 if (page)      

23

363 return page;      

24

364      

zone中的頁面非常短缺, 1)喚醒核心線程 kswapd,讓其設法換成一些頁面,gfp_mask甯可等待也要申請記憶體,那就讓系統進行一次排程 并讓目前程序為其他程序讓路,這樣kswapd可能會立即執行 2)其他程序可能會釋放一些頁面,也可減緩了要求配置設定頁面的速度,最後以 PAGES_MIN參數 再次執行 __alloc_pages_limit 當然還是可能會失敗

1

365 /*      

2

366 * OK, none of the zones on our zonelist has lots      

3

367 * of pages free.      

4

368 *      

5

369 * We wake up kswapd, in the hope that kswapd will      

6

370 * resolve this situation before memory gets tight.      

7

371 *      

8

372 * We also yield the CPU, because that:      

9

373 * - gives kswapd a chance to do something      

10

374 * - slows down allocations, in particular the      

11

375 * allocations from the fast allocator that's      

12

376 * causing the problems ...      

13

377 * - ... which minimises the impact the "bad guys"      

14

378 * have on the rest of the system      

15

379 * - if we don't have __GFP_IO set, kswapd may be      

16

380 * able to free some memory we can't free ourselves      

17

381 */      

18

382 wakeup_kswapd(0);      

19

383 if (gfp_mask & __GFP_WAIT) {      

20

384 __set_current_state(TASK_RUNNING);      

21

385 current->policy |= SCHED_YIELD;      

22

386 schedule();      

23

387 }      

24

388      

25

389 /*      

26

390 * After waking up kswapd, we try to allocate a page      

27

391 * from any zone which isn't critical yet.      

28

392 *      

29

393 * Kswapd should, in most situations, bring the situation      

30

394 * back to normal in no time.      

31

395 */      

32

396 page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);      

33

397 if (page)      

34

398 return page;      

35

399      

如果再次失敗,需要檢視是誰在有要求核心頁面。如果是kswapd和kreclaimd,本身就是“ 記憶體配置設定工作者”,要求配置設定記憶體頁面的目的是執行公務,這 比一般程序更重要 這些程序task_struct結構中的flags字段的PF_MEMALLO标志位為1,一般程序為0 失敗的原因: 1)可配置設定頁面的數量太少; 2)頁面總量不少,但是要求的頁面塊無法滿足,此時往往有很多單個頁面在管理區的 inactive_clean_pages中,回收的話,有可能拼裝出較大的頁面塊 inactive_drity_pages隊列中,把髒頁面的内容寫到交換裝置上或檔案中,可以使他們變成幹淨頁面 加以回收 __free_page()釋放頁面時,會把空閑頁面拼裝起盡可能大的頁面塊,是以在回收每一個頁面後都 調用一下rmqueue,看看是否滿足要求 在調用 page_launder()期間把目前程序的PF_MEMALLOC标志位設為1,是其有了“執行公務”時的特權 這是因為page_la uncher也會要求配置設定一些臨時性的工作頁面,不把PF_MEMALLOC标志位設為1

還是失敗,喚醒kswapd,要求配置設定頁面的程序等待,有kswapd完成一輪運作後,喚醒申請頁面的程序 如果申請單個頁面,通過goto語句轉換__alloc_pages開頭處的标号try_again處 另一種方法是直接調用try_to_free_pages,這個函數本來是kswaps調用的

如果是“執行公務”,或者想盡一切辦法,隻不過因為要求配置設定的是成塊頁面,是以才沒有裝回前面的 标号try_again處

1

400 /*      

2

401 * Damn, we didn't succeed.      

3

402 *      

4

403 * This can be due to 2 reasons:      

5

404 * - we're doing a higher-order allocation      

6

405 * --> move pages to the free list until we succeed      

7

406 * - we're /really/ tight on memory      

8

407 * --> wait on the kswapd waitqueue until memory is freed      

9

408 */      

10

409 if (!(current->flags & PF_MEMALLOC)) {      

11

410 /*      

12

411 * Are we dealing with a higher order allocation?      

13

412 *      

14

413 * Move pages from the inactive_clean to the free list      

15

414 * in the hope of creating a large, physically contiguous      

16

415 * piece of free memory.      

17

416 */      

18

417     if (order > 0 && (gfp_mask & __GFP_WAIT)) {      

19

418         zone = zonelist->zones;      

20

419         /* First, clean some dirty pages. */      

21

420         current->flags |= PF_MEMALLOC;      

22

421         page_launder(gfp_mask, 1);      

23

422         current->flags &= ~PF_MEMALLOC;      

24

423         for (;;) {      

25

424             zone_t *z = *(zone++);      

26

425             if (!z)      

27

426                 break;      

28

427             if (!z->size)      

29

428             continue;      

30

429             while (z->inactive_clean_pages) {      

31

430                 struct page * page;      

32

431                 /* Move one page to the free list. */      

33

432                 page = reclaim_page(z);      

34

433                 if (!page)      

35

434                     break;      

36

435                 __free_page(page);      

37

436                 /* Try if the allocation succeeds. */      

38

437                 page = rmqueue(z, order);      

39

438                 if (page)      

40

439                     return page;      

41

440             }      

42

441         }      

43

442     }      

44

443 /*      

45

444 * When we arrive here, we are really tight on memory.      

46

445 *      

47

446 * We wake up kswapd and sleep until kswapd wakes us      

48

447 * up again. After that we loop back to the start.      

49

448 *      

50

449 * We have to do this because something else might eat      

51

450 * the memory kswapd frees for us and we need to be      

52

451 * reliable. Note that we don't loop back for higher      

53

452 * order allocations since it is possible that kswapd      

54

453 * simply cannot free a large enough contiguous area      

55

454 * of memory *ever*.      

56

455 */      

57

456     if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {      

58

457         wakeup_kswapd(1);      

59

458         memory_pressure++;      

60

459         if (!order)      

61

460             goto try_again;      

62

461 /*      

63

462 * If __GFP_IO isn't set, we can't wait on kswapd because      

64

463 * kswapd just might need some IO locks /we/ are holding ...      

65

464 *      

66

465 * SUBTLE: The scheduling point above makes sure that      

67

466 * kswapd does get the chance to free memory we can't      

68

467 * free ourselves...      

69

468 */      

70

469      } else if (gfp_mask & __GFP_WAIT) {      

71

470             try_to_free_pages(gfp_mask);      

72

471             memory_pressure++;      

73

472             if (!order)      

74

473             goto try_again;      

75

474      }      

76

475      

77

476 }      

78

477      

前面使用 __alloc_pages_limit(),其實還有所保留 我們使用 PAGES_MIN為參數,此時判斷是否可以配置設定的準則是管理區中可配置設定頁面的“水位”高于 z->pages_min,是以還留着一些“老本 為了應付緊急情況,已經到了“不惜血本”的時候了,繼續下面處理

1

478 /*      

2

479 * Final phase: allocate anything we can!      

3

480 *      

4

481 * Higher order allocations, GFP_ATOMIC allocations and      

5

482 * recursive allocations (PF_MEMALLOC) end up here.      

6

483 *      

7

484 * Only recursive allocations can use the very last pages      

8

485 * in the system, otherwise it would be just too easy to      

9

486 * deadlock the system...      

10

487 */      

11

488 zone = zonelist->zones;      

12

489 for (;;) {      

13

490     zone_t *z = *(zone++);      

14

491     struct page * page = NULL;      

15

492     if (!z)      

16

493         break;      

17

494     if (!z->size)      

18

495         BUG();      

19

496      

20

497 /*      

21

498 * SUBTLE: direct_reclaim is only possible if the task      

22

499 * becomes PF_MEMALLOC while looping above. This will      

23

500 * happen when the OOM killer selects this task for      

24

501 * instant execution...      

25

502 */      

26

503 if (direct_reclaim) {      

27

504     page = reclaim_page(z);      

28

505     if (page)      

29

506         return page;      

30

507 }      

31

508      

32

509 /* XXX: is pages_min/4 a good amount to reserve for this? */      

33

510 if (z->free_pages < z->pages_min / 4 &&      

34

511     !(current->flags & PF_MEMALLOC))      

35

512     continue;      

36

513     page = rmqueue(z, order);      

37

514 if (page)      

38

515     return page;      

39

516 }      

40

517      

41

518     /* No luck.. */      

42

519     printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);      

43

520     return NULL;      

44

521 }      

如果再次失敗,那就是系統有問題

2.2 rmqueue函數: 從指定的zone中,擷取2^order數量的頁面 zone中的free_area是按照order建立的數組,每個free_area裡面有多個free_list, 也可能為空, 也就是目前order中沒有空閑空間,為空時從order更大的free_area中申請 我們要在order對應的free_area中申請記憶體頁面,并将其從page鍊中摘出, 摘鍊的過程不能被打斷,需要 spin_lock_irqsave加鎖 rmqueue函數代碼(2.4版本),2.6以及之後的版本發生了修改(看page_alloc.c檔案) memlist_entry提供了需要的free_list的頭page結構, memlist_del幫我們把free_list從free_area中删除

1

172 static struct page * rmqueue(zone_t *zone, unsigned long order)      

2

173 {      

3

174     free_area_t * area = zone->free_area + order;      

4

175     unsigned long curr_order = order;      

5

176     struct list_head *head, *curr;      

6

177     unsigned long flags;      

7

178     struct page *page;      

8

179      

9

180     spin_lock_irqsave(&zone->lock, flags);      

10

181     do {      

11

182         head = &area->free_list;      

12

183         curr = memlist_next(head);      

13

184      

14

185         if (curr != head) {      

15

186             unsigned int index;      

16

187               

17

188             page = memlist_entry(curr, struct page, list);      

18

189             if (BAD_RANGE(zone,page))      

19

190                 BUG();      

20

191             memlist_del(curr);      

21

192             index = (page - mem_map) - zone->offset;      

22

193             MARK_USED(index, curr_order, area);      

23

194             zone->free_pages -= 1 << order;      

24

195             //用來分解大塊記憶體         

25

196             page = expand(zone, page, index, order, curr_order, area);      

26

197             spin_unlock_irqrestore(&zone->lock, flags);      

27

198      

28

199             set_page_count(page, 1);      

29

200             if (BAD_RANGE(zone,page))      

30

201                 BUG();      

31

202             DEBUG_ADD_PAGE      

32

203             return page;      

33

204         }      

34

205         curr_order++;      

35

206         area++;      

36

207     } while (curr_order < MAX_ORDER);      

37

208     spin_unlock_irqrestore(&zone->lock, flags);      

38

209      

39

210     return NULL;      

40

211 }      

2.3 expand函數 用來将order更大的free_area中free_list進行分割,并存入order更低的free_area high為更大的order,low申請的order,當将high中一個free_list分割到low的大小時,就停止

1

150 static inline struct page * expand (zone_t *zone, struct page *page,      

2

151 unsigned long index, int low, int high, free_area_t * area)      

3

152 {      

4

153     unsigned long size = 1 << high;      

5

154      

6

155     while (high > low) {      

7

156         if (BAD_RANGE(zone,page))      

8

157             BUG();      

9

158         area--;      

10

159         high--;      

11

160         size >>= 1;      

12

161         memlist_add_head(&(page)->list, &(area)->free_list);      

13

162         MARK_USED(index, high, area);      

14

163         index += size;      

15

164         page += size;      

16

165     }      

17

166     if (BAD_RANGE(zone,page))      

18

167         BUG();      

19

168     return page;      

20

169 }      

繼續閱讀