程序需要的連續的頁面,通過alloc_pages來完成 該函數在mm/numa.c和mm/page_alloc.h中都有定義 NUMA和UMA配置設定記憶體的函數是不并存的,根據CONFIG_DISCONTIGMEM的勾選與否選擇其中一個
1. NUMA的配置設定記憶體函數: 選擇CONFIG_DISCONTIGMEM選項 被編譯的條件是“不連續的存儲空間”,而不是CONFIG_NUMA,但是CONFIG_NUMA會對程式造成影響 alloc_pages(int gfp_mask,unsigned long order) gfp_mask:表示采用哪一種配置設定政策 order:申請2^order個頁面
numa的alloc_pages的代碼:
如果定義了NUMA,就需要擷取 pgdat_list,并且需要周遊所有的pg_data_t節點 配置設定時輪流從各個節點開始,并希望各節點負載均衡 在每個節點上使用 alloc_pages_pgdat函數
1
==================== mm/numa.c 43 43 ====================
2
43 #ifdef CONFIG_DISCONTIGMEM
3
==================== mm/numa.c 91 128 ====================
4
91 /*
5
92 * This can be refined. Currently, tries to do round robin, instead
6
93 * should do concentratic circle search, starting from current node.
7
94 */
8
95 struct page * alloc_pages(int gfp_mask, unsigned long order)
9
96 {
10
97 struct page *ret = 0;
11
98 pg_data_t *start, *temp;
12
99 #ifndef CONFIG_NUMA
13
100 unsigned long flags;
14
101 static pg_data_t *next = 0;
15
102 #endif
16
103
17
104 if (order >= MAX_ORDER)
18
105 return NULL;
19
106 #ifdef CONFIG_NUMA
20
107 temp = NODE_DATA(numa_node_id());
21
108 #else
22
109 spin_lock_irqsave(&node_lock, flags);
23
110 if (!next) next = pgdat_list;
24
111 temp = next;
25
112 next = next->node_next;
26
113 spin_unlock_irqrestore(&node_lock, flags);
27
114 #endif
28
115 start = temp;
29
116 while (temp) {
30
117 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
31
118 return(ret);
32
119 temp = temp->node_next;
33
120 }
34
121 temp = pgdat_list;
35
122 while (temp != start) {
36
123 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
37
124 return(ret);
38
125 temp = temp->node_next;
39
126 }
40
127 return(0);
41
128 }
1.2 alloc_pages_pgdat函數: NUMA和UMA機制下都使用了相同的函數,在UMA處在做詳細介紹 gfp_mask相當于node_zonelists數組的下标
1
85 static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,
2
86 unsigned long order)
3
87 {
4
88 return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);
5
89 }
2. UMA的配置設定記憶體函數: 不選擇CONFIG_DISCONTIGMEM選項
該函數隻有在CONFIG_DISCONTIGMEM無定義時才編譯 很明顯在UMA結構下 隻有一個pg_data_t節點,也就是 contig_page_data,是以無序周遊 具體的記憶體配置設定過程由__alloc_pages()完成
1
343 #ifndef CONFIG_DISCONTIGMEM
2
344 static inline struct page * alloc_pages(int gfp_mask, unsigned long order)
3
345 {
4
346 /*
5
347 * Gets optimized away by the compiler.
6
348 */
7
349 if (order >= MAX_ORDER)
8
350 return NULL;
9
351 return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);
10
352 }
2.1 __alloc_pages函數: 該函數完成了記憶體的具體配置設定 zonelist是 contig_page_data節點的中的zone清單,要在這些zone裡面找到合适實體頁面進行配置設定 zonelist_t結構中存放了具體的記憶體配置設定政策,也就是gfp_task,其實是一些标志位 memory_pressure表示頁面管理所受的壓力,配置設定記憶體頁面時增加,歸還記憶體時減少
申請的頁面數為1,而且允許等待完成、不用于管理的目的,則我們将 direct_reclaim設定為1, 表示可以從相應的頁面管理區的“不活躍頁面”中回收,一般而言,這些頁面都不是連接配接成塊的, 是以 提供給了單頁面請求使用,而且這些頁面的内容已經寫出到了交換裝置中(swap分區) 當發現頁面短缺,則需要喚醒kswapd和bdflush線程,騰出空間
1
270 /*
2
271 * This is the 'heart' of the zoned buddy allocator:
3
272 */
4
273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
5
274 {
6
275 zone_t **zone;
7
276 int direct_reclaim = 0;
8
277 unsigned int gfp_mask = zonelist->gfp_mask;
9
278 struct page * page;
10
279
11
280 /*
12
281 * Allocations put pressure on the VM subsystem.
13
282 */
14
283 memory_pressure++;
15
284
16
285 /*
17
286 * (If anyone calls gfp from interrupts nonatomically then it
18
287 * will sooner or later tripped up by a schedule().)
19
288 *
20
289 * We are falling back to lower-level zones if allocation
21
290 * in a higher zone fails.
22
291 */
23
292
24
293 /*
25
294 * Can we take pages directly from the inactive_clean
26
295 * list?
27
296 */
28
//申請的記憶體空間為1頁,且允許等待
29
297 if (order == 0 && (gfp_mask & __GFP_WAIT) &&
30
298 !(current->flags & PF_MEMALLOC))
31
299 direct_reclaim = 1;
32
300
33
301 /*
34
302 * If we are about to get low on free pages and we also have
35
303 * an inactive page shortage, wake up kswapd.
36
84
37
304 */
38
305 if (inactive_shortage() > inactive_target / 2 && free_shortage())
39
306 wakeup_kswapd(0);
40
307 /*
41
308 * If we are about to get low on free pages and cleaning
42
309 * the inactive_dirty pages would fix the situation,
43
310 * wake up bdflush.
44
311 */
45
312 else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
46
313 && nr_inactive_dirty_pages >= freepages.high)
47
314 wakeup_bdflush(0);
48
315
對pgdata_t節點中的所有zone進行周遊(其實隻有三個zone) rmqueue從管理區中擷取若幹連續的記憶體頁,當記憶體不足時,喚醒 kreclaimd() ,讓其幫助回收頁面
1
316 try_again:
2
317 /*
3
318 * First, see if we have any zones with lots of free memory.
4
319 *
5
320 * We allocate free memory first because it doesn't contain
6
321 * any data ... DUH!
7
322 */
8
323 zone = zonelist->zones;
9
//死循環
10
324 for (;;) {
11
325 zone_t *z = *(zone++);
12
326 if (!z)
13
327 break;
14
328 if (!z->size)
15
329 BUG();
16
330
17
331 if (z->free_pages >= z->pages_low) {
18
332 page = rmqueue(z, order); //配置設定記憶體
19
333 if (page)
20
334 return page;
21
335 } else if (z->free_pages < z->pages_min &&
22
336 waitqueue_active(&kreclaimd_wait)) {
23
337 wake_up_interruptible(&kreclaimd_wait);
24
338 }
25
339
假如三個zone都失敗,要考慮下面的事 1)降低頁面管理區的中的“保持水位的 要求” 2)把緩存在管理區中的“不活躍幹淨頁面”考慮進去 PAGES_LOW和 PAGES_HIGH其實表示了不同的free_list,然後使用 __alloc_pages_limit, 申請記憶體,再次失敗,就說明記憶體真的短缺了 2.6之後的核心引用新的參數(migirate_type),用來表示遷移類型(數值越小說明記憶體與越緊張)
1
341 /*
2
342 * Try to allocate a page from a zone with a HIGH
3
343 * amount of free + inactive_clean pages.
4
344 *
5
345 * If there is a lot of activity, inactive_target
6
346 * will be high and we'll have a good chance of
7
347 * finding a page using the HIGH limit.
8
348 */
9
349 page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
10
350 if (page)
11
351 return page;
12
352
13
353 /*
14
354 * Then try to allocate a page from a zone with more
15
355 * than zone->pages_low free + inactive_clean pages.
16
356 *
17
357 * When the working set is very large and VM activity
18
358 * is low, we're most likely to have our allocation
19
359 * succeed here.
20
360 */
21
361 page = (zonelist, order, PAGES_LOW, direct_reclaim);
22
362 if (page)
23
363 return page;
24
364
zone中的頁面非常短缺, 1)喚醒核心線程 kswapd,讓其設法換成一些頁面,gfp_mask甯可等待也要申請記憶體,那就讓系統進行一次排程 并讓目前程序為其他程序讓路,這樣kswapd可能會立即執行 2)其他程序可能會釋放一些頁面,也可減緩了要求配置設定頁面的速度,最後以 PAGES_MIN參數 再次執行 __alloc_pages_limit 當然還是可能會失敗
1
365 /*
2
366 * OK, none of the zones on our zonelist has lots
3
367 * of pages free.
4
368 *
5
369 * We wake up kswapd, in the hope that kswapd will
6
370 * resolve this situation before memory gets tight.
7
371 *
8
372 * We also yield the CPU, because that:
9
373 * - gives kswapd a chance to do something
10
374 * - slows down allocations, in particular the
11
375 * allocations from the fast allocator that's
12
376 * causing the problems ...
13
377 * - ... which minimises the impact the "bad guys"
14
378 * have on the rest of the system
15
379 * - if we don't have __GFP_IO set, kswapd may be
16
380 * able to free some memory we can't free ourselves
17
381 */
18
382 wakeup_kswapd(0);
19
383 if (gfp_mask & __GFP_WAIT) {
20
384 __set_current_state(TASK_RUNNING);
21
385 current->policy |= SCHED_YIELD;
22
386 schedule();
23
387 }
24
388
25
389 /*
26
390 * After waking up kswapd, we try to allocate a page
27
391 * from any zone which isn't critical yet.
28
392 *
29
393 * Kswapd should, in most situations, bring the situation
30
394 * back to normal in no time.
31
395 */
32
396 page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
33
397 if (page)
34
398 return page;
35
399
如果再次失敗,需要檢視是誰在有要求核心頁面。如果是kswapd和kreclaimd,本身就是“ 記憶體配置設定工作者”,要求配置設定記憶體頁面的目的是執行公務,這 比一般程序更重要 這些程序task_struct結構中的flags字段的PF_MEMALLO标志位為1,一般程序為0 失敗的原因: 1)可配置設定頁面的數量太少; 2)頁面總量不少,但是要求的頁面塊無法滿足,此時往往有很多單個頁面在管理區的 inactive_clean_pages中,回收的話,有可能拼裝出較大的頁面塊 inactive_drity_pages隊列中,把髒頁面的内容寫到交換裝置上或檔案中,可以使他們變成幹淨頁面 加以回收 __free_page()釋放頁面時,會把空閑頁面拼裝起盡可能大的頁面塊,是以在回收每一個頁面後都 調用一下rmqueue,看看是否滿足要求 在調用 page_launder()期間把目前程序的PF_MEMALLOC标志位設為1,是其有了“執行公務”時的特權 這是因為page_la uncher也會要求配置設定一些臨時性的工作頁面,不把PF_MEMALLOC标志位設為1
還是失敗,喚醒kswapd,要求配置設定頁面的程序等待,有kswapd完成一輪運作後,喚醒申請頁面的程序 如果申請單個頁面,通過goto語句轉換__alloc_pages開頭處的标号try_again處 另一種方法是直接調用try_to_free_pages,這個函數本來是kswaps調用的
如果是“執行公務”,或者想盡一切辦法,隻不過因為要求配置設定的是成塊頁面,是以才沒有裝回前面的 标号try_again處
1
400 /*
2
401 * Damn, we didn't succeed.
3
402 *
4
403 * This can be due to 2 reasons:
5
404 * - we're doing a higher-order allocation
6
405 * --> move pages to the free list until we succeed
7
406 * - we're /really/ tight on memory
8
407 * --> wait on the kswapd waitqueue until memory is freed
9
408 */
10
409 if (!(current->flags & PF_MEMALLOC)) {
11
410 /*
12
411 * Are we dealing with a higher order allocation?
13
412 *
14
413 * Move pages from the inactive_clean to the free list
15
414 * in the hope of creating a large, physically contiguous
16
415 * piece of free memory.
17
416 */
18
417 if (order > 0 && (gfp_mask & __GFP_WAIT)) {
19
418 zone = zonelist->zones;
20
419 /* First, clean some dirty pages. */
21
420 current->flags |= PF_MEMALLOC;
22
421 page_launder(gfp_mask, 1);
23
422 current->flags &= ~PF_MEMALLOC;
24
423 for (;;) {
25
424 zone_t *z = *(zone++);
26
425 if (!z)
27
426 break;
28
427 if (!z->size)
29
428 continue;
30
429 while (z->inactive_clean_pages) {
31
430 struct page * page;
32
431 /* Move one page to the free list. */
33
432 page = reclaim_page(z);
34
433 if (!page)
35
434 break;
36
435 __free_page(page);
37
436 /* Try if the allocation succeeds. */
38
437 page = rmqueue(z, order);
39
438 if (page)
40
439 return page;
41
440 }
42
441 }
43
442 }
44
443 /*
45
444 * When we arrive here, we are really tight on memory.
46
445 *
47
446 * We wake up kswapd and sleep until kswapd wakes us
48
447 * up again. After that we loop back to the start.
49
448 *
50
449 * We have to do this because something else might eat
51
450 * the memory kswapd frees for us and we need to be
52
451 * reliable. Note that we don't loop back for higher
53
452 * order allocations since it is possible that kswapd
54
453 * simply cannot free a large enough contiguous area
55
454 * of memory *ever*.
56
455 */
57
456 if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
58
457 wakeup_kswapd(1);
59
458 memory_pressure++;
60
459 if (!order)
61
460 goto try_again;
62
461 /*
63
462 * If __GFP_IO isn't set, we can't wait on kswapd because
64
463 * kswapd just might need some IO locks /we/ are holding ...
65
464 *
66
465 * SUBTLE: The scheduling point above makes sure that
67
466 * kswapd does get the chance to free memory we can't
68
467 * free ourselves...
69
468 */
70
469 } else if (gfp_mask & __GFP_WAIT) {
71
470 try_to_free_pages(gfp_mask);
72
471 memory_pressure++;
73
472 if (!order)
74
473 goto try_again;
75
474 }
76
475
77
476 }
78
477
前面使用 __alloc_pages_limit(),其實還有所保留 我們使用 PAGES_MIN為參數,此時判斷是否可以配置設定的準則是管理區中可配置設定頁面的“水位”高于 z->pages_min,是以還留着一些“老本 為了應付緊急情況,已經到了“不惜血本”的時候了,繼續下面處理
1
478 /*
2
479 * Final phase: allocate anything we can!
3
480 *
4
481 * Higher order allocations, GFP_ATOMIC allocations and
5
482 * recursive allocations (PF_MEMALLOC) end up here.
6
483 *
7
484 * Only recursive allocations can use the very last pages
8
485 * in the system, otherwise it would be just too easy to
9
486 * deadlock the system...
10
487 */
11
488 zone = zonelist->zones;
12
489 for (;;) {
13
490 zone_t *z = *(zone++);
14
491 struct page * page = NULL;
15
492 if (!z)
16
493 break;
17
494 if (!z->size)
18
495 BUG();
19
496
20
497 /*
21
498 * SUBTLE: direct_reclaim is only possible if the task
22
499 * becomes PF_MEMALLOC while looping above. This will
23
500 * happen when the OOM killer selects this task for
24
501 * instant execution...
25
502 */
26
503 if (direct_reclaim) {
27
504 page = reclaim_page(z);
28
505 if (page)
29
506 return page;
30
507 }
31
508
32
509 /* XXX: is pages_min/4 a good amount to reserve for this? */
33
510 if (z->free_pages < z->pages_min / 4 &&
34
511 !(current->flags & PF_MEMALLOC))
35
512 continue;
36
513 page = rmqueue(z, order);
37
514 if (page)
38
515 return page;
39
516 }
40
517
41
518 /* No luck.. */
42
519 printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
43
520 return NULL;
44
521 }
如果再次失敗,那就是系統有問題
2.2 rmqueue函數: 從指定的zone中,擷取2^order數量的頁面 zone中的free_area是按照order建立的數組,每個free_area裡面有多個free_list, 也可能為空, 也就是目前order中沒有空閑空間,為空時從order更大的free_area中申請 我們要在order對應的free_area中申請記憶體頁面,并将其從page鍊中摘出, 摘鍊的過程不能被打斷,需要 spin_lock_irqsave加鎖 rmqueue函數代碼(2.4版本),2.6以及之後的版本發生了修改(看page_alloc.c檔案) memlist_entry提供了需要的free_list的頭page結構, memlist_del幫我們把free_list從free_area中删除
1
172 static struct page * rmqueue(zone_t *zone, unsigned long order)
2
173 {
3
174 free_area_t * area = zone->free_area + order;
4
175 unsigned long curr_order = order;
5
176 struct list_head *head, *curr;
6
177 unsigned long flags;
7
178 struct page *page;
8
179
9
180 spin_lock_irqsave(&zone->lock, flags);
10
181 do {
11
182 head = &area->free_list;
12
183 curr = memlist_next(head);
13
184
14
185 if (curr != head) {
15
186 unsigned int index;
16
187
17
188 page = memlist_entry(curr, struct page, list);
18
189 if (BAD_RANGE(zone,page))
19
190 BUG();
20
191 memlist_del(curr);
21
192 index = (page - mem_map) - zone->offset;
22
193 MARK_USED(index, curr_order, area);
23
194 zone->free_pages -= 1 << order;
24
195 //用來分解大塊記憶體
25
196 page = expand(zone, page, index, order, curr_order, area);
26
197 spin_unlock_irqrestore(&zone->lock, flags);
27
198
28
199 set_page_count(page, 1);
29
200 if (BAD_RANGE(zone,page))
30
201 BUG();
31
202 DEBUG_ADD_PAGE
32
203 return page;
33
204 }
34
205 curr_order++;
35
206 area++;
36
207 } while (curr_order < MAX_ORDER);
37
208 spin_unlock_irqrestore(&zone->lock, flags);
38
209
39
210 return NULL;
40
211 }
2.3 expand函數 用來将order更大的free_area中free_list進行分割,并存入order更低的free_area high為更大的order,low申請的order,當将high中一個free_list分割到low的大小時,就停止
1
150 static inline struct page * expand (zone_t *zone, struct page *page,
2
151 unsigned long index, int low, int high, free_area_t * area)
3
152 {
4
153 unsigned long size = 1 << high;
5
154
6
155 while (high > low) {
7
156 if (BAD_RANGE(zone,page))
8
157 BUG();
9
158 area--;
10
159 high--;
11
160 size >>= 1;
12
161 memlist_add_head(&(page)->list, &(area)->free_list);
13
162 MARK_USED(index, high, area);
14
163 index += size;
15
164 page += size;
16
165 }
17
166 if (BAD_RANGE(zone,page))
18
167 BUG();
19
168 return page;
20
169 }