2.7 Linux存储管理-物理页面分配

进程需要的连续的页面，通过alloc_pages来完成该函数在mm/numa.c和mm/page_alloc.h中都有定义 NUMA和UMA分配内存的函数是不并存的，根据CONFIG_DISCONTIGMEM的勾选与否选择其中一个

1. NUMA的分配内存函数：选择CONFIG_DISCONTIGMEM选项被编译的条件是“不连续的存储空间”，而不是CONFIG_NUMA，但是CONFIG_NUMA会对程序造成影响 alloc_pages（int gfp_mask,unsigned long order） gfp_mask：表示采用哪一种分配策略 order：申请2^order个页面

numa的alloc_pages的代码：

如果定义了NUMA，就需要获取 pgdat_list，并且需要遍历所有的pg_data_t节点分配时轮流从各个节点开始，并希望各节点负载均衡在每个节点上使用 alloc_pages_pgdat函数

==================== mm/numa.c 43 43 ====================

43  #ifdef CONFIG_DISCONTIGMEM

==================== mm/numa.c 91 128 ====================

91  /*

92  * This can be refined. Currently, tries to do round robin, instead

93  * should do concentratic circle search, starting from current node.

94  */

95  struct page * alloc_pages(int gfp_mask, unsigned long order)

96  {

97     struct page *ret = 0;

98     pg_data_t *start, *temp;

99  #ifndef CONFIG_NUMA

100     unsigned long flags;

101     static pg_data_t *next = 0;

102 #endif

104 if (order >= MAX_ORDER)

105     return NULL;

106 #ifdef CONFIG_NUMA

107     temp = NODE_DATA(numa_node_id());

108 #else

109     spin_lock_irqsave(&node_lock, flags);

110     if (!next) next = pgdat_list;

111     temp = next;

112     next = next->node_next;

113     spin_unlock_irqrestore(&node_lock, flags);

114 #endif

115     start = temp;

116 while (temp) {

117     if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))

118         return(ret);

119     temp = temp->node_next;

120 }

121 temp = pgdat_list;

122 while (temp != start) {

123    if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))

124        return(ret);

125     temp = temp->node_next;

126 }

127     return(0);

128 }

1.2 alloc_pages_pgdat函数： NUMA和UMA机制下都使用了相同的函数，在UMA处在做详细介绍 gfp_mask相当于node_zonelists数组的下标

85  static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,

86 unsigned long order)

87  {

88     return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);

89  }

2. UMA的分配内存函数：不选择CONFIG_DISCONTIGMEM选项

该函数只有在CONFIG_DISCONTIGMEM无定义时才编译很明显在UMA结构下只有一个pg_data_t节点，也就是 contig_page_data，所以无序遍历具体的内存分配过程由__alloc_pages()完成

343 #ifndef CONFIG_DISCONTIGMEM

344 static inline struct page * alloc_pages(int gfp_mask, unsigned long order)

345 {

346 /*

347 * Gets optimized away by the compiler.

348 */

349 if (order >= MAX_ORDER)

350     return NULL;

351 return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);

352 }

2.1 __alloc_pages函数：该函数完成了内存的具体分配 zonelist是 contig_page_data节点的中的zone列表，要在这些zone里面找到合适物理页面进行分配 zonelist_t结构中存放了具体的内存分配策略，也就是gfp_task，其实是一些标志位 memory_pressure表示页面管理所受的压力，分配内存页面时增加，归还内存时减少

申请的页面数为1，而且允许等待完成、不用于管理的目的，则我们将 direct_reclaim设置为1，表示可以从相应的页面管理区的“不活跃页面”中回收，一般而言，这些页面都不是连接成块的，所以提供给了单页面请求使用，而且这些页面的内容已经写出到了交换设备中（swap分区）当发现页面短缺，则需要唤醒kswapd和bdflush线程，腾出空间

270 /*

271 * This is the 'heart' of the zoned buddy allocator:

272 */

273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)

274 {

275     zone_t **zone;

276     int direct_reclaim = 0;

277     unsigned int gfp_mask = zonelist->gfp_mask;

278     struct page * page;

280 /*

281 * Allocations put pressure on the VM subsystem.

282 */

283     memory_pressure++;

285 /*

286 * (If anyone calls gfp from interrupts nonatomically then it

287 * will sooner or later tripped up by a schedule().)

288 *

289 * We are falling back to lower-level zones if allocation

290 * in a higher zone fails.

291 */

293 /*

294 * Can we take pages directly from the inactive_clean

295 * list?

296 */

   //申请的内存空间为1页，且允许等待

297 if (order == 0 && (gfp_mask & __GFP_WAIT) &&

298     !(current->flags & PF_MEMALLOC))

299     direct_reclaim = 1;

301 /*

302 * If we are about to get low on free pages and we also have

303 * an inactive page shortage, wake up kswapd.

304 */

305 if (inactive_shortage() > inactive_target / 2 && free_shortage())

306     wakeup_kswapd(0);

307 /*

308 * If we are about to get low on free pages and cleaning

309 * the inactive_dirty pages would fix the situation,

310 * wake up bdflush.

311 */

312 else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()

313     && nr_inactive_dirty_pages >= freepages.high)

314     wakeup_bdflush(0);

对pgdata_t节点中的所有zone进行遍历（其实只有三个zone） rmqueue从管理区中获取若干连续的内存页，当内存不足时，唤醒 kreclaimd（），让其帮助回收页面

316 try_again:

317 /*

318 * First, see if we have any zones with lots of free memory.

319 *

320 * We allocate free memory first because it doesn't contain

321 * any data ... DUH!

322 */

323     zone = zonelist->zones;

   //死循环

324 for (;;) {

325     zone_t *z = *(zone++);

326     if (!z)

327         break;

328     if (!z->size)

329         BUG();

331     if (z->free_pages >= z->pages_low) {

332         page = rmqueue(z, order);    //分配内存

333     if (page)

334         return page;

335     } else if (z->free_pages < z->pages_min &&

336     waitqueue_active(&kreclaimd_wait)) {

337         wake_up_interruptible(&kreclaimd_wait);

338 }

假如三个zone都失败，要考虑下面的事 1）降低页面管理区的中的“保持水位的要求” 2）把缓存在管理区中的“不活跃干净页面”考虑进去 PAGES_LOW和 PAGES_HIGH其实表示了不同的free_list，然后使用 __alloc_pages_limit，申请内存，再次失败，就说明内存真的短缺了 2.6之后的内核引用新的参数（migirate_type），用来表示迁移类型（数值越小说明内存与越紧张）

341 /*

342 * Try to allocate a page from a zone with a HIGH

343 * amount of free + inactive_clean pages.

344 *

345 * If there is a lot of activity, inactive_target

346 * will be high and we'll have a good chance of

347 * finding a page using the HIGH limit.

348 */

349 page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);

350 if (page)

351 return page;

353 /*

354 * Then try to allocate a page from a zone with more

355 * than zone->pages_low free + inactive_clean pages.

356 *

357 * When the working set is very large and VM activity

358 * is low, we're most likely to have our allocation

359 * succeed here.

360 */

361 page = (zonelist, order, PAGES_LOW, direct_reclaim);

362 if (page)

363 return page;

zone中的页面非常短缺， 1）唤醒内核线程 kswapd，让其设法换成一些页面，gfp_mask宁可等待也要申请内存，那就让系统进行一次调度并让当前进程为其他进程让路，这样kswapd可能会立即执行 2）其他进程可能会释放一些页面，也可减缓了要求分配页面的速度，最后以 PAGES_MIN参数再次执行 __alloc_pages_limit 当然还是可能会失败

365 /*

366 * OK, none of the zones on our zonelist has lots

367 * of pages free.

368 *

369 * We wake up kswapd, in the hope that kswapd will

370 * resolve this situation before memory gets tight.

371 *

372 * We also yield the CPU, because that:

373 * - gives kswapd a chance to do something

374 * - slows down allocations, in particular the

375 * allocations from the fast allocator that's

376 * causing the problems ...

377 * - ... which minimises the impact the "bad guys"

378 * have on the rest of the system

379 * - if we don't have __GFP_IO set, kswapd may be

380 * able to free some memory we can't free ourselves

381 */

382 wakeup_kswapd(0);

383 if (gfp_mask & __GFP_WAIT) {

384 __set_current_state(TASK_RUNNING);

385 current->policy |= SCHED_YIELD;

386 schedule();

387 }

389 /*

390 * After waking up kswapd, we try to allocate a page

391 * from any zone which isn't critical yet.

392 *

393 * Kswapd should, in most situations, bring the situation

394 * back to normal in no time.

395 */

396 page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);

397 if (page)

398 return page;

如果再次失败，需要查看是谁在有要求内核页面。如果是kswapd和kreclaimd，本身就是“ 内存分配工作者”，要求分配内存页面的目的是执行公务，这比一般进程更重要这些进程task_struct结构中的flags字段的PF_MEMALLO标志位为1，一般进程为0 失败的原因： 1）可分配页面的数量太少； 2）页面总量不少，但是要求的页面块无法满足，此时往往有很多单个页面在管理区的 inactive_clean_pages中，回收的话，有可能拼装出较大的页面块 inactive_drity_pages队列中，把脏页面的内容写到交换设备上或文件中，可以使他们变成干净页面加以回收 __free_page()释放页面时，会把空闲页面拼装起尽可能大的页面块，所以在回收每一个页面后都调用一下rmqueue，看看是否满足要求在调用 page_launder()期间把当前进程的PF_MEMALLOC标志位设为1，是其有了“执行公务”时的特权这是因为page_la uncher也会要求分配一些临时性的工作页面，不把PF_MEMALLOC标志位设为1

还是失败，唤醒kswapd，要求分配页面的进程等待，有kswapd完成一轮运行后，唤醒申请页面的进程如果申请单个页面，通过goto语句转换__alloc_pages开头处的标号try_again处另一种方法是直接调用try_to_free_pages，这个函数本来是kswaps调用的

如果是“执行公务”，或者想尽一切办法，只不过因为要求分配的是成块页面，所以才没有装回前面的标号try_again处

400 /*

401 * Damn, we didn't succeed.

402 *

403 * This can be due to 2 reasons:

404 * - we're doing a higher-order allocation

405 * --> move pages to the free list until we succeed

406 * - we're /really/ tight on memory

407 * --> wait on the kswapd waitqueue until memory is freed

408 */

409 if (!(current->flags & PF_MEMALLOC)) {

410 /*

411 * Are we dealing with a higher order allocation?

412 *

413 * Move pages from the inactive_clean to the free list

414 * in the hope of creating a large, physically contiguous

415 * piece of free memory.

416 */

417     if (order > 0 && (gfp_mask & __GFP_WAIT)) {

418         zone = zonelist->zones;

419         /* First, clean some dirty pages. */

420         current->flags |= PF_MEMALLOC;

421         page_launder(gfp_mask, 1);

422         current->flags &= ~PF_MEMALLOC;

423         for (;;) {

424             zone_t *z = *(zone++);

425             if (!z)

426                 break;

427             if (!z->size)

428             continue;

429             while (z->inactive_clean_pages) {

430                 struct page * page;

431                 /* Move one page to the free list. */

432                 page = reclaim_page(z);

433                 if (!page)

434                     break;

435                 __free_page(page);

436                 /* Try if the allocation succeeds. */

437                 page = rmqueue(z, order);

438                 if (page)

439                     return page;

440             }

441         }

442     }

443 /*

444 * When we arrive here, we are really tight on memory.

445 *

446 * We wake up kswapd and sleep until kswapd wakes us

447 * up again. After that we loop back to the start.

448 *

449 * We have to do this because something else might eat

450 * the memory kswapd frees for us and we need to be

451 * reliable. Note that we don't loop back for higher

452 * order allocations since it is possible that kswapd

453 * simply cannot free a large enough contiguous area

454 * of memory *ever*.

455 */

456     if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {

457         wakeup_kswapd(1);

458         memory_pressure++;

459         if (!order)

460             goto try_again;

461 /*

462 * If __GFP_IO isn't set, we can't wait on kswapd because

463 * kswapd just might need some IO locks /we/ are holding ...

464 *

465 * SUBTLE: The scheduling point above makes sure that

466 * kswapd does get the chance to free memory we can't

467 * free ourselves...

468 */

469      } else if (gfp_mask & __GFP_WAIT) {

470             try_to_free_pages(gfp_mask);

471             memory_pressure++;

472             if (!order)

473             goto try_again;

474      }

476 }

前面使用 __alloc_pages_limit()，其实还有所保留我们使用 PAGES_MIN为参数，此时判断是否可以分配的准则是管理区中可分配页面的“水位”高于 z->pages_min，所以还留着一些“老本为了应付紧急情况，已经到了“不惜血本”的时候了，继续下面处理

478 /*

479 * Final phase: allocate anything we can!

480 *

481 * Higher order allocations, GFP_ATOMIC allocations and

482 * recursive allocations (PF_MEMALLOC) end up here.

483 *

484 * Only recursive allocations can use the very last pages

485 * in the system, otherwise it would be just too easy to

486 * deadlock the system...

487 */

488 zone = zonelist->zones;

489 for (;;) {

490     zone_t *z = *(zone++);

491     struct page * page = NULL;

492     if (!z)

493         break;

494     if (!z->size)

495         BUG();

497 /*

498 * SUBTLE: direct_reclaim is only possible if the task

499 * becomes PF_MEMALLOC while looping above. This will

500 * happen when the OOM killer selects this task for

501 * instant execution...

502 */

503 if (direct_reclaim) {

504     page = reclaim_page(z);

505     if (page)

506         return page;

507 }

509 /* XXX: is pages_min/4 a good amount to reserve for this? */

510 if (z->free_pages < z->pages_min / 4 &&

511     !(current->flags & PF_MEMALLOC))

512     continue;

513     page = rmqueue(z, order);

514 if (page)

515     return page;

516 }

518     /* No luck.. */

519     printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);

520     return NULL;

521 }

如果再次失败，那就是系统有问题

2.2 rmqueue函数：从指定的zone中，获取2^order数量的页面 zone中的free_area是按照order建立的数组，每个free_area里面有多个free_list，也可能为空, 也就是当前order中没有空闲空间，为空时从order更大的free_area中申请我们要在order对应的free_area中申请内存页面，并将其从page链中摘出，摘链的过程不能被打断，需要 spin_lock_irqsave加锁 rmqueue函数代码（2.4版本），2.6以及之后的版本发生了修改（看page_alloc.c文件） memlist_entry提供了需要的free_list的头page结构， memlist_del帮我们把free_list从free_area中删除

172 static struct page * rmqueue(zone_t *zone, unsigned long order)

173 {

174     free_area_t * area = zone->free_area + order;

175     unsigned long curr_order = order;

176     struct list_head *head, *curr;

177     unsigned long flags;

178     struct page *page;

180     spin_lock_irqsave(&zone->lock, flags);

181     do {

182         head = &area->free_list;

183         curr = memlist_next(head);

185         if (curr != head) {

186             unsigned int index;

188             page = memlist_entry(curr, struct page, list);

189             if (BAD_RANGE(zone,page))

190                 BUG();

191             memlist_del(curr);

192             index = (page - mem_map) - zone->offset;

193             MARK_USED(index, curr_order, area);

194             zone->free_pages -= 1 << order;

195             //用来分解大块内存

196             page = expand(zone, page, index, order, curr_order, area);

197             spin_unlock_irqrestore(&zone->lock, flags);

199             set_page_count(page, 1);

200             if (BAD_RANGE(zone,page))

201                 BUG();

202             DEBUG_ADD_PAGE

203             return page;

204         }

205         curr_order++;

206         area++;

207     } while (curr_order < MAX_ORDER);

208     spin_unlock_irqrestore(&zone->lock, flags);

210     return NULL;

211 }

2.3 expand函数用来将order更大的free_area中free_list进行分割，并存入order更低的free_area high为更大的order，low申请的order，当将high中一个free_list分割到low的大小时，就停止

150 static inline struct page * expand (zone_t *zone, struct page *page,

151 unsigned long index, int low, int high, free_area_t * area)

152 {

153     unsigned long size = 1 << high;

155     while (high > low) {

156         if (BAD_RANGE(zone,page))

157             BUG();

158         area--;

159         high--;

160         size >>= 1;

161         memlist_add_head(&(page)->list, &(area)->free_list);

162         MARK_USED(index, high, area);

163         index += size;

164         page += size;

165     }

166     if (BAD_RANGE(zone,page))

167         BUG();

168     return page;

169 }

2.7 Linux存储管理-物理页面分配

继续阅读

操作系统（python）多进程学习

Ubuntu14.04 LTS下安装mongodb

Nginx服务优化（1）——隐藏版本号、修改用户与组、网页缓存时间、日志切割、连接超时一、隐藏版本号二、修改用户与组三、配置Nginx网页缓存时间四、实现Nginx日志分割五、配置Nginx实现连接超时六、补充关于时间日期的命令

httpd服务的部署、启动、配置和简单优化一、部署二、启动三、配置文件

配置网页内容访问

手动安装Intel network I217-LM网卡的Linux驱动

禁止ubuntu系统弹出报错界面

Ubuntu Linux下Apache的配置文件

ACS基本配置-权限等级管理

传说FreeBSD等比Linux更稳定，更“健壮”

无人机--飞控科普

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

samba服务器的功能

【Linux】UDP广播报文接收速率问题

Linux设备模型（中）之上层容器

PowerPC平台 Linux移植三