天天看點

Linux核心記憶體管理alloc_pages()函數分析

作者:核心中文社群
http://blog.chinaunix.net/uid-20729583-id-1884604.html

/*

 *下面的alloc_pages(gfp_mask,order)函數用來請求2^order個連續的頁框
 */ 
#define alloc_pages(gfp_mask, order) \
                 alloc_pages_node(numa_node_id(), gfp_mask, order)  


 #define numa_node_id()          (cpu_to_node(raw_smp_processor_id()))
 /* Returns the number of the node containing CPU 'cpu' */
 static inline int cpu_to_node(int cpu)                                                                                 
 {
         return cpu_2_node[cpu];
 }
 int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0};//每個CPU都有互相對應的節點,__read_mostly是gcc的一個
//屬性




//配置設定頁面函數,這個函數比較複雜,所牽涉到的内容也比較多,尤其是程序方面的内容
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
                                                 unsigned int order)
 {
         if (unlikely(order >= MAX_ORDER))  /*如果要求配置設定的頁數大于MAX_ORDER就以失敗告終,這裡的MAX_ORDER指的是最大頁面号,這裡要注意的是對于夥伴算法,所配置設定的 頁面的最大值為2^10,即1024個頁面,這一點在夥伴算法中經常會使用到,是以這裡的MAX_ORDER的值為11,也就是說如果order的值大于了10,即超出了最大值,那麼就會以失敗告終,直接以失敗傳回。*/
                 return NULL;    /*從這個判斷可以了解到,所配置設定頁的最大的值為 2^10次方,即1KB個頁面,即最大不能超過4MB。*/
 
         /* Unknown node is current node */
         if (nid < 0)
                 nid = numa_node_id();/*具體實作: #define numa_node_id()          (cpu_to_node(raw_smp_processor_id()))
//最後得到的值為0,因為假設現在隻有一個CPU  */
/*
 /* Returns the number of the node containing CPU 'cpu' 
 static inline int cpu_to_node(int cpu)                                                                                  
 {
         return cpu_2_node[cpu];//int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };/* 這又是C語言中使用的一個新的數組初始化的方法。  //read_mostly是在最後執行的時候被組織到一起,這被認為是為了提高效率,因為在多CPU系統中它改善了通路的時間。*/
 }


*/
 
         return __alloc_pages(gfp_mask, order,
                 NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask)); /*這是夥伴算法的核心實作,node_zonelists是zone_list類型,gfp_zone的傳回值為ZONE_DMA或者是ZONE_NORMAL或ZONE_HIGH,這三個區分别對應着一個值,ZONE_DMA為0,ZONE_NORMAL為1,ZONE_HIGH為2,即__alloc_pages配置設定頁面的管理區由的三個參數決定,如果gfp_zone的傳回值為0,就是在ZONE_DMA管理區中配置設定,如果gfp_zone傳回值為1,就是在ZONE_NORMAL中進行配置設定,如果gfp_zone的傳回值為2,就是在ZONE_HIGH中進行配置設定。*/
//下面是NDOE_DATA的具體定義:
/*
     struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;這裡的MAX_NUMNODES的值為1,即就定義一個節點
*/
}
 /*
  * This is the 'heart' of the zoned buddy allocator.
  *    這個算法是夥伴算法的核心操作
  */
 struct page * fastcall __alloc_pages(gfp_t gfp_mask, unsigned int order,
                 struct zonelist *zonelist)
 {
         const gfp_t wait = gfp_mask & __GFP_WAIT;   /*為了實作檢視是否允許核心對等待空閑頁框的目前程序進行阻塞*/
         struct zone **z;                  //這裡為何要使用雙重指針???
         struct page *page;            //指向頁描述符的指針
         struct reclaim_state reclaim_state;  //可回收頁面操作
 /*
  * current->reclaim_state points to one of these when a task is running
  * memory reclaim
用于回收頁面
 */


         struct task_struct *p = current;     //将p設定成指向目前程序
         int do_retry;        //
         int alloc_flags;              //配置設定标志
         int did_some_progress;       
 
         might_sleep_if(wait);        //對可能睡眠的函數進行注釋
 
         if (should_fail_alloc_page(gfp_mask, order))         /*檢查記憶體配置設定是否可行,如果不可行就直接傳回,即以失敗告終,否則就繼續執行記憶體配置設定*/
                 return NULL;
 
 restart:
         z = zonelist->zones;  /* the list of zones suitable for gfp_mask *///首先讓z指向第一個管理區
 
         if (unlikely(*z == NULL)) {            /*unlikely()宏的功能很有意思的,可以自己去進行驗證。這裡要實作的如果*z==NULL,那麼就傳回NULL,否則就繼續執行。*/
                 /* Should this ever happen?? */
                 return NULL;
         }                                
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
                                 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);  //從空閑連結清單中擷取2^order頁記憶體
//這是get_page_from_freelist函數的原型
//    get_page_from_freelist(gfp_t gfp_mask, unsigned int order,struct zonelist *zonelist, int alloc_flags)
         if (page)
                 goto got_pg;  //如果獲得了相應的頁就退出,否則繼續執行
 
         /*
          * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
            #define GFP_THISNODE    (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)


          * __GFP_NOWARN set) should not cause reclaim since the subsystem
          * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
          * using a larger set of nodes after it has established that the
          * allowed per node queues are empty and that nodes are
          * over allocated.
          */
         if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)  //在不支援NUMA的情況下跳轉到nopage處
                 goto nopage;
 
         for (z = zonelist->zones; *z; z++)       
                 wakeup_kswapd(*z, order);//回收頁面操作,待解
/
  *                    
  * A zone is low on free memory, so wake its kswapd task to service it.
  *         
 void wakeup_kswapd(struct zone *zone, int order)
 {
         pg_data_t *pgdat;
 
         if (!populated_zone(zone))  /*return !!(zone->present_pages) zone->present_pages是以頁為機關的管理區的總大小,如果以頁為機關的管理區的總大小為0,那麼就直接結束退出*/
                 return;
 
         pgdat = zone->zone_pgdat;
         if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
                 return;
         if (pgdat->kswapd_max_order < order)
                 pgdat->kswapd_max_order = order;
         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                 return;
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
         wake_up_interruptible(&pgdat->kswapd_wait);
 }


*
 
         /*
          * OK, we're below the kswapd watermark and have kicked background
          * reclaim. Now things get more complex, so set up alloc_flags according
          * to how we want to proceed.
          *
          * The caller may dip into page reserves a bit more if the caller
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
          * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
          */                       
         alloc_flags = ALLOC_WMARK_MIN;        //


/
  #define ALLOC_NO_WATERMARKS     0x01 /* don't check watermarks at all *
  #define ALLOC_WMARK_MIN         0x02 /* use pages_min watermark *
  #define ALLOC_WMARK_LOW         0x04 /* use pages_low watermark *
  #define ALLOC_WMARK_HIGH        0x08 /* use pages_high watermark *
  #define ALLOC_HARDER            0x10 /* try to alloc harder *
  #define ALLOC_HIGH              0x20 /* __GFP_HIGH set *
  #define ALLOC_CPUSET            0x40 /* check for correct cpuset *


*/
         if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
                 alloc_flags |= ALLOC_HARDER;
         if (gfp_mask & __GFP_HIGH)
                 alloc_flags |= ALLOC_HIGH;
         if (wait)
                 alloc_flags |= ALLOC_CPUSET;
 
         /*
          * Go through the zonelist again. Let __GFP_HIGH and allocations
          * coming from realtime tasks go deeper into reserves.
          *
          * This is the last chance, in general, before the goto nopage.
          * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
          * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
          */
         page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);//在進行了頁面回收後再次進行頁面的配置設定操作
         if (page)
                 goto got_pg;  //如果配置設定成功,就成功傳回
 
         /* This allocation should allow future memory freeing. */
 
 rebalance:
         if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))//#define PF_MEMALLOC     0x00000800      /* Allocating memory */   TIF_MEMDIE=16
/
6define test_thread_flag(flag) \                                                                                       
         test_ti_thread_flag(current_thread_info(), flag)


 static inline int test_ti_thread_flag(struct thread_info *ti, int flag)                                                
 {
         return test_bit(flag,&ti->flags);
 }
*
                         && !in_interrupt()) {
                 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 nofail_alloc:
                         /* go through the zonelist yet again, ignoring mins */
                         page = get_page_from_freelist(gfp_mask, order,
                                 zonelist, ALLOC_NO_WATERMARKS);
                         if (page)
                                 goto got_pg;
                         if (gfp_mask & __GFP_NOFAIL) {
                                 congestion_wait(WRITE, HZ/50);
                                 goto nofail_alloc;
                         }
                 }
                 goto nopage;
         }
 
         /* Atomic allocations - we can't balance anything */
         if (!wait)      //原子配置設定,就跳轉到nopage,即沒有空閑頁
                 goto nopage;   
 
         cond_resched();
 
         /* We now go into synchronous reclaim 現在進入異步回收*/
         cpuset_memory_pressure_bump();
         p->flags |= PF_MEMALLOC;
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
 
         did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
 
         p->reclaim_state = NULL;
         p->flags &= ~PF_MEMALLOC;
 
         cond_resched();
 
         if (likely(did_some_progress)) {
                 page = get_page_from_freelist(gfp_mask, order,
                                                 zonelist, alloc_flags);
                 if (page)
                         goto got_pg;
         } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {//If set the mark of the __GFP_FS zero,Then it doesn't allow the kernel execute the operation depending the filesystem .The mark of __Gfp_NORETRY means that you can allocate the page only once.Here allows allocate many times
                 /*
                  * Go through the zonelist yet one more time, keep
                  * very high watermark here, this is only to catch
                  * a parallel oom killing, we must fail if we're still
                  * under heavy pressure.
                  */
                 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
                                 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
                 if (page)
                         goto got_pg;
 
                 /* The OOM killer will not help higher order allocs so fail */
                 if (order > PAGE_ALLOC_COSTLY_ORDER)
                         goto nopage;
 /*
  *PAGE_ALLOC_COSTLY_ORDER是那些配置設定行為被認為是一項花費較大的服務所對應的定值,
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
  * costly to service.  That is between allocation orders which should
  * coelesce naturally under reasonable reclaim pressure and those which
  * will not.
  *
 #define PAGE_ALLOC_COSTLY_ORDER 3 
*
                out_of_memory(zonelist, gfp_mask, order);
                goto restart;
        }


        /*
         * Don't let big-order allocations loop unless the caller explicitly
         * requests that.  Wait for some write requests to complete then retry.
         *
         * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
         * <= 3, but that may not be true in other implementations.
         */
        do_retry = 0;
        if (!(gfp_mask & __GFP_NORETRY)) {
                if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
                                                (gfp_mask & __GFP_REPEAT))
                        do_retry = 1;
                if (gfp_mask & __GFP_NOFAIL)
                        do_retry = 1;
        }
        if (do_retry) {
                congestion_wait(WRITE, HZ/50);
                goto rebalance;
        }


nopage:
        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
                printk(KERN_WARNING "%s: page allocation failure."
                        " order:%d, mode:0x%x\n",
                        p->comm, order, gfp_mask);
                dump_stack();
/
   *
   * The architecture-independent dump_stack generator
   *
  void dump_stack(void)
  {
          unsigned long stack;
          show_trace(current, NULL, &stack);
  }
2void show_trace(struct task_struct *task, struct pt_regs *regs,                                                       
                  unsigned long * stack)
  {
          show_trace_log_lvl(task, regs, stack, "");
  }
*
                show_mem();//如果沒有空閑的頁就顯示記憶體具體分布,即羅列出相應的資訊
        }
got_pg:
        return page;
}


EXPORT_SYMBOL(__alloc_pages);           
Linux核心記憶體管理alloc_pages()函數分析

需要的小夥伴私信回複核心免費領取

原文位址:Linux核心記憶體管理alloc_pages()函數分析 - 圈點 - 核心技術中文網 - 建構全國最權威的核心技術交流分享論壇(版權歸原文作者所有,侵權聯系删除)

繼續閱讀