接上一篇
現在分析寫時複制COW,對于寫時複制,首先把握一點就是隻有寫操作時才有可能觸發寫時複制,是以首先總要判斷異常flag是否含有标志FAULT_FLAG_WRITE,然後判斷二級頁表條目值是否含有L_PTE_WRITE标志,這是意味着這個實體頁是否可寫,如果不可寫則說明應該進入寫時複制流程,調用處理函數do_wp_page;
可見,COW的應用場合就是通路映射的頁不可寫,它包括兩種情況,第一種是fork導緻,第二種是如malloc後第一次對他進行讀操作,擷取到的是zero_pfn零頁,當再次寫時需要寫時複制,共同特點都是虛拟位址的二級頁表映射内容在記憶體中,但是對應的頁不可寫,在函數do_wp_page中對于這兩種情況的處理基本相似的;
另外一個應該知道的是,如果該頁隻有一個程序在用,那麼就直接修改這個頁可寫就行了,不要搞COW,總之,不到不得以的情況下是不會進行COW的,這也是核心對于COW使用的原則,就是盡量不使用;
函數do_wp_page源碼如下:
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte)
{
struct page *old_page, *new_page;
pte_t entry;
int reuse = 0, ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
goto reuse;
goto gotten;
}
if (PageAnon(old_page) && !PageKsm(old_page)) {
if (!trylock_page(old_page)) {
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
goto unlock;
}
page_cache_release(old_page);
}
reuse = reuse_swap_page(old_page);
unlock_page(old_page);
}
else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
struct vm_fault vmf;
int tmp;
vmf.virtual_address = (void __user *)(address &
PAGE_MASK);
vmf.pgoff = old_page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
vmf.page = old_page;
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(old_page);
if (!old_page->mapping) {
ret = 0;
unlock_page(old_page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(old_page));
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
goto unlock;
}
page_mkwrite = 1;
}
dirty_page = old_page;
get_page(dirty_page);
reuse = 1;
}
if (reuse) {
reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, entry);
ret |= VM_FAULT_WRITE;
goto unlock;
}
page_cache_get(old_page);
gotten:
pte_unmap_unlock(page_table, ptl);
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (is_zero_pfn(pte_pfn(orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma, address);
if (!new_page)
goto oom;
}
else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);
}
__SetPageUptodate(new_page);
if ((vma->vm_flags & VM_LOCKED) && old_page) {
lock_page(old_page);
clear_page_mlock(old_page);
unlock_page(old_page);
}
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
inc_mm_counter(mm, anon_rss);
}
} else
inc_mm_counter(mm, anon_rss);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
ptep_clear_flush(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
set_pte_at_notify(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
if (old_page) {
page_remove_rmap(old_page);
}
new_page = old_page;
ret |= VM_FAULT_WRITE;
}
else
mem_cgroup_uncharge_page(new_page);
if (new_page)
page_cache_release(new_page);
if (old_page)
page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
set_page_dirty_balance(dirty_page, page_mkwrite);
}
put_page(dirty_page);
if (page_mkwrite) {
struct address_space *mapping = dirty_page->mapping;
set_page_dirty(dirty_page);
unlock_page(dirty_page);
page_cache_release(dirty_page);
if (mapping) {
balance_dirty_pages_ratelimited(mapping);
}
}
if (vma->vm_file)
file_update_time(vma->vm_file);
}
return ret;
oom_free_new:
page_cache_release(new_page);
oom:
if (old_page) {
if (page_mkwrite) {
unlock_page(old_page);
page_cache_release(old_page);
}
page_cache_release(old_page);
}
return VM_FAULT_OOM;
unwritable_page:
page_cache_release(old_page);
return ret;
}
一級一級傳回,最終傳回到函數__do_page_fault,會根據傳回值fault累計task的相應異常類型次數(maj_flt或min_flt),并最終把fault傳回給函數do_page_fault,釋放信号量mmap_sem,正常情況下就傳回0,缺頁異常處理完畢。