天天看點

核心代碼閱讀(13) - sys_mmap

sys_mmap

static inline long do_mmap2(
        unsigned long addr, unsigned long len,
        unsigned long prot, unsigned long flags,
        unsigned long fd, unsigned long pgoff)
    {
        int error = -EBADF;
        struct file * file = NULL;
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
        if (!(flags & MAP_ANONYMOUS)) {
                file = fget(fd);
                if (!file)
                        goto out;
        }
        down(&current->mm->mmap_sem);
        error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
        up(&current->mm->mmap_sem);
        if (file)
                fput(file);
    out:
        return error;
    }      
1) MAP_ANONYMOUS
   這個flag表示沒有檔案映射,隻是用來在指定的位址上配置設定記憶體。
2) file = fget(fd);
   擷取程序中的file結構。
3) do_mmap_pgoff      

do_mmap_pgoff 映射檔案

unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
        unsigned long prot, unsigned long flags, unsigned long pgoff)
{
        struct mm_struct * mm = current->mm;
        struct vm_area_struct * vma;
        int correct_wcount = 0;
        int error;
        if (flags & MAP_FIXED) {
                if (addr & ~PAGE_MASK)
                        return -EINVAL;
        } else {
                addr = get_unmapped_area(addr, len);
                if (!addr)
                        return -ENOMEM;
        }
        vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
        if (!vma)
                return -ENOMEM;
        vma->vm_mm = mm;
        vma->vm_start = addr;
        vma->vm_end = addr + len;
        vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;
        vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
        vma->vm_ops = NULL;
        vma->vm_pgoff = pgoff;
        vma->vm_file = NULL;
        vma->vm_private_data = NULL;
        error = -ENOMEM;
        if (do_munmap(mm, addr, len))
                goto free_vma;
        if ((mm->total_vm << PAGE_SHIFT) + len
            > current->rlim[RLIMIT_AS].rlim_cur)
                goto free_vma;
        if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
            !(flags & MAP_NORESERVE)                                 &&
            !vm_enough_memory(len >> PAGE_SHIFT))
                goto free_vma;
        if (file) {
                if (vma->vm_flags & VM_DENYWRITE) {
                        error = deny_write_access(file);
                        if (error)
                                goto free_vma;
                        correct_wcount = 1;
                }
                vma->vm_file = file;
                get_file(file);
                error = file->f_op->mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;
        } else if (flags & MAP_SHARED) {
                error = shmem_zero_setup(vma);
                if (error)
                        goto free_vma;
        }
        flags = vma->vm_flags;
        addr = vma->vm_start;
        insert_vm_struct(mm, vma);
        if (correct_wcount)
                atomic_inc(&file->f_dentry->d_inode->i_writecount);
        
        mm->total_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED) {
                mm->locked_vm += len >> PAGE_SHIFT;
                make_pages_present(addr, addr + len);
        }
        return addr;
    unmap_and_free_vma:
        if (correct_wcount)
                atomic_inc(&file->f_dentry->d_inode->i_writecount);
        vma->vm_file = NULL;
        fput(file);
        flush_cache_range(mm, vma->vm_start, vma->vm_end);
        zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
        flush_tlb_range(mm, vma->vm_start, vma->vm_end);
    free_vma:
        kmem_cache_free(vm_area_cachep, vma);
        return error;
    }      
1) if (flags & MAP_FIXED)
   MAP_FIXED: 表示映射檔案到程序空間的起始位址必須是addr,如果滿足不了則傳回錯誤。
2) if (addr & ~PAGE_MASK)
   addr必須要page對齊。
3) addr = get_unmapped_area(addr, len);
   如果MAP_FIXED沒有設定,則從程序的位址空間中配置設定一個addr。
4) vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
   配置設定一個vma,每次mmap都會增加一個vma。而brk會判斷新增的vma和已有的vma相鄰而且屬性相同會進行合并。
5) vma->vm_pgoff = pgoff;
   記錄檔案的偏移量到vma中,以供缺頁中斷時讀取檔案。
6) if (do_munmap(mm, addr, len))
   解除已經映射的addr。當MAP_FIXED沒有指定了,并且addr和之前的map重複了。
7) goto free_vma;
   TODO
   核心中有很多都是先配置設定資源,然後進一步檢查條件,如果檢查失敗,則釋放資源。
   之是以采用這種看似浪費的操作,是因為配置設定資源會導緻程序切換。當先檢查成功後,再配置設定資源,就在配置設定資源過程中發生了程序切換,會導緻先前檢查的條件已經不成立了。
8) vma->vm_file = file;
   設定file
9) error = file->f_op->mmap(file, vma);
   file->f_op->mmap 和具體的檔案系統相關,ext2中對應的是 generic_file_mmap。
10) insert_vm_struct(mm, vma);
    把新的vma插入到目前程序的mm中。
11) if (flags & VM_LOCKED)
    如果設定了VM_LOCKED标記,表示把檔案的内容鎖在記憶體中,此時調用 make_pages_present,把檔案讀進記憶體。      

get_unmapped_area 使用者程序中配置設定虛拟位址區間

unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
    {
        struct vm_area_struct * vmm;
        if (len > TASK_SIZE)
                return 0;
        if (!addr)
                addr = TASK_UNMAPPED_BASE;
        addr = PAGE_ALIGN(addr);
        for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
                if (TASK_SIZE - len < addr)
                        return 0;
                if (!vmm || addr + len <= vmm->vm_start)
                        return addr;
                addr = vmm->vm_end;
        }
    }      
1) addr = TASK_UNMAPPED_BASE;
   如果addr為0,則從TASK_SIZE/3=1G的位置開始往上找。也就是說,mmap是從1G開始的。
2) for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next)
   從第一個vma->start大于addr開始找。
3) if (!vmm || addr + len <= vmm->vm_start)
   如果目前addr+len < vma->vm_start,找到了一個空洞。      

generic_file_mmap ext2的mmap

mmap的定義
struct file_operations ext2_file_operations = {
        llseek:                ext2_file_lseek,
        read:                generic_file_read,
        write:                generic_file_write,
        ioctl:                ext2_ioctl,
        mmap:                generic_file_mmap,
        open:                ext2_open_file,
        release:        ext2_release_file,
        fsync:                ext2_sync_file,
   };      
generic_file_mmap
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
    {
        struct vm_operations_struct * ops;
        struct inode *inode = file->f_dentry->d_inode;
        ops = &file_private_mmap;
        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
                if (!inode->i_mapping->a_ops->writepage)
                        return -EINVAL;
                ops = &file_shared_mmap;
        }
        if (!inode->i_sb || !S_ISREG(inode->i_mode))
                return -EACCES;
        if (!inode->i_mapping->a_ops->readpage)
                return -ENOEXEC;
        UPDATE_ATIME(inode);
        vma->vm_ops = ops;
        return 0;
    }      
1) vma->vm_ops = ops;
   設定vm_ops,這是缺頁的回調函數。
2) ops = &file_private_mmap;
   static struct vm_operations_struct file_private_mmap = {
       nopage:                filemap_nopage,
   };
3) if (!inode->i_mapping->a_ops->writepage)
   檢查 address_space_operations。
4) address_space
   TODO
   struct address_space {
    struct list_head        clean_pages;        /* list of clean pages */
    struct list_head        dirty_pages;        /* list of dirty pages */
    struct list_head        locked_pages;        /* list of locked pages */
    unsigned long                nrpages;        /* number of total pages */
    struct address_space_operations *a_ops;        /* methods */
    struct inode                *host;                /* owner: inode, block_device */
    struct vm_area_struct        *i_mmap;        /* list of private mappings */
    struct vm_area_struct        *i_mmap_shared; /* list of shared mappings */
    spinlock_t                i_shared_lock;  /* and spinlock protecting it */
  };
5) address_space_operations
   struct address_space_operations ext2_aops = {
    readpage: ext2_readpage,
    writepage: ext2_writepage,
    sync_page: block_sync_page,
    prepare_write: ext2_prepare_write,
    commit_write: generic_commit_write,
    bmap: ext2_bmap
   };      

make_pages_present 主動觸發缺頁

int make_pages_present(unsigned long addr, unsigned long end)
    {
        int write;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct * vma;
        vma = find_vma(mm, addr);
        write = (vma->vm_flags & VM_WRITE) != 0;
        if (addr >= end)
                BUG();
        do {
                if (handle_mm_fault(mm, vma, addr, write) < 0)
                        return -1;
                addr += PAGE_SIZE;
        } while (addr < end);
        return 0;
    }      
1) 每隔一個 PAGE_SIZE 調用一次 handle_mm_fault。      
handle_mm_fault -> handle_pte_fault -> do_no_page 主動觸發缺頁
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
        unsigned long address, int write_access, pte_t *page_table)
    {
        struct page * new_page;
        pte_t entry;
        if (!vma->vm_ops || !vma->vm_ops->nopage)
                return do_anonymous_page(mm, vma, page_table, write_access, address);
        new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
        if (new_page == NULL)        /* no page was available -- SIGBUS */
                return 0;
        if (new_page == NOPAGE_OOM)
                return -1;
        ++mm->rss;
        flush_page_to_ram(new_page);
        flush_icache_page(vma, new_page);
        entry = mk_pte(new_page, vma->vm_page_prot);
        if (write_access) {
                entry = pte_mkwrite(pte_mkdirty(entry));
        } else if (page_count(new_page) > 1 &&
                   !(vma->vm_flags & VM_SHARED))
                entry = pte_wrprotect(entry);
        set_pte(page_table, entry);
        update_mmu_cache(vma, address, entry);
        return 2;        /* Major fault */
    }      
1) new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
   調用mmap的缺頁回調。      

vma->vm_ops->nopage

vm_ops在 generic_file_mmap中已經設定好了。
   ext2檔案系統的設定:      
ops = &file_private_mmap;
       
       static struct vm_operations_struct file_private_mmap = {
           nopage:                filemap_nopage,
       };      
是以 nopage最終會進入 filemap_nopage。      

# filemap_nopage 缺頁處理

struct page * filemap_nopage(struct vm_area_struct * area,
        unsigned long address, int no_share)
    {
        int error;
        struct file *file = area->vm_file;
        struct inode *inode = file->f_dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        struct page *page, **hash, *old_page;
        unsigned long size, pgoff;
        pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
    retry_all:
        size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if ((pgoff >= size) && (area->vm_mm == current->mm))
                return NULL;
        hash = page_hash(mapping, pgoff);
    retry_find:
        page = __find_get_page(mapping, pgoff, hash);
        if (!page)
                goto no_cached_page;
        if (!Page_Uptodate(page))
                goto page_not_uptodate;
    success:
        if (VM_SequentialReadHint(area))
                nopage_sequential_readahead(area, pgoff, size);
        old_page = page;
        if (no_share) {
                struct page *new_page = page_cache_alloc();
                if (new_page) {
                        copy_user_highpage(new_page, old_page, address);
                        flush_page_to_ram(new_page);
                } else
                        new_page = NOPAGE_OOM;
                page_cache_release(page);
                return new_page;
        }
        flush_page_to_ram(old_page);
        return old_page;
    no_cached_page:
        if ((pgoff < size) && !VM_RandomReadHint(area))
                error = read_cluster_nonblocking(file, pgoff, size);
        else
                error = page_cache_read(file, pgoff);
        if (error >= 0)
                goto retry_find;
                
        if (error == -ENOMEM)
                return NOPAGE_OOM;
        return NULL;
    page_not_uptodate:
        lock_page(page);
        if (!page->mapping) {
                UnlockPage(page);
                page_cache_release(page);
                goto retry_all;
        }
        if (Page_Uptodate(page)) {
                UnlockPage(page);
                goto success;
        }
        if (!mapping->a_ops->readpage(file, page)) {
                wait_on_page(page);
                if (Page_Uptodate(page))
                        goto success;
        }
        lock_page(page);
        if (!page->mapping) {
                UnlockPage(page);
                page_cache_release(page);
                goto retry_all;
        }
        if (Page_Uptodate(page)) {
                UnlockPage(page);
                goto success;
        }
        ClearPageError(page);
        if (!mapping->a_ops->readpage(file, page)) {
                wait_on_page(page);
                if (Page_Uptodate(page))
                        goto success;
        }
        page_cache_release(page);
        return NULL;
    }      
1) hash = page_hash(mapping, pgoff);
   page = __find_get_page(mapping, pgoff, hash);
   首先在全局的page_hash_table裡嘗試搜尋pgoff的頁面。
2) if (!Page_Uptodate(page))
   如果找到了,檢查頁面的内容是否是最新的。
3) error = read_cluster_nonblocking(file, pgoff, size);
   如果頁面不再hash裡面,則配置設定新的實體頁,并從裝置上讀入。把實體頁加入相應的隊列中。
   這個函數會向前預讀一些頁面。      

## page_cache_read 從檔案讀内容到一個頁面

static inline int page_cache_read(struct file * file, unsigned long offset) 
    {
        struct inode *inode = file->f_dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        struct page **hash = page_hash(mapping, offset);
        struct page *page; 
        spin_lock(&pagecache_lock);
        page = __find_page_nolock(mapping, offset, *hash); 
        spin_unlock(&pagecache_lock);
        if (page)
                return 0;
        page = page_cache_alloc();
        if (!page)
                return -ENOMEM;
        if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
                int error = mapping->a_ops->readpage(file, page);
                page_cache_release(page);
                return error;
        }
        page_cache_free(page);
        return 0;
    }      
1) page = __find_page_nolock(mapping, offset, *hash);
   再次到hash表中看看是否已經有别人把這個頁面讀進來了。
2) if (!add_to_page_cache_unique(page, mapping, offset, hash))
   添加頁面到3個連結清單中。
3) int error = mapping->a_ops->readpage(file, page);
   調用相應檔案系統的接口讀入内容。      

### add_to_page_cache_unique

static int add_to_page_cache_unique(struct page * page,
        struct address_space *mapping, unsigned long offset,
        struct page **hash)
    {
        int err;
        struct page *alias;
        spin_lock(&pagecache_lock);
        alias = __find_page_nolock(mapping, offset, *hash);
        err = 1;
        if (!alias) {
                __add_to_page_cache(page,mapping,offset,hash);
                err = 0;
        }
        spin_unlock(&pagecache_lock);
        return err;
    }
    
    static inline void __add_to_page_cache(struct page * page,
        struct address_space *mapping, unsigned long offset,
        struct page **hash)
    {
        unsigned long flags;
        if (PageLocked(page))
                BUG();
        flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
        page->flags = flags | (1 << PG_locked);
        page_cache_get(page);
        page->index = offset;
        add_page_to_inode_queue(mapping, page);
        add_page_to_hash_queue(page, hash);
        lru_cache_add(page);
    }      
1) add_page_to_inode_queue(mapping, page);
   添加實體頁到 i_mapping的clean_pages中。
2) add_page_to_hash_queue(page, hash);
   添加實體頁到hash表中。
3) lru_cache_add(page);
   添加實體頁到 active_list中      

繼續閱讀