sys_mmap
static inline long do_mmap2(
unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
int error = -EBADF;
struct file * file = NULL;
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
if (!(flags & MAP_ANONYMOUS)) {
file = fget(fd);
if (!file)
goto out;
}
down(¤t->mm->mmap_sem);
error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
up(¤t->mm->mmap_sem);
if (file)
fput(file);
out:
return error;
}
1) MAP_ANONYMOUS
這個flag表示沒有檔案映射,隻是用來在指定的位址上配置設定記憶體。
2) file = fget(fd);
擷取程序中的file結構。
3) do_mmap_pgoff
do_mmap_pgoff 映射檔案
unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma;
int correct_wcount = 0;
int error;
if (flags & MAP_FIXED) {
if (addr & ~PAGE_MASK)
return -EINVAL;
} else {
addr = get_unmapped_area(addr, len);
if (!addr)
return -ENOMEM;
}
vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!vma)
return -ENOMEM;
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;
vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
vma->vm_ops = NULL;
vma->vm_pgoff = pgoff;
vma->vm_file = NULL;
vma->vm_private_data = NULL;
error = -ENOMEM;
if (do_munmap(mm, addr, len))
goto free_vma;
if ((mm->total_vm << PAGE_SHIFT) + len
> current->rlim[RLIMIT_AS].rlim_cur)
goto free_vma;
if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
!(flags & MAP_NORESERVE) &&
!vm_enough_memory(len >> PAGE_SHIFT))
goto free_vma;
if (file) {
if (vma->vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
correct_wcount = 1;
}
vma->vm_file = file;
get_file(file);
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
} else if (flags & MAP_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
flags = vma->vm_flags;
addr = vma->vm_start;
insert_vm_struct(mm, vma);
if (correct_wcount)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
return addr;
unmap_and_free_vma:
if (correct_wcount)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
vma->vm_file = NULL;
fput(file);
flush_cache_range(mm, vma->vm_start, vma->vm_end);
zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
flush_tlb_range(mm, vma->vm_start, vma->vm_end);
free_vma:
kmem_cache_free(vm_area_cachep, vma);
return error;
}
1) if (flags & MAP_FIXED)
MAP_FIXED: 表示映射檔案到程序空間的起始位址必須是addr,如果滿足不了則傳回錯誤。
2) if (addr & ~PAGE_MASK)
addr必須要page對齊。
3) addr = get_unmapped_area(addr, len);
如果MAP_FIXED沒有設定,則從程序的位址空間中配置設定一個addr。
4) vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
配置設定一個vma,每次mmap都會增加一個vma。而brk會判斷新增的vma和已有的vma相鄰而且屬性相同會進行合并。
5) vma->vm_pgoff = pgoff;
記錄檔案的偏移量到vma中,以供缺頁中斷時讀取檔案。
6) if (do_munmap(mm, addr, len))
解除已經映射的addr。當MAP_FIXED沒有指定了,并且addr和之前的map重複了。
7) goto free_vma;
TODO
核心中有很多都是先配置設定資源,然後進一步檢查條件,如果檢查失敗,則釋放資源。
之是以采用這種看似浪費的操作,是因為配置設定資源會導緻程序切換。當先檢查成功後,再配置設定資源,就在配置設定資源過程中發生了程序切換,會導緻先前檢查的條件已經不成立了。
8) vma->vm_file = file;
設定file
9) error = file->f_op->mmap(file, vma);
file->f_op->mmap 和具體的檔案系統相關,ext2中對應的是 generic_file_mmap。
10) insert_vm_struct(mm, vma);
把新的vma插入到目前程序的mm中。
11) if (flags & VM_LOCKED)
如果設定了VM_LOCKED标記,表示把檔案的内容鎖在記憶體中,此時調用 make_pages_present,把檔案讀進記憶體。
get_unmapped_area 使用者程序中配置設定虛拟位址區間
unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
{
struct vm_area_struct * vmm;
if (len > TASK_SIZE)
return 0;
if (!addr)
addr = TASK_UNMAPPED_BASE;
addr = PAGE_ALIGN(addr);
for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
if (TASK_SIZE - len < addr)
return 0;
if (!vmm || addr + len <= vmm->vm_start)
return addr;
addr = vmm->vm_end;
}
}
1) addr = TASK_UNMAPPED_BASE;
如果addr為0,則從TASK_SIZE/3=1G的位置開始往上找。也就是說,mmap是從1G開始的。
2) for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next)
從第一個vma->start大于addr開始找。
3) if (!vmm || addr + len <= vmm->vm_start)
如果目前addr+len < vma->vm_start,找到了一個空洞。
generic_file_mmap ext2的mmap
mmap的定義
struct file_operations ext2_file_operations = {
llseek: ext2_file_lseek,
read: generic_file_read,
write: generic_file_write,
ioctl: ext2_ioctl,
mmap: generic_file_mmap,
open: ext2_open_file,
release: ext2_release_file,
fsync: ext2_sync_file,
};
generic_file_mmap
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
struct vm_operations_struct * ops;
struct inode *inode = file->f_dentry->d_inode;
ops = &file_private_mmap;
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
if (!inode->i_mapping->a_ops->writepage)
return -EINVAL;
ops = &file_shared_mmap;
}
if (!inode->i_sb || !S_ISREG(inode->i_mode))
return -EACCES;
if (!inode->i_mapping->a_ops->readpage)
return -ENOEXEC;
UPDATE_ATIME(inode);
vma->vm_ops = ops;
return 0;
}
1) vma->vm_ops = ops;
設定vm_ops,這是缺頁的回調函數。
2) ops = &file_private_mmap;
static struct vm_operations_struct file_private_mmap = {
nopage: filemap_nopage,
};
3) if (!inode->i_mapping->a_ops->writepage)
檢查 address_space_operations。
4) address_space
TODO
struct address_space {
struct list_head clean_pages; /* list of clean pages */
struct list_head dirty_pages; /* list of dirty pages */
struct list_head locked_pages; /* list of locked pages */
unsigned long nrpages; /* number of total pages */
struct address_space_operations *a_ops; /* methods */
struct inode *host; /* owner: inode, block_device */
struct vm_area_struct *i_mmap; /* list of private mappings */
struct vm_area_struct *i_mmap_shared; /* list of shared mappings */
spinlock_t i_shared_lock; /* and spinlock protecting it */
};
5) address_space_operations
struct address_space_operations ext2_aops = {
readpage: ext2_readpage,
writepage: ext2_writepage,
sync_page: block_sync_page,
prepare_write: ext2_prepare_write,
commit_write: generic_commit_write,
bmap: ext2_bmap
};
make_pages_present 主動觸發缺頁
int make_pages_present(unsigned long addr, unsigned long end)
{
int write;
struct mm_struct *mm = current->mm;
struct vm_area_struct * vma;
vma = find_vma(mm, addr);
write = (vma->vm_flags & VM_WRITE) != 0;
if (addr >= end)
BUG();
do {
if (handle_mm_fault(mm, vma, addr, write) < 0)
return -1;
addr += PAGE_SIZE;
} while (addr < end);
return 0;
}
1) 每隔一個 PAGE_SIZE 調用一次 handle_mm_fault。
handle_mm_fault -> handle_pte_fault -> do_no_page 主動觸發缺頁
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long address, int write_access, pte_t *page_table)
{
struct page * new_page;
pte_t entry;
if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table, write_access, address);
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
if (new_page == NULL) /* no page was available -- SIGBUS */
return 0;
if (new_page == NOPAGE_OOM)
return -1;
++mm->rss;
flush_page_to_ram(new_page);
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
} else if (page_count(new_page) > 1 &&
!(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
set_pte(page_table, entry);
update_mmu_cache(vma, address, entry);
return 2; /* Major fault */
}
1) new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
調用mmap的缺頁回調。
vma->vm_ops->nopage
vm_ops在 generic_file_mmap中已經設定好了。
ext2檔案系統的設定:
ops = &file_private_mmap;
static struct vm_operations_struct file_private_mmap = {
nopage: filemap_nopage,
};
是以 nopage最終會進入 filemap_nopage。
# filemap_nopage 缺頁處理
struct page * filemap_nopage(struct vm_area_struct * area,
unsigned long address, int no_share)
{
int error;
struct file *file = area->vm_file;
struct inode *inode = file->f_dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
struct page *page, **hash, *old_page;
unsigned long size, pgoff;
pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
retry_all:
size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if ((pgoff >= size) && (area->vm_mm == current->mm))
return NULL;
hash = page_hash(mapping, pgoff);
retry_find:
page = __find_get_page(mapping, pgoff, hash);
if (!page)
goto no_cached_page;
if (!Page_Uptodate(page))
goto page_not_uptodate;
success:
if (VM_SequentialReadHint(area))
nopage_sequential_readahead(area, pgoff, size);
old_page = page;
if (no_share) {
struct page *new_page = page_cache_alloc();
if (new_page) {
copy_user_highpage(new_page, old_page, address);
flush_page_to_ram(new_page);
} else
new_page = NOPAGE_OOM;
page_cache_release(page);
return new_page;
}
flush_page_to_ram(old_page);
return old_page;
no_cached_page:
if ((pgoff < size) && !VM_RandomReadHint(area))
error = read_cluster_nonblocking(file, pgoff, size);
else
error = page_cache_read(file, pgoff);
if (error >= 0)
goto retry_find;
if (error == -ENOMEM)
return NOPAGE_OOM;
return NULL;
page_not_uptodate:
lock_page(page);
if (!page->mapping) {
UnlockPage(page);
page_cache_release(page);
goto retry_all;
}
if (Page_Uptodate(page)) {
UnlockPage(page);
goto success;
}
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page(page);
if (Page_Uptodate(page))
goto success;
}
lock_page(page);
if (!page->mapping) {
UnlockPage(page);
page_cache_release(page);
goto retry_all;
}
if (Page_Uptodate(page)) {
UnlockPage(page);
goto success;
}
ClearPageError(page);
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page(page);
if (Page_Uptodate(page))
goto success;
}
page_cache_release(page);
return NULL;
}
1) hash = page_hash(mapping, pgoff);
page = __find_get_page(mapping, pgoff, hash);
首先在全局的page_hash_table裡嘗試搜尋pgoff的頁面。
2) if (!Page_Uptodate(page))
如果找到了,檢查頁面的内容是否是最新的。
3) error = read_cluster_nonblocking(file, pgoff, size);
如果頁面不再hash裡面,則配置設定新的實體頁,并從裝置上讀入。把實體頁加入相應的隊列中。
這個函數會向前預讀一些頁面。
## page_cache_read 從檔案讀内容到一個頁面
static inline int page_cache_read(struct file * file, unsigned long offset)
{
struct inode *inode = file->f_dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
struct page **hash = page_hash(mapping, offset);
struct page *page;
spin_lock(&pagecache_lock);
page = __find_page_nolock(mapping, offset, *hash);
spin_unlock(&pagecache_lock);
if (page)
return 0;
page = page_cache_alloc();
if (!page)
return -ENOMEM;
if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
int error = mapping->a_ops->readpage(file, page);
page_cache_release(page);
return error;
}
page_cache_free(page);
return 0;
}
1) page = __find_page_nolock(mapping, offset, *hash);
再次到hash表中看看是否已經有别人把這個頁面讀進來了。
2) if (!add_to_page_cache_unique(page, mapping, offset, hash))
添加頁面到3個連結清單中。
3) int error = mapping->a_ops->readpage(file, page);
調用相應檔案系統的接口讀入内容。
### add_to_page_cache_unique
static int add_to_page_cache_unique(struct page * page,
struct address_space *mapping, unsigned long offset,
struct page **hash)
{
int err;
struct page *alias;
spin_lock(&pagecache_lock);
alias = __find_page_nolock(mapping, offset, *hash);
err = 1;
if (!alias) {
__add_to_page_cache(page,mapping,offset,hash);
err = 0;
}
spin_unlock(&pagecache_lock);
return err;
}
static inline void __add_to_page_cache(struct page * page,
struct address_space *mapping, unsigned long offset,
struct page **hash)
{
unsigned long flags;
if (PageLocked(page))
BUG();
flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
page->flags = flags | (1 << PG_locked);
page_cache_get(page);
page->index = offset;
add_page_to_inode_queue(mapping, page);
add_page_to_hash_queue(page, hash);
lru_cache_add(page);
}
1) add_page_to_inode_queue(mapping, page);
添加實體頁到 i_mapping的clean_pages中。
2) add_page_to_hash_queue(page, hash);
添加實體頁到hash表中。
3) lru_cache_add(page);
添加實體頁到 active_list中