Linux缓存机制之页缓存

Linux运用一个功能广泛的缓冲和缓存框架来提高系统的速度。缓冲和缓存利用一部分系统物理内存，确保最重要、最常使用的块设备数据在操作时可直接从主内存获取，而无需从低速设备读取。物理内存还用于存储从快设备读取的数据，使得随后对该数据的访问可直接在物理内存进行，而无需从外部设备再次取用。考虑系统中多种因素然后延迟写回在总体上改进了系统的性能。前面分析的部分，例如内存管理的slab缓存是一个内存到内存的缓存，其目地不是加速对低速设备的操作，而是对现有资源进行更简单、更高效的使用。文件系统的Dentry缓存也用于减少对低速块设备的访问，但他无法推广到通用场合，因为他是专门用于处理单一数据类型的。

内核为块设备提供了两种通用的缓存方案：

1）页缓存，针对以页为单位的所有操作，并考虑了特定体系结构上的页长度。一个主要的例子是内存映射技术。因为其他类型的文件访问也是基于内核中的这一技术实现的。所以页缓存实际上负责了块设备的大部分缓存工作。

2）块缓存，以块为操作单位。在进行I/O操作时，存取的单位是设备的各个块，而不是整个内存页。尽管页长度对所有文件系统都是相同的，但块长度取决于特定的文件系统或其设置。因而，块缓存必须能够处理不同长度的块。

目前用于块传输的标准数据结构已经演变为struct bio。用这种方式进行块传输更为高效，因为他可以合并同一请求中后续的块，加速处理的进行。在许多场合下，页缓存和块缓存是联合使用的。例如，一个缓存的页在写操作期间可以划分为不同的缓冲区，这样可以在更细的力度下，识别出页被修改的部分。好处在于，在将数据写回时，只需要回写被修改的部分，无需将这个页面传输回底层的块设备。

页面缓存结构

[cpp] view plain copy print ?

struct address_space {
struct inode *host;
struct radix_tree_root page_tree;
spinlock_t tree_lock;
unsigned int i_mmap_writable;
struct prio_tree_root i_mmap;
struct list_head i_mmap_nonlinear;
spinlock_t i_mmap_lock;
unsigned int truncate_count;
unsigned long nrpages;
pgoff_t writeback_index;
const struct address_space_operations *a_ops;
unsigned long flags;
struct backing_dev_info *backing_dev_info;
spinlock_t private_lock;
struct list_head private_list;
struct address_space *assoc_mapping;
} __attribute__((aligned(sizeof(long))));

后备存储信息

[cpp] view plain copy print ?

struct backing_dev_info {
struct list_head bdi_list;
struct rcu_head rcu_head;
unsigned long ra_pages;
unsigned long state;
unsigned int capabilities;
congested_fn *congested_fn;
void *congested_data;
void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
void *unplug_io_data;
char *name;
struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
struct prop_local_percpu completions;
int dirty_exceeded;
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
struct bdi_writeback wb;
spinlock_t wb_lock;
struct list_head wb_list;
unsigned long wb_mask;
unsigned int wb_cnt;
struct list_head work_list;
struct device *dev;
#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
struct dentry *debug_stats;
#endif
};

下图为地址空间与内核其他部分的关联。

Linux缓存机制之页缓存

内核采用一种通用的地址空间方案，来建立缓存数据与其来源之间的关联。

1）内存中的页分配到每个地址空间。这些页的内容可以由用户进程或内核本身使用各式各样的方法操作。这些数据表示了缓存中的内容；

2）后备存储器struct backing_dev_info指定了填充地址空间中页的数据的来源。地址空间关联到处理器的虚拟地址空间，是由处理器在虚拟内存中管理的一个区域到设备device上对应位置之间的一个映射。

如果访问了虚拟内存中的某个位置，该位置没有关联到物理内存页，内核可根据地址空间结构来找到读取数据的来源。

为支持数据传输，每个地址空间都提供了一组操作，以容许地址空间所涉及双方面的交互。

地址空间是内核中最关键的数据结构之一，对该数据结构的管理，已经演变为内核面对的最关键的问题之一。页缓存的任务在于，获得一些物理内存页，以加速在块设备上按页为单位执行的操作。

内核使用了基数树来管理与一个地址空间相关的所有页，以便尽可能降低开销。对于基数树的理解在这里就不分析了，后面有空的时候再做分析。

地址空间操作

[cpp] view plain copy print ?

struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
void (*sync_page)(struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, gfp_t);
ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
int (*get_xip_mem)(struct address_space *, pgoff_t, int,
void **, unsigned long *);
int (*migratepage) (struct address_space *,
struct page *, struct page *);
int (*launder_page) (struct page *);
int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
unsigned long);
int (*error_remove_page)(struct address_space *, struct page *);
};

页面缓存的实现基于基数树，缓存属于内核中性能要求最苛刻的部分之一，而且广泛用于内核的所有子系统，实现也比较简单。举两个例子，其他的暂时不做分析了。

分配页面用于加入地址空间

[cpp] view plain copy print ?

static inline struct page *page_cache_alloc(struct address_space *x)
{
return __page_cache_alloc(mapping_gfp_mask(x));
}

分配完了添加到基数树中

[cpp] view plain copy print ?

static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
__set_page_locked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
__clear_page_locked(page);
return error;
}

[cpp] view plain copy print ?

int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
int error;
VM_BUG_ON(!PageLocked(page));
error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
page_cache_get(page);
page->mapping = mapping;
page->index = offset;
spin_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
}
radix_tree_preload_end();
} else
mem_cgroup_uncharge_cache_page(page);
out:
return error;
}

来源：

内核为块设备提供了两种通用的缓存方案：

页面缓存结构

[cpp] view plain copy print ?

struct address_space {
struct inode *host;
struct radix_tree_root page_tree;
spinlock_t tree_lock;
unsigned int i_mmap_writable;
struct prio_tree_root i_mmap;
struct list_head i_mmap_nonlinear;
spinlock_t i_mmap_lock;
unsigned int truncate_count;
unsigned long nrpages;
pgoff_t writeback_index;
const struct address_space_operations *a_ops;
unsigned long flags;
struct backing_dev_info *backing_dev_info;
spinlock_t private_lock;
struct list_head private_list;
struct address_space *assoc_mapping;
} __attribute__((aligned(sizeof(long))));

后备存储信息

[cpp] view plain copy print ?

struct backing_dev_info {
struct list_head bdi_list;
struct rcu_head rcu_head;
unsigned long ra_pages;
unsigned long state;
unsigned int capabilities;
congested_fn *congested_fn;
void *congested_data;
void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
void *unplug_io_data;
char *name;
struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
struct prop_local_percpu completions;
int dirty_exceeded;
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
struct bdi_writeback wb;
spinlock_t wb_lock;
struct list_head wb_list;
unsigned long wb_mask;
unsigned int wb_cnt;
struct list_head work_list;
struct device *dev;
#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
struct dentry *debug_stats;
#endif
};

下图为地址空间与内核其他部分的关联。

Linux缓存机制之页缓存

内核采用一种通用的地址空间方案，来建立缓存数据与其来源之间的关联。

1）内存中的页分配到每个地址空间。这些页的内容可以由用户进程或内核本身使用各式各样的方法操作。这些数据表示了缓存中的内容；

如果访问了虚拟内存中的某个位置，该位置没有关联到物理内存页，内核可根据地址空间结构来找到读取数据的来源。

为支持数据传输，每个地址空间都提供了一组操作，以容许地址空间所涉及双方面的交互。

内核使用了基数树来管理与一个地址空间相关的所有页，以便尽可能降低开销。对于基数树的理解在这里就不分析了，后面有空的时候再做分析。

地址空间操作

[cpp] view plain copy print ?

struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
void (*sync_page)(struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, gfp_t);
ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
int (*get_xip_mem)(struct address_space *, pgoff_t, int,
void **, unsigned long *);
int (*migratepage) (struct address_space *,
struct page *, struct page *);
int (*launder_page) (struct page *);
int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
unsigned long);
int (*error_remove_page)(struct address_space *, struct page *);
};

分配页面用于加入地址空间

[cpp] view plain copy print ?

static inline struct page *page_cache_alloc(struct address_space *x)
{
return __page_cache_alloc(mapping_gfp_mask(x));
}

分配完了添加到基数树中

[cpp] view plain copy print ?

static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
__set_page_locked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
__clear_page_locked(page);
return error;
}

[cpp] view plain copy print ?

int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
int error;
VM_BUG_ON(!PageLocked(page));
error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
page_cache_get(page);
page->mapping = mapping;
page->index = offset;
spin_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
}
radix_tree_preload_end();
} else
mem_cgroup_uncharge_cache_page(page);
out:
return error;
}

来源：http://blog.csdn.net/bullbat/article/details/7296988

内核为块设备提供了两种通用的缓存方案：

页面缓存结构

[cpp] view plain copy print ?

struct address_space {
struct inode *host;
struct radix_tree_root page_tree;
spinlock_t tree_lock;
unsigned int i_mmap_writable;
struct prio_tree_root i_mmap;
struct list_head i_mmap_nonlinear;
spinlock_t i_mmap_lock;
unsigned int truncate_count;
unsigned long nrpages;
pgoff_t writeback_index;
const struct address_space_operations *a_ops;
unsigned long flags;
struct backing_dev_info *backing_dev_info;
spinlock_t private_lock;
struct list_head private_list;
struct address_space *assoc_mapping;
} __attribute__((aligned(sizeof(long))));

后备存储信息

[cpp] view plain copy print ?

struct backing_dev_info {
struct list_head bdi_list;
struct rcu_head rcu_head;
unsigned long ra_pages;
unsigned long state;
unsigned int capabilities;
congested_fn *congested_fn;
void *congested_data;
void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
void *unplug_io_data;
char *name;
struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
struct prop_local_percpu completions;
int dirty_exceeded;
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
struct bdi_writeback wb;
spinlock_t wb_lock;
struct list_head wb_list;
unsigned long wb_mask;
unsigned int wb_cnt;
struct list_head work_list;
struct device *dev;
#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
struct dentry *debug_stats;
#endif
};

下图为地址空间与内核其他部分的关联。

Linux缓存机制之页缓存

内核采用一种通用的地址空间方案，来建立缓存数据与其来源之间的关联。

1）内存中的页分配到每个地址空间。这些页的内容可以由用户进程或内核本身使用各式各样的方法操作。这些数据表示了缓存中的内容；

如果访问了虚拟内存中的某个位置，该位置没有关联到物理内存页，内核可根据地址空间结构来找到读取数据的来源。

为支持数据传输，每个地址空间都提供了一组操作，以容许地址空间所涉及双方面的交互。

内核使用了基数树来管理与一个地址空间相关的所有页，以便尽可能降低开销。对于基数树的理解在这里就不分析了，后面有空的时候再做分析。

地址空间操作

[cpp] view plain copy print ?

struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
void (*sync_page)(struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, gfp_t);
ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
int (*get_xip_mem)(struct address_space *, pgoff_t, int,
void **, unsigned long *);
int (*migratepage) (struct address_space *,
struct page *, struct page *);
int (*launder_page) (struct page *);
int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
unsigned long);
int (*error_remove_page)(struct address_space *, struct page *);
};

分配页面用于加入地址空间

[cpp] view plain copy print ?

static inline struct page *page_cache_alloc(struct address_space *x)
{
return __page_cache_alloc(mapping_gfp_mask(x));
}

分配完了添加到基数树中

[cpp] view plain copy print ?

static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
__set_page_locked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
__clear_page_locked(page);
return error;
}

[cpp] view plain copy print ?

int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
int error;
VM_BUG_ON(!PageLocked(page));
error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
page_cache_get(page);
page->mapping = mapping;
page->index = offset;
spin_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
}
radix_tree_preload_end();
} else
mem_cgroup_uncharge_cache_page(page);
out:
return error;
}

Linux缓存机制之页缓存

继续阅读

Apache (You don't have permission to access / on this server.）

debian9升级4.9.0内核到4.19.2内核过程

centOS7 配置 vsftpd 虚拟用户及权限Vsftpd配置虚拟用户及权限

linux-svn卸载与安装

vsftp虚拟多用户多权限一键部署脚本

Ubuntu14.04 LTS下安装mongodb

Nginx服务优化（1）——隐藏版本号、修改用户与组、网页缓存时间、日志切割、连接超时一、隐藏版本号二、修改用户与组三、配置Nginx网页缓存时间四、实现Nginx日志分割五、配置Nginx实现连接超时六、补充关于时间日期的命令

httpd服务的部署、启动、配置和简单优化一、部署二、启动三、配置文件

配置网页内容访问

手动安装Intel network I217-LM网卡的Linux驱动

禁止ubuntu系统弹出报错界面

Ubuntu Linux下Apache的配置文件

samba服务器的功能

【Linux】UDP广播报文接收速率问题

Linux设备模型（中）之上层容器

PowerPC平台 Linux移植三