注:本文分析基于linux-4.18.0-193.14.2.el8_2核心版本,即CentOS 8.2
1 buffer_head
當程序直接讀寫塊裝置時,比如超級塊和索引節點,就需要把塊資料放入記憶體,我們上一篇講page cache是将檔案資料放入記憶體,是以不适用,這時就需要用到塊緩沖區。每個塊緩沖區都對應一個buffer_head類型的緩沖區首部描述符。資料依然存放在page頁面中,隻不過由buffer_head管理,這種情況下page頁面被稱為緩沖區頁。
2 struct buffer_head主要成員變量
struct buffer_head {
unsigned long b_state; //buffer的狀态
struct buffer_head *b_this_page;//該page的下一個buffer
struct page *b_page; //buffer所在page
sector_t b_blocknr; //相對于block device起始位置的logical block number
size_t b_size; /* size of mapping */
char *b_data; //指向資料在page中的位置
struct block_device *b_bdev; //對應的塊裝置
bh_end_io_t *b_end_io; /* I/O completion */
void *b_private; /* reserved for b_end_io */
struct list_head b_assoc_buffers; /* associated with another mapping */
struct address_space *b_assoc_map; /* mapping this buffer is associated with */
atomic_t b_count; /* users using this buffer_head */
};
可見,buffer_head描述的是磁盤block和記憶體buffer之間的映射關系。
3 建立buffer_head
緩沖區頁的使用主要有一下兩種場景,
- 直接讀取塊裝置(super_block或inode)
- 讀寫的檔案頁在磁盤中不相鄰,或者存在檔案洞
3.1 直接讀取塊裝置
我們先來看下第一種情況,我們以ext4檔案系統的mkdir來大概梳理下流程,
ext4_mkdir ->
ext4_new_inode_start_handle ->
__ext4_new_inode -> ------------------------------------------- //為新目錄配置設定inode索引
new_inode ->
ext4_read_inode_bitmap -> --------------------------------- //擷取磁盤的inode位圖
sb_getblk -> ------------------------------------------ //讀取塊裝置資料
__getblk_gfp ->
__find_get_block ->
lookup_bh_lru ----------------------------- //在每CPU變量bh_lrus中查找BH
__find_get_block_slow -> ------------------ //bh_lrus沒找到就要到對應的page cache中查找頁面
find_get_page_flags ->
pagecache_get_page -> ------------- //查找page cache
find_get_entry ---------------- //根據bdev->bd_inode->i_mapping位址空間在page cache基樹中查找頁面
page_buffers ------------------ //找到page cache看是否有對應的buffer_head,沒有則傳回NULL
bh_lru_install ---------------------------- //如果有找到,把找到的bh放入每CPU bh_lrus中,提高通路速度
__getblk_slow -> ------------------------------ //CPU變量bh_lrus和page cache中都沒有找到目标BH,就需要從塊裝置讀取了
grow_buffers ->
grow_dev_page ->
find_or_create_page ->
pagecache_get_page -> --------- //根據bdev->bd_inode->i_mapping位址空間在page cache基樹中查找頁面
find_get_entry
__page_cache_alloc -------- //沒找到page cache,建立新頁面
add_to_page_cache_lru ----- //并加入page cache基樹,以及LRU連結清單
alloc_page_buffers -> ------------- //找到或建立的頁面沒有buffer_head,建立新的buffer_head
alloc_buffer_head ->
kmem_cache_zalloc(bh_cachep //在slab中配置設定空閑buffer_head對象
set_bh_page ------------------- //設定buffer的資料指向位址
link_dev_buffers -> --------------- //将page對應的所有buffer_head連成一個環形連結清單
attach_page_buffers ----------- //将buffer關聯到對應的page上,将page->private指向buffer_head
配置設定buffer_head的操作在alloc_page_buffers,前提是page cache沒有對應的buffer_head。
- 配置設定以頁大小為總量,每個塊大小取決于blocksize大小
- 配置設定後将各個buffer_head用b_this_page串成連結清單
- 通過set_bh_page設定各個buffer_head的資料指向位址,該位址也就是page對應的虛拟位址,每個buffer_head還會儲存相對頁面的偏移位址。
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry)
{
...
head = NULL;
offset = PAGE_SIZE;
//以頁大小作為總量,配置設定buffer_head,而buffer大小由blocksize決定
//是以對于blocksize為4k的裝置,此時就隻會配置設定一個buffer_head
//對于blocksize為1k的裝置,就會配置設定4個buffer_head
while ((offset -= size) >= 0) {
//從slab緩存中配置設定空閑buffer_head結構
bh = alloc_buffer_head(gfp);
if (!bh)
goto no_grow;
//對于配置設定多個buffer_head的場景,通過b_this_page将其連接配接起來
bh->b_this_page = head;
bh->b_blocknr = -1;
head = bh;
bh->b_size = size;
set_bh_page(bh, page, offset);//設定buffer對應的資料位址
}
...
return head; //連結清單頭,即最後配置設定的buffer_head
...
}
void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset)
{
bh->b_page = page; //b_page指向對應頁面
BUG_ON(offset >= PAGE_SIZE);
if (PageHighMem(page))
bh->b_data = (char *)(0 + offset);
else
bh->b_data = page_address(page) + offset; //設定資料指向位址
}
alloc_page_buffers傳回上層函數後,還需要link_dev_buffers進一步處理,
- 将buffer_head進一步頭尾相連,變成環形連結清單
- 将buffer關聯到對應的page上,page->private指向buffer_head頭部
static inline void link_dev_buffers(struct page *page, struct buffer_head *head)
{
struct buffer_head *bh, *tail;
//将page對應的所有buffer_head連成一個環形連結清單
bh = head;
do {
tail = bh;
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
//将buffer關聯到對應的page上
attach_page_buffers(page, head);
}
static inline void attach_page_buffers(struct page *page, struct buffer_head *head)
{
get_page(page);
SetPagePrivate(page); //設定PG_Private标志,表示有page有對應fs的資料,即buffer
set_page_private(page, (unsigned long)head); //将page->private指向buffer_head
}
3.2 讀的檔案頁在磁盤中不相鄰
在上篇文章——頁緩存page cache和位址空間address_space中,我們知道如果page cache沒有緩存,會調用readpage去磁盤讀取資料,對于ext4調用的就是ext4_readpage,ext4_readpage其實是對ext4_mpage_readpages的簡單封裝,
int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages, bool is_readahead)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
struct inode *inode = mapping->host;
//對于扇區大小為512位元組的磁盤,該值為9
const unsigned blkbits = inode->i_blkbits;
//對于扇區大小為512位元組的磁盤,該值為8,即每個頁面對應8個磁盤塊
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits; //實體磁盤塊大小,即扇區大小,為512位元組
...
for (; nr_pages; nr_pages--) {
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
prefetchw(&page->flags);
//如果page有關聯的buffer_head,那繼續以塊的方式讀取
if (page_has_buffers(page))
goto confused;
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);//目前page在file中的相對block
last_block = block_in_file + nr_pages * blocks_per_page; //需要讀取的最後一個block
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; //該檔案的最後一個block
if (last_block > last_block_in_file)
last_block = last_block_in_file; //讀取的block不能超過檔案最後一個block
page_block = 0;
...
//調用ext4_map_blocks查找該頁需要的所有磁盤塊
while (page_block < blocks_per_page) {
if (block_in_file < last_block) {
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file; //讀取長度
//從磁盤查找塊
if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
...
}
}
...
//兩次讀取的block是否相鄰,不相鄰則通過一次讀一塊的方式讀取
//但是對于4k盤,即實體塊扇區大小為4k,和page大小一緻時,就不存在是否相鄰的問題,因為一個頁就對應一個block
if (page_block && blocks[page_block-1] != map.m_pblk-1)
goto confused;
for (relative_block = 0; ; relative_block++) {
if (relative_block == map.m_len) {
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
} else if (page_block == blocks_per_page)
break;
blocks[page_block] = map.m_pblk+relative_block;
page_block++;
block_in_file++;
}
}
...
confused:
...
if (!PageUptodate(page))
//通過buffer_head,一次一塊讀取檔案
block_read_full_page(page, ext4_get_block);
...
}
...
return 0;
}
然後block_read_full_page就會調用create_empty_buffers建立buffer_head,和上面直接讀取塊裝置一樣,最後讀取磁盤的檔案資料。
3.3 寫操作
對于ext4檔案系統,寫操作都會經過buffer_head,
SYSCALL_DEFINE3(write //write系統調用入口
ksys_write
vfs_write
__vfs_write
new_sync_write
call_write_iter
file->f_op->write_iter
ext4_file_write_iter
__generic_file_write_iter
generic_perform_write
a_ops->write_begin
ext4_write_begin
grab_cache_page_write_begin ->
pagecache_get_page ->
find_get_entry ---------------- //查找page cache
__page_cache_alloc ------------ //沒找到page cache,則配置設定一個page對象
add_to_page_cache_lru --------- //将頁面加入page cache基樹中,同時也加入active LRU連結清單
__block_write_begin
__block_write_begin_int
create_page_buffers
create_empty_buffers ------ //page沒有對應的buffer,建立新的buffer
alloc_page_buffers
alloc_buffer_head
kmem_cache_zalloc(bh_cachep //從slab緩存中配置設定空閑buffer_head結構
set_bh_page ------- //設定buffer_head資料指向位址
attach_page_buffers
iov_iter_copy_from_user_atomic ---------------- //将資料從使用者空間拷貝到核心空間,也就是page cache上
a_ops->write_end
ext4_write_end
block_write_end
__block_commit_write
mark_buffer_dirty
ext4_update_inode_size ---------------- //更新檔案對應的inode資訊
ext4_mark_inode_dirty ----------------- //标記inode為髒,寫入了資料,需要同步到磁盤
4 删除buffer_head
因為buffer_head和page關聯,是以在回收page的時候會同時回收buffer_head,同樣在上篇文章——頁緩存page cache和位址空間address_space中我們提到drop_caches時,針對ext4檔案系統會調用ext4_releasepage對資源進行釋放,
ext4_releasepage ->
jbd2_journal_try_to_free_buffers ->
try_to_free_buffers ->
drop_buffers ->
__clear_page_buffers //清空page->private的指向
free_buffer_head ->
kmem_cache_free(bh_cachep //将buffer_head釋放回slab緩存
5 結構關系
以寫/home/test.c檔案為例,
- 應用程式打開test.c檔案,并通過fd檔案描述符獲得file結構體
- file結構體中f_inode指向檔案對應的inode結構
- file結構體中f_path的dentry指向檔案對應的dentry對象
- dentry對象中的d_inode也指向該檔案對應的inode結構
- file結構體的f_mapping指向inode的i_mapping
- inode的i_mapping則指向inode内嵌的address_space結構體
- 并且address_space結構體也有一個變量——host,指向對應的inode結構
- address_space結構體的i_pages指向page cache基樹
- page cache基樹的結點中slot指針數組指向page對象
- 此時count=2,即說名test.c檔案在記憶體中有兩個高速緩存頁,page位址儲存在slot指針數組中
- page結構體中的mapping指向file結構體的f_mapping
- page結構體中的private指向buffer_head的首部
- buffer_head的數量取決于實體扇區大小,對于sector size為512bytes,就會有8個buffer_head,(page size/sector size)
- 所有buffer_head通過b_this_page連接配接,并形成一個環形連結清單
- buffer_head結構體中b_page指向buffer對應的實際頁面位址,即虛拟位址