bcache 寫導緻io hung問題的追蹤

　　最近線上bcache機器爆出io hung的問題，出問題的機器占到灰階bcache的10%左右，其中一個日志如下:

<3>[[email protected]] INFO: task qierouterproxy:22202 blocked for more than 1200 seconds.
<3>[[email protected]] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] [aml_sdhc_data_thread] SDHC_ESTA=0x0
<4>[ [email protected]] 
<6>[[email protected]] qierouterproxy D c09f286c 0 22202 22005 0x00000001
<4>[[email protected]] [<c09f286c>] (__schedule+0x6a4/0x7e8) from [<c09f2cf0>] (io_schedule+0xa0/0xf8)
<4>[[email protected]] [<c09f2cf0>] (io_schedule+0xa0/0xf8) from [<c00d5878>] (sleep_on_page+0x8/0x10)
<4>[[email protected]] [<c00d5878>] (sleep_on_page+0x8/0x10) from [<c09f0c7c>] (__wait_on_bit+0x54/0xbc)
<4>[[email protected]] [<c09f0c7c>] (__wait_on_bit+0x54/0xbc) from [<c00d56a4>] (wait_on_page_bit+0xbc/0xc4)
<4>[[email protected]] [<c00d56a4>] (wait_on_page_bit+0xbc/0xc4) from [<c00d5770>] (filemap_fdatawait_range+0x78/0x118)
<4>[[email protected]] [<c00d5770>] (filemap_fdatawait_range+0x78/0x118) from [<c00d6d44>] (filemap_write_and_wait_range+0x54/0x78)
<4>[[email protected]] [<c00d6d44>] (filemap_write_and_wait_range+0x54/0x78) from [<c01b45d0>] (ext4_sync_file+0xb8/0x388)
<4>[[email protected]] [<c01b45d0>] (ext4_sync_file+0xb8/0x388) from [<c014b2cc>] (vfs_fsync+0x3c/0x4c)
<4>[[email protected]] [<c014b2cc>] (vfs_fsync+0x3c/0x4c) from [<c014b45c>] (do_fsync+0x28/0x50)
<4>[[email protected]] [<c014b45c>] (do_fsync+0x28/0x50) from [<c000e040>] (ret_fast_syscall+0x0/0x30)

　　由日志看出，典型的d狀态逾時，因為我們把逾時時間由核心預設的120s改成了1200s，依然爆棧，顯然是是出了問題。由棧可以看出是qierouterproxy程序調用fsync系統調用引起的，程序d在了page的等待隊列上等待喚醒，但卻沒有等到。

(一)複現方法

　　上面的棧是線上爆出的現場，後來在bcache裝置目錄裡寫小檔案沒事，寫大檔案并sync或者fsync就會出樣同樣的問題，二分法很快發現一次寫32個page大小也就是128k的資料就是會出現此問題，比128kk小沒事，128k或者更大會出問題，系統hung死。

　　複現代碼如下：

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>


#define LEN 4096*32
#define NUM 1

int main(int argc, char *argv[])
{
    int fd = -1;       
    char *buf = NULL;
    int ret = -1;
	int i = 0;
    
	buf = malloc(LEN);
	if(!buf)
	{
		printf("buf alloc failed!\n");
		return -1;
	}
	
    fd = open("test", O_RDWR|O_CREAT);
    if (-1 == fd)
    {
        printf("open err\n");
    }
    else
    {
        printf("open success!\n");
    }
   	
	for(i = 0;i <NUM;i++){
		memset(buf,'b',LEN); 
    	ret = write(fd, buf, LEN);
    	if (ret < 0)
    	{
        	printf("write failed!\n");
   		}
    	else
    	{
        	printf("write success，write %d bytes\n", ret);
    	}
		fsync(fd);
	}
   /* 
	ret = read(fd, buf, 5);
    if (ret < 0)
    {
        printf("read失敗\n");
    }
    else
    {
        printf("實際讀取了%d位元組.\n", ret);
        printf("檔案内容是：[%s].\n", buf);
    }
    */
	close(fd);
    
    return 0;
}

（二）io 路徑分析

線上環境是比較複雜，線上使用者磁盤hdd一般是NTFS檔案系統，我們在NTFS上建立loop裝置，再基于loop裝置建立linear映射的lvm裝置，對于使用ssd和hdd雙盤的情況，還要使用bcache加速hdd，用lvm裝置作為bcache的後端裝置，ssd作為緩存裝置，最後在bcache裝置上部署ext4檔案系統。總結起來，IO寫的路徑大概為：ext4檔案系統，bcache裝置，lvm裝置，loop裝置，ntfs，hdd，十分漫長。

　　這裡主要關注ext4、bcache 裝置，寫流程由上到下大概流程為：

　　ext4寫流程分為vfs寫和bdi回寫兩部分：

　　前半部，vfs寫流程如下：

SyS_write
   vfs_write
      do_sync_write
         ext4_file_write       
            generic_file_aio_write
              __generic_file_aio_write
                   generic_file_buffered_write
                      generic_perform_write

　　主要關注下generic_perform_write函數，源碼如下：

static ssize_t generic_perform_write(struct file *file,
				struct iov_iter *i, loff_t pos)
{
	struct address_space *mapping = file->f_mapping;
	const struct address_space_operations *a_ops = mapping->a_ops;
	long status = 0;
	ssize_t written = 0;
	unsigned int flags = 0;

	/*
	 * Copies from kernel address space cannot fail (NFSD is a big user).
	 */
	if (segment_eq(get_fs(), KERNEL_DS))
		flags |= AOP_FLAG_UNINTERRUPTIBLE;

	do {
		struct page *page;
		unsigned long offset;	/* Offset into pagecache page */
		unsigned long bytes;	/* Bytes to write to page */
		size_t copied;		/* Bytes copied from user */
		void *fsdata;

		offset = (pos & (PAGE_CACHE_SIZE - 1));
		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
						iov_iter_count(i));

again:
		/*
		 * Bring in the user page that we will copy from _first_.
		 * Otherwise there's a nasty deadlock on copying from the
		 * same page as we're writing to, without it being marked
		 * up-to-date.
		 *
		 * Not only is this an optimisation, but it is also required
		 * to check that the address is actually valid, when atomic
		 * usercopies are used, below.
		 */
		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
			status = -EFAULT;
			break;
		}

		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
						&page, &fsdata);
		if (unlikely(status))
			break;

		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

		pagefault_disable();
		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
		pagefault_enable();
		flush_dcache_page(page);
		mark_page_accessed(page);
		blk_associate_page(page);
		status = a_ops->write_end(file, mapping, pos, bytes, copied,
						page, fsdata);
		if (unlikely(status < 0))
			break;
		copied = status;

		cond_resched();

		iov_iter_advance(i, copied);
		if (unlikely(copied == 0)) {
			/*
			 * If we were unable to copy any data at all, we must
			 * fall back to a single segment length write.
			 *
			 * If we didn't fallback here, we could livelock
			 * because not all segments in the iov can be copied at
			 * once without a pagefault.
			 */
			bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
						iov_iter_single_seg_count(i));
			goto again;
		}
		pos += copied;
		written += copied;

		balance_dirty_pages_ratelimited(mapping);
		if (fatal_signal_pending(current)) {
			status = -EINTR;
			break;
		}
	} while (iov_iter_count(i));

	return written ? written : status;
}

該函數是前半部寫的核心函數，主要是是将資料從使用者緩存區拷貝到檔案的page cache，具體工作如下：

（1）a_ops->write_begin。ext4上具體為ext4_da_write_begin函數，主要工作是檢視檔案的page cache裡對應的page是否存在，如果不存在，配置設定并插入inode address 基樹，插入系統inactive file lru連結清單。對于要寫入的page依次調用ext4_da_get_block_prep函數判斷檔案page所在的邏輯塊對應的磁盤的實體塊是否配置設定，如果沒有配置設定，由于ext4采用delay alloc 政策，并不會取配置設定實體塊，隻是為檔案預留相應的quota，并将delay alloc 的range記錄到inode的status tree裡。delay alloc 的rang被視為mapped(參見ext4_da_get_block_prep函數裡的map_bh函數）。

（2）iov_iter_copy_from_user_atomic函數将使用者緩存區資料拷貝到page cache對應的page。

（3）a->ops->write_end。ext4上具體為ext4_da_write_end函數。對于追加寫的情況，i_size_write函數更新檔案i_size，mark_inode_dirty函數為回寫ext4 磁盤inode中繼資料做準備。

　　後半部-bdi回寫：

kthread
   worker_thread
      process_one_work
          bdi_writeback_workfn
             wb_do_writeback
                __writeback_inodes_wb
                      writeback_sb_inodes
                          writeback_sb_inodes 
                              __writeback_single_inode
                                   ext4_da_writepages

　　這裡重點關注ext4_da_writepages函數的實作，源碼如下：

static int ext4_da_writepages(struct address_space *mapping,
			      struct writeback_control *wbc)
{
	pgoff_t	index;
	int range_whole = 0;
	handle_t *handle = NULL;
	struct mpage_da_data mpd;
	struct inode *inode = mapping->host;
	int pages_written = 0;
	unsigned int max_pages;
	int range_cyclic, cycled = 1, io_done = 0;
	int needed_blocks, ret = 0;
	long desired_nr_to_write, nr_to_writebump = 0;
	loff_t range_start = wbc->range_start;
	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
	pgoff_t done_index = 0;
	pgoff_t end;
	struct blk_plug plug;

	trace_ext4_da_writepages(inode, wbc);

	/*
	 * No pages to write? This is mainly a kludge to avoid starting
	 * a transaction for special inodes like journal inode on last iput()
	 * because that could violate lock ordering on umount
	 */
	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
		return 0;

	/*
	 * If the filesystem has aborted, it is read-only, so return
	 * right away instead of dumping stack traces later on that
	 * will obscure the real source of the problem.  We test
	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
	 * the latter could be true if the filesystem is mounted
	 * read-only, and in that case, ext4_da_writepages should
	 * *never* be called, so if that ever happens, we would want
	 * the stack trace.
	 */
	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
		return -EROFS;

	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
		range_whole = 1;

	range_cyclic = wbc->range_cyclic;
	if (wbc->range_cyclic) {
		index = mapping->writeback_index;
		if (index)
			cycled = 0;
		wbc->range_start = index << PAGE_CACHE_SHIFT;
		wbc->range_end  = LLONG_MAX;
		wbc->range_cyclic = 0;
		end = -1;
	} else {
		index = wbc->range_start >> PAGE_CACHE_SHIFT;
		end = wbc->range_end >> PAGE_CACHE_SHIFT;
	}

	/*
	 * This works around two forms of stupidity.  The first is in
	 * the writeback code, which caps the maximum number of pages
	 * written to be 1024 pages.  This is wrong on multiple
	 * levels; different architectues have a different page size,
	 * which changes the maximum amount of data which gets
	 * written.  Secondly, 4 megabytes is way too small.  XFS
	 * forces this value to be 16 megabytes by multiplying
	 * nr_to_write parameter by four, and then relies on its
	 * allocator to allocate larger extents to make them
	 * contiguous.  Unfortunately this brings us to the second
	 * stupidity, which is that ext4's mballoc code only allocates
	 * at most 2048 blocks.  So we force contiguous writes up to
	 * the number of dirty blocks in the inode, or
	 * sbi->max_writeback_mb_bump whichever is smaller.
	 */
	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
	if (!range_cyclic && range_whole) {
		if (wbc->nr_to_write == LONG_MAX)
			desired_nr_to_write = wbc->nr_to_write;
		else
			desired_nr_to_write = wbc->nr_to_write * 8;
	} else
		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
							   max_pages);
	if (desired_nr_to_write > max_pages)
		desired_nr_to_write = max_pages;

	if (wbc->nr_to_write < desired_nr_to_write) {
		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
		wbc->nr_to_write = desired_nr_to_write;
	}

retry:
	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
		tag_pages_for_writeback(mapping, index, end);

	blk_start_plug(&plug);
	while (!ret && wbc->nr_to_write > 0) {

		/*
		 * we  insert one extent at a time. So we need
		 * credit needed for single extent allocation.
		 * journalled mode is currently not supported
		 * by delalloc
		 */
		BUG_ON(ext4_should_journal_data(inode));
		needed_blocks = ext4_da_writepages_trans_blocks(inode);

		/* start a new transaction*/
		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
					    needed_blocks);
		if (IS_ERR(handle)) {
			ret = PTR_ERR(handle);
			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
			       "%ld pages, ino %lu; err %d", __func__,
				wbc->nr_to_write, inode->i_ino, ret);
			blk_finish_plug(&plug);
			goto out_writepages;
		}

		/*
		 * Now call write_cache_pages_da() to find the next
		 * contiguous region of logical blocks that need
		 * blocks to be allocated by ext4 and submit them.
		 */
		ret = write_cache_pages_da(handle, mapping,
					   wbc, &mpd, &done_index);
		/*
		 * If we have a contiguous extent of pages and we
		 * haven't done the I/O yet, map the blocks and submit
		 * them for I/O.
		 */
		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
			mpage_da_map_and_submit(&mpd);
			ret = MPAGE_DA_EXTENT_TAIL;
		}
		trace_ext4_da_write_pages(inode, &mpd);
		wbc->nr_to_write -= mpd.pages_written;

		ext4_journal_stop(handle);

		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
			/* commit the transaction which would
			 * free blocks released in the transaction
			 * and try again
			 */
			jbd2_journal_force_commit_nested(sbi->s_journal);
			ret = 0;
		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
			/*
			 * Got one extent now try with rest of the pages.
			 * If mpd.retval is set -EIO, journal is aborted.
			 * So we don't need to write any more.
			 */
			pages_written += mpd.pages_written;
			ret = mpd.retval;
			io_done = 1;
		} else if (wbc->nr_to_write)
			/*
			 * There is no more writeout needed
			 * or we requested for a noblocking writeout
			 * and we found the device congested
			 */
			break;
	}
	blk_finish_plug(&plug);
	if (!io_done && !cycled) {
		cycled = 1;
		index = 0;
		wbc->range_start = index << PAGE_CACHE_SHIFT;
		wbc->range_end  = mapping->writeback_index - 1;
		goto retry;
	}

	/* Update index */
	wbc->range_cyclic = range_cyclic;
	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
		/*
		 * set the writeback_index so that range_cyclic
		 * mode will write it back later
		 */
		mapping->writeback_index = done_index;

out_writepages:
	wbc->nr_to_write -= nr_to_writebump;
	wbc->range_start = range_start;
	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
	return ret;
}

　　在分析這個函數之前首先介紹這個關鍵結構體：

struct mpage_da_data {
	struct inode *inode;
	sector_t b_blocknr;		/* start block number of extent */
	size_t b_size;			/* size of extent */
	unsigned long b_state;		/* state of the extent */
	unsigned long first_page, next_page;	/* extent of pages */
	struct writeback_control *wbc;
	int io_done;
	int pages_written;
	int retval;
};

struct mpage_da_data主要記錄待映射的extent，其中

first_page，next_page：分别記錄本次要回寫的range，機關是檔案頁偏移量。

b_blocknr：待映射的extent的起始邏輯塊号。

b_size：待映射的extent的大小。

b_state：extent對應的bh的标志。

io_done:如果[first_page，next_page-1]的區域回寫完畢，io_done=1。

　　再看ext4_da_writepages函數主要處理流程：

該函數主體是個while循環，循環回寫wbc->start_range，wbc->end_range之間的區域，每次回寫mpd->first_page,mpd->next_page的區域。mpd->first_page,mpd->next_page如何得來的？

比如說，wbc->start_range，wbc->end_range覆寫檔案的1,2,3,4,[5]],6,7,8,9頁，其中1，2，3，4頁是dirty的，5頁是clean的，6,7,8,9頁是dirty的，那麼ext4_da_writepages函數需要循環兩個。第一次[mpd->first_page = 1，mpd->next_page-1 = 4]，第二次[mpd->first_page = 6，mpd->next_page-1 = 9]。mpd->first_page,mpd->next_page标示最大連續髒頁區域。

(1)調用write_cache_pages_da函數累計記錄本次連續髒頁區域（由mpd->first_page,mpd->next_page記錄）。同時write_cache_pages_da函數調用mpage_add_bh_to_extent函數累集該區域内的delay或者unwritten的區域，并記錄到mpd的b->blocknr和mpd->b_size裡。

(2)mpage_da_map_and_submit完成對mpd記錄的extent的的配置設定工作。mpage_da_map_and_submit函數調用ext4_map_blocks為extent配置設定空間。對與delay的extent的直接配置設定，對于unwritten的extent，直接轉換成written就行。

(3)mpage_da_submit_io送出屬于[mpd->first_page,mpd->next_page-1]區間的bio。經過mpage_da_map_and_submit函數的配置設定Block，此時所有bh都是真正mapped的。需要說明的是ext4_bio_write_page會調用io_submit_add_bh累集連續的bio到ext4_io_submit裡。

如果遇到不連續的bio，調用ext4_io_submit函數送出先前累集的bio，并重新開始累集。直到最後，如果io_done不為1，說明還有io沒有送出，再次調用ext4_io_submit函數送出剩餘的bio。

vfs 寫基本已經介紹完了，來看bcache寫流程：

上面說到的ext4_da_writepages函數最後調用ext4_io_submit函數來送出IO。ext4_io_submit函數調用submit_bio函數進入通用送出Bio流程。submit_bio函數調用generic_make_request函數，最終調用q->make_request_fn，也就是調用bcache裝置的的cached_dev_make_request函數，自此進入bcache裝置的處理流程。以下是具體流程：

(1)．進入bcache裝置的q->make_request_fn函數，具體為cached_dev_make_request。在該函數中，申請struct search結構體，結構體中的struct closure通過計數跟蹤bio在bcache裡的完成情況，并将bio和struct search進行綁定，同時對bio進行clone，之是以要進行clone，是因為像bcache lvm這類邏輯裝置在處理bio的時候需要修改bio某些域或者split bio，而在最後bio的所有請求在底層完成的時候，還需要完成一些列回調函數，這些回調的參數就是bio，是以需要對原始bio進行保留，clone一個新的bio進行送出。最後調用request_write函數。

(2)．request_write函數。線上的bcache的寫模式為writearound，是以會直接寫後端裝置。調用bch_generic_make_request送出bio，這裡送出的是clone後的bio。

(3)．bch_generic_make_reques函數主要處理bio 的split。如果bio大小超過bch_bio_max_sectors之後，需要對bio根據bch_bio_max_sectors大小進行切分。

(4)．bch_generic_make_request_hack調用generic_make_request向lvm裝置隊列送出bio，進入lvm裝置的處理流程。

(三)bug分析定位

從hung的調用棧來看，是qierouterproxy程序調用了fsync函數在sleep_on_page裡切換了出去，再也沒有排程回來。

看下，fsync系統調用的過程。do_fsync調用vfs_fsync，vfs_sync函數調用vfs_fsync_range函數（），vfs_fsync_range函數調用ext4的ext4_sync_file函數。注意vfs_fsync_range(file, 0, LLONG_MAX, datasync)的參數，是等待所有資料。ext4_fsync_file函數調用filemap_write_and_wait_range函數，來看下這個函數：

int filemap_write_and_wait_range(struct address_space *mapping,
				 loff_t lstart, loff_t lend)
{
	int err = 0;

	if (mapping->nrpages) {
		err = __filemap_fdatawrite_range(mapping, lstart, lend,
						 WB_SYNC_ALL);
		/* See comment of filemap_write_and_wait() */
		if (err != -EIO) {
			int err2 = filemap_fdatawait_range(mapping,
						lstart, lend);
			if (!err)
				err = err2;
		}
	} else {
		err = filemap_check_errors(mapping);
	}
	return err;
}

從函數的名字filemap_write_and_wait_range名字來看，它主要主要幹了兩件事write和wait。事實上，7行的__filemap_fdatawrite_range通過調用ext4_da_writepages函數來寫address space裡的所有髒頁，11行filemap_fdatawait_range函數負責依次等待所有的page回寫完成。

　　看下filemap_fdatawait_range函數：　

int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
			    loff_t end_byte)
{
	pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
	pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
	struct pagevec pvec;
	int nr_pages;
	int ret2, ret = 0;

	if (end_byte < start_byte)
		goto out;

	pagevec_init(&pvec, 0);
	while ((index <= end) &&
			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
			PAGECACHE_TAG_WRITEBACK,
			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
		unsigned i;

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			/* until radix tree lookup accepts end_index */
			if (page->index > end)
				continue;
			wait_on_page_writeback(page);
			if (TestClearPageError(page))
				ret = -EIO;
		}
		pagevec_release(&pvec);
		cond_resched();
	}
out:
	ret2 = filemap_check_errors(mapping);
	if (!ret)
		ret = ret2;

	return ret;
}

該函數周遊inode的adress space的writeback的page，對每個writeback的page調用wait_on_page_writeback函數等待page回寫完成。如果page還處于writeback狀态，那麼程序切換出去。

在wait_on_page_writeback前後加兩句列印（26-27，29-30行），如下：

int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
			    loff_t end_byte)
{
	pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
	pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
	struct pagevec pvec;
	int nr_pages;
	int ret2, ret = 0;

	if (end_byte < start_byte)
		goto out;

	pagevec_init(&pvec, 0);
	while ((index <= end) &&
			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
			PAGECACHE_TAG_WRITEBACK,
			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
		unsigned i;

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			/* until radix tree lookup accepts end_index */
			if (page->index > end)
				continue;
			if((page->mapping)&&(!((unsigned long)page->mapping&0x1))&&(page->mapping->host->i_ino == 13))
				printk("Before wait page index %ld!\n",page->index);
			wait_on_page_writeback(page);
			if((page->mapping)&&(!((unsigned long)page->mapping&0x1))&&(page->mapping->host->i_ino == 13))
				printk("After  wait page index %ld!\n",page->index);
			if (TestClearPageError(page))
				ret = -EIO;
		}
		pagevec_release(&pvec);
		cond_resched();
	}
out:
	ret2 = filemap_check_errors(mapping);
	if (!ret)
		ret = ret2;

	return ret;
}

其中13為要寫的檔案的inode no。

重新編譯燒寫核心，在bcache目錄裡執行前面的複現的程式test_32k，系統hung住，序列槽列印如下：

[  191.456617] Before wait page index 0!
[  191.461711] After  wait page index 0!
[  191.461757] Before wait page index 1!
[  191.464117] After  wait page index 1!
[  191.467853] Before wait page index 2!
[  191.472090] After  wait page index 2!
[  191.475609] Before wait page index 3!
[  191.479335] After  wait page index 3!
[  191.483291] Before wait page index 4!
[  191.487035] After  wait page index 4!
[  191.490799] Before wait page index 5!
[  191.494462] After  wait page index 5!
[  191.498253] Before wait page index 6!
[  191.502042] After  wait page index 6!
[  191.505843] Before wait page index 7!
[  191.509621] After  wait page index 7!
[  191.513391] Before wait page index 8!
[  191.517184] After  wait page index 8!
[  191.521016] Before wait page index 9!
[  191.524775] After  wait page index 9!
[  191.528587] Before wait page index 10!
[  191.532452] After  wait page index 10!
[  191.536670] Before wait page index 11!
[  191.540250] After  wait page index 11!
[  191.544103] Before wait page index 12!
[  191.547983] After  wait page index 12!
[  191.551893] Before wait page index 13!
[  191.555748] After  wait page index 13!
[  191.559674] Before wait page index 31!

　發現index為31的page隻有Before，沒有After。由此斷定fsync系統調用hung在第31個page，即寫入的32個page的最後一個page。之是以沒有把[0,31]個page全打出來，是因為，在調用fsync之前，有的page已經完成io 傳輸。

　index為31的page為什麼沒有完成writeback導緻在此頁上等待完成的程序d死。bio->bi_end_io負責喚醒在頁上等待io完成的程序，并清除page的writeback。bio->bi_end_io在ext4上為具體為ext4_end_bio函數。代碼如下：

static void ext4_end_bio(struct bio *bio, int error)
{
	ext4_io_end_t *io_end = bio->bi_private;
	struct inode *inode;
	int i;
	int blocksize;
	sector_t bi_sector = bio->bi_sector;
	struct page *page1 = NULL;
	int need_printk = 0;
	
	if(bio_has_data(bio)&&!PageSwapBacked(page1=bio->bi_io_vec[0].bv_page)&&(page1->mapping)&&(!((unsigned long)page1->mapping&0x1))&&(page1->mapping->host->i_ino == 13))
		need_printk = 1;
	BUG_ON(!io_end);
	inode = io_end->inode;
	blocksize = 1 << inode->i_blkbits;
	bio->bi_private = NULL;
	bio->bi_end_io = NULL;
	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
		error = 0;
	for (i = 0; i < bio->bi_vcnt; i++) {
		struct bio_vec *bvec = &bio->bi_io_vec[i];
		struct page *page = bvec->bv_page;
		struct buffer_head *bh, *head;
		unsigned bio_start = bvec->bv_offset;
		unsigned bio_end = bio_start + bvec->bv_len;
		unsigned under_io = 0;
		unsigned long flags;

		if (!page)
			continue;
		if(need_printk)
			printk("page->index %ld start %d end %d!\n",page->index,bio_start,bio_end);
		if (error) {
			SetPageError(page);
			set_bit(AS_EIO, &page->mapping->flags);
		}
		bh = head = page_buffers(page);
		/*
		 * We check all buffers in the page under BH_Uptodate_Lock
		 * to avoid races with other end io clearing async_write flags
		 */
		local_irq_save(flags);
		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
		do {
			if (bh_offset(bh) < bio_start ||
			    bh_offset(bh) + blocksize > bio_end) {
				if (buffer_async_write(bh))
					under_io++;
				continue;
			}
			clear_buffer_async_write(bh);
			if (error)
				buffer_io_error(bh);
		} while ((bh = bh->b_this_page) != head);
		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
		local_irq_restore(flags);
		if (!under_io){
			end_page_writeback(page);
			if(need_printk)
				printk("inode blocksize %d page index %ld!\n",1<<(inode->i_blkbits),page->index);
		}
	}
	bio_put(bio);

	if (error) {
		io_end->flag |= EXT4_IO_END_ERROR;
		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
			     "(offset %llu size %ld starting block %llu)",
			     inode->i_ino,
			     (unsigned long long) io_end->offset,
			     (long) io_end->size,
			     (unsigned long long)
			     bi_sector >> (inode->i_blkbits - 9));
	}

	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
		ext4_free_io_end(io_end);
		return;
	}

	ext4_add_complete_io(io_end);
}

該函數周遊bio的所有page，檢視page所屬的bh，根據bh的回寫情況，來處理page。如果page的所有bh均完成回寫，那麼調用end_page_writeback函數來喚醒該page上等待的程序并清除writeback标志。其中11-12,59-60行是添加列印，過濾列印inode no為13的已經完成回寫的page。

重制編譯燒寫核心，在bcache裝置上執行test_32複現程式，dmesg輸出如下：

[  214.340375] inode blocksize 4096 page index 0!
[  214.340424] inode blocksize 4096 page index 1!
[  214.343944] inode blocksize 4096 page index 2!
[  214.348572] inode blocksize 4096 page index 3!
[  214.353120] inode blocksize 4096 page index 4!
[  214.357661] inode blocksize 4096 page index 5!
[  214.362261] inode blocksize 4096 page index 6!
[  214.366924] inode blocksize 4096 page index 7!
[  214.371436] inode blocksize 4096 page index 8!
[  214.375958] inode blocksize 4096 page index 9!
[  214.380575] inode blocksize 4096 page index 10!
[  214.385187] inode blocksize 4096 page index 11!
[  214.390170] inode blocksize 4096 page index 12!
[  214.394488] inode blocksize 4096 page index 13!
[  214.399165] inode blocksize 4096 page index 14!
[  214.403803] inode blocksize 4096 page index 15!
[  214.408478] inode blocksize 4096 page index 16!
[  214.413117] inode blocksize 4096 page index 17!
[  214.417773] inode blocksize 4096 page index 18!
[  214.422536] inode blocksize 4096 page index 19!
[  214.427092] inode blocksize 4096 page index 20!
[  214.431816] inode blocksize 4096 page index 21!
[  214.436418] inode blocksize 4096 page index 22!
[  214.441101] inode blocksize 4096 page index 23!
[  214.445736] inode blocksize 4096 page index 24!
[  214.450415] inode blocksize 4096 page index 25!
[  214.455042] inode blocksize 4096 page index 26!
[  214.459744] inode blocksize 4096 page index 27!
[  214.464369] inode blocksize 4096 page index 28!
[  214.469098] inode blocksize 4096 page index 29!
[  214.473705] inode blocksize 4096 page index 30!

test_32函數寫32個page，從列印來看隻有[0,30]完成IO，index為31最後一個page沒有完成io，沒有喚醒等待其完成的程序。這個結果和上面filemap_fdatawait_range函數的列印互相印證。另外列印了ext4的blocksize是4096，表明一個page隻有一個bh。

顯然，最後一個page沒有調用end_page_writeback函數。函數265-276行，周遊page的所有bh（隻有一個），如果落在bvec的範圍内，那麼調用clear_buffer_async_write函數清理bh的BH_Async_Write标志。如果還有在bh在bvec範圍之外，且BH_Async_Write标志置位的話，那麼under_io++，表明本page還有尚未回寫完成的bh，那麼整個page不能調用end_page_writeback。BH_Async_Write标志是在ext4_bio_write_page函數裡在map完extent，累積送出的bio之前置位的。

在ext4_end_bio函數繼續添加列印，列印處每個bvec的覆寫範圍，添加列印後的代碼(11-12行，31-32行)如下：

static void ext4_end_bio(struct bio *bio, int error)
{
	ext4_io_end_t *io_end = bio->bi_private;
	struct inode *inode;
	int i;
	int blocksize;
	sector_t bi_sector = bio->bi_sector;
	struct page *page1 = NULL;
	int need_printk = 0;
	
	if(bio_has_data(bio)&&!PageSwapBacked(page1=bio->bi_io_vec[0].bv_page)&&(page1->mapping)&&(!((unsigned long)page1->mapping&0x1))&&(page1->mapping->host->i_ino == 13))
		need_printk = 1;
	BUG_ON(!io_end);
	inode = io_end->inode;
	blocksize = 1 << inode->i_blkbits;
	bio->bi_private = NULL;
	bio->bi_end_io = NULL;
	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
		error = 0;
	for (i = 0; i < bio->bi_vcnt; i++) {
		struct bio_vec *bvec = &bio->bi_io_vec[i];
		struct page *page = bvec->bv_page;
		struct buffer_head *bh, *head;
		unsigned bio_start = bvec->bv_offset;
		unsigned bio_end = bio_start + bvec->bv_len;
		unsigned under_io = 0;
		unsigned long flags;

		if (!page)
			continue;
		if(need_printk)
			printk("page->index %ld start %d end %d!\n",page->index,bio_start,bio_end);
		if (error) {
			SetPageError(page);
			set_bit(AS_EIO, &page->mapping->flags);
		}
		bh = head = page_buffers(page);
		/*
		 * We check all buffers in the page under BH_Uptodate_Lock
		 * to avoid races with other end io clearing async_write flags
		 */
		local_irq_save(flags);
		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
		do {
			if (bh_offset(bh) < bio_start ||
			    bh_offset(bh) + blocksize > bio_end) {
				if (buffer_async_write(bh))
					under_io++;
				continue;
			}
			clear_buffer_async_write(bh);
			if (error)
				buffer_io_error(bh);
		} while ((bh = bh->b_this_page) != head);
		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
		local_irq_restore(flags);
		if (!under_io){
			end_page_writeback(page);
			//if(need_printk)
			//	printk("inode blocksize %d page index %ld!\n",1<<(inode->i_blkbits),page->index);
		}
	}
	bio_put(bio);

	if (error) {
		io_end->flag |= EXT4_IO_END_ERROR;
		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
			     "(offset %llu size %ld starting block %llu)",
			     inode->i_ino,
			     (unsigned long long) io_end->offset,
			     (long) io_end->size,
			     (unsigned long long)
			     bi_sector >> (inode->i_blkbits - 9));
	}

	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
		ext4_free_io_end(io_end);
		return;
	}

	ext4_add_complete_io(io_end);
}

重新編譯燒寫核心，bcache目錄執行./test_32，dmesg輸出結果如下：

[  175.681627] page->index 0 start 0 end 4096!
[  175.681686] page->index 1 start 0 end 4096!
[  175.684686] page->index 2 start 0 end 4096!
[  175.689027] page->index 3 start 0 end 4096!
[  175.693330] page->index 4 start 0 end 4096!
[  175.697622] page->index 5 start 0 end 4096!
[  175.701956] page->index 6 start 0 end 4096!
[  175.706245] page->index 7 start 0 end 4096!
[  175.710627] page->index 8 start 0 end 4096!
[  175.714875] page->index 9 start 0 end 4096!
[  175.719203] page->index 10 start 0 end 4096!
[  175.723581] page->index 11 start 0 end 4096!
[  175.727991] page->index 12 start 0 end 4096!
[  175.732449] page->index 13 start 0 end 4096!
[  175.736791] page->index 14 start 0 end 4096!
[  175.741231] page->index 15 start 0 end 4096!
[  175.745588] page->index 16 start 0 end 4096!
[  175.750204] page->index 17 start 0 end 4096!
[  175.754370] page->index 18 start 0 end 4096!
[  175.758787] page->index 19 start 0 end 4096!
[  175.763168] page->index 20 start 0 end 4096!
[  175.767566] page->index 21 start 0 end 4096!
[  175.772026] page->index 22 start 0 end 4096!
[  175.776384] page->index 23 start 0 end 4096!
[  175.780831] page->index 24 start 0 end 4096!
[  175.785183] page->index 25 start 0 end 4096!
[  175.789629] page->index 26 start 0 end 4096!
[  175.793983] page->index 27 start 0 end 4096!
[  175.798418] page->index 28 start 0 end 4096!
[  175.802776] page->index 29 start 0 end 4096!
[  175.807161] page->index 30 start 0 end 4096!
[  175.811627] page->index 31 start 3584 end 4096!

最後一個bvec為何是[3582，4096]、大小512位元組，不也應該是[0，4096]麼。根據前面對ext4寫流程的分析，寫32個page，應該會在磁盤配置設定連續的實體塊，由于實體空間連續，一個bio就可以搞定。那應該是32個page全部應該是[0,4096]才符合實際。bio的bvec被修改，我想到了bio的split，bio的split如果split不是位于bvec邊界的話，需要重新界定bvec的範圍。bio的split函數在bcache裡是bch_bio_split函數，後面再看這個函數。下面确定是否存在bio split的情況存在。

為了驗證bio是否會split，我在bcache的入口函數和bcache的出口函數（也就是lvm的入口函數）裡面分别對Bio的大小進行列印，用如下語句進行過濾：

if(bio_has_data(bio)&&!PageSwapBacked(page1=bio->bi_io_vec[0].bv_page)&&(page1->mapping)&&(!((unsigned long)page1->mapping&0x1))&&(page1->mapping->host->i_ino == 13))

這樣隻列印屬于inode no為13的檔案下發的bio，排除幹擾。

bcache的入口函數為為bcache裝置隊列的make_request_fn，具體為cached_dev_make_request函數。lvm的入口函數為lvm裝置的make_request_fn，具體為dm_request函數。列印結果如下：

[  184.801757] func cached_dev_make_request bio sectors 256!
[  184.805082] func dm_request bio sectors 255!
[  184.809459] func dm_request bio sectors 1!

很明顯，進入bcache的是256個sector的bio，出來之後split成255個sector和1個sector。

下面再過一下，bio在bcache的流程：

bio 進入bcache的make_request_fn函數，具體是cached_dev_make_request函數。因為是寫bio，是以進入request_write函數。request_write函數調用bch_generic_make_request函數。bch_generic_make_request函數代碼如下：

void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
 {
     struct bio_split_hook *s;

     if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
         goto submit;

     if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
         goto submit;

     s = mempool_alloc(p->bio_split_hook, GFP_NOIO);

     s->bio      = bio;
     s->p        = p;
     s->bi_end_io    = bio->bi_end_io;
     s->bi_private   = bio->bi_private;
     bio_get(bio);

     closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
     return;
 submit:
     bch_generic_make_request_hack(bio);
 }

250行，如果bio的大小小于bch_bio_max_sectors（255個sector，每個sector為512位元組），直接調用bch_generic_make_request_hack函數向下層的lvm裝置送出bio。如果bio的超過255個sector，調用__bch_bio_submit_split函數。__bch_bio_submit_split函數調用bch_bio_split對bio以255個sector大小進行切分，切分後的bio，在調用bch_generic_make_request_hack函數向下層的lvm裝置送出bio。

這幾個bio的關系：原始bio(256個sector)，clone的bio（256個sector），clone的bio split成大小為255個sector和大小1個sector兩個bio 。看bch_bio_split函數，clone的bio split的時候，申請一個255sector的bio，然後自己調整大小為1個sector的bio。

最終三個bio：原始bio(大小256個sector)，新split申請的大小為255個sector的新bio，還有原來調整為1個sector的clone bio。

原始的bio的結束回調為ext4_end_bio，新申請的255個sector的結束回調為bch_bio_submit_split_endio，大小為1sector的clone bio的回調函數為bch_bio_submit_split_endio。原始bio的bio->private為closure1，大小為255個sector的bio和clone bio的private為closure2。255的bio和1sector的clone bio每完成一個，bch_bio_submit_split_endio回調函數被調用。bch_bio_submit_split_endio函數調用closure_put（bio->private），将closure2的計數減1，當計數減為0的時候（此時255的bio和clone bio均已經完成），調用closure->fn，具體為bch_bio_submit_split_done。

看下bch_bio_submit_split_done函數：

static void bch_bio_submit_split_done(struct closure *cl)
{
    struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);

    s->bio->bi_end_io = s->bi_end_io;
    s->bio->bi_private = s->bi_private;
    bio_endio(s->bio, 0);

    closure_debug_destroy(&s->cl);
    mempool_free(s, s->p->bio_split_hook);
}

s->bio指的是大小為1sector的clone bio，該函數将s->bi_end_io（request_endio函數）重新設定為clone bio的結束回調，clone bio的private設定為closure1(原來是cloure2)。然後調用bio_endio，也就是執行request_endio，調用closure_put(closure1)将closure1的計數減1，如果計數為0，那麼調用closure1->fn，具體為cached_dev_write_complete函數。該函數最終回調用原始bio的結束回調函數ext4_end_bio，好繞。

一言以蔽之，對于下發的bio，bcache不會送出這個bio，而是送出他的clone副本，如果副本超過bch_bio_max_sectors，還會分裂為多個副本，隻有在所有的副本bio都完成IO之後，bcache的回調機制能夠確定，最後一個副本bio的io完成回調裡執行原始bio的回調，在本例中是ext4_end_bio函數。

回到最開始的io hung的root cause上，為何最後一個page沒有喚醒上面的程序，沒有清除writeback。看一下，原始bio的clone過程，在search_alloc函數裡，代碼如下：

static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
{
    struct bio_vec *bv;
    struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
    memset(s, 0, offsetof(struct search, op.keys));

    __closure_init(&s->cl, NULL);

    s->op.inode     = d->id;
    s->op.c         = d->c;
    s->d            = d;
    s->op.lock      = -1;
    s->task         = current;
    s->orig_bio     = bio;
    s->write        = (bio->bi_rw & REQ_WRITE) != 0;
    s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
    s->op.skip      = (bio->bi_rw & REQ_DISCARD) != 0;
    s->recoverable      = 1;
    s->start_time       = jiffies;
    do_bio_hook(s);

    if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
        bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
        memcpy(bv, bio_iovec(bio),
               sizeof(struct bio_vec) * bio_segments(bio));

        s->bio.bio.bi_io_vec    = bv;
        s->unaligned_bvec   = 1;
    }

    return s;
}

其中參數bio原始bio，指派給s->orig_bio，do_bio_hook裡，clone bio是s->bio.bio，并完全拷貝原始bio，。do_bio_hook函數如下：

static void do_bio_hook(struct search *s)
{
    struct bio *bio = &s->bio.bio;
    memcpy(bio, s->orig_bio, sizeof(struct bio));

    bio->bi_end_io      = request_endio;
    bio->bi_private     = &s->cl;
    atomic_set(&bio->bi_cnt, 3);
}

688行可見，clone bio完全拷貝原始bio結構體，此時原始bio和clone bio的bvec數組是公用的，這是問題根本所在。

731行，如果原始bio的每個bvec不全是完整的頁，那麼會為clone bio從新配置設定bvec數組，并拷貝原始bio的數組内容。顯然本例中，原始bio為256個sector，且bvec都是滿頁，不會重新配置設定bvec數組，clone bio會和原始bio公用bvec數組。

問題在于，在clone bio 的split過程中，256個sector按照255個sector進行切分，分為255個sector bio和1sector的bio，其中clone bio為1sector的bio，split過程中，會修該bvec的範圍，clone bio的最後一個bvec從{bv_len = 4096，bv_offset=0}變成{bv_len=512，bv_offset=3584}。由于clone bio和原始bio公用bvec數組，導緻原始bio數組最後一個bvec被修改。最後，在所有的io都完成之後，調用原始bio的回調函數導緻最後一個頁不會執行end_page_writeback()，導緻此頁上的程序Hung死。

（四）修正與結論

知道了根本原因之後，是由于原始bio和clone bio公用bvec數組導緻的，可以修改search_alloc函數如下：

static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
{
    struct bio_vec *bv;
    struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
    memset(s, 0, offsetof(struct search, op.keys));

    __closure_init(&s->cl, NULL);

    s->op.inode     = d->id;
    s->op.c         = d->c;
    s->d            = d;
    s->op.lock      = -1;
    s->task         = current;
    s->orig_bio     = bio;
    s->write        = (bio->bi_rw & REQ_WRITE) != 0;
    s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
    s->op.skip      = (bio->bi_rw & REQ_DISCARD) != 0;
    s->recoverable      = 1;
    s->start_time       = jiffies;
    do_bio_hook(s);

  //  if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
        bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
        memcpy(bv, bio_iovec(bio),
               sizeof(struct bio_vec) * bio_segments(bio));

        s->bio.bio.bi_io_vec    = bv;
        s->unaligned_bvec   = 1;
    //}

    return s;
}

注釋掉731行和738行，即無論對齊與否，都為clone bio 的bvec數組申請單獨的記憶體，確定不和原始bio共用bvec數組，防止被修改。

在bcache或者lvm這種邏輯裝置中，由于涉及到bio的重定向或者split處理，都不會送出原始的bio，以確定原始的bio的參數不被修改，在原始bio所覆寫的io 請求區間全部完成之後，會通過回調機制執行原始io的結束回調函數。在整個處理過程中，原始bio的不應該被修改。

bcache 寫導緻io hung問題的追蹤

繼續閱讀

Linux I/O系列：不使用fsync如何盡快将資料寫入磁盤（posix_fadivce的作用與實際應用）系列文章一、前言二、posix_fadvise函數三、posix_fadvise的應用場景四、慣例

Linux sync/fsync/fdatasync 函數

node.js學習筆記之檔案讀寫操作

關于Extjs4.2的一些BUG

解決extjs中chart顯示不下legend的問題

ext4之核心文檔

ext4之Makefile

安裝Bcache中遇到的問題記錄

Linux學習日志（十二）

Hi3559AV100制作ext4檔案系統和燒錄emmc

Ext4右鍵菜單實作

制作ext4檔案系統

gdb調試工具簡介【二】

Linux initramfs

使用GDB生成coredump檔案

RedHat Enterprise Linux挂載新硬碟