从系统调用read开始,
其系统调用实现如下,传入的参数是打开的文件句柄,用户区缓冲池,读取长度:
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos =
file_pos_read(f.file);
ret =
vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
}
return ret;
}
fd是文件描述符是一个整型,对应一个文件,其实是一个索引,关联到一个struct file。
因为入参中是文件描述符,所以需要通过fget_pos(fd)(->fdget->fget_light,current->files)函数来获取file结构体。
fget_pos函数返回的是机构体struct fd,fd的定义如下,包含了file和flags:
struct fd {
struct file *file;
unsigned int flags;
};
看到其中调用了vfs_read函数,参数是file,用户层缓冲区和文件偏移位置。
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
ssize_t ret;
if (!(file->f_mode &
FMODE_READ))
return -EBADF;
if (!file->f_op ||
(!file->f_op->read && !file->f_op->aio_read))
return -EINVAL;
if
(unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
count = ret;
if (file->f_op->read)
ret =
file->f_op->read(file, buf, count, pos);
else
do_sync_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file);
add_rchar(current,
ret);
}
inc_syscr(current);
return ret;
先是判断,文件的模式是否可读,以及其file_operations结构体中是否定义了read后者aio_read,如果没有则返回。如果成功返回则给进程结构的ioac成员更新io情况。
使用file的f_op函数集,ext4则是结构体ext4_file_operations,定义如下,所以file->f_op->read函数就是do_sync_read函数。
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read =
generic_file_aio_read,
.aio_write = ext4_file_write,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl =
ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
.open = ext4_file_open,
.release =
ext4_release_file,
.fsync = ext4_sync_file,
.splice_read =
generic_file_splice_read,
.splice_write =
generic_file_splice_write,
.fallocate = ext4_fallocate,
如果是xfs文件系统如下:
const struct file_operations xfs_file_operations = {
.llseek = xfs_file_llseek,
xfs_file_aio_read,
.aio_write =
xfs_file_aio_write,
xfs_file_splice_read,
xfs_file_splice_write,
.unlocked_ioctl = xfs_file_ioctl,
xfs_file_compat_ioctl,
.mmap = xfs_file_mmap,
.open = xfs_file_open,
xfs_file_release,
.fsync = xfs_file_fsync,
.fallocate =
xfs_file_fallocate,
那么,我们进入到了,函数do_sync_read如下:
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base =
buf, .iov_len = len };
struct kiocb kiocb;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
}
do_sync_read函数中会先初始化kiocb(kernel io control block),然后调用f_op->aio_read函数,在ext4中generic_file_aio_read。
在generic_file_aio_read函数调用generic_segment_checks检测要写的缓冲区是否有效,这种检查在整个内核中都非常常见,如果不做检测很容易导致指针异常而系统崩溃。
然后判断是否是直接IO,如果是则会调用retval = mapping->a_ops->direct_IO(READ, iocb, iov,
pos, nr_segs);
该函数的a_ops是address_space_ops,ext4的如下,其调用的direct_IO是ext4_direct_IO.
static const struct address_space_operations ext4_aops = {
.readpage =
ext4_readpage,
.readpages =
ext4_readpages,
.writepage =
ext4_writepage,
.writepages =
ext4_writepages,
.write_begin =
ext4_write_begin,
.write_end =
ext4_write_end,
.bmap =
ext4_bmap,
.invalidatepage_range =
ext4_invalidatepage,
.releasepage =
ext4_releasepage,
.direct_IO =
ext4_direct_IO,
.migratepage =
buffer_migrate_page,
.is_partially_uptodate =
block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
另外xfs的如下:
const struct address_space_operations xfs_address_space_operations =
xfs_vm_readpage,
xfs_vm_readpages,
xfs_vm_writepage,
xfs_vm_writepages,
.set_page_dirty =
xfs_vm_set_page_dirty,
xfs_vm_releasepage,
xfs_vm_invalidatepage,
xfs_vm_write_begin,
xfs_vm_write_end,
xfs_vm_bmap,
.direct_IO = xfs_vm_direct_IO,
.migratepage = buffer_migrate_page,
.error_remove_page =
generic_error_remove_page,
如果不是,则调用函数do_generic_file_read,从磁盘读取请求的页并把它们复制到用户态缓存区。真正执行读操作,是通过mapping->a_ops->readpage()来完成。
Address_space对象的readpage会负责激活磁盘到页之间的I/O数据传输。Ext4的readpage函数是ext4_readpage,会调用函数mpage_readpage.如果块在磁盘上是连续的,就用单个bio,如果不连续就用不同的bio描述符来读。
完事之后,由file_read_actor函数负责把页中的数据拷贝到用户态缓冲区中。
Submit_bio是一个关键函数,负责根据传递的bio实例创建一个新请求。并使用make_request_fn将请求置于驱动程序的请求队列上。
之后就到块层了,块层会对这些请求进行合并、插入,从而提高性能。