一、vfs_read的调用流程:
二、VFS层调用流程
1、系统调用sys_read会调用到vfs层的__vfs_read接口如下,在vfs层接口会调用大具体的文件系统的
操作接口:
//kernel-4.9/fs/read_write.c
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
loff_t *pos)
{
if (file->f_op->read)
return file->f_op->read(file, buf, count, pos);
else if (file->f_op->read_iter)
return new_sync_read(file, buf, count, pos);
else
return -EINVAL;
}
在new_sync_read中会调用到具体的文件系统的读写接口generic_file_read_iter:
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
//......
ret = filp->f_op->read_iter(&kiocb, &iter);
return ret;
}
2、filp->f_op->read_iter会调用到generic_file_read_iter,generic_file_read_iter这个是所有文件系统通用
的读接口:
//kernel-4.9/fs/ext4/file.c
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = generic_file_read_iter,
.write_iter = ext4_file_write_iter,
//......
}
3、generic_file_read_iter是读文件的核心函数:
//kernel-4.9/mm/filemap.c
/**
* generic_file_read_iter - generic filesystem read routine
* @iocb: kernel I/O control block
* @iter: destination for the data read
*
* This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
*/
ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
ssize_t retval = 0;
size_t count = iov_iter_count(iter);
if (iocb->ki_flags & IOCB_DIRECT) { //针对IOCB_DIRECT类型进程特殊操作,后续研究
retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1);
retval = mapping->a_ops->direct_IO(iocb, &data);
if (retval >= 0) {
iocb->ki_pos += retval;
iov_iter_advance(iter, retval);
}
}
retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);
}
可以看到在generic_file_read_iter针对数据的读取方式是IOCB_DIRECT还是其他类型进行区别操作,
对于没有添加IOCB_DIRECT标志的read会调用到do_generic_file_read:
//kernel-4.9/mm/filemap.c
/**
* do_generic_file_read - generic file read routine
* @filp: the file to read
* @ppos: current file position
* @iter: data destination
* @written: already copied
* loff_t 字段 用来维护当前读写位置
* 地址右移PAGE_SHIFT,移位的结果就是页号
**/
static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
struct iov_iter *iter, ssize_t written)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &filp->f_ra;
/*计算本次读取的是文件中的第几个page*/
index = *ppos >> PAGE_SHIFT;
/*上次读取的是第几个page*/
prev_index = ra->prev_pos >> PAGE_SHIFT;
prev_offset = ra->prev_pos & (PAGE_SIZE-1);
/*要读取的最后一个page*/
last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
for (;;) {
//从cache中查找page的位置
find_page:
/* 从cache中读取page */
page = find_get_page(mapping, index);
if (!page) {
/* 如果page不在当前cache中,进行预读操作*/
page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
page = find_get_page(mapping, index);
if (unlikely(page == NULL))
goto no_cached_page;
}
//page读取OK,返回到用户空间
page_ok:
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*/
ret = copy_page_to_iter(page, offset, nr, iter);
offset += ret;
index += offset >> PAGE_SHIFT;
offset &= ~PAGE_MASK;
prev_offset = offset;
//读取page
readpage:
/*
* A previous I/O error may have been due to temporary
* failures, eg. multipath errors.
* PG_error will be set again if readpage fails.
*/
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
goto page_ok;
......
}
do_generic_file_read中调用到find_get_page在cache中查找page,如果没有找到page,调用
page_cache_async_readahead对page进行预读取,预读取后find_get_page几乎都能命中page。其中
page_cache_async_readahead我再继续看下对应的代码:
//kernel-4.9/mm/readahead.c
void page_cache_sync_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
pgoff_t offset, unsigned long req_size)
{
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
static unsigned long
ondemand_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
}
继续看__do_page_cache_readahead,根据要读取的page索引和page数量,去查找相应的page,如果
没有则alloc一个新的page。然后调用read_pages继续处理:
/*
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
end_index = ((isize - 1) >> PAGE_SHIFT);
/*
* Preallocate as many pages as we will need.
*/
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
rcu_read_unlock();
page = __page_cache_alloc(gfp_mask);
if (!page)
break;
page->index = page_offset;
list_add(&page->lru, &page_pool);
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
ret++;
}
if (ret)
read_pages(mapping, filp, &page_pool, ret, gfp_mask);
}
read_pages会调用blk_start_plug和blk_finish_plug进行bio的请求,start plug不会立马去调用到
bio驱动的queue中,而是加入到对应的plug list中,等到finish_plug是会通过submit_io去刷新plug队列上
的请求到驱动的queue进行处理。
static int read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
{
struct blk_plug plug;
unsigned page_idx;
int ret;
blk_start_plug(&plug);
if (mapping->a_ops->readpages) { //调用到具体文件系统ext4的readpages
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
goto out;
}
out:
blk_finish_plug(&plug);
}
三、ext4文件系统调用流程
1、ext4文件系统readpages:
//kernel-4.9/fs/ext4/inode.c
static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
};
static int
ext4_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
struct inode *inode = mapping->host;
/* If the file has inline data, no need to do readpages. */
if (ext4_has_inline_data(inode))
return 0;
return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
}
在ext4_mpage_readpages中,会构造一个bio,不通的page请求保存在bio的bio_vec不同元素(段)中。
当然很多时候,mpage_readpages多个page会产生多个bio,这个时候read_pages的start/stop plug组合就
起作用了:
int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages)
{
struct bio *bio = NULL;
for (; nr_pages; nr_pages--) {
/*
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
if (bio && (last_block_in_bio != blocks[0] - 1)) {
submit_and_realloc:
ext4_submit_bio_read(bio);
bio = NULL;
}
if (bio == NULL) {
struct fscrypt_ctx *ctx = NULL;
if (ext4_encrypted_inode(inode) &&
S_ISREG(inode->i_mode)) {
ctx = fscrypt_get_ctx(inode, GFP_NOFS);
if (IS_ERR(ctx))
goto set_error_page;
}
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
if (!bio) {
if (ctx)
fscrypt_release_ctx(ctx);
goto set_error_page;
}
bio->bi_bdev = bdev;
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
bio->bi_private = ctx;
ext4_set_bio_ctx(inode, bio);
bio_set_op_attrs(bio, REQ_OP_READ, 0);
}
BUG_ON(pages && !list_empty(pages));
if (bio)
ext4_submit_bio_read(bio);
return 0;
}
2、ext4_submit_bio_read会调用submit_bio向blk层申请request:
static void
ext4_submit_bio_read(struct bio *bio)
{
//......
submit_bio(bio);
}
四、blk块设备驱动层:
1、submit_bio是内核标准的提交request方法,基本会调用到q->make_request_fn(q, bio):
//kernel-4.9/block/blk-core.c
/**
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
*
* submit_bio() is very similar in purpose to generic_make_request(), and
* uses that function to do most of the work. Both are fairly rough
* interfaces; @bio must be presetup and ready for I/O.
*
*/
blk_qc_t submit_bio(struct bio *bio)
{
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
......
return generic_make_request(bio);
}
在generic_make_request中会向bio层申请request请求:
blk_qc_t generic_make_request(struct bio *bio)
{
if (current->bio_list) {
bio_list_add(¤t->bio_list[0], bio);
goto out;
}
bio_list_init(&bio_list_on_stack[0]);
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
if (likely(blk_queue_enter(q, false) == 0)) {
struct bio_list lower, same;
/* Create a fresh bio_list for all subordinate requests */
bio_list_on_stack[1] = bio_list_on_stack[0];
bio_list_init(&bio_list_on_stack[0]);
ret = q->make_request_fn(q, bio);
} else {
bio_io_error(bio);
}
bio = bio_list_pop(&bio_list_on_stack[0]);
} while (bio);
current->bio_list = NULL; /* deactivate */
}
调用queue的request_fn方法把request提交给磁盘驱动进行真正的处理。当前进程是plug的,在
read_pages之后会调用stop plug,将所有request集中交给磁盘驱动处理。
参考资料:
https://www.jianshu.com/p/acdadea07fd8
http://bbs.chinaunix.net/thread-3772486-1-1.html
http://bbs.chinaunix.net/thread-3774478-1-1.html
作者:frank_zyp
您的支持是对博主最大的鼓励,感谢您的认真阅读。
本文无所谓版权,欢迎转载。