linux文件系统四 VFS数据读取vfs_read

一、vfs_read的调用流程：

二、VFS层调用流程

1、系统调用sys_read会调用到vfs层的__vfs_read接口如下，在vfs层接口会调用大具体的文件系统的

操作接口：

//kernel-4.9/fs/read_write.c
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
		   loff_t *pos)
{
	if (file->f_op->read)
		return file->f_op->read(file, buf, count, pos);
	else if (file->f_op->read_iter)
		return new_sync_read(file, buf, count, pos);
	else
		return -EINVAL;
}

在new_sync_read中会调用到具体的文件系统的读写接口generic_file_read_iter：

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
    //......
	ret = filp->f_op->read_iter(&kiocb, &iter);
	return ret;
}

2、filp->f_op->read_iter会调用到generic_file_read_iter，generic_file_read_iter这个是所有文件系统通用

的读接口：

//kernel-4.9/fs/ext4/file.c
const struct file_operations ext4_file_operations = {
	.llseek		= ext4_llseek,
	.read_iter	= generic_file_read_iter,
	.write_iter	= ext4_file_write_iter,
//......
}

3、generic_file_read_iter是读文件的核心函数：

//kernel-4.9/mm/filemap.c
/**
 * generic_file_read_iter - generic filesystem read routine
 * @iocb:	kernel I/O control block
 * @iter:	destination for the data read
 *
 * This is the "read_iter()" routine for all filesystems
 * that can use the page cache directly.
 */
ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
	struct file *file = iocb->ki_filp;
	ssize_t retval = 0;
	size_t count = iov_iter_count(iter);

	if (iocb->ki_flags & IOCB_DIRECT) { //针对IOCB_DIRECT类型进程特殊操作，后续研究
		retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
					iocb->ki_pos + count - 1);
		retval = mapping->a_ops->direct_IO(iocb, &data);
		if (retval >= 0) {
			iocb->ki_pos += retval;
			iov_iter_advance(iter, retval);
		}
	}

	retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);
}

　　可以看到在generic_file_read_iter针对数据的读取方式是IOCB_DIRECT还是其他类型进行区别操作，

对于没有添加IOCB_DIRECT标志的read会调用到do_generic_file_read：

//kernel-4.9/mm/filemap.c
/**
 * do_generic_file_read - generic file read routine
 * @filp:	the file to read
 * @ppos:	current file position
 * @iter:	data destination
 * @written:	already copied
 * loff_t 字段 用来维护当前读写位置
 * 地址右移PAGE_SHIFT，移位的结果就是页号
**/
static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
		struct iov_iter *iter, ssize_t written)
{
	struct address_space *mapping = filp->f_mapping;
	struct inode *inode = mapping->host;
	struct file_ra_state *ra = &filp->f_ra;
        /*计算本次读取的是文件中的第几个page*/ 
	index = *ppos >> PAGE_SHIFT;
        /*上次读取的是第几个page*/ 
	prev_index = ra->prev_pos >> PAGE_SHIFT;
	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
        /*要读取的最后一个page*/ 
	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
	offset = *ppos & ~PAGE_MASK;
	for (;;) {
//从cache中查找page的位置
find_page:
            /* 从cache中读取page */
		page = find_get_page(mapping, index);
		if (!page) {
                        /* 如果page不在当前cache中，进行预读操作*/
			page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
			page = find_get_page(mapping, index);
			if (unlikely(page == NULL))
				goto no_cached_page;
		}
//page读取OK，返回到用户空间
page_ok:
		/*
		 * Ok, we have the page, and it's up-to-date, so
		 * now we can copy it to user space...
		 */
		ret = copy_page_to_iter(page, offset, nr, iter);
		offset += ret;
		index += offset >> PAGE_SHIFT;
		offset &= ~PAGE_MASK;
		prev_offset = offset;

//读取page
readpage:
		/*
		 * A previous I/O error may have been due to temporary
		 * failures, eg. multipath errors.
		 * PG_error will be set again if readpage fails.
		 */
		/* Start the actual read. The read will unlock the page. */
		error = mapping->a_ops->readpage(filp, page);

		goto page_ok;
......
}

　　do_generic_file_read中调用到find_get_page在cache中查找page，如果没有找到page，调用

page_cache_async_readahead对page进行预读取，预读取后find_get_page几乎都能命中page。其中

page_cache_async_readahead我再继续看下对应的代码：

//kernel-4.9/mm/readahead.c
void page_cache_sync_readahead(struct address_space *mapping,
			       struct file_ra_state *ra, struct file *filp,
			       pgoff_t offset, unsigned long req_size)
{
	/* do read-ahead */
	ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}

/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static unsigned long
ondemand_readahead(struct address_space *mapping,
		   struct file_ra_state *ra, struct file *filp,
		   bool hit_readahead_marker, pgoff_t offset,
		   unsigned long req_size)
{
	/*
	 * standalone, small random read
	 * Read as is, and do not pollute the readahead state.
	 */
	return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);

}

　　继续看__do_page_cache_readahead，根据要读取的page索引和page数量，去查找相应的page，如果

没有则alloc一个新的page。然后调用read_pages继续处理：

/*
 * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates all
 * the pages first, then submits them all for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 *
 * Returns the number of pages requested, or the maximum amount of I/O allowed.
 */
int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
			pgoff_t offset, unsigned long nr_to_read,
			unsigned long lookahead_size)
{
	end_index = ((isize - 1) >> PAGE_SHIFT);
	/*
	 * Preallocate as many pages as we will need.
	 */
	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
		pgoff_t page_offset = offset + page_idx;

		rcu_read_lock();
		page = radix_tree_lookup(&mapping->page_tree, page_offset);
		rcu_read_unlock();

		page = __page_cache_alloc(gfp_mask);
		if (!page)
			break;
		page->index = page_offset;
		list_add(&page->lru, &page_pool);
		if (page_idx == nr_to_read - lookahead_size)
			SetPageReadahead(page);
		ret++;
	}

	if (ret)
		read_pages(mapping, filp, &page_pool, ret, gfp_mask);
}

　　read_pages会调用blk_start_plug和blk_finish_plug进行bio的请求，start plug不会立马去调用到

bio驱动的queue中，而是加入到对应的plug list中，等到finish_plug是会通过submit_io去刷新plug队列上

的请求到驱动的queue进行处理。

static int read_pages(struct address_space *mapping, struct file *filp,
		struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
{
	struct blk_plug plug;
	unsigned page_idx;
	int ret;

	blk_start_plug(&plug);

	if (mapping->a_ops->readpages) { //调用到具体文件系统ext4的readpages
		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
		/* Clean up the remaining pages */
		put_pages_list(pages);
		goto out;
	}

out:
	blk_finish_plug(&plug);
}

三、ext4文件系统调用流程

1、ext4文件系统readpages：

//kernel-4.9/fs/ext4/inode.c
static const struct address_space_operations ext4_aops = {
	.readpage		= ext4_readpage,
	.readpages		= ext4_readpages,
	.writepage		= ext4_writepage,
	.writepages		= ext4_writepages,
	.direct_IO		= ext4_direct_IO,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate  = block_is_partially_uptodate,
};

static int
ext4_readpages(struct file *file, struct address_space *mapping,
		struct list_head *pages, unsigned nr_pages)
{
	struct inode *inode = mapping->host;

	/* If the file has inline data, no need to do readpages. */
	if (ext4_has_inline_data(inode))
		return 0;

	return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
}

　　在ext4_mpage_readpages中，会构造一个bio，不通的page请求保存在bio的bio_vec不同元素（段）中。

当然很多时候，mpage_readpages多个page会产生多个bio，这个时候read_pages的start/stop plug组合就

起作用了：

int ext4_mpage_readpages(struct address_space *mapping,
			 struct list_head *pages, struct page *page,
			 unsigned nr_pages)
{
	struct bio *bio = NULL;

	for (; nr_pages; nr_pages--) {
		/*
		 * This page will go to BIO.  Do we need to send this
		 * BIO off first?
		 */
		if (bio && (last_block_in_bio != blocks[0] - 1)) {
		submit_and_realloc:
			ext4_submit_bio_read(bio);
			bio = NULL;
		}
		if (bio == NULL) {
			struct fscrypt_ctx *ctx = NULL;

			if (ext4_encrypted_inode(inode) &&
			    S_ISREG(inode->i_mode)) {
				ctx = fscrypt_get_ctx(inode, GFP_NOFS);
				if (IS_ERR(ctx))
					goto set_error_page;
			}
			bio = bio_alloc(GFP_KERNEL,
				min_t(int, nr_pages, BIO_MAX_PAGES));
			if (!bio) {
				if (ctx)
					fscrypt_release_ctx(ctx);
				goto set_error_page;
			}
			bio->bi_bdev = bdev;
			bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
			bio->bi_end_io = mpage_end_io;
			bio->bi_private = ctx;
			ext4_set_bio_ctx(inode, bio);
			bio_set_op_attrs(bio, REQ_OP_READ, 0);
		}

	BUG_ON(pages && !list_empty(pages));
	if (bio)
		ext4_submit_bio_read(bio);
	return 0;
}

２、ext4_submit_bio_read会调用submit_bio向blk层申请request：

static void
ext4_submit_bio_read(struct bio *bio)
{
//......
	submit_bio(bio);
}

四、blk块设备驱动层：

1、submit_bio是内核标准的提交request方法，基本会调用到q->make_request_fn(q, bio)：

//kernel-4.9/block/blk-core.c
/**
 * submit_bio - submit a bio to the block device layer for I/O
 * @bio: The &struct bio which describes the I/O
 *
 * submit_bio() is very similar in purpose to generic_make_request(), and
 * uses that function to do most of the work. Both are fairly rough
 * interfaces; @bio must be presetup and ready for I/O.
 *
 */
blk_qc_t submit_bio(struct bio *bio)
{
	/*
	 * If it's a regular read/write or a barrier with data attached,
	 * go through the normal accounting stuff before submission.
	 */
......
	return generic_make_request(bio);
}

　　在generic_make_request中会向bio层申请request请求：

blk_qc_t generic_make_request(struct bio *bio)
{
	if (current->bio_list) {
		bio_list_add(&current->bio_list[0], bio);
		goto out;
	}

	bio_list_init(&bio_list_on_stack[0]);
	current->bio_list = bio_list_on_stack;
	do {
		struct request_queue *q = bdev_get_queue(bio->bi_bdev);

		if (likely(blk_queue_enter(q, false) == 0)) {
			struct bio_list lower, same;

			/* Create a fresh bio_list for all subordinate requests */
			bio_list_on_stack[1] = bio_list_on_stack[0];
			bio_list_init(&bio_list_on_stack[0]);
			ret = q->make_request_fn(q, bio);

		} else {
			bio_io_error(bio);
		}
		bio = bio_list_pop(&bio_list_on_stack[0]);
	} while (bio);
	current->bio_list = NULL; /* deactivate */
}

　　调用queue的request_fn方法把request提交给磁盘驱动进行真正的处理。当前进程是plug的，在

read_pages之后会调用stop plug，将所有request集中交给磁盘驱动处理。

参考资料：

https://www.jianshu.com/p/acdadea07fd8

http://bbs.chinaunix.net/thread-3772486-1-1.html

http://bbs.chinaunix.net/thread-3774478-1-1.html

作者：frank_zyp
您的支持是对博主最大的鼓励，感谢您的认真阅读。
本文无所谓版权，欢迎转载。

frank_zyp

发布了59 篇原创文章 · 获赞 80 · 访问量 5万+

私信关注

linux文件系统四 VFS数据读取vfs_read

猜你喜欢