VFS基础学习笔记 - 6.写文件过程

1. 前言

本专题我们开始学习虚拟文件系统VFS的相关内容。本专题主要参考了《存储技术原理分析》、ULA、ULK的相关内容。本文主要记录写文件的过程。

kernel版本:5.10
FS: minix
平台:arm64

注:
为方便阅读,正文标题采用分级结构标识,每一级用一个"-“表示,如:两级为”|- -", 三级为”|- - -“

2. ksys_write

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, loff_t pos)
	|--ksys_write(fd, buf, count);
		|--struct fd f = fdget_pos(fd);
		|--loff_t pos, *ppos = file_ppos(f.file);
		|  pos = *ppos;
		|  ppos = &pos;
		|--ret = vfs_write(f.file, buf, count, ppos);
		\--if (ret >= 0 && ppos)
				f.file->f_pos = pos;

总体流程与read调用是类似的,write主要是通过fd获取到file文件描述符,更新文件的位置,通过调用vfs_write写入,写入完毕更新文件的位置

 vfs_write(f.file, buf, count, ppos);
	|--对file->f_mode进行检查
	|--rw_verify_area(WRITE, file, pos, count)
	|  //获取超级块的写入权限
	|--file_start_write(file);
	|--if (file->f_op->write)
	|		ret = file->f_op->write(file, buf, count, pos);
	|  else if (file->f_op->write_iter)
	|		ret = new_sync_write(file, buf, count, pos);
	|  else
	|		ret = -EINVAL;
	\--file_end_write(file);
  1. vfs_write首先做一些基本的检查,如:文件是否可写、是否实现操作函数集、是否持有锁等;

  2. 如果file->f_ops已经实现则执行file->f_ops->write函数,否则执行默认的write_iter回调函数
    write_iter和write回调至少要实现一个,如果没有定义write回调则一定要定义write_iter回调

new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
	|--struct iovec iov = {
    
     .iov_base = (void __user *)buf, .iov_len = len };
	|  struct kiocb kiocb;
	|  struct iov_iter iter;
	|--init_sync_kiocb(&kiocb, filp);
	|  kiocb.ki_pos = (ppos ? *ppos : 0);
	|  iov_iter_init(&iter, WRITE, &iov, 1, len);
	|--call_write_iter(filp, &kiocb, &iter);
	|		file->f_op->write_iter(kio, iter);
	|--if (ret > 0 && ppos)
			*ppos = kiocb.ki_pos;

对于minix系统而言,write_iter回调为generic_file_write_iter

ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
	|--struct file *file = iocb->ki_filp;
	|  struct inode *inode = file->f_mapping->host;
	|  // Performs necessary checks before doing a write
	|--generic_write_checks(iocb, from);
	|--if (ret > 0)
	|		ret = __generic_file_write_iter(iocb, from);
	|--if (ret > 0)
			ret = generic_write_sync(iocb, ret);

此处只分析__generic_file_write_iter

ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
	|--struct file *file = iocb->ki_filp;
	|  struct address_space * mapping = file->f_mapping;
	|  struct inode    *inode = mapping->host;
	|--current->backing_dev_info = inode_to_bdi(inode);
	|--file_remove_privs(file);
	|--file_update_time(file)
	|--if (iocb->ki_flags & IOCB_DIRECT)
	|		written = generic_file_direct_write(iocb, from);
	|		if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
	|			goto out;
	|		//如果direct_io没有将需要写盘的数据全部写完,就要调用generic_perform_write函数做一次buffer write
	|		status = generic_perform_write(file, from, pos = iocb->ki_pos);
	|		err = filemap_write_and_wait_range(mapping, pos, endbyte);
	|		if (err == 0)
	|			iocb->ki_pos = endbyte + 1;
	|			written += status;
	|			invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, endbyte >> PAGE_SHIFT);
	|  else
	|		written = generic_perform_write(file, from, iocb->ki_pos);
	|		if (likely(written > 0))
	|			iocb->ki_pos += written;
out:
	|--current->backing_dev_info = NULL;

generic_file_direct_write:对直接IO的处理,将直接绕过page cache写入磁盘,当出错或者写完就会直接返回,如果direct_io没有将需要写盘的数据全部写完,就要调用generic_perform_write函数做一次buffer write

generic_perform_write:对buffer io的处理,基于iovec迭代器,一个个的iovec推进,放入page cache

|- -generic_file_direct_write

ssize_t  generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
	|--struct file     *file = iocb->ki_filp;|  struct address_space *mapping = file->f_mapping;
	|  struct inode    *inode = mapping->host;
	|  loff_t          pos = iocb->ki_pos;
	|  write_len = iov_iter_count(from);
	|  end = (pos + write_len - 1) >> PAGE_SHIFT;
	|--written = filemap_write_and_wait_range(mapping, pos,pos + write_len - 1);
	|  //发生错误退出
	|  if (written)
	|		goto out;
	|--written = invalidate_inode_pages2_range(mapping,pos >> PAGE_SHIFT, end);
	|	//If a page can not be invalidated, return 0 to fall back to buffered write.
	|   if (written)
	|		if (written == -EBUSY)
	|			return 0;
	|		goto out;
	|--written = mapping->a_ops->direct_IO(iocb, from);
	|  //再次执行invalidate page cache
	|--if (written > 0 && mapping->nrpages &&
	|			invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
	|		dio_warn_stale_pagecache(file);
	|  //更新位置
	|--if (written > 0)
	|		pos += written;
	|		write_len -= written;
	|		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode))
	|			i_size_write(inode, pos);
	|			mark_inode_dirty(inode);
	|		iocb->ki_pos = pos;
out:
	|--return written;
	

|- -generic_perform_write

ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos)
	|--struct address_space *mapping = file->f_mapping;
	|--const struct address_space_operations *a_ops = mapping->a_ops;
	|--do {
    
    
			struct page *page;
			unsigned long offset;   /* Offset into pagecache page */
			unsigned long bytes;    /* Bytes to write to page */
			offset = (pos & (PAGE_SIZE - 1));
			bytes = min_t(unsigned long, PAGE_SIZE - offset,iov_iter_count(i));
again:
			//调用块设备的write_begin函数,获取块设备缓存page,如果page不存在则需要分配
			status = a_ops->write_begin(file, mapping, pos, bytes, flags,&page, &fsdata);
			//标记页面脏,需要flush
			if (mapping_writably_mapped(mapping))
				flush_dcache_page(page);
			//将用户要写的用户空间的数据copy到块设备的缓存page中
			copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
			flush_dcache_page(page);
			//writer_end函数作用就是标记该page为最新、page bh为dirty,等待在合适的时机被刷到设备中
			status = a_ops->write_end(file, mapping, pos, bytes, copied,page, fsdata);
			copied = status;
			cond_resched();
			//根据完成的字节数向前推进
			iov_iter_advance(i, copied)
			//偏移加上完成的数据长度
			pos += copied;
			//更新写入总数
			written += copied;
			// 此处会触发page cache脏页写回
			balance_dirty_pages_ratelimited(mapping);
		} while (iov_iter_count(i));

参考:块存储:写块设备缓存的代码注释

  1. write_begin:块设备的writer_begin函数是blkdev_write_begin, 该函数为块设备的缓存准备page:

  2. iov_iter_copy_from_user_atomic:作用是将用户要写的用户空间的数据copy到块设备的缓存,即page中

  3. write_end:块设备的writer_end函数是blkdev_write_end,在第2步骤已将用户数据copy到块设备的缓存page中,blkdev_write_end函数作用就是标记该page为最新、page bh为dirty,等待在合适的时机被刷到设备中:

参考文档

块存储:写块设备缓存的代码注释

おすすめ

転載: blog.csdn.net/jasonactions/article/details/117278534