1. 前言
本专题我们开始学习虚拟文件系统VFS的相关内容。本专题主要参考了《存储技术原理分析》、ULA、ULK的相关内容。本文主要记录写文件的过程。
kernel版本:5.10
FS: minix
平台:arm64
注:
为方便阅读,正文标题采用分级结构标识,每一级用一个"-“表示,如:两级为”|- -", 三级为”|- - -“
2. ksys_write
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, loff_t pos)
|--ksys_write(fd, buf, count);
|--struct fd f = fdget_pos(fd);
|--loff_t pos, *ppos = file_ppos(f.file);
| pos = *ppos;
| ppos = &pos;
|--ret = vfs_write(f.file, buf, count, ppos);
\--if (ret >= 0 && ppos)
f.file->f_pos = pos;
总体流程与read调用是类似的,write主要是通过fd获取到file文件描述符,更新文件的位置,通过调用vfs_write写入,写入完毕更新文件的位置
vfs_write(f.file, buf, count, ppos);
|--对file->f_mode进行检查
|--rw_verify_area(WRITE, file, pos, count)
| //获取超级块的写入权限
|--file_start_write(file);
|--if (file->f_op->write)
| ret = file->f_op->write(file, buf, count, pos);
| else if (file->f_op->write_iter)
| ret = new_sync_write(file, buf, count, pos);
| else
| ret = -EINVAL;
\--file_end_write(file);
-
vfs_write首先做一些基本的检查,如:文件是否可写、是否实现操作函数集、是否持有锁等;
-
如果file->f_ops已经实现则执行file->f_ops->write函数,否则执行默认的write_iter回调函数
write_iter和write回调至少要实现一个,如果没有定义write回调则一定要定义write_iter回调
new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
|--struct iovec iov = {
.iov_base = (void __user *)buf, .iov_len = len };
| struct kiocb kiocb;
| struct iov_iter iter;
|--init_sync_kiocb(&kiocb, filp);
| kiocb.ki_pos = (ppos ? *ppos : 0);
| iov_iter_init(&iter, WRITE, &iov, 1, len);
|--call_write_iter(filp, &kiocb, &iter);
| file->f_op->write_iter(kio, iter);
|--if (ret > 0 && ppos)
*ppos = kiocb.ki_pos;
对于minix系统而言,write_iter回调为generic_file_write_iter
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|--struct file *file = iocb->ki_filp;
| struct inode *inode = file->f_mapping->host;
| // Performs necessary checks before doing a write
|--generic_write_checks(iocb, from);
|--if (ret > 0)
| ret = __generic_file_write_iter(iocb, from);
|--if (ret > 0)
ret = generic_write_sync(iocb, ret);
此处只分析__generic_file_write_iter
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|--struct file *file = iocb->ki_filp;
| struct address_space * mapping = file->f_mapping;
| struct inode *inode = mapping->host;
|--current->backing_dev_info = inode_to_bdi(inode);
|--file_remove_privs(file);
|--file_update_time(file)
|--if (iocb->ki_flags & IOCB_DIRECT)
| written = generic_file_direct_write(iocb, from);
| if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
| goto out;
| //如果direct_io没有将需要写盘的数据全部写完,就要调用generic_perform_write函数做一次buffer write
| status = generic_perform_write(file, from, pos = iocb->ki_pos);
| err = filemap_write_and_wait_range(mapping, pos, endbyte);
| if (err == 0)
| iocb->ki_pos = endbyte + 1;
| written += status;
| invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, endbyte >> PAGE_SHIFT);
| else
| written = generic_perform_write(file, from, iocb->ki_pos);
| if (likely(written > 0))
| iocb->ki_pos += written;
out:
|--current->backing_dev_info = NULL;
generic_file_direct_write:对直接IO的处理,将直接绕过page cache写入磁盘,当出错或者写完就会直接返回,如果direct_io没有将需要写盘的数据全部写完,就要调用generic_perform_write函数做一次buffer write
generic_perform_write:对buffer io的处理,基于iovec迭代器,一个个的iovec推进,放入page cache
|- -generic_file_direct_write
ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
|--struct file *file = iocb->ki_filp;、
| struct address_space *mapping = file->f_mapping;
| struct inode *inode = mapping->host;
| loff_t pos = iocb->ki_pos;
| write_len = iov_iter_count(from);
| end = (pos + write_len - 1) >> PAGE_SHIFT;
|--written = filemap_write_and_wait_range(mapping, pos,pos + write_len - 1);
| //发生错误退出
| if (written)
| goto out;
|--written = invalidate_inode_pages2_range(mapping,pos >> PAGE_SHIFT, end);
| //If a page can not be invalidated, return 0 to fall back to buffered write.
| if (written)
| if (written == -EBUSY)
| return 0;
| goto out;
|--written = mapping->a_ops->direct_IO(iocb, from);
| //再次执行invalidate page cache
|--if (written > 0 && mapping->nrpages &&
| invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
| dio_warn_stale_pagecache(file);
| //更新位置
|--if (written > 0)
| pos += written;
| write_len -= written;
| if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode))
| i_size_write(inode, pos);
| mark_inode_dirty(inode);
| iocb->ki_pos = pos;
out:
|--return written;
|- -generic_perform_write
ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos)
|--struct address_space *mapping = file->f_mapping;
|--const struct address_space_operations *a_ops = mapping->a_ops;
|--do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset,iov_iter_count(i));
again:
//调用块设备的write_begin函数,获取块设备缓存page,如果page不存在则需要分配
status = a_ops->write_begin(file, mapping, pos, bytes, flags,&page, &fsdata);
//标记页面脏,需要flush
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
//将用户要写的用户空间的数据copy到块设备的缓存page中
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
flush_dcache_page(page);
//writer_end函数作用就是标记该page为最新、page bh为dirty,等待在合适的时机被刷到设备中
status = a_ops->write_end(file, mapping, pos, bytes, copied,page, fsdata);
copied = status;
cond_resched();
//根据完成的字节数向前推进
iov_iter_advance(i, copied)
//偏移加上完成的数据长度
pos += copied;
//更新写入总数
written += copied;
// 此处会触发page cache脏页写回
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
参考:块存储:写块设备缓存的代码注释
-
write_begin:块设备的writer_begin函数是blkdev_write_begin, 该函数为块设备的缓存准备page:
-
iov_iter_copy_from_user_atomic:作用是将用户要写的用户空间的数据copy到块设备的缓存,即page中
-
write_end:块设备的writer_end函数是blkdev_write_end,在第2步骤已将用户数据copy到块设备的缓存page中,blkdev_write_end函数作用就是标记该page为最新、page bh为dirty,等待在合适的时机被刷到设备中: