read()&write()

通过分析内核梳理read()，write()系统调用的调用路径

内核版本：2.6.32

- start with open()

//$KERNEL_ROOT/fs/open.c
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
{
    long ret;

    if (force_o_largefile())
        flags |= O_LARGEFILE;

    ret = do_sys_open(AT_FDCWD, filename, flags, mode);
    /* avoid REGPARM breakage on x86: */
    asmlinkage_protect(3, ret, filename, flags, mode);
    return ret;
}

这是open调用的入口点，至于系统调用入口的实现，我们前面分析过，书本上也讲的比较清楚，这里我们就不在赘述，可以看到，这个函数首先使用force_o_largefile()检查了一个什么东西，并在一定条件下给flags添加了一个位，然后进一步调用了do_sys_open，并将返回的long送给了asmlinkage_protect，我们接下类看看这三个调用

其中force_o_largefile()是一个宏，定义如下

//$KERNEL_ROOT/include/fcntl.h
#ifndef force_o_largefile   //line 45
#define force_o_largefile() (BITS_PER_LONG != 32)

可以看到这里就是检查位长度的，如果一个long的长度不是32，就需要O_LARGEFILE标识位

– do_sys_open()

//$KERNEL_ROOT/fs/open.c
long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
    char *tmp = getname(filename);
    int fd = PTR_ERR(tmp);

    if (!IS_ERR(tmp)) {
        fd = get_unused_fd_flags(flags);
        if (fd >= 0) {
            struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
            if (IS_ERR(f)) {
                put_unused_fd(fd);
                fd = PTR_ERR(f);
            } else {
                fsnotify_open(f);
                fd_install(fd, f);
                trace_do_sys_open(tmp, flags, mode);
            }
        }
        putname(tmp);
    }
    return fd;
}

可以看到，这个函数里面完成了主要的工作，并最终可以返回描述符，其中，首先是对filename参数进行了处理，使用了getname函数和PTR_ERR完成了检查，从字面意思上可以看出些端倪：filename前面有个__user表明这个指针是用户空间的，那么可以比较自然地认为这两个函数完成的就是用户空间到内核空间的转换，我们还是来看一下

— getname()

//$KERNEL_ROOT/fs/namei.c
char * getname(const char __user * filename)
{
    char *tmp, *result;

    result = ERR_PTR(-ENOMEM);
    tmp = __getname();
    if (tmp)  {
        int retval = do_getname(filename, tmp);

        result = tmp;
        if (retval < 0) {
            __putname(tmp);
            result = ERR_PTR(retval);
        }
    }
    audit_getname(result);
    return result;
}

首先依旧是对输入的参数进行安全检查，然后使用do_getname完成真正逻辑，最后还用了个audit_getname处理结果

其中ERR_PTR和__getname 都是宏，第一个很简单，就是将long强制类型转换成void*，第二个定义如下

//$KERNEL_ROOT/include/fs.h line 1941
#define __getname_gfp(gfp)  kmem_cache_alloc(names_cachep, (gfp))
#define __getname()     __getname_gfp(GFP_KERNEL)

这个东西搞了半天就是申请了一块空间

__putname，audit_getname比较有意思，首先是__putname

//$KERNEL_ROOT/include/fs.h line 1943
#define __putname(name)     kmem_cache_free(names_cachep, (void *)(name))
#ifndef CONFIG_AUDITSYSCALL
#define putname(name)   __putname(name)
#else
extern void putname(const char *name);
#endif

完成的工作就是对应上面申请cache空间的释放cache，如果定义了CONFIG_AUDITSYSCALL配置，那么将会进行比较复杂的审计过程，然后又回来调用在本文件中定义的__putname这里不详细说明了

audit_getname就没有配置不配置的问题了，必须走过那一套复杂的审计过程，取出审计过程后调用的东西竟然是。。。

#define audit_getname(n) do { ; } while (0) //$KERNEL_ROOT/include/audit.h line 554

—- do_getname()

//$KERNEL_ROOT/fs/namei.c
/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
static int do_getname(const char __user *filename, char *page)
{
    int retval;
    unsigned long len = PATH_MAX;

    if (!segment_eq(get_fs(), KERNEL_DS)) {
        if ((unsigned long) filename >= TASK_SIZE)
            return -EFAULT;
        if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
            len = TASK_SIZE - (unsigned long) filename;
    }

    retval = strncpy_from_user(page, filename, len);
    if (retval > 0) {
        if (retval < len)
            return 0;
        return -ENAMETOOLONG;
    } else if (!retval)
        retval = -ENOENT;
    return retval;
}

一上来先是segment_eq是一个宏，真的就是比较两个参数的seq成员，可读性极差，然后第一个get_fs也是一个宏，定义如下，估摸着就是获得地址空间

//$KERNEL_ROOT/arch/x86/include/asm/uaccess.h line 31
#define get_fs()    (current_thread_info()->addr_limit)

TASK_SIZE这个宏经过多次宏别名到了一个常数，课件其与体系结构相关程度之高。接下来的代码可好，如果filename的指针超过了这个范围，直接就Segmentation Fault了；另一方面，如果很巧的，filename的长度有可能超过地址空间（地址空间-filename起始 < MAX_PATH），就强行截断，可能超过的部分直接忽略

—– strncpy_from_user()

//$LINUX_ROOT/arch/x86/lib/usercopy_64.c
/*
 * Copy a null terminated string from userspace.
 */

#define __do_strncpy_from_user(dst,src,count,res)              \
do {                                       \
    long __d0, __d1, __d2;                         \
    might_fault();                             \
    __asm__ __volatile__(                          \
        "   testq %1,%1\n"                     \
        "   jz 2f\n"                       \
        "0: lodsb\n"                       \
        "   stosb\n"                       \
        "   testb %%al,%%al\n"                 \
        "   jz 1f\n"                       \
        "   decq %1\n"                     \
        "   jnz 0b\n"                      \
        "1: subq %1,%0\n"                      \
        "2:\n"                             \
        ".section .fixup,\"ax\"\n"                 \
        "3: movq %5,%0\n"                      \
        "   jmp 2b\n"                      \
        ".previous\n"                          \
        _ASM_EXTABLE(0b,3b)                    \
        : "=&r"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1),    \
          "=&D" (__d2)                         \
        : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
        : "memory");                           \
} while (0)

long
__strncpy_from_user(char *dst, const char __user *src, long count)
{
    long res;
    __do_strncpy_from_user(dst, src, count, res);
    return res;
}
EXPORT_SYMBOL(__strncpy_from_user);

long
strncpy_from_user(char *dst, const char __user *src, long count)
{
    long res = -EFAULT;
    if (access_ok(VERIFY_READ, src, 1))
        return __strncpy_from_user(dst, src, count);
    return res;
}
EXPORT_SYMBOL(strncpy_from_user);

可以看到，用户空间到内核空间的内容复制最终是由内联汇编实现的，这也比较容易理解因为C语言的名字空间中确实无法表达这种物理空间上的内存读写

— get_unused_fd_flags()

调用getname返回后，根据返回值的有效性，决定下一步如何处理，如果返回值有效，那么将调用get_unused_fd_flags这个宏

#define get_unused_fd_flags(flags) alloc_fd(0, (flags)) //$LINUX_ROOT/include/linux/file.h

//$LINUX_ROOT/fs/file.c
/*
 * allocate a file descriptor, mark it busy.
 */
int alloc_fd(unsigned start, unsigned flags)
{
    struct files_struct *files = current->files;
    unsigned int fd;
    int error;
    struct fdtable *fdt;

    spin_lock(&files->file_lock);
repeat:
    fdt = files_fdtable(files);
    fd = start;
    if (fd < files->next_fd)
        fd = files->next_fd;

    if (fd < fdt->max_fds)
        fd = find_next_zero_bit(fdt->open_fds->fds_bits,
                       fdt->max_fds, fd);

    error = expand_files(files, fd);
    if (error < 0)
        goto out;

    /*
     * If we needed to expand the fs array we
     * might have blocked - try again.
     */
    if (error)
        goto repeat;

    if (start <= files->next_fd)
        files->next_fd = fd + 1;

    FD_SET(fd, fdt->open_fds);
    if (flags & O_CLOEXEC)
        FD_SET(fd, fdt->close_on_exec);
    else
        FD_CLR(fd, fdt->close_on_exec);
    error = fd;
#if 1
    /* Sanity check */
    if (rcu_dereference(fdt->fd[fd]) != NULL) {
        printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
        rcu_assign_pointer(fdt->fd[fd], NULL);
    }
#endif

out:
    spin_unlock(&files->file_lock);
    return error;
}

看功能，这个函数的作用应该是分配一个文件描述符。首先，函数取出当前进程的表示打开文件的结构体files_struct并上锁，之后调用find_next_zero_bit，从打开文件的位图中找出一个空的描述符

—- find_next_zero_bit()

//$LINUX_ROOT/lib/find_next_bit.c
/*
 * This implementation of find_{first,next}_zero_bit was stolen from
 * Linus' asm-alpha/bitops.h.
 */
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
                 unsigned long offset)
{
    const unsigned long *p = addr + BITOP_WORD(offset);
    unsigned long result = offset & ~(BITS_PER_LONG-1);
    unsigned long tmp;

    if (offset >= size)
        return size;
    size -= result;
    offset %= BITS_PER_LONG;
    if (offset) {
        tmp = *(p++);
        tmp |= ~0UL >> (BITS_PER_LONG - offset);
        if (size < BITS_PER_LONG)
            goto found_first;
        if (~tmp)
            goto found_middle;
        size -= BITS_PER_LONG;
        result += BITS_PER_LONG;
    }
    while (size & ~(BITS_PER_LONG-1)) {
        if (~(tmp = *(p++)))
            goto found_middle;
        result += BITS_PER_LONG;
        size -= BITS_PER_LONG;
    }
    if (!size)
        return result;
    tmp = *p;

found_first:
    tmp |= ~0UL << size;
    if (tmp == ~0UL)   /* Are any bits zero? */
        return result + size;   /* Nope. */
found_middle:
    return result + ffz(tmp);
}
EXPORT_SYMBOL(find_next_zero_bit);

这里就是纯粹的位操作，找出第一个非0位，不再进行详细解读

—- expand_files()

要向打开文件表中添加新文件，改变files_struct的工作必不可少，这个函数完成的就是这个功能

/*
 * Expand files.
 * This function will expand the file structures, if the requested size exceeds
 * the current capacity and there is room for expansion.
 * Return <0 error code on error; 0 when nothing done; 1 when files were
 * expanded and execution may have blocked.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
int expand_files(struct files_struct *files, int nr)
{
    struct fdtable *fdt;

    fdt = files_fdtable(files);

    /*
     * N.B. For clone tasks sharing a files structure, this test
     * will limit the total number of files that can be opened.
     */
    if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
        return -EMFILE;

    /* Do we need to expand? */
    if (nr < fdt->max_fds)
        return 0;

    /* Can we expand? */
    if (nr >= sysctl_nr_open)
        return -EMFILE;

    /* All good, so we try */
    return expand_fdtable(files, nr);
}

根据注释中的描述，第一步检查的应该是系统全局的打开文件数量，第二步检查是否有必要扩展，第三步检查当前进程是否还能对表进行扩展，然后将会返回一个新的函数调用

—– expand_fdtable()

/*
 * Expand the file descriptor table.
 * This function will allocate a new fdtable and both fd array and fdset, of
 * the given size.
 * Return <0 error code on error; 1 on successful completion.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_fdtable(struct files_struct *files, int nr)
    __releases(files->file_lock)
    __acquires(files->file_lock)
{
    struct fdtable *new_fdt, *cur_fdt;

    spin_unlock(&files->file_lock);
    new_fdt = alloc_fdtable(nr);
    spin_lock(&files->file_lock);
    if (!new_fdt)
        return -ENOMEM;
    /*
     * extremely unlikely race - sysctl_nr_open decreased between the check in
     * caller and alloc_fdtable().  Cheaper to catch it here...
     */
    if (unlikely(new_fdt->max_fds <= nr)) {
        free_fdarr(new_fdt);
        free_fdset(new_fdt);
        kfree(new_fdt);
        return -EMFILE;
    }
    /*
     * Check again since another task may have expanded the fd table while
     * we dropped the lock
     */
    cur_fdt = files_fdtable(files);
    if (nr >= cur_fdt->max_fds) {
        /* Continue as planned */
        copy_fdtable(new_fdt, cur_fdt);
        rcu_assign_pointer(files->fdt, new_fdt);
        if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
            free_fdtable(cur_fdt);
    } else {
        /* Somebody else expanded, so undo our attempt */
        free_fdarr(new_fdt);
        free_fdset(new_fdt);
        kfree(new_fdt);
    }
    return 1;
}

根据函数注释，我们可以了解到这个函数是临界区，要求进入时函数持有锁，为了提高效率，到目前为止内核中看到的同步机制全部是通过自旋锁实现的。

整个函数的执行过程首先是解锁，完成新fdtable的申请，之后将旧fdtable的内容复制到新fdtable中，释放掉旧fdtable

—- alloc_fdtable()

static struct fdtable * alloc_fdtable(unsigned int nr)
{
    struct fdtable *fdt;
    char *data;

    /*
     * Figure out how many fds we actually want to support in this fdtable.
     * Allocation steps are keyed to the size of the fdarray, since it
     * grows far faster than any of the other dynamic data. We try to fit
     * the fdarray into comfortable page-tuned chunks: starting at 1024B
     * and growing in powers of two from there on.
     */
    nr /= (1024 / sizeof(struct file *));
    nr = roundup_pow_of_two(nr + 1);
    nr *= (1024 / sizeof(struct file *));
    /*
     * Note that this can drive nr *below* what we had passed if sysctl_nr_open
     * had been set lower between the check in expand_files() and here.  Deal
     * with that in caller, it's cheaper that way.
     *
     * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
     * bitmaps handling below becomes unpleasant, to put it mildly...
     */
    if (unlikely(nr > sysctl_nr_open))
        nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;

    fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
    if (!fdt)
        goto out;
    fdt->max_fds = nr;
    data = alloc_fdmem(nr * sizeof(struct file *));
    if (!data)
        goto out_fdt;
    fdt->fd = (struct file **)data;
    data = alloc_fdmem(max_t(unsigned int,
                 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
    if (!data)
        goto out_arr;
    fdt->open_fds = (fd_set *)data;
    data += nr / BITS_PER_BYTE;
    fdt->close_on_exec = (fd_set *)data;
    INIT_RCU_HEAD(&fdt->rcu);
    fdt->next = NULL;

    return fdt;

out_arr:
    free_fdarr(fdt);
out_fdt:
    kfree(fdt);
out:
    return NULL;
}

首先，进程释放锁，并通过本函数申请一个新的fdtable，包括申请其中两个指针数据结构的空间，如果过程中发生失败，那么将根据函数执行进度进行不同处理

—- copy_fdtable()

/*
 * Expand the fdset in the files_struct.  Called with the files spinlock
 * held for write.
 */
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
    unsigned int cpy, set;

    BUG_ON(nfdt->max_fds < ofdt->max_fds);

    cpy = ofdt->max_fds * sizeof(struct file *);
    set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
    memcpy(nfdt->fd, ofdt->fd, cpy);
    memset((char *)(nfdt->fd) + cpy, 0, set);

    cpy = ofdt->max_fds / BITS_PER_BYTE;
    set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
    memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
    memset((char *)(nfdt->open_fds) + cpy, 0, set);
    memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
    memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
}

这个函数不仅仅是简单的复制，还有一些地方需要和原来的fdtable产生区别，这些地方就通过直接赋值来完成

这个函数调用结束后，将返回alloc_fd，根据flags设置新的fd中的标识位，最后是完成安全检查，get_unused_fd_flags这个宏就可以返回了

— do_filp_open()

/*
 * Note that the low bits of the passed in "open_flag"
 * are not the same as in the local variable "flag". See
 * open_to_namei_flags() for more details.
 */
struct file *do_filp_open(int dfd, const char *pathname,
        int open_flag, int mode, int acc_mode)
{
    struct file *filp;
    struct nameidata nd;
    int error;
    struct path path;
    struct dentry *dir;
    int count = 0;
    int will_write;
    int flag = open_to_namei_flags(open_flag);

    if (!acc_mode)
        acc_mode = MAY_OPEN | ACC_MODE(flag);

    /* O_TRUNC implies we need access checks for write permissions */
    if (flag & O_TRUNC)
        acc_mode |= MAY_WRITE;

    /* Allow the LSM permission hook to distinguish append 
       access from general write access. */
    if (flag & O_APPEND)
        acc_mode |= MAY_APPEND;

    /*
     * The simplest case - just a plain lookup.
     */
    if (!(flag & O_CREAT)) {
        error = path_lookup_open(dfd, pathname, lookup_flags(flag),
                     &nd, flag);
        if (error)
            return ERR_PTR(error);
        goto ok;
    }

    /*
     * Create - we need to know the parent.
     */
    error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
    if (error)
        return ERR_PTR(error);
    error = path_walk(pathname, &nd);
    if (error) {
        if (nd.root.mnt)
            path_put(&nd.root);
        return ERR_PTR(error);
    }
    if (unlikely(!audit_dummy_context()))
        audit_inode(pathname, nd.path.dentry);

    /*
     * We have the parent and last component. First of all, check
     * that we are not asked to creat(2) an obvious directory - that
     * will not do.
     */
    error = -EISDIR;
    if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
        goto exit_parent;

    error = -ENFILE;
    filp = get_empty_filp();
    if (filp == NULL)
        goto exit_parent;
    nd.intent.open.file = filp;
    nd.intent.open.flags = flag;
    nd.intent.open.create_mode = mode;
    dir = nd.path.dentry;
    nd.flags &= ~LOOKUP_PARENT;
    nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
    if (flag & O_EXCL)
        nd.flags |= LOOKUP_EXCL;
    mutex_lock(&dir->d_inode->i_mutex);
    path.dentry = lookup_hash(&nd);
    path.mnt = nd.path.mnt;

do_last:
    error = PTR_ERR(path.dentry);
    if (IS_ERR(path.dentry)) {
        mutex_unlock(&dir->d_inode->i_mutex);
        goto exit;
    }

    if (IS_ERR(nd.intent.open.file)) {
        error = PTR_ERR(nd.intent.open.file);
        goto exit_mutex_unlock;
    }

    /* Negative dentry, just create the file */
    if (!path.dentry->d_inode) {
        /*
         * This write is needed to ensure that a
         * ro->rw transition does not occur between
         * the time when the file is created and when
         * a permanent write count is taken through
         * the 'struct file' in nameidata_to_filp().
         */
        error = mnt_want_write(nd.path.mnt);
        if (error)
            goto exit_mutex_unlock;
        error = __open_namei_create(&nd, &path, flag, mode);
        if (error) {
            mnt_drop_write(nd.path.mnt);
            goto exit;
        }
        filp = nameidata_to_filp(&nd, open_flag);
        if (IS_ERR(filp))
            ima_counts_put(&nd.path,
                       acc_mode & (MAY_READ | MAY_WRITE |
                           MAY_EXEC));
        mnt_drop_write(nd.path.mnt);
        if (nd.root.mnt)
            path_put(&nd.root);
        return filp;
    }

    /*
     * It already exists.
     */
    mutex_unlock(&dir->d_inode->i_mutex);
    audit_inode(pathname, path.dentry);

    error = -EEXIST;
    if (flag & O_EXCL)
        goto exit_dput;

    if (__follow_mount(&path)) {
        error = -ELOOP;
        if (flag & O_NOFOLLOW)
            goto exit_dput;
    }

    error = -ENOENT;
    if (!path.dentry->d_inode)
        goto exit_dput;
    if (path.dentry->d_inode->i_op->follow_link)
        goto do_link;

    path_to_nameidata(&path, &nd);
    error = -EISDIR;
    if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
        goto exit;
ok:
    /*
     * Consider:
     * 1. may_open() truncates a file
     * 2. a rw->ro mount transition occurs
     * 3. nameidata_to_filp() fails due to
     *    the ro mount.
     * That would be inconsistent, and should
     * be avoided. Taking this mnt write here
     * ensures that (2) can not occur.
     */
    will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
    if (will_write) {
        error = mnt_want_write(nd.path.mnt);
        if (error)
            goto exit;
    }
    error = may_open(&nd.path, acc_mode, flag);
    if (error) {
        if (will_write)
            mnt_drop_write(nd.path.mnt);
        goto exit;
    }
    filp = nameidata_to_filp(&nd, open_flag);
    if (IS_ERR(filp))
        ima_counts_put(&nd.path,
                   acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
    /*
     * It is now safe to drop the mnt write
     * because the filp has had a write taken
     * on its behalf.
     */
    if (will_write)
        mnt_drop_write(nd.path.mnt);
    if (nd.root.mnt)
        path_put(&nd.root);
    return filp;

exit_mutex_unlock:
    mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
    path_put_conditional(&path, &nd);
exit:
    if (!IS_ERR(nd.intent.open.file))
        release_open_intent(&nd);
exit_parent:
    if (nd.root.mnt)
        path_put(&nd.root);
    path_put(&nd.path);
    return ERR_PTR(error);

do_link:
    error = -ELOOP;
    if (flag & O_NOFOLLOW)
        goto exit_dput;
    /*
     * This is subtle. Instead of calling do_follow_link() we do the
     * thing by hands. The reason is that this way we have zero link_count
     * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
     * After that we have the parent and last component, i.e.
     * we are in the same situation as after the first path_walk().
     * Well, almost - if the last component is normal we get its copy
     * stored in nd->last.name and we will have to putname() it when we
     * are done. Procfs-like symlinks just set LAST_BIND.
     */
    nd.flags |= LOOKUP_PARENT;
    error = security_inode_follow_link(path.dentry, &nd);
    if (error)
        goto exit_dput;
    error = __do_follow_link(&path, &nd);
    if (error) {
        /* Does someone understand code flow here? Or it is only
         * me so stupid? Anathema to whoever designed this non-sense
         * with "intent.open".
         */
        release_open_intent(&nd);
        if (nd.root.mnt)
            path_put(&nd.root);
        return ERR_PTR(error);
    }
    nd.flags &= ~LOOKUP_PARENT;
    if (nd.last_type == LAST_BIND)
        goto ok;
    error = -EISDIR;
    if (nd.last_type != LAST_NORM)
        goto exit;
    if (nd.last.name[nd.last.len]) {
        __putname(nd.last.name);
        goto exit;
    }
    error = -ELOOP;
    if (count++==32) {
        __putname(nd.last.name);
        goto exit;
    }
    dir = nd.path.dentry;
    mutex_lock(&dir->d_inode->i_mutex);
    path.dentry = lookup_hash(&nd);
    path.mnt = nd.path.mnt;
    __putname(nd.last.name);
    goto do_last;
}

这个函数比较长，我们选取其中意义比较重大的函数看一看

—- open_to_namei_flags()

/*
 * Note that while the flag value (low two bits) for sys_open means:
 *  00 - read-only
 *  01 - write-only
 *  10 - read-write
 *  11 - special
 * it is changed into
 *  00 - no permissions needed
 *  01 - read-permission
 *  10 - write-permission
 *  11 - read-write
 * for the internal routines (ie open_namei()/follow_link() etc)
 * This is more logical, and also allows the 00 "no perm needed"
 * to be used for symlinks (where the permissions are checked
 * later).
 *
*/
static inline int open_to_namei_flags(int flag)
{
    if ((flag+1) & O_ACCMODE)
        flag++;
    return flag;
}

这个函数获得了真正用于打开文件的标识位，通过玄学的flag++…

—- path_lookup_open()

如果文件不需要新建，就会进入这个函数，直接打开文件

/**
 * path_lookup_open - lookup a file path with open intent
 * @dfd: the directory to use as base, or AT_FDCWD
 * @name: pointer to file name
 * @lookup_flags: lookup intent flags
 * @nd: pointer to nameidata
 * @open_flags: open intent flags
 */
static int path_lookup_open(int dfd, const char *name,
        unsigned int lookup_flags, struct nameidata *nd, int open_flags)
{
    struct file *filp = get_empty_filp();
    int err;

    if (filp == NULL)
        return -ENFILE;
    nd->intent.open.file = filp;
    nd->intent.open.flags = open_flags;
    nd->intent.open.create_mode = 0;
    err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
    if (IS_ERR(nd->intent.open.file)) {
        if (err == 0) {
            err = PTR_ERR(nd->intent.open.file);
            path_put(&nd->path);
        }
    } else if (err != 0)
        release_open_intent(nd);
    return err;
}

可以看到，这个函数在完成大量的标识位处理后调用了do_path_lookup我们来看一看这个函数

—– do_path_lookup()

/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int do_path_lookup(int dfd, const char *name,
                unsigned int flags, struct nameidata *nd)
{
    int retval = path_init(dfd, name, flags, nd);
    if (!retval)
        retval = path_walk(name, nd);
    if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
                nd->path.dentry->d_inode))
        audit_inode(name, nd->path.dentry);
    if (nd->root.mnt) {
        path_put(&nd->root);
        nd->root.mnt = NULL;
    }
    return retval;
}

这个函数也是直接调用了path_init和path_walk，具体内容我们下面分析，然后这里还调用了一个path_put，我们简单看一下

/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
void path_put(struct path *path)
{
    dput(path->dentry);
    mntput(path->mnt);
}
EXPORT_SYMBOL(path_put);

这两个函数的具体实现比较复杂，根据分析，应该就是对路径上文件夹和文件的引用计数进行修改

—- path_init()

static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
{
    int retval = 0;
    int fput_needed;
    struct file *file;

    nd->last_type = LAST_ROOT; /* if there are only slashes... */
    nd->flags = flags;
    nd->depth = 0;
    nd->root.mnt = NULL;

    if (*name=='/') {
        set_root(nd);
        nd->path = nd->root;
        path_get(&nd->root);
    } else if (dfd == AT_FDCWD) {
        struct fs_struct *fs = current->fs;
        read_lock(&fs->lock);
        nd->path = fs->pwd;
        path_get(&fs->pwd);
        read_unlock(&fs->lock);
    } else {
        struct dentry *dentry;

        file = fget_light(dfd, &fput_needed);
        retval = -EBADF;
        if (!file)
            goto out_fail;

        dentry = file->f_path.dentry;

        retval = -ENOTDIR;
        if (!S_ISDIR(dentry->d_inode->i_mode))
            goto fput_fail;

        retval = file_permission(file, MAY_EXEC);
        if (retval)
            goto fput_fail;

        nd->path = file->f_path;
        path_get(&file->f_path);

        fput_light(file, fput_needed);
    }
    return 0;

fput_fail:
    fput_light(file, fput_needed);
out_fail:
    return retval;
}

具体过程很难说，但是明显的一点就是这个函数根据不同文件系统寻路方式设置了根目录

—- path_walk()

/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
static int __link_path_walk(const char *name, struct nameidata *nd)
{
    struct path next;
    struct inode *inode;
    int err;
    unsigned int lookup_flags = nd->flags;

    while (*name=='/')
        name++;
    if (!*name)
        goto return_reval;

    inode = nd->path.dentry->d_inode;
    if (nd->depth)
        lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);

    /* At this point we know we have a real path component. */
    for(;;) {
        unsigned long hash;
        struct qstr this;
        unsigned int c;

        nd->flags |= LOOKUP_CONTINUE;
        err = exec_permission_lite(inode);
        if (err)
            break;

        this.name = name;
        c = *(const unsigned char *)name;

        hash = init_name_hash();
        do {
            name++;
            hash = partial_name_hash(c, hash);
            c = *(const unsigned char *)name;
        } while (c && (c != '/'));
        this.len = name - (const char *) this.name;
        this.hash = end_name_hash(hash);

        /* remove trailing slashes? */
        if (!c)
            goto last_component;
        while (*++name == '/');
        if (!*name)
            goto last_with_slashes;

        /*
         * "." and ".." are special - ".." especially so because it has
         * to be able to know about the current root directory and
         * parent relationships.
         */
        if (this.name[0] == '.') switch (this.len) {
            default:
                break;
            case 2: 
                if (this.name[1] != '.')
                    break;
                follow_dotdot(nd);
                inode = nd->path.dentry->d_inode;
                /* fallthrough */
            case 1:
                continue;
        }
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
        if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
            err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
                                &this);
            if (err < 0)
                break;
        }
        /* This does the actual lookups.. */
        err = do_lookup(nd, &this, &next);
        if (err)
            break;

        err = -ENOENT;
        inode = next.dentry->d_inode;
        if (!inode)
            goto out_dput;

        if (inode->i_op->follow_link) {
            err = do_follow_link(&next, nd);
            if (err)
                goto return_err;
            err = -ENOENT;
            inode = nd->path.dentry->d_inode;
            if (!inode)
                break;
        } else
            path_to_nameidata(&next, nd);
        err = -ENOTDIR; 
        if (!inode->i_op->lookup)
            break;
        continue;
        /* here ends the main loop */

last_with_slashes:
        lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
last_component:
        /* Clear LOOKUP_CONTINUE iff it was previously unset */
        nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
        if (lookup_flags & LOOKUP_PARENT)
            goto lookup_parent;
        if (this.name[0] == '.') switch (this.len) {
            default:
                break;
            case 2: 
                if (this.name[1] != '.')
                    break;
                follow_dotdot(nd);
                inode = nd->path.dentry->d_inode;
                /* fallthrough */
            case 1:
                goto return_reval;
        }
        if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
            err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
                                &this);
            if (err < 0)
                break;
        }
        err = do_lookup(nd, &this, &next);
        if (err)
            break;
        inode = next.dentry->d_inode;
        if (follow_on_final(inode, lookup_flags)) {
            err = do_follow_link(&next, nd);
            if (err)
                goto return_err;
            inode = nd->path.dentry->d_inode;
        } else
            path_to_nameidata(&next, nd);
        err = -ENOENT;
        if (!inode)
            break;
        if (lookup_flags & LOOKUP_DIRECTORY) {
            err = -ENOTDIR; 
            if (!inode->i_op->lookup)
                break;
        }
        goto return_base;
lookup_parent:
        nd->last = this;
        nd->last_type = LAST_NORM;
        if (this.name[0] != '.')
            goto return_base;
        if (this.len == 1)
            nd->last_type = LAST_DOT;
        else if (this.len == 2 && this.name[1] == '.')
            nd->last_type = LAST_DOTDOT;
        else
            goto return_base;
return_reval:
        /*
         * We bypassed the ordinary revalidation routines.
         * We may need to check the cached dentry for staleness.
         */
        if (nd->path.dentry && nd->path.dentry->d_sb &&
            (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
            err = -ESTALE;
            /* Note: we do not d_invalidate() */
            if (!nd->path.dentry->d_op->d_revalidate(
                    nd->path.dentry, nd))
                break;
        }
return_base:
        return 0;
out_dput:
        path_put_conditional(&next, nd);
        break;
    }
    path_put(&nd->path);
return_err:
    return err;
}

static int path_walk(const char *name, struct nameidata *nd)
{
    current->total_link_count = 0;
    return link_path_walk(name, nd);
}

这两个函数完成的工作，就像注释说明的一样，是路径解析，将代表路径的字符串使用‘/’截断，并检查每个目录是否存在、是否有x权限，并一层层把路径加到nd上

这个函数返回后，对于do_filp_open函数来说，就知道是应该创建文件还是打开现有文件了，剩下的就是根据创建和打开两条路线完成各种检查，如open_will_write_to_fs，may_oepn等，最后把文件指针filp返回

— fsnotify_open()

/*
 * fsnotify_open - file was opened
 */
static inline void fsnotify_open(struct dentry *dentry)
{
    struct inode *inode = dentry->d_inode;
    __u32 mask = FS_OPEN;

    if (S_ISDIR(inode->i_mode))
        mask |= FS_IN_ISDIR;

    inotify_inode_queue_event(inode, mask, 0, NULL, NULL);

    fsnotify_parent(dentry, mask);
    fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
}

这个函数将文件被打开的时间通知给文件系统，实现文件系统的并发访问控制

—fd_install()

/*
 * Install a file pointer in the fd array.
 *
 * The VFS is full of places where we drop the files lock between
 * setting the open_fds bitmap and installing the file in the file
 * array.  At any such point, we are vulnerable to a dup2() race
 * installing a file in the array before us.  We need to detect this and
 * fput() the struct file we are about to overwrite in this case.
 *
 * It should never happen - if we allow dup2() do it, _really_ bad things
 * will follow.
 */

void fd_install(unsigned int fd, struct file *file)
{
    struct files_struct *files = current->files;
    struct fdtable *fdt;
    spin_lock(&files->file_lock);
    fdt = files_fdtable(files);
    BUG_ON(fdt->fd[fd] != NULL);
    rcu_assign_pointer(fdt->fd[fd], file);
    spin_unlock(&files->file_lock);
}

这个函数将上面新建立的文件描述符插入文件指针，互斥地完成文件系统文件和进程文件描述符之间的映射

这些步骤完成之后，open调用的工作基本完成

- read()

read和write调用都定义在fs/read_write.c中

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
    struct file *file;
    ssize_t ret = -EBADF;
    int fput_needed;

    file = fget_light(fd, &fput_needed);
    if (file) {
        loff_t pos = file_pos_read(file);
        ret = vfs_read(file, buf, count, &pos);
        file_pos_write(file, pos);
        fput_light(file, fput_needed);
    }

    return ret;
}

– fget_light()

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 
 * You can use this only if it is guranteed that the current task already 
 * holds a refcnt to that file. That check has to be done at fget() only
 * and a flag is returned to be passed to the corresponding fput_light().
 * There must not be a cloning between an fget_light/fput_light pair.
 */
struct file *fget_light(unsigned int fd, int *fput_needed)
{
    struct file *file;
    struct files_struct *files = current->files;

    *fput_needed = 0;
    if (likely((atomic_read(&files->count) == 1))) {
        file = fcheck_files(files, fd);
    } else {
        rcu_read_lock();
        file = fcheck_files(files, fd);
        if (file) {
            if (atomic_long_inc_not_zero(&file->f_count))
                *fput_needed = 1;
            else
                /* Didn't get the reference, someone's freed */
                file = NULL;
        }
        rcu_read_unlock();
    }

    return file;
}

作用也是比较明确的，就是利用文件描述符，从当前进程的打开文件表中找出与文件描述符绑定的文件指针，这里分出了2中情况——文件描述符表是否共享，共享的时候需要对引用计数增1

– file_pos_read()

这个函数简单到难以置信，完成的就是把文件当前访问到的偏移量读出来

static inline loff_t file_pos_read(struct file *file)
{
    return file->f_pos;
}

思来想去，这个函数存在的意义估计也就是为了增加代码可读性，还为了不影响性能专门写成了`static inline…

– vfs_read()

这个函数是read中的重头戏，不仅需要利用文件指针，从不知道到底在哪里的文件中读出数据，还需要把这些数据放到用户空间的缓冲区中

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_READ))
        return -EBADF;
    if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
        return -EINVAL;
    if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
        return -EFAULT;

    ret = rw_verify_area(READ, file, pos, count);
    if (ret >= 0) {
        count = ret;
        if (file->f_op->read)
            ret = file->f_op->read(file, buf, count, pos);
        else
            ret = do_sync_read(file, buf, count, pos);
        if (ret > 0) {
            fsnotify_access(file->f_path.dentry);
            add_rchar(current, ret);
        }
        inc_syscr(current);
    }

    return ret;
}

这个函数上来就是检查权限，包括文件打开模式权限，文件操作权限，还有缓冲区的写权限

然后rw_verify_area检查文件偏移量和想要读取字节数有没有问题，如下

— rw_verify_area()

/*
 * rw_verify_area doesn't like huge counts. We limit
 * them to something that fits in "int" so that others
 * won't have to do range checks all the time.
 */
#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)

int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
{
    struct inode *inode;
    loff_t pos;
    int retval = -EINVAL;

    inode = file->f_path.dentry->d_inode;
    if (unlikely((ssize_t) count < 0))
        return retval;
    pos = *ppos;
    if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
        return retval;

    if (unlikely(inode->i_flock && mandatory_lock(inode))) {
        retval = locks_mandatory_area(
            read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
            inode, file, pos, count);
        if (retval < 0)
            return retval;
    }
    retval = security_file_permission(file,
                read_write == READ ? MAY_READ : MAY_WRITE);
    if (retval)
        return retval;
    return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
}

检查完成后，根据文件操作中是否定义了读操作采取不同的方式进行读取

实际上两种方式真正读取的函数都定义在f_op中，do_sync_read使用的函数是aio_read（atomic io），而且多了同步机制。这些函数的具体定义我就不再发掘了，不光完成了从不同介质中的文件数据读取，而且还完成了内核空间向用户空间的数据传送。实现起来，我认为不是C语言语义能够表示清楚地

file_pos_write这个函数和上面的file_pos_read一样神奇，我就不详细说明了，区别就和getter、setter之间区别一样

– fput_light()

这个和上面的fget_light套路相似，都是看看文件描述符表是否被共享

static inline void fput_light(struct file *file, int fput_needed)
{
    if (unlikely(fput_needed))
        fput(file);
}

void fput(struct file *file)
{
    if (atomic_long_dec_and_test(&file->f_count))
        __fput(file);
}

EXPORT_SYMBOL(fput);

/* __fput is called from task context when aio completion releases the last
 * last use of a struct file *.  Do not use otherwise.
 */
void __fput(struct file *file)
{
    struct dentry *dentry = file->f_path.dentry;
    struct vfsmount *mnt = file->f_path.mnt;
    struct inode *inode = dentry->d_inode;

    might_sleep();

    fsnotify_close(file);
    /*
     * The function eventpoll_release() should be the first called
     * in the file cleanup chain.
     */
    eventpoll_release(file);
    locks_remove_flock(file);

    if (unlikely(file->f_flags & FASYNC)) {
        if (file->f_op && file->f_op->fasync)
            file->f_op->fasync(-1, file, 0);
    }
    if (file->f_op && file->f_op->release)
        file->f_op->release(inode, file);
    security_file_free(file);
    ima_file_free(file);
    if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
        cdev_put(inode->i_cdev);
    fops_put(file->f_op);
    put_pid(file->f_owner.pid);
    file_kill(file);
    if (file->f_mode & FMODE_WRITE)
        drop_file_write_access(file);
    file->f_path.dentry = NULL;
    file->f_path.mnt = NULL;
    file_free(file);
    dput(dentry);
    mntput(mnt);
}

可以看到，一系列函数转发让整个处理过程变得复杂，说明针对不同的情况，需要的操作也是不同的

-write()

和read的唯一区别就是中间完成实际功能的函数是vfs_write，我们简单看一下

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_WRITE))
        return -EBADF;
    if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
        return -EINVAL;
    if (unlikely(!access_ok(VERIFY_READ, buf, count)))
        return -EFAULT;

    ret = rw_verify_area(WRITE, file, pos, count);
    if (ret >= 0) {
        count = ret;
        if (file->f_op->write)
            ret = file->f_op->write(file, buf, count, pos);
        else
            ret = do_sync_write(file, buf, count, pos);
        if (ret > 0) {
            fsnotify_modify(file->f_path.dentry);
            add_wchar(current, ret);
        }
        inc_syscw(current);
    }

    return ret;
}

EXPORT_SYMBOL(vfs_write);

看到这些代码的时候我甚至怀疑我自己的眼睛，除了read全都变成了write以外，这个函数和vfs_read没有任何区别。这应该就是vfs设计理念体现在代码上的特点了

- end with close()

当一个文件使用完成，内存资源需要被释放的时候，这个系统调用就发挥作用，本质上是open的逆过程，所以open过程中考虑的很多中情况，这里需要逆向考虑

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
    struct file * filp;
    struct files_struct *files = current->files;
    struct fdtable *fdt;
    int retval;

    spin_lock(&files->file_lock);
    fdt = files_fdtable(files);
    if (fd >= fdt->max_fds)
        goto out_unlock;
    filp = fdt->fd[fd];
    if (!filp)
        goto out_unlock;
    rcu_assign_pointer(fdt->fd[fd], NULL);
    FD_CLR(fd, fdt->close_on_exec);
    __put_unused_fd(files, fd);
    spin_unlock(&files->file_lock);
    retval = filp_close(filp, files);

    /* can't restart close syscall because file table entry was cleared */
    if (unlikely(retval == -ERESTARTSYS ||
             retval == -ERESTARTNOINTR ||
             retval == -ERESTARTNOHAND ||
             retval == -ERESTART_RESTARTBLOCK))
        retval = -EINTR;

    return retval;

out_unlock:
    spin_unlock(&files->file_lock);
    return -EBADF;
}
EXPORT_SYMBOL(sys_close);

实际上，真正的实现比open简单一些，很多部分我们之前都分析过了，我们只看filp_close这个函数是如何实现功能的

– filp_close()

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
int filp_close(struct file *filp, fl_owner_t id)
{
    int retval = 0;

    if (!file_count(filp)) {
        printk(KERN_ERR "VFS: Close: file count is 0\n");
        return 0;
    }

    if (filp->f_op && filp->f_op->flush)
        retval = filp->f_op->flush(filp, id);

    dnotify_flush(filp, id);
    locks_remove_posix(filp, id);
    fput(filp);
    return retval;
}

关闭的过程实际是通过文件系统定义的flush函数完成的，open就没有这个福利，我认为原因是open的时候正好是在内存中构建文件指针的过程，而这里只需要释放，所以可以使文件系统提供的函数指针

小结

整个调用过程还是比较复杂的，不仅仅是打开些什么东西就完事了，对一个文件进行了操作，不仅影响了当前进程，还影响了文件系统的内存表示，其他打开过这个文件的进程，以及文件系统的物理介质。很多的审计和检查过程我认为有待深入研究

read、write之旅