【转】inode缓存与dentry缓存

1. inode缓存

 
        1: struct inode { 
        2:     /* RCU path lookup touches following: */ 
        3:     umode_t            i_mode; 
        4:     uid_t            i_uid; 
        5:     gid_t            i_gid; 
        6:     const struct inode_operations    *i_op; 
        7:     struct super_block    *i_sb; 
        8:   
        9:     spinlock_t        i_lock;    /* i_blocks, i_bytes, maybe i_size */ 
       10:     unsigned int        i_flags; 
       11:     unsigned long        i_state; 
       12: #ifdef CONFIG_SECURITY 
       13:     void            *i_security; 
       14: #endif 
       15:     struct mutex        i_mutex; 
       16:   
       17:   
       18:     unsigned long        dirtied_when;    /* jiffies of first dirtying */ 
       19:   
       20:     struct hlist_node    i_hash; 
       21:     struct list_head    i_wb_list;    /* backing dev IO list */ 
       22:     struct list_head    i_lru;        /* inode LRU list */ 
       23:     struct list_head    i_sb_list; 
       24:     union { 
       25:         struct list_head    i_dentry; 
       26:         struct rcu_head        i_rcu; 
       27:     }; 
       28:     unsigned long        i_ino; 
       29:     atomic_t        i_count; 
       30:     unsigned int        i_nlink; 
       31:     dev_t            i_rdev; 
       32:     unsigned int        i_blkbits; 
       33:     u64            i_version; 
       34:     loff_t            i_size; 
       35: #ifdef __NEED_I_SIZE_ORDERED 
       36:     seqcount_t        i_size_seqcount; 
       37: #endif 
       38:     struct timespec        i_atime; 
       39:     struct timespec        i_mtime; 
       40:     struct timespec        i_ctime; 
       41:     blkcnt_t        i_blocks; 
       42:     unsigned short          i_bytes; 
       43:     struct rw_semaphore    i_alloc_sem; 
       44:     const struct file_operations    *i_fop;    /* former ->i_op->default_file_ops */ 
       45:     struct file_lock    *i_flock; 
       46:     struct address_space    *i_mapping; 
       47:     struct address_space    i_data; 
       48: #ifdef CONFIG_QUOTA 
       49:     struct dquot        *i_dquot[MAXQUOTAS]; 
       50: #endif 
       51:     struct list_head    i_devices; 
       52:     union { 
       53:         struct pipe_inode_info    *i_pipe; 
       54:         struct block_device    *i_bdev; 
       55:         struct cdev        *i_cdev; 
       56:     }; 
       57:   
       58:     __u32            i_generation; 
       59:   
       60: #ifdef CONFIG_FSNOTIFY 
       61:     __u32            i_fsnotify_mask; /* all events this inode cares about */ 
       62:     struct hlist_head    i_fsnotify_marks; 
       63: #endif 
       64:   
       65: #ifdef CONFIG_IMA 
       66:     atomic_t        i_readcount; /* struct files open RO */ 
       67: #endif 
       68:     atomic_t        i_writecount; 
       69: #ifdef CONFIG_FS_POSIX_ACL 
       70:     struct posix_acl    *i_acl; 
       71:     struct posix_acl    *i_default_acl; 
       72: #endif 
       73:     void            *i_private; /* fs or device private pointer */ 
       74: }; 
    

inode可能处于三种状态：

1）unused，里面没有保存有效的内容，可以被复用为新的用途；

2）in use，正在被使用，其成员i_count以及i_nlink一定大于0，此时inode与文件系统或者说设备上的文件相关联，但是自从上次与设备同步后，内容没有发生改变，即不是dirty的；

3）dirty，inode里面的内容已经与文件系统中的文件内容不一致了，即脏了，需要进行文件同步操作。

前两种状态的inode都各自位于一个全局的链表中，而第三种的inode位于super_block结构体中的一个链表中。

先看inode结构体中的一个成员：

struct list_head i_lru; /* inode LRU list */

对应着一个全局的链表：

static LIST_HEAD(inode_lru);
static DEFINE_SPINLOCK(inode_lru_lock);

 
        1: /* 
        2:  * Called when we're dropping the last reference 
        3:  * to an inode. 
        4:  * 
        5:  * Call the FS "drop_inode()" function, defaulting to 
        6:  * the legacy UNIX filesystem behaviour.  If it tells 
        7:  * us to evict inode, do so.  Otherwise, retain inode 
        8:  * in cache if fs is alive, sync and evict if fs is 
        9:  * shutting down. 
       10:  */ 
       11: static void iput_final(struct inode *inode) 
       12: { 
       13:     struct super_block *sb = inode->i_sb; 
       14:     const struct super_operations *op = inode->i_sb->s_op; 
       15:     int drop; 
       16:   
       17:     WARN_ON(inode->i_state & I_NEW); 
       18:   
       19:     if (op && op->drop_inode) 
       20:         drop = op->drop_inode(inode); 
       21:     else 
       22:         drop = generic_drop_inode(inode); 
       23:   
       24:     if (!drop && (sb->s_flags & MS_ACTIVE)) { 
       25:         inode->i_state |= I_REFERENCED; 
       26:         if (!(inode->i_state & (I_DIRTY|I_SYNC))) 
       27:             inode_lru_list_add(inode); 
       28:         spin_unlock(&inode->i_lock); 
       29:         return; 
       30:     } 
       31:   
       32:     if (!drop) { 
       33:         inode->i_state |= I_WILL_FREE; 
       34:         spin_unlock(&inode->i_lock); 
       35:         write_inode_now(inode, 1); 
       36:         spin_lock(&inode->i_lock); 
       37:         WARN_ON(inode->i_state & I_NEW); 
       38:         inode->i_state &= ~I_WILL_FREE; 
       39:     } 
       40:   
       41:     inode->i_state |= I_FREEING; 
       42:     inode_lru_list_del(inode); 
       43:     spin_unlock(&inode->i_lock); 
       44:   
       45:     evict(inode); 
       46: } 
    

函数iput_final是在当inode没有被任何地方引用后，即变成了unused状态后，回收inode的机制。

if (op && op->drop_inode)
        drop = op->drop_inode(inode);
    else
        drop = generic_drop_inode(inode);

drop为0时，表示i_nlink为0，并且inode没有保存着inode_hashtable中的拉链表，即这个inode可以被释放掉。

 
        1: /* 
        2:  * Normal UNIX filesystem behaviour: delete the 
        3:  * inode when the usage count drops to zero, and 
        4:  * i_nlink is zero. 
        5:  */ 
        6: int generic_drop_inode(struct inode *inode) 
        7: { 
        8:     return !inode->i_nlink || inode_unhashed(inode); 
        9: } 
       10: EXPORT_SYMBOL_GPL(generic_drop_inode); 
    

if (!drop && (sb->s_flags & MS_ACTIVE)) {
        inode->i_state |= I_REFERENCED;
        if (!(inode->i_state & (I_DIRTY|I_SYNC)))
           inode_lru_list_add(inode);
        spin_unlock(&inode->i_lock);
        return;
    }

如果superblock还存在在系统中，就调用inode_lru_list_add将inode添加到unused列表中，即将inode缓存起来。

否则，就先调用write_inode_now写回到磁盘上，再调用inode_lru_list_del将已经缓存下来的inode删除掉，最后调用evict函数将inode彻底删除。

 
        1: static void inode_lru_list_add(struct inode *inode) 
        2: { 
        3:     spin_lock(&inode_lru_lock); 
        4:     if (list_empty(&inode->i_lru)) { 
        5:         list_add(&inode->i_lru, &inode_lru); 
        6:         inodes_stat.nr_unused++; 
        7:     } 
        8:     spin_unlock(&inode_lru_lock); 
        9: } 
    

因此inode_lru就是全局的unused inode列表，通过“Least Recently Used”的顺序保存。

此外，操作inode_lru的函数还有prune_icache

 
        1: /* 
        2:  * Scan `goal' inodes on the unused list for freeable ones. They are moved to a 
        3:  * temporary list and then are freed outside inode_lru_lock by dispose_list(). 
        4:  * 
        5:  * Any inodes which are pinned purely because of attached pagecache have their 
        6:  * pagecache removed.  If the inode has metadata buffers attached to 
        7:  * mapping->private_list then try to remove them. 
        8:  * 
        9:  * If the inode has the I_REFERENCED flag set, then it means that it has been 
       10:  * used recently - the flag is set in iput_final(). When we encounter such an 
       11:  * inode, clear the flag and move it to the back of the LRU so it gets another 
       12:  * pass through the LRU before it gets reclaimed. This is necessary because of 
       13:  * the fact we are doing lazy LRU updates to minimise lock contention so the 
       14:  * LRU does not have strict ordering. Hence we don't want to reclaim inodes 
       15:  * with this flag set because they are the inodes that are out of order. 
       16:  */ 
       17: static void prune_icache(int nr_to_scan) 
       18: { 
       19:     LIST_HEAD(freeable); 
       20:     int nr_scanned; 
       21:     unsigned long reap = 0; 
       22:   
       23:     down_read(&iprune_sem); 
       24:     spin_lock(&inode_lru_lock); 
       25:     for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 
       26:         struct inode *inode; 
       27:   
       28:         if (list_empty(&inode_lru)) 
       29:             break; 
       30:   
       31:         inode = list_entry(inode_lru.prev, struct inode, i_lru); 
       32:   
       33:         /* 
       34:          * we are inverting the inode_lru_lock/inode->i_lock here, 
       35:          * so use a trylock. If we fail to get the lock, just move the 
       36:          * inode to the back of the list so we don't spin on it. 
       37:          */ 
       38:         if (!spin_trylock(&inode->i_lock)) { 
       39:             list_move(&inode->i_lru, &inode_lru); 
       40:             continue; 
       41:         } 
       42:   
       43:         /* 
       44:          * Referenced or dirty inodes are still in use. Give them 
       45:          * another pass through the LRU as we canot reclaim them now. 
       46:          */ 
       47:         if (atomic_read(&inode->i_count) || 
       48:             (inode->i_state & ~I_REFERENCED)) { 
       49:             list_del_init(&inode->i_lru); 
       50:             spin_unlock(&inode->i_lock); 
       51:             inodes_stat.nr_unused--; 
       52:             continue; 
       53:         } 
       54:   
       55:         /* recently referenced inodes get one more pass */ 
       56:         if (inode->i_state & I_REFERENCED) { 
       57:             inode->i_state &= ~I_REFERENCED; 
       58:             list_move(&inode->i_lru, &inode_lru); 
       59:             spin_unlock(&inode->i_lock); 
       60:             continue; 
       61:         } 
       62:         if (inode_has_buffers(inode) || inode->i_data.nrpages) { 
       63:             __iget(inode); 
       64:             spin_unlock(&inode->i_lock); 
       65:             spin_unlock(&inode_lru_lock); 
       66:             if (remove_inode_buffers(inode)) 
       67:                 reap += invalidate_mapping_pages(&inode->i_data, 
       68:                                 0, -1); 
       69:             iput(inode); 
       70:             spin_lock(&inode_lru_lock); 
       71:   
       72:             if (inode != list_entry(inode_lru.next, 
       73:                         struct inode, i_lru)) 
       74:                 continue;    /* wrong inode or list_empty */ 
       75:             /* avoid lock inversions with trylock */ 
       76:             if (!spin_trylock(&inode->i_lock)) 
       77:                 continue; 
       78:             if (!can_unuse(inode)) { 
       79:                 spin_unlock(&inode->i_lock); 
       80:                 continue; 
       81:             } 
       82:         } 
       83:         WARN_ON(inode->i_state & I_NEW); 
       84:         inode->i_state |= I_FREEING; 
       85:         spin_unlock(&inode->i_lock); 
       86:   
       87:         list_move(&inode->i_lru, &freeable); 
       88:         inodes_stat.nr_unused--; 
       89:     } 
       90:     if (current_is_kswapd()) 
       91:         __count_vm_events(KSWAPD_INODESTEAL, reap); 
       92:     else 
       93:         __count_vm_events(PGINODESTEAL, reap); 
       94:     spin_unlock(&inode_lru_lock); 
       95:   
       96:     dispose_list(&freeable); 
       97:     up_read(&iprune_sem); 
       98: } 
    

该函数的作用是在内存压力较大时，通过缩减缓存的inode列表inode_lru以释放出更多的内存。

该函数就是从inode_lru中从头开始取inode出来，做一些简单检查，如果inode还有一些原因需要继续存在在缓存中，就将该inode移到链表的尾部，然后检查下一个inode。

使得inode继续保留的原因包括：无法获取到操作inode中数据的锁i_lock；inode中的数据是脏的；inode的使用计数非0；inode刚刚被引用过等等。

还有一个比较实用的问题，我们看到在调用iput_final时，检查如果i_nlink为0，并且没有被用作拉链表的话，就将其放到缓存inode_lru中，但是在prune_icache时，会检查i_count引用计数是否为0。

这也就是说，如果一个inode对应的磁盘文件已经被删除了，但是还有进程对其进行操作的话，那么它不会被直接删除，而是会保存在缓存中，也就是说对其操作的进程仍然可以对已经缓存下来的数据页面page进行操作。

直到没有进程再对其进行操作了，才有可能被清除出缓存。

inode中有两个链表头元素，分别是i_sb_list和i_wb_list，其中i_sb_list是super_block->s_inodes列表的元素，而i_wb_list是用于维护设备的后备inode列表。

2. dentry缓存

dentry缓存的目的，为了减少对慢速磁盘的访问，每当VFS文件系统对底层的数据进行访问时，都会将访问的结果缓存下来，保存成一个dentry对象。

而且dentry对象的组织与管理，是和inode缓存极其相似的，也有一个hash表，和一个lru队列。

而且当内存压力较大时，也会调用prune_dcache来企图释放lru中优先级较低的dentry项目。

区别在于，inode是不需要维护目录的关系的，但是dentry需要，因此dentry的组织比inode要复杂。

 
        1: static struct hlist_bl_head *dentry_hashtable __read_mostly; 
        2:

在super_block中

 
        1: /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */ 
        2:     struct list_head    s_dentry_lru;    /* unused dentry lru */ 
        3:   
    

因此，保存dentry全局hash表的数据结构是全局的，而保存dentry缓存的数据结构是存在于super_block数据结构中。

 
        1: /* 
        2:  * dentry_lru_(add|del|move_tail) must be called with d_lock held. 
        3:  */ 
        4: static void dentry_lru_add(struct dentry *dentry) 
        5: { 
        6:     if (list_empty(&dentry->d_lru)) { 
        7:         spin_lock(&dcache_lru_lock); 
        8:         list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 
        9:         dentry->d_sb->s_nr_dentry_unused++; 
       10:         dentry_stat.nr_unused++; 
       11:         spin_unlock(&dcache_lru_lock); 
       12:     } 
       13: } 
    

dentry_lur_add函数用于向dentry缓存中添加一个释放的dentry，它被函数dput调用。

 
        1: /*  
        2:  * This is dput 
        3:  * 
        4:  * This is complicated by the fact that we do not want to put 
        5:  * dentries that are no longer on any hash chain on the unused 
        6:  * list: we'd much rather just get rid of them immediately. 
        7:  * 
        8:  * However, that implies that we have to traverse the dentry 
        9:  * tree upwards to the parents which might _also_ now be 
       10:  * scheduled for deletion (it may have been only waiting for 
       11:  * its last child to go away). 
       12:  * 
       13:  * This tail recursion is done by hand as we don't want to depend 
       14:  * on the compiler to always get this right (gcc generally doesn't). 
       15:  * Real recursion would eat up our stack space. 
       16:  */ 
       17:   
       18: /* 
       19:  * dput - release a dentry 
       20:  * @dentry: dentry to release  
       21:  * 
       22:  * Release a dentry. This will drop the usage count and if appropriate 
       23:  * call the dentry unlink method as well as removing it from the queues and 
       24:  * releasing its resources. If the parent dentries were scheduled for release 
       25:  * they too may now get deleted. 
       26:  */ 
       27: void dput(struct dentry *dentry) 
       28: { 
       29:     if (!dentry) 
       30:         return; 
       31:   
       32: repeat: 
       33:     if (dentry->d_count == 1) 
       34:         might_sleep(); 
       35:     spin_lock(&dentry->d_lock); 
       36:     BUG_ON(!dentry->d_count); 
       37:     if (dentry->d_count > 1) { 
       38:         dentry->d_count--; 
       39:         spin_unlock(&dentry->d_lock); 
       40:         return; 
       41:     } 
       42:   
       43:     if (dentry->d_flags & DCACHE_OP_DELETE) { 
       44:         if (dentry->d_op->d_delete(dentry)) 
       45:             goto kill_it; 
       46:     } 
       47:   
       48:     /* Unreachable? Get rid of it */ 
       49:      if (d_unhashed(dentry)) 
       50:         goto kill_it; 
       51:   
       52:     /* Otherwise leave it cached and ensure it's on the LRU */ 
       53:     dentry->d_flags |= DCACHE_REFERENCED; 
       54:     dentry_lru_add(dentry); 
       55:   
       56:     dentry->d_count--; 
       57:     spin_unlock(&dentry->d_lock); 
       58:     return; 
       59:   
       60: kill_it: 
       61:     dentry = dentry_kill(dentry, 1); 
       62:     if (dentry) 
       63:         goto repeat; 
       64: } 
       65: EXPORT_SYMBOL(dput); 
    

所有的dentry实例会形成一个网络，用于反映文件系统的结构。

d_subdirs成员，里面保存着所有的子目录以及该目录下的文件组成的列表。

d_child成员，是该dentry链接到其父目录的dentry节点的锚点。

这两个成员，是构成文件系统的层次结构的基本设施。

if (dentry->d_count == 1)
might_sleep();

参考：http://yuxu9710108.blog.163.com/blog/static/23751534201011715413404/

用于调试时，提示atomic context的可能睡眠情况。

分析dput函数的逻辑：

如果dentry的引用计数大于1，那么代表还有其他的地方在使用这个dentry，因此只减少引用计数，直接返回；

如果dentry->d_flags里面设置了delete标志，那么直接调用d_op->d_delete函数指针删除该dentry，再调用dentry_kill来处理；

【d_op->d_delete与dentry_kill在功能上有什么不同？】

如果在全局的hash表中也已经找不该dentry了，那么直接调用dentry_kill来处理；

如果dentry的引用计数为1，而且也不属于上面二种需要调用dentry_kill的情况，那么就将其缓存在super_block的LRU队列中。

我们看一种可能的d_delete的实现

 
        1: /* 
        2:  * This is called from dput() when d_count is going to 0. 
        3:  */ 
        4: static int nfs_dentry_delete(const struct dentry *dentry) 
        5: { 
        6:     dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n", 
        7:         dentry->d_parent->d_name.name, dentry->d_name.name, 
        8:         dentry->d_flags); 
        9:   
       10:     /* Unhash any dentry with a stale inode */ 
       11:     if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode)) 
       12:         return 1; 
       13:   
       14:     if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 
       15:         /* Unhash it, so that ->d_iput() would be called */ 
       16:         return 1; 
       17:     } 
       18:     if (!(dentry->d_sb->s_flags & MS_ACTIVE)) { 
       19:         /* Unhash it, so that ancestors of killed async unlink 
       20:          * files will be cleaned up during umount */ 
       21:         return 1; 
       22:     } 
       23:     return 0; 
       24:   
       25: } 
    

可见，该函数是进行一些内部的判断，决定是否需要将该dentry从全局的hash表中删除掉。

if (dentry->d_flags & DCACHE_OP_DELETE) {
        if (dentry->d_op->d_delete(dentry))
            goto kill_it;
    }

 
        1: /* 
        2:  * Finish off a dentry we've decided to kill. 
        3:  * dentry->d_lock must be held, returns with it unlocked. 
        4:  * If ref is non-zero, then decrement the refcount too. 
        5:  * Returns dentry requiring refcount drop, or NULL if we're done. 
        6:  */ 
        7: static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) 
        8:     __releases(dentry->d_lock) 
        9: { 
       10:     struct inode *inode; 
       11:     struct dentry *parent; 
       12:   
       13:     inode = dentry->d_inode; 
       14:     if (inode && !spin_trylock(&inode->i_lock)) {