Re: [PATCH v2 3/3] vfs: Use per-cpu list for superblock's inode list

From: Jan Kara
Date: Mon Feb 22 2016 - 06:54:25 EST


On Mon 22-02-16 10:18:44, Peter Zijlstra wrote:
> On Mon, Feb 22, 2016 at 08:34:19AM +1100, Dave Chinner wrote:
> > On Fri, Feb 19, 2016 at 04:10:45PM -0500, Waiman Long wrote:
> > > +/*
> > > + * Superblock's inode list iterator function and arguments macros
> > > + */
> > > +#define SB_INODES_ITER_FUNC(name, lock, struct_fields) \
> > > + struct name ## _arg { \
> > > + struct_fields; \
> > > + }; \
> > > + static int name ## _iter(struct pcpu_list_node *_node, \
> > > + struct pcpu_list_node **_pnext, \
> > > + spinlock_t *lock, void *_arg)
> > > +
> > > +#define SB_INODES_ITER_ARGS(name, i, a) \
> > > + struct inode *i = container_of(_node, struct inode, i_sb_list); \
> > > + struct name ## _arg *a = (struct name ## _arg *)_arg
> > > +
> > > +#define SB_INODES_ITER_ARGS_SAFE(name, i, n, a) \
> > > + struct inode *i = container_of(_node, struct inode, i_sb_list); \
> > > + struct inode *n = container_of(*_pnext, struct inode, i_sb_list);\
> > > + struct name ## _arg *a = (struct name ## _arg *)_arg
> > > +
> > > +#define SB_INODES_ITER_SET_PCPU_LIST_NEXT(n) \
> > > + { *_pnext = &(n)->i_sb_list; }
> > > +
> > > +#define SB_INODES_ITER_CALL(name, sb) \
> > > + pcpu_list_iterate(sb->s_inodes, false, NULL, name ## _iter, &arg)
> > > +
> > > +#define SB_INODES_ITER_CALL_SAFE(name, sb, phead) \
> > > + pcpu_list_iterate(sb->s_inodes, true, phead, name ## _iter, &arg)
> > > +
> >
> > No, just no.
>
> Ha! See I was thinking of something more along the lines of the below.
>
> Which is completely untested and incomplete, but does illustrate the
> point.
>
> Also, I think fsnotify_unmount_inodes() (as per mainline) is missing a
> final iput(need_iput) at the very end, but I could be mistaken, that
> code hurts my brain.

I think the code is actually correct since need_iput contains "inode
further in the list than the current inode". Thus we will always go though
another iteration of the loop which will drop the reference. And inode
cannot change state to I_FREEING or I_WILL_FREE because we hold inode
reference. But it is subtle as hell so I agree that code needs rewrite.

Honza
>
> ---
> fs/block_dev.c | 51 +++++++++++---------------
> fs/drop_caches.c | 32 +++++-----------
> fs/fs-writeback.c | 65 ++++++++--------------------------
> fs/notify/inode_mark.c | 93 ++++++++++---------------------------------------
> fs/quota/dquot.c | 63 ++++++++-------------------------
> fs/super.c | 65 +++++++++++++++++++++++++++++++++-
> include/linux/fs.h | 9 ++--
> 7 files changed, 154 insertions(+), 224 deletions(-)
>
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1862,38 +1862,29 @@ int __invalidate_device(struct block_dev
> }
> EXPORT_SYMBOL(__invalidate_device);
>
> -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
> -{
> - struct inode *inode, *old_inode = NULL;
> +struct iterate_bdevs_data {
> + void (*func)(struct block_device *, void *);
> + void *arg;
> +};
>
> - spin_lock(&blockdev_superblock->s_inode_list_lock);
> - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
> - struct address_space *mapping = inode->i_mapping;
> +static void __iterate_bdevs(struct inode *inode, void *arg)
> +{
> + struct iterate_bdevs_data *ibd = arg;
>
> - spin_lock(&inode->i_lock);
> - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
> - mapping->nrpages == 0) {
> - spin_unlock(&inode->i_lock);
> - continue;
> - }
> - __iget(inode);
> - spin_unlock(&inode->i_lock);
> - spin_unlock(&blockdev_superblock->s_inode_list_lock);
> - /*
> - * We hold a reference to 'inode' so it couldn't have been
> - * removed from s_inodes list while we dropped the
> - * s_inode_list_lock We cannot iput the inode now as we can
> - * be holding the last reference and we cannot iput it under
> - * s_inode_list_lock. So we keep the reference and iput it
> - * later.
> - */
> - iput(old_inode);
> - old_inode = inode;
> + ibd->func(I_BDEV(inode), ibd->arg);
> +}
>
> - func(I_BDEV(inode), arg);
> +static bool __iterate_bdevs_cond(struct inode *inode, void *arg)
> +{
> + return inode->i_mapping->nr_pages != 0;
> +}
>
> - spin_lock(&blockdev_superblock->s_inode_list_lock);
> - }
> - spin_unlock(&blockdev_superblock->s_inode_list_lock);
> - iput(old_inode);
> +void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
> +{
> + struct iterate_bdevs_data ibd = {
> + .func = func,
> + .arg = arg,
> + };
> + iterate_inodes_super(&blockdev_superblock, &__iterate_bdevs_cond,
> + &__iterate_bdevs, &ibd);
> }
> --- a/fs/drop_caches.c
> +++ b/fs/drop_caches.c
> @@ -13,30 +13,20 @@
> /* A global variable is a bit ugly, but it keeps the code simple */
> int sysctl_drop_caches;
>
> -static void drop_pagecache_sb(struct super_block *sb, void *unused)
> +static void __drop_pagecache_sb(struct inode *inode, void *unused)
> {
> - struct inode *inode, *toput_inode = NULL;
> -
> - spin_lock(&sb->s_inode_list_lock);
> - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
> - spin_lock(&inode->i_lock);
> - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
> - (inode->i_mapping->nrpages == 0)) {
> - spin_unlock(&inode->i_lock);
> - continue;
> - }
> - __iget(inode);
> - spin_unlock(&inode->i_lock);
> - spin_unlock(&sb->s_inode_list_lock);
> + invalidate_mapping_pages(inode->i_mapping, 0, -1);
> +}
>
> - invalidate_mapping_pages(inode->i_mapping, 0, -1);
> - iput(toput_inode);
> - toput_inode = inode;
> +static bool __drop_pagecache_sb_cond(struct inode *inode, void *unused)
> +{
> + return inode->i_mapping->nr_pages != 0;
> +}
>
> - spin_lock(&sb->s_inode_list_lock);
> - }
> - spin_unlock(&sb->s_inode_list_lock);
> - iput(toput_inode);
> +static void drop_pagecache_sb(struct super_block *sb, void *unused)
> +{
> + iterate_inodes_super(sb, &__drop_pagecache_sb_cond,
> + &__drop_pagecache_sb, NULL);
> }
>
> int drop_caches_sysctl_handler(struct ctl_table *table, int write,
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -2102,6 +2102,21 @@ void __mark_inode_dirty(struct inode *in
> }
> EXPORT_SYMBOL(__mark_inode_dirty);
>
> +static void __wait_sb_inodes(struct inode *inode, void *arg)
> +{
> + /*
> + * We keep the error status of individual mapping so that
> + * applications can catch the writeback error using fsync(2).
> + * See filemap_fdatawait_keep_errors() for details.
> + */
> + filemap_fdatawait_keep_errors(inode->i_mapping);
> +}
> +
> +static bool __wait_sb_inodes_cond(struct inode *inode, void *arg)
> +{
> + return inode->i_mapping->nr_pages != 0;
> +}
> +
> /*
> * The @s_sync_lock is used to serialise concurrent sync operations
> * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
> @@ -2113,8 +2128,6 @@ EXPORT_SYMBOL(__mark_inode_dirty);
> */
> static void wait_sb_inodes(struct super_block *sb)
> {
> - struct inode *inode, *old_inode = NULL;
> -
> /*
> * We need to be protected against the filesystem going from
> * r/o to r/w or vice versa.
> @@ -2122,52 +2135,8 @@ static void wait_sb_inodes(struct super_
> WARN_ON(!rwsem_is_locked(&sb->s_umount));
>
> mutex_lock(&sb->s_sync_lock);
> - spin_lock(&sb->s_inode_list_lock);
> -
> - /*
> - * Data integrity sync. Must wait for all pages under writeback,
> - * because there may have been pages dirtied before our sync
> - * call, but which had writeout started before we write it out.
> - * In which case, the inode may not be on the dirty list, but
> - * we still have to wait for that writeout.
> - */
> - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
> - struct address_space *mapping = inode->i_mapping;
> -
> - spin_lock(&inode->i_lock);
> - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
> - (mapping->nrpages == 0)) {
> - spin_unlock(&inode->i_lock);
> - continue;
> - }
> - __iget(inode);
> - spin_unlock(&inode->i_lock);
> - spin_unlock(&sb->s_inode_list_lock);
> -
> - /*
> - * We hold a reference to 'inode' so it couldn't have been
> - * removed from s_inodes list while we dropped the
> - * s_inode_list_lock. We cannot iput the inode now as we can
> - * be holding the last reference and we cannot iput it under
> - * s_inode_list_lock. So we keep the reference and iput it
> - * later.
> - */
> - iput(old_inode);
> - old_inode = inode;
> -
> - /*
> - * We keep the error status of individual mapping so that
> - * applications can catch the writeback error using fsync(2).
> - * See filemap_fdatawait_keep_errors() for details.
> - */
> - filemap_fdatawait_keep_errors(mapping);
> -
> - cond_resched();
> -
> - spin_lock(&sb->s_inode_list_lock);
> - }
> - spin_unlock(&sb->s_inode_list_lock);
> - iput(old_inode);
> + iterate_inodes_super(sb, &__wait_sb_inodes_cond,
> + &__wait_sb_inodes, NULL);
> mutex_unlock(&sb->s_sync_lock);
> }
>
> --- a/fs/notify/inode_mark.c
> +++ b/fs/notify/inode_mark.c
> @@ -141,6 +141,24 @@ int fsnotify_add_inode_mark(struct fsnot
> return ret;
> }
>
> +static void __fsnotify_umount_inodes(struct inode *inode, void *arg)
> +{
> + /* for each watch, send FS_UNMOUNT and then remove it */
> + fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
> + fsnotify_inode_delete(inode);
> +}
> +
> +static bool __fsnotify_umount_cond(struct inode *inode, void *arg)
> +{
> + /*
> + * If i_count is zero, the inode cannot have any watches and
> + * doing an __iget/iput with MS_ACTIVE clear would actually
> + * evict all inodes with zero i_count from icache which is
> + * unnecessarily violent and may in fact be illegal to do.
> + */
> + return atomic_read(&inode->i_count);
> +}
> +
> /**
> * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
> * @sb: superblock being unmounted.
> @@ -150,77 +168,6 @@ int fsnotify_add_inode_mark(struct fsnot
> */
> void fsnotify_unmount_inodes(struct super_block *sb)
> {
> - struct inode *inode, *next_i, *need_iput = NULL;
> -
> - spin_lock(&sb->s_inode_list_lock);
> - list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
> - struct inode *need_iput_tmp;
> -
> - /*
> - * We cannot __iget() an inode in state I_FREEING,
> - * I_WILL_FREE, or I_NEW which is fine because by that point
> - * the inode cannot have any associated watches.
> - */
> - spin_lock(&inode->i_lock);
> - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
> - spin_unlock(&inode->i_lock);
> - continue;
> - }
> -
> - /*
> - * If i_count is zero, the inode cannot have any watches and
> - * doing an __iget/iput with MS_ACTIVE clear would actually
> - * evict all inodes with zero i_count from icache which is
> - * unnecessarily violent and may in fact be illegal to do.
> - */
> - if (!atomic_read(&inode->i_count)) {
> - spin_unlock(&inode->i_lock);
> - continue;
> - }
> -
> - need_iput_tmp = need_iput;
> - need_iput = NULL;
> -
> - /* In case fsnotify_inode_delete() drops a reference. */
> - if (inode != need_iput_tmp)
> - __iget(inode);
> - else
> - need_iput_tmp = NULL;
> - spin_unlock(&inode->i_lock);
> -
> - /* In case the dropping of a reference would nuke next_i. */
> - while (&next_i->i_sb_list != &sb->s_inodes) {
> - spin_lock(&next_i->i_lock);
> - if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
> - atomic_read(&next_i->i_count)) {
> - __iget(next_i);
> - need_iput = next_i;
> - spin_unlock(&next_i->i_lock);
> - break;
> - }
> - spin_unlock(&next_i->i_lock);
> - next_i = list_next_entry(next_i, i_sb_list);
> - }
> -
> - /*
> - * We can safely drop s_inode_list_lock here because either
> - * we actually hold references on both inode and next_i or
> - * end of list. Also no new inodes will be added since the
> - * umount has begun.
> - */
> - spin_unlock(&sb->s_inode_list_lock);
> -
> - if (need_iput_tmp)
> - iput(need_iput_tmp);
> -
> - /* for each watch, send FS_UNMOUNT and then remove it */
> - fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
> -
> - fsnotify_inode_delete(inode);
> -
> - iput(inode);
> -
> - spin_lock(&sb->s_inode_list_lock);
> - }
> - spin_unlock(&sb->s_inode_list_lock);
> + iterate_inodes_super(sb, &__fsnotify_umount_cond,
> + &__fsnotify_umount_inodes, NULL);
> }
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -920,55 +920,26 @@ static int dqinit_needed(struct inode *i
> return 0;
> }
>
> +static void __add_dquot_ref(struct inode *inode, void *arg)
> +{
> + int type = (int)(long)(arg);
> +
> + __dquot_initialize(inode, type);
> +}
> +
> +static bool __add_dquot_ref_cond(struct inode *inode, void *arg)
> +{
> + int type = (int)(long)(arg);
> +
> + return atomic_read(&inode->i_writecount) && dqinit_needed(inode, type);
> +}
> +
> /* This routine is guarded by dqonoff_mutex mutex */
> static void add_dquot_ref(struct super_block *sb, int type)
> {
> - struct inode *inode, *old_inode = NULL;
> -#ifdef CONFIG_QUOTA_DEBUG
> - int reserved = 0;
> -#endif
> -
> - spin_lock(&sb->s_inode_list_lock);
> - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
> - spin_lock(&inode->i_lock);
> - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
> - !atomic_read(&inode->i_writecount) ||
> - !dqinit_needed(inode, type)) {
> - spin_unlock(&inode->i_lock);
> - continue;
> - }
> - __iget(inode);
> - spin_unlock(&inode->i_lock);
> - spin_unlock(&sb->s_inode_list_lock);
> -
> -#ifdef CONFIG_QUOTA_DEBUG
> - if (unlikely(inode_get_rsv_space(inode) > 0))
> - reserved = 1;
> -#endif
> - iput(old_inode);
> - __dquot_initialize(inode, type);
> -
> - /*
> - * We hold a reference to 'inode' so it couldn't have been
> - * removed from s_inodes list while we dropped the
> - * s_inode_list_lock. We cannot iput the inode now as we can be
> - * holding the last reference and we cannot iput it under
> - * s_inode_list_lock. So we keep the reference and iput it
> - * later.
> - */
> - old_inode = inode;
> - spin_lock(&sb->s_inode_list_lock);
> - }
> - spin_unlock(&sb->s_inode_list_lock);
> - iput(old_inode);
> -
> -#ifdef CONFIG_QUOTA_DEBUG
> - if (reserved) {
> - quota_error(sb, "Writes happened before quota was turned on "
> - "thus quota information is probably inconsistent. "
> - "Please run quotacheck(8)");
> - }
> -#endif
> + /* XXX we lost debug code */
> + iterate_inodes_super(sb, &__add_dquot_ref_cond,
> + &__add_dquot_ref, (void *)type);
> }
>
> /*
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -204,7 +204,7 @@ static struct super_block *alloc_super(s
> INIT_HLIST_NODE(&s->s_instances);
> INIT_HLIST_BL_HEAD(&s->s_anon);
> mutex_init(&s->s_sync_lock);
> - INIT_LIST_HEAD(&s->s_inodes);
> + PCPU_LIST_HEAD_INIT(&s->s_inodes_cpu);
> spin_lock_init(&s->s_inode_list_lock);
>
> if (list_lru_init_memcg(&s->s_dentry_lru))
> @@ -253,6 +253,67 @@ static struct super_block *alloc_super(s
> return NULL;
> }
>
> +/*
> + * Iterate over all the inodes of this superblock.
> + */
> +void iterate_inodes_super(struct super_block *sb,
> + bool (*cond)(struct inode *, void *),
> + void (*func)(struct inode *, void *), void *arg)
> +{
> + struct inode *inode, *n_inode, *old_inode = NULL;
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + struct list_head *head = &per_cpu_ptr(&sb->s_inodes_cpu, cpu)->list;
> + spinlock_t *lock = &per_cpu_ptr(&sb->s_inodes_cpu, cpu)->lock;
> +
> + spin_lock(lock);
> + list_for_each_entry_safe(inode, n_inode, head, i_sb_list) {
> + struct inode *put_inode;
> +
> + spin_lock(&inode->i_lock);
> + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
> + !cond(inode, arg)) {
> + spin_unlock(&inode->i_lock);
> + continue;
> + }
> +
> + put_inode = old_inode;
> + old_inode = NULL;
> +
> + if (inode != put_inode)
> + __iget(inode);
> + else
> + put_inode = NULL;
> + spin_unlock(&inode->i_lock);
> +
> + while (&n_inode->i_sb_list != head) {
> + spin_lock(&n_inode->i_lock);
> + if (!((n_inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
> + !cond(n_inode, arg))) {
> + __iget(n_inode);
> + old_inode = n_inode;
> + spin_unlock(&n_inode->i_lock);
> + break;
> + }
> + spin_unlock(&n_inode->i_lock);
> + n_inode = list_next_entry(n_inode, i_sb_list);
> + }
> +
> + spin_unlock(lock);
> +
> + iput(put_inode);
> + func(inode, arg);
> + iput(inode);
> + cond_resched();
> +
> + spin_lock(lock);
> + }
> + spin_unlock(lock);
> + }
> + iput(old_inode);
> +}
> +
> /* Superblock refcounting */
>
> /*
> @@ -426,7 +487,7 @@ void generic_shutdown_super(struct super
> if (sop->put_super)
> sop->put_super(sb);
>
> - if (!list_empty(&sb->s_inodes)) {
> + if (!pcpu_list_empty(&sb->s_inodes_cpu)) {
> printk("VFS: Busy inodes after unmount of %s. "
> "Self-destruct in 5 seconds. Have a nice day...\n",
> sb->s_id);
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -648,7 +648,7 @@ struct inode {
> u16 i_wb_frn_history;
> #endif
> struct list_head i_lru; /* inode LRU list */
> - struct list_head i_sb_list;
> + struct pcpu_list_node i_sb_list;
> union {
> struct hlist_head i_dentry;
> struct rcu_head i_rcu;
> @@ -1397,11 +1397,12 @@ struct super_block {
> */
> int s_stack_depth;
>
> - /* s_inode_list_lock protects s_inodes */
> - spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
> - struct list_head s_inodes; /* all inodes */
> + struct pcpu_list_head s_inodes_cpu;
> };
>
> +extern void iterate_inodes_super(struct super_block *sb,
> + void (*func)(struct inode *, void *), void *arg);
> +
> extern struct timespec current_fs_time(struct super_block *sb);
>
> /*
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR