Re: [PATCH v2 8/8] change sb_writers to use percpu_rw_semaphore

From: Jan Kara
Date: Thu Aug 13 2015 - 06:49:00 EST


On Tue 11-08-15 19:04:19, Oleg Nesterov wrote:
> We can remove everything from struct sb_writers except frozen
> and add the array of percpu_rw_semaphore's instead.
>
> This patch doesn't remove sb_writers->wait_unfrozen yet, we keep
> it for get_super_thawed(). We will probably remove it later.
>
> This change tries to address the following problems:
>
> - Firstly, __sb_start_write() looks simply buggy. It does
> __sb_end_write() if it sees ->frozen, but if it migrates
> to another CPU before percpu_counter_dec(), sb_wait_write()
> can wrongly succeed if there is another task which holds
> the same "semaphore": sb_wait_write() can miss the result
> of the previous percpu_counter_inc() but see the result
> of this percpu_counter_dec().
>
> - As Dave Hansen reports, it is suboptimal. The trivial
> microbenchmark that writes to a tmpfs file in a loop runs
> 12% faster if we change this code to rely on RCU and kill
> the memory barriers.
>
> - This code doesn't look simple. It would be better to rely
> on the generic locking code.
>
> According to Dave, this change adds the same performance
> improvement.
>
> Note: with this change both freeze_super() and thaw_super() will do
> synchronize_sched_expedited() 3 times. This is just ugly. But:
>
> - This will be "fixed" by the rcu_sync changes we are going
> to merge. After that freeze_super()->percpu_down_write()
> will use synchronize_sched(), and thaw_super() won't use
> synchronize() at all.
>
> This doesn't need any changes in fs/super.c.
>
> - Once we merge rcu_sync changes, we can also change super.c
> so that all wb_write->rw_sem's will share the single ->rss
> in struct sb_writes, then freeze_super() will need only one
> synchronize_sched().
>
> Signed-off-by: Oleg Nesterov <oleg@xxxxxxxxxx>

The patch looks good. You can add:

Reviewed-by: Jan Kara <jack@xxxxxxxx>

Honza

> ---
> fs/super.c | 107 ++++++++++++++--------------------------------------
> include/linux/fs.h | 19 +++------
> 2 files changed, 35 insertions(+), 91 deletions(-)
>
> diff --git a/fs/super.c b/fs/super.c
> index 75436e2..8762997 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work)
> int i;
>
> for (i = 0; i < SB_FREEZE_LEVELS; i++)
> - percpu_counter_destroy(&s->s_writers.counter[i]);
> + percpu_free_rwsem(&s->s_writers.rw_sem[i]);
> kfree(s);
> }
>
> @@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> goto fail;
>
> for (i = 0; i < SB_FREEZE_LEVELS; i++) {
> - if (percpu_counter_init(&s->s_writers.counter[i], 0,
> - GFP_KERNEL) < 0)
> + if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
> + sb_writers_name[i],
> + &type->s_writers_key[i]))
> goto fail;
> - lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
> - &type->s_writers_key[i], 0);
> }
> - init_waitqueue_head(&s->s_writers.wait);
> init_waitqueue_head(&s->s_writers.wait_unfrozen);
> s->s_bdi = &noop_backing_dev_info;
> s->s_flags = flags;
> @@ -1161,47 +1159,10 @@ out:
> */
> void __sb_end_write(struct super_block *sb, int level)
> {
> - percpu_counter_dec(&sb->s_writers.counter[level-1]);
> - /*
> - * Make sure s_writers are updated before we wake up waiters in
> - * freeze_super().
> - */
> - smp_mb();
> - if (waitqueue_active(&sb->s_writers.wait))
> - wake_up(&sb->s_writers.wait);
> - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
> + percpu_up_read(sb->s_writers.rw_sem + level-1);
> }
> EXPORT_SYMBOL(__sb_end_write);
>
> -static int do_sb_start_write(struct super_block *sb, int level, bool wait,
> - unsigned long ip)
> -{
> - if (wait)
> - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip);
> -retry:
> - if (unlikely(sb->s_writers.frozen >= level)) {
> - if (!wait)
> - return 0;
> - wait_event(sb->s_writers.wait_unfrozen,
> - sb->s_writers.frozen < level);
> - }
> -
> - percpu_counter_inc(&sb->s_writers.counter[level-1]);
> - /*
> - * Make sure counter is updated before we check for frozen.
> - * freeze_super() first sets frozen and then checks the counter.
> - */
> - smp_mb();
> - if (unlikely(sb->s_writers.frozen >= level)) {
> - __sb_end_write(sb, level);
> - goto retry;
> - }
> -
> - if (!wait)
> - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip);
> - return 1;
> -}
> -
> /*
> * This is an internal function, please use sb_start_{write,pagefault,intwrite}
> * instead.
> @@ -1209,7 +1170,7 @@ retry:
> int __sb_start_write(struct super_block *sb, int level, bool wait)
> {
> bool force_trylock = false;
> - int ret;
> + int ret = 1;
>
> #ifdef CONFIG_LOCKDEP
> /*
> @@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait)
> int i;
>
> for (i = 0; i < level - 1; i++)
> - if (lock_is_held(&sb->s_writers.lock_map[i])) {
> + if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
> force_trylock = true;
> break;
> }
> }
> #endif
> - ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_);
> + if (wait && !force_trylock)
> + percpu_down_read(sb->s_writers.rw_sem + level-1);
> + else
> + ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
> +
> WARN_ON(force_trylock & !ret);
> return ret;
> }
> @@ -1249,9 +1214,7 @@ EXPORT_SYMBOL(__sb_start_write);
> */
> static void sb_wait_write(struct super_block *sb, int level)
> {
> - s64 writers;
> -
> - rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
> + percpu_down_write(sb->s_writers.rw_sem + level-1);
> /*
> * We are going to return to userspace and forget about this lock, the
> * ownership goes to the caller of thaw_super() which does unlock.
> @@ -1262,24 +1225,18 @@ static void sb_wait_write(struct super_block *sb, int level)
> * this leads to lockdep false-positives, so currently we do the early
> * release right after acquire.
> */
> - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
> -
> - do {
> - DEFINE_WAIT(wait);
> + percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
> +}
>
> - /*
> - * We use a barrier in prepare_to_wait() to separate setting
> - * of frozen and checking of the counter
> - */
> - prepare_to_wait(&sb->s_writers.wait, &wait,
> - TASK_UNINTERRUPTIBLE);
> +static void sb_freeze_unlock(struct super_block *sb)
> +{
> + int level;
>
> - writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
> - if (writers)
> - schedule();
> + for (level = 0; level < SB_FREEZE_LEVELS; ++level)
> + percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
>
> - finish_wait(&sb->s_writers.wait, &wait);
> - } while (writers);
> + for (level = SB_FREEZE_LEVELS; --level >= 0; )
> + percpu_up_write(sb->s_writers.rw_sem + level);
> }
>
> /**
> @@ -1338,20 +1295,14 @@ int freeze_super(struct super_block *sb)
> return 0;
> }
>
> - /* From now on, no new normal writers can start */
> sb->s_writers.frozen = SB_FREEZE_WRITE;
> - smp_wmb();
> -
> /* Release s_umount to preserve sb_start_write -> s_umount ordering */
> up_write(&sb->s_umount);
> -
> sb_wait_write(sb, SB_FREEZE_WRITE);
> + down_write(&sb->s_umount);
>
> /* Now we go and block page faults... */
> - down_write(&sb->s_umount);
> sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
> - smp_wmb();
> -
> sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
>
> /* All writers are done so after syncing there won't be dirty data */
> @@ -1359,7 +1310,6 @@ int freeze_super(struct super_block *sb)
>
> /* Now wait for internal filesystem counter */
> sb->s_writers.frozen = SB_FREEZE_FS;
> - smp_wmb();
> sb_wait_write(sb, SB_FREEZE_FS);
>
> if (sb->s_op->freeze_fs) {
> @@ -1368,7 +1318,7 @@ int freeze_super(struct super_block *sb)
> printk(KERN_ERR
> "VFS:Filesystem freeze failed\n");
> sb->s_writers.frozen = SB_UNFROZEN;
> - smp_wmb();
> + sb_freeze_unlock(sb);
> wake_up(&sb->s_writers.wait_unfrozen);
> deactivate_locked_super(sb);
> return ret;
> @@ -1400,8 +1350,10 @@ int thaw_super(struct super_block *sb)
> return -EINVAL;
> }
>
> - if (sb->s_flags & MS_RDONLY)
> + if (sb->s_flags & MS_RDONLY) {
> + sb->s_writers.frozen = SB_UNFROZEN;
> goto out;
> + }
>
> if (sb->s_op->unfreeze_fs) {
> error = sb->s_op->unfreeze_fs(sb);
> @@ -1413,12 +1365,11 @@ int thaw_super(struct super_block *sb)
> }
> }
>
> -out:
> sb->s_writers.frozen = SB_UNFROZEN;
> - smp_wmb();
> + sb_freeze_unlock(sb);
> +out:
> wake_up(&sb->s_writers.wait_unfrozen);
> deactivate_locked_super(sb);
> -
> return 0;
> }
> EXPORT_SYMBOL(thaw_super);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 6addccc..fb2cb4a 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1,7 +1,6 @@
> #ifndef _LINUX_FS_H
> #define _LINUX_FS_H
>
> -
> #include <linux/linkage.h>
> #include <linux/wait.h>
> #include <linux/kdev_t.h>
> @@ -31,6 +30,7 @@
> #include <linux/percpu-rwsem.h>
> #include <linux/blk_types.h>
> #include <linux/workqueue.h>
> +#include <linux/percpu-rwsem.h>
>
> #include <asm/byteorder.h>
> #include <uapi/linux/fs.h>
> @@ -1247,16 +1247,9 @@ enum {
> #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
>
> struct sb_writers {
> - /* Counters for counting writers at each level */
> - struct percpu_counter counter[SB_FREEZE_LEVELS];
> - wait_queue_head_t wait; /* queue for waiting for
> - writers / faults to finish */
> - int frozen; /* Is sb frozen? */
> - wait_queue_head_t wait_unfrozen; /* queue for waiting for
> - sb to be thawed */
> -#ifdef CONFIG_DEBUG_LOCK_ALLOC
> - struct lockdep_map lock_map[SB_FREEZE_LEVELS];
> -#endif
> + int frozen; /* Is sb frozen? */
> + wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */
> + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
> };
>
> struct super_block {
> @@ -1364,9 +1357,9 @@ void __sb_end_write(struct super_block *sb, int level);
> int __sb_start_write(struct super_block *sb, int level, bool wait);
>
> #define __sb_acquire_write(sb, lev) \
> - rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_)
> + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
> #define __sb_release_write(sb, lev) \
> - rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_)
> + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
>
> /**
> * sb_end_write - drop write access to a superblock
> --
> 1.5.5.1
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/