[PATCH v2 8/8] change sb_writers to use percpu_rw_semaphore

From: Oleg Nesterov
Date: Tue Aug 11 2015 - 13:06:41 EST


We can remove everything from struct sb_writers except frozen
and add the array of percpu_rw_semaphore's instead.

This patch doesn't remove sb_writers->wait_unfrozen yet, we keep
it for get_super_thawed(). We will probably remove it later.

This change tries to address the following problems:

- Firstly, __sb_start_write() looks simply buggy. It does
__sb_end_write() if it sees ->frozen, but if it migrates
to another CPU before percpu_counter_dec(), sb_wait_write()
can wrongly succeed if there is another task which holds
the same "semaphore": sb_wait_write() can miss the result
of the previous percpu_counter_inc() but see the result
of this percpu_counter_dec().

- As Dave Hansen reports, it is suboptimal. The trivial
microbenchmark that writes to a tmpfs file in a loop runs
12% faster if we change this code to rely on RCU and kill
the memory barriers.

- This code doesn't look simple. It would be better to rely
on the generic locking code.

According to Dave, this change adds the same performance
improvement.

Note: with this change both freeze_super() and thaw_super() will do
synchronize_sched_expedited() 3 times. This is just ugly. But:

- This will be "fixed" by the rcu_sync changes we are going
to merge. After that freeze_super()->percpu_down_write()
will use synchronize_sched(), and thaw_super() won't use
synchronize() at all.

This doesn't need any changes in fs/super.c.

- Once we merge rcu_sync changes, we can also change super.c
so that all wb_write->rw_sem's will share the single ->rss
in struct sb_writes, then freeze_super() will need only one
synchronize_sched().

Signed-off-by: Oleg Nesterov <oleg@xxxxxxxxxx>
---
fs/super.c | 107 ++++++++++++++--------------------------------------
include/linux/fs.h | 19 +++------
2 files changed, 35 insertions(+), 91 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 75436e2..8762997 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work)
int i;

for (i = 0; i < SB_FREEZE_LEVELS; i++)
- percpu_counter_destroy(&s->s_writers.counter[i]);
+ percpu_free_rwsem(&s->s_writers.rw_sem[i]);
kfree(s);
}

@@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
goto fail;

for (i = 0; i < SB_FREEZE_LEVELS; i++) {
- if (percpu_counter_init(&s->s_writers.counter[i], 0,
- GFP_KERNEL) < 0)
+ if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
+ sb_writers_name[i],
+ &type->s_writers_key[i]))
goto fail;
- lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
- &type->s_writers_key[i], 0);
}
- init_waitqueue_head(&s->s_writers.wait);
init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
@@ -1161,47 +1159,10 @@ out:
*/
void __sb_end_write(struct super_block *sb, int level)
{
- percpu_counter_dec(&sb->s_writers.counter[level-1]);
- /*
- * Make sure s_writers are updated before we wake up waiters in
- * freeze_super().
- */
- smp_mb();
- if (waitqueue_active(&sb->s_writers.wait))
- wake_up(&sb->s_writers.wait);
- rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+ percpu_up_read(sb->s_writers.rw_sem + level-1);
}
EXPORT_SYMBOL(__sb_end_write);

-static int do_sb_start_write(struct super_block *sb, int level, bool wait,
- unsigned long ip)
-{
- if (wait)
- rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip);
-retry:
- if (unlikely(sb->s_writers.frozen >= level)) {
- if (!wait)
- return 0;
- wait_event(sb->s_writers.wait_unfrozen,
- sb->s_writers.frozen < level);
- }
-
- percpu_counter_inc(&sb->s_writers.counter[level-1]);
- /*
- * Make sure counter is updated before we check for frozen.
- * freeze_super() first sets frozen and then checks the counter.
- */
- smp_mb();
- if (unlikely(sb->s_writers.frozen >= level)) {
- __sb_end_write(sb, level);
- goto retry;
- }
-
- if (!wait)
- rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip);
- return 1;
-}
-
/*
* This is an internal function, please use sb_start_{write,pagefault,intwrite}
* instead.
@@ -1209,7 +1170,7 @@ retry:
int __sb_start_write(struct super_block *sb, int level, bool wait)
{
bool force_trylock = false;
- int ret;
+ int ret = 1;

#ifdef CONFIG_LOCKDEP
/*
@@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait)
int i;

for (i = 0; i < level - 1; i++)
- if (lock_is_held(&sb->s_writers.lock_map[i])) {
+ if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
force_trylock = true;
break;
}
}
#endif
- ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_);
+ if (wait && !force_trylock)
+ percpu_down_read(sb->s_writers.rw_sem + level-1);
+ else
+ ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
+
WARN_ON(force_trylock & !ret);
return ret;
}
@@ -1249,9 +1214,7 @@ EXPORT_SYMBOL(__sb_start_write);
*/
static void sb_wait_write(struct super_block *sb, int level)
{
- s64 writers;
-
- rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
+ percpu_down_write(sb->s_writers.rw_sem + level-1);
/*
* We are going to return to userspace and forget about this lock, the
* ownership goes to the caller of thaw_super() which does unlock.
@@ -1262,24 +1225,18 @@ static void sb_wait_write(struct super_block *sb, int level)
* this leads to lockdep false-positives, so currently we do the early
* release right after acquire.
*/
- rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
-
- do {
- DEFINE_WAIT(wait);
+ percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
+}

- /*
- * We use a barrier in prepare_to_wait() to separate setting
- * of frozen and checking of the counter
- */
- prepare_to_wait(&sb->s_writers.wait, &wait,
- TASK_UNINTERRUPTIBLE);
+static void sb_freeze_unlock(struct super_block *sb)
+{
+ int level;

- writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
- if (writers)
- schedule();
+ for (level = 0; level < SB_FREEZE_LEVELS; ++level)
+ percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);

- finish_wait(&sb->s_writers.wait, &wait);
- } while (writers);
+ for (level = SB_FREEZE_LEVELS; --level >= 0; )
+ percpu_up_write(sb->s_writers.rw_sem + level);
}

/**
@@ -1338,20 +1295,14 @@ int freeze_super(struct super_block *sb)
return 0;
}

- /* From now on, no new normal writers can start */
sb->s_writers.frozen = SB_FREEZE_WRITE;
- smp_wmb();
-
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
up_write(&sb->s_umount);
-
sb_wait_write(sb, SB_FREEZE_WRITE);
+ down_write(&sb->s_umount);

/* Now we go and block page faults... */
- down_write(&sb->s_umount);
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
- smp_wmb();
-
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);

/* All writers are done so after syncing there won't be dirty data */
@@ -1359,7 +1310,6 @@ int freeze_super(struct super_block *sb)

/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
- smp_wmb();
sb_wait_write(sb, SB_FREEZE_FS);

if (sb->s_op->freeze_fs) {
@@ -1368,7 +1318,7 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
- smp_wmb();
+ sb_freeze_unlock(sb);
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return ret;
@@ -1400,8 +1350,10 @@ int thaw_super(struct super_block *sb)
return -EINVAL;
}

- if (sb->s_flags & MS_RDONLY)
+ if (sb->s_flags & MS_RDONLY) {
+ sb->s_writers.frozen = SB_UNFROZEN;
goto out;
+ }

if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
@@ -1413,12 +1365,11 @@ int thaw_super(struct super_block *sb)
}
}

-out:
sb->s_writers.frozen = SB_UNFROZEN;
- smp_wmb();
+ sb_freeze_unlock(sb);
+out:
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
-
return 0;
}
EXPORT_SYMBOL(thaw_super);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6addccc..fb2cb4a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1,7 +1,6 @@
#ifndef _LINUX_FS_H
#define _LINUX_FS_H

-
#include <linux/linkage.h>
#include <linux/wait.h>
#include <linux/kdev_t.h>
@@ -31,6 +30,7 @@
#include <linux/percpu-rwsem.h>
#include <linux/blk_types.h>
#include <linux/workqueue.h>
+#include <linux/percpu-rwsem.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -1247,16 +1247,9 @@ enum {
#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)

struct sb_writers {
- /* Counters for counting writers at each level */
- struct percpu_counter counter[SB_FREEZE_LEVELS];
- wait_queue_head_t wait; /* queue for waiting for
- writers / faults to finish */
- int frozen; /* Is sb frozen? */
- wait_queue_head_t wait_unfrozen; /* queue for waiting for
- sb to be thawed */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map lock_map[SB_FREEZE_LEVELS];
-#endif
+ int frozen; /* Is sb frozen? */
+ wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */
+ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};

struct super_block {
@@ -1364,9 +1357,9 @@ void __sb_end_write(struct super_block *sb, int level);
int __sb_start_write(struct super_block *sb, int level, bool wait);

#define __sb_acquire_write(sb, lev) \
- rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_)
+ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_release_write(sb, lev) \
- rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_)
+ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)

/**
* sb_end_write - drop write access to a superblock
--
1.5.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/