[PATCH 4/4] change sb_writers to use percpu_rw_semaphore

From: Oleg Nesterov
Date: Mon Jul 13 2015 - 17:27:46 EST


__sb_start/end_write() can use percpu_down/up_read().
---
fs/super.c | 135 +++++++++++++++++-----------------------------------
include/linux/fs.h | 14 +----
2 files changed, 47 insertions(+), 102 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 94303fc..6e336b8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -147,7 +147,7 @@ static void destroy_super(struct super_block *s)
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
for (i = 0; i < SB_FREEZE_LEVELS; i++)
- percpu_counter_destroy(&s->s_writers.counter[i]);
+ percpu_free_rwsem(s->s_writers.rw_sem + i);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
kfree(s->s_subtype);
@@ -178,14 +178,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
goto fail;

for (i = 0; i < SB_FREEZE_LEVELS; i++) {
- if (percpu_counter_init(&s->s_writers.counter[i], 0,
- GFP_KERNEL) < 0)
- goto fail;
- lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
- &type->s_writers_key[i], 0);
+ __percpu_init_rwsem(&s->s_writers.rw_sem[i],
+ sb_writers_name[i],
+ &type->s_writers_key[i]);
}
- init_waitqueue_head(&s->s_writers.wait);
- init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
INIT_HLIST_NODE(&s->s_instances);
@@ -1146,43 +1142,10 @@ out:
*/
void __sb_end_write(struct super_block *sb, int level)
{
- percpu_counter_dec(&sb->s_writers.counter[level-1]);
- /*
- * Make sure s_writers are updated before we wake up waiters in
- * freeze_super().
- */
- smp_mb();
- if (waitqueue_active(&sb->s_writers.wait))
- wake_up(&sb->s_writers.wait);
- rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+ percpu_up_read(sb->s_writers.rw_sem + level-1);
}
EXPORT_SYMBOL(__sb_end_write);

-#ifdef CONFIG_LOCKDEP
-/*
- * We want lockdep to tell us about possible deadlocks with freezing but
- * it's it bit tricky to properly instrument it. Getting a freeze protection
- * works as getting a read lock but there are subtle problems. XFS for example
- * gets freeze protection on internal level twice in some cases, which is OK
- * only because we already hold a freeze protection also on higher level. Due
- * to these cases we have to tell lockdep we are doing trylock when we
- * already hold a freeze protection for a higher freeze level.
- */
-static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
- unsigned long ip)
-{
- int i;
-
- if (!trylock) {
- for (i = 0; i < level - 1; i++)
- if (lock_is_held(&sb->s_writers.lock_map[i])) {
- trylock = true;
- break;
- }
- }
- rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
-}
-#endif

/*
* This is an internal function, please use sb_start_{write,pagefault,intwrite}
@@ -1190,27 +1153,15 @@ static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
*/
int __sb_start_write(struct super_block *sb, int level, bool wait)
{
-retry:
- if (unlikely(sb->s_writers.frozen >= level)) {
- if (!wait)
- return 0;
- wait_event(sb->s_writers.wait_unfrozen,
- sb->s_writers.frozen < level);
- }
-
-#ifdef CONFIG_LOCKDEP
- acquire_freeze_lock(sb, level, !wait, _RET_IP_);
-#endif
- percpu_counter_inc(&sb->s_writers.counter[level-1]);
/*
- * Make sure counter is updated before we check for frozen.
- * freeze_super() first sets frozen and then checks the counter.
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ * !!!!!!!!! percpu_down_read_trylock() wasn't merged yet !!!!!!!!!!!!!
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
*/
- smp_mb();
- if (unlikely(sb->s_writers.frozen >= level)) {
- __sb_end_write(sb, level);
- goto retry;
- }
+ if (!wait)
+ return 0;
+
+ percpu_down_read(sb->s_writers.rw_sem + level-1);
return 1;
}
EXPORT_SYMBOL(__sb_start_write);
@@ -1227,42 +1178,46 @@ EXPORT_SYMBOL(__sb_start_write);
*/
static void sb_wait_write(struct super_block *sb, int level)
{
- s64 writers;
-
- rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
-
- do {
- DEFINE_WAIT(wait);
-
- /*
- * We use a barrier in prepare_to_wait() to separate setting
- * of frozen and checking of the counter
- */
- prepare_to_wait(&sb->s_writers.wait, &wait,
- TASK_UNINTERRUPTIBLE);
-
- writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
- if (writers)
- schedule();
-
- finish_wait(&sb->s_writers.wait, &wait);
- } while (writers);
+ percpu_down_write(sb->s_writers.rw_sem + level-1);
}

+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ * !!!!!!!!!!! Move this code into kernel/locking/percpu-rwsem.c !!!!!!!!!!!!!!
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+#include "../kernel/locking/rwsem.h"
/* Avoid the warning from lockdep_sys_exit() */
static void sb_lockdep_release(struct super_block *sb)
{
int level;
+ struct percpu_rw_semaphore *sem;
+
+ for (level = 0; level < SB_FREEZE_LEVELS; ++level) {
+ sem = sb->s_writers.rw_sem + level;
+ rwsem_clear_owner(&sem->rw_sem);
+ rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+ }
+}
+
+static void sb_lockdep_acquire(struct super_block *sb)
+{
+ int level;
+ struct percpu_rw_semaphore *sem;

for (level = 0; level < SB_FREEZE_LEVELS; ++level) {
- rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
+ sem = sb->s_writers.rw_sem + level;
+ rwsem_set_owner(&sem->rw_sem); /* unneeded */
+ rwsem_acquire(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
}
}

static void sb_unlock_frozen(struct super_block *sb)
{
- smp_wmb();
- wake_up(&sb->s_writers.wait_unfrozen);
+ int level;
+
+ for (level = 0; level < SB_FREEZE_LEVELS; ++level)
+ percpu_up_write(sb->s_writers.rw_sem + level);
}

/**
@@ -1323,8 +1278,6 @@ int freeze_super(struct super_block *sb)

/* From now on, no new normal writers can start */
sb->s_writers.frozen = SB_FREEZE_WRITE;
- smp_wmb();
-
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
up_write(&sb->s_umount);

@@ -1332,9 +1285,8 @@ int freeze_super(struct super_block *sb)

/* Now we go and block page faults... */
down_write(&sb->s_umount);
- sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
- smp_wmb();

+ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);

/* All writers are done so after syncing there won't be dirty data */
@@ -1342,7 +1294,6 @@ int freeze_super(struct super_block *sb)

/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
- smp_wmb();
sb_wait_write(sb, SB_FREEZE_FS);

if (sb->s_op->freeze_fs) {
@@ -1350,8 +1301,8 @@ int freeze_super(struct super_block *sb)
if (ret) {
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
- sb->s_writers.frozen = SB_UNFROZEN;
sb_unlock_frozen(sb);
+ sb->s_writers.frozen = SB_UNFROZEN;
deactivate_locked_super(sb);
return ret;
}
@@ -1386,6 +1337,8 @@ int thaw_super(struct super_block *sb)
if (sb->s_flags & MS_RDONLY)
goto out;

+ sb_lockdep_acquire(sb);
+
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
if (error) {
@@ -1396,9 +1349,9 @@ int thaw_super(struct super_block *sb)
}
}

+ sb_unlock_frozen(sb);
out:
sb->s_writers.frozen = SB_UNFROZEN;
- sb_unlock_frozen(sb);
deactivate_locked_super(sb);

return 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 35ec87e..314e2d0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1,7 +1,6 @@
#ifndef _LINUX_FS_H
#define _LINUX_FS_H

-
#include <linux/linkage.h>
#include <linux/wait.h>
#include <linux/kdev_t.h>
@@ -30,6 +29,7 @@
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/blk_types.h>
+#include <linux/percpu-rwsem.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -1246,16 +1246,8 @@ enum {
#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)

struct sb_writers {
- /* Counters for counting writers at each level */
- struct percpu_counter counter[SB_FREEZE_LEVELS];
- wait_queue_head_t wait; /* queue for waiting for
- writers / faults to finish */
- int frozen; /* Is sb frozen? */
- wait_queue_head_t wait_unfrozen; /* queue for waiting for
- sb to be thawed */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map lock_map[SB_FREEZE_LEVELS];
-#endif
+ int frozen; /* Is sb frozen? */
+ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};

struct super_block {
--
1.5.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/