[PATCH] track number of mnts writing to superblocks

From: Dave Hansen
Date: Wed Jan 02 2008 - 16:51:30 EST



One of the benefits of the r/o bind mount patches is that they
make it explicit when a write to a superblock might occur.
We currently search sb->s_files when remounting rw->ro to look
for writable files. But, that search is not comprehensive, and
it is racy. This replaces that search.

The idea is to keep a bit in each mount saying whether the
mount has any writers on it. When the bit is set the first
time, we also increment a counter in the superblock. That
sb counter is the number of mounts with that bit set and
thus, potential writers.

The other problem is that, after we make this check for
the number of writable mounts, we need to exclude all new
writers on those mounts. We do this by requring that the
superblock mnt writer count be incremented under a
lock_super() and also holding that lock over the remount
operation. Effectively, this keeps us from *adding* to
the sb's writable mounts during a remount.

The alternative to doing this is to do a much simpler list
of mounts for each superblock. I could also code that up
to see what it look like. Shouldn't be too bad.


Signed-off-by: Dave Hansen <haveblue@xxxxxxxxxx>
---

linux-2.6.git-dave/fs/file_table.c | 24 -----
linux-2.6.git-dave/fs/namespace.c | 134 +++++++++++++++++++++++++------
linux-2.6.git-dave/fs/super.c | 61 +++++++++++---
linux-2.6.git-dave/include/linux/fs.h | 5 -
linux-2.6.git-dave/include/linux/mount.h | 3
5 files changed, 163 insertions(+), 64 deletions(-)

diff -puN fs/file_table.c~track_sb_mnt_writers fs/file_table.c
--- linux-2.6.git/fs/file_table.c~track_sb_mnt_writers 2008-01-02 10:49:11.000000000 -0800
+++ linux-2.6.git-dave/fs/file_table.c 2008-01-02 10:49:11.000000000 -0800
@@ -374,30 +374,6 @@ void file_kill(struct file *file)
}
}

-int fs_may_remount_ro(struct super_block *sb)
-{
- struct file *file;
-
- /* Check that no files are currently opened for writing. */
- file_list_lock();
- list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
- struct inode *inode = file->f_path.dentry->d_inode;
-
- /* File with pending delete? */
- if (inode->i_nlink == 0)
- goto too_bad;
-
- /* Writeable file? */
- if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
- goto too_bad;
- }
- file_list_unlock();
- return 1; /* Tis' cool bro. */
-too_bad:
- file_list_unlock();
- return 0;
-}
-
void __init files_init(unsigned long mempages)
{
int n;
diff -puN fs/file_table.c.orig~track_sb_mnt_writers fs/file_table.c.orig
diff -puN fs/namespace.c~track_sb_mnt_writers fs/namespace.c
--- linux-2.6.git/fs/namespace.c~track_sb_mnt_writers 2008-01-02 10:49:11.000000000 -0800
+++ linux-2.6.git-dave/fs/namespace.c 2008-01-02 13:39:52.000000000 -0800
@@ -118,7 +118,7 @@ struct mnt_writer {
* If holding multiple instances of this lock, they
* must be ordered by cpu number.
*/
- spinlock_t lock;
+ struct mutex lock;
struct lock_class_key lock_class; /* compiles out with !lockdep */
unsigned long count;
struct vfsmount *mnt;
@@ -130,7 +130,7 @@ static int __init init_mnt_writers(void)
int cpu;
for_each_possible_cpu(cpu) {
struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
- spin_lock_init(&writer->lock);
+ mutex_init(&writer->lock);
lockdep_set_class(&writer->lock, &writer->lock_class);
writer->count = 0;
}
@@ -145,11 +145,52 @@ static void mnt_unlock_cpus(void)

for_each_possible_cpu(cpu) {
cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_unlock(&cpu_writer->lock);
+ mutex_unlock(&cpu_writer->lock);
}
}

-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+static int mark_mnt_has_writer(struct vfsmount *mnt,
+ struct mnt_writer *cpu_writer)
+{
+ /*
+ * Ensure that if there are people racing to set
+ * the bit that only one of them succeeds and can
+ * increment the sb count.
+ */
+ if (test_and_set_bit(ilog2(MNT_MAY_HAVE_WRITERS), &mnt->mnt_flags))
+ return 0;
+ if (cpu_writer == NULL)
+ return 0;
+
+ /*
+ * Our goal here is to get exclusion from a superblock
+ * remount operation (fs_may_remount_ro()). This is
+ * effectively a slow path that we must go through the
+ * first time we set the bit on each mount, but never
+ * again unless the writer counts get coalesced.
+ */
+
+ mutex_unlock(&cpu_writer->lock);
+ lock_super(mnt->mnt_sb);
+
+ atomic_inc(&mnt->mnt_sb->__s_mnt_writers);
+
+ unlock_super(mnt->mnt_sb);
+ mutex_lock(&cpu_writer->lock);
+ return -EAGAIN;
+}
+
+static void __mark_sb_if_writable(struct vfsmount *mnt)
+{
+ int bitnr = ilog2(MNT_MAY_HAVE_WRITERS);
+
+ if (atomic_read(&mnt->__mnt_writers))
+ mark_mnt_has_writer(mnt, NULL);
+ else if (test_and_clear_bit(bitnr, &mnt->mnt_flags))
+ atomic_dec(&mnt->mnt_sb->__s_mnt_writers);
+}
+
+static inline void __move_cpu_mnt_count(struct mnt_writer *cpu_writer)
{
if (!cpu_writer->mnt)
return;
@@ -164,7 +205,7 @@ static inline void use_cpu_writer_for_mo
{
if (cpu_writer->mnt == mnt)
return;
- __clear_mnt_count(cpu_writer);
+ __move_cpu_mnt_count(cpu_writer);
cpu_writer->mnt = mnt;
}

@@ -188,20 +229,31 @@ static inline void use_cpu_writer_for_mo
*/
int mnt_want_write(struct vfsmount *mnt)
{
- int ret = 0;
+ int ret;
struct mnt_writer *cpu_writer;

- cpu_writer = &get_cpu_var(mnt_writers);
- spin_lock(&cpu_writer->lock);
+ /*
+ * It is OK to not disable preemption here. We
+ * only use the per-cpu vars to reduce cache
+ * effects, and not for any real exclusion.
+ * We use the mutexes for that.
+ */
+ cpu_writer = &__get_cpu_var(mnt_writers);
+ mutex_lock(&cpu_writer->lock);
+again:
+ ret = 0;
if (__mnt_is_readonly(mnt)) {
ret = -EROFS;
goto out;
}
+ ret = mark_mnt_has_writer(mnt, cpu_writer);
+ /* did we hit the slow path and re-do the lock? */
+ if (ret == -EAGAIN)
+ goto again;
use_cpu_writer_for_mount(cpu_writer, mnt);
cpu_writer->count++;
out:
- spin_unlock(&cpu_writer->lock);
- put_cpu_var(mnt_writers);
+ mutex_unlock(&cpu_writer->lock);
return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);
@@ -213,13 +265,37 @@ static void lock_and_coalesce_cpu_mnt_wr

for_each_possible_cpu(cpu) {
cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_lock(&cpu_writer->lock);
- __clear_mnt_count(cpu_writer);
+ mutex_lock(&cpu_writer->lock);
+ __move_cpu_mnt_count(cpu_writer);
+ /*
+ * __mnt_writers may temporarily hit zero
+ * (and trigger MNT_MAY_HAVE_WRITERS to get
+ * cleared), but it will get set again if
+ * and when another mnt_writer[] has an
+ * entry for that mnt later in this loop.
+ * Holding the locks keeps anyone from
+ * seeing this.
+ */
+ __mark_sb_if_writable(cpu_writer->mnt);
cpu_writer->mnt = NULL;
}
}

/*
+ * This is just an external interface. I want
+ * to use the long names in here, but leave the
+ * simpler names for external users.
+ */
+void lock_mnt_writers()
+{
+ lock_and_coalesce_cpu_mnt_writer_counts();
+}
+void unlock_mnt_writers()
+{
+ mnt_unlock_cpus();
+}
+
+/*
* These per-cpu write counts are not guaranteed to have
* matched increments and decrements on any given cpu.
* A file open()ed for write on one cpu and close()d on
@@ -262,23 +338,40 @@ static void handle_write_count_underflow
* performing writes to it. Must be matched with
* mnt_want_write() call above.
*/
+#define MNT_WRITERS_UNDERFLOW_LIMIT 1024
void mnt_drop_write(struct vfsmount *mnt)
{
int must_check_underflow = 0;
struct mnt_writer *cpu_writer;

- cpu_writer = &get_cpu_var(mnt_writers);
- spin_lock(&cpu_writer->lock);
+retry:
+ cpu_writer = &__get_cpu_var(mnt_writers);
+ mutex_lock(&cpu_writer->lock);

use_cpu_writer_for_mount(cpu_writer, mnt);
if (cpu_writer->count > 0) {
cpu_writer->count--;
} else {
- must_check_underflow = 1;
+ /*
+ * Without this check, it is theoretically
+ * possible for large number of processes
+ * to atomic_dec(__mnt_writers) and cause
+ * it to underflow. Because we are under
+ * the mutex (and there are NR_CPUS
+ * mutexes), this limits the number of
+ * concurrent decrements and underflow
+ * level to NR_CPUS.
+ */
+ if (atomic_read(&mnt->__mnt_writers) <
+ MNT_WRITER_UNDERFLOW_LIMIT) {
+ mutex_unlock(&cpu_writer->lock);
+ goto retry;
+ }
atomic_dec(&mnt->__mnt_writers);
+ must_check_underflow = 1;
}

- spin_unlock(&cpu_writer->lock);
+ mutex_unlock(&cpu_writer->lock);
/*
* Logically, we could call this each time,
* but the __mnt_writers cacheline tends to
@@ -286,15 +379,6 @@ void mnt_drop_write(struct vfsmount *mnt
*/
if (must_check_underflow)
handle_write_count_underflow(mnt);
- /*
- * This could be done right after the spinlock
- * is taken because the spinlock keeps us on
- * the cpu, and disables preemption. However,
- * putting it here bounds the amount that
- * __mnt_writers can underflow. Without it,
- * we could theoretically wrap __mnt_writers.
- */
- put_cpu_var(mnt_writers);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

diff -puN fs/namespace.c.orig~track_sb_mnt_writers fs/namespace.c.orig
diff -puN include/linux/fs.h~track_sb_mnt_writers include/linux/fs.h
--- linux-2.6.git/include/linux/fs.h~track_sb_mnt_writers 2008-01-02 10:49:11.000000000 -0800
+++ linux-2.6.git-dave/include/linux/fs.h 2008-01-02 10:49:11.000000000 -0800
@@ -1005,6 +1005,9 @@ struct super_block {
#ifdef CONFIG_SECURITY
void *s_security;
#endif
+ atomic_t __s_mnt_writers;/* how many mounts of this sb
+ * might have writers. Only
+ * stable with lock_mnt_writers() */
struct xattr_handler **s_xattr;

struct list_head s_inodes; /* all inodes */
@@ -1650,8 +1653,6 @@ extern const struct file_operations read
extern const struct file_operations write_fifo_fops;
extern const struct file_operations rdwr_fifo_fops;

-extern int fs_may_remount_ro(struct super_block *);
-
#ifdef CONFIG_BLOCK
/*
* return READ, READA, or WRITE
diff -puN include/linux/fs.h.orig~track_sb_mnt_writers include/linux/fs.h.orig
diff -puN include/linux/mount.h~track_sb_mnt_writers include/linux/mount.h
--- linux-2.6.git/include/linux/mount.h~track_sb_mnt_writers 2008-01-02 10:49:11.000000000 -0800
+++ linux-2.6.git-dave/include/linux/mount.h 2008-01-02 10:49:11.000000000 -0800
@@ -33,6 +33,7 @@ struct mnt_namespace;

#define MNT_SHRINKABLE 0x100
#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
+#define MNT_MAY_HAVE_WRITERS 0x400 /* did this ever have a writer? */

#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
@@ -80,6 +81,8 @@ static inline struct vfsmount *mntget(st

extern int mnt_want_write(struct vfsmount *mnt);
extern void mnt_drop_write(struct vfsmount *mnt);
+extern void lock_mnt_writers(void);
+extern void unlock_mnt_writers(void);
extern void mntput_no_expire(struct vfsmount *mnt);
extern void mnt_pin(struct vfsmount *mnt);
extern void mnt_unpin(struct vfsmount *mnt);
diff -puN include/linux/mount.h.orig~track_sb_mnt_writers include/linux/mount.h.orig
diff -puN fs/super.c~track_sb_mnt_writers fs/super.c
--- linux-2.6.git/fs/super.c~track_sb_mnt_writers 2008-01-02 10:49:11.000000000 -0800
+++ linux-2.6.git-dave/fs/super.c 2008-01-02 13:38:15.000000000 -0800
@@ -574,6 +574,41 @@ static void mark_files_ro(struct super_b
file_list_unlock();
}

+static inline int fs_may_remount_ro(struct super_block *sb)
+{
+ int ret = 1;
+ lock_mnt_writers();
+ if (atomic_read(&sb->__s_mnt_writers))
+ ret = 0;
+ unlock_mnt_writers();
+ return ret;
+}
+
+/*
+ * This unconditionally acquires the sb lock,
+ * even if it returns an error code, and should
+ * not be used generally.
+ *
+ * This must be done to ensure that no new
+ * writers come in on the mounts after the
+ * check is performed.
+ */
+static int __try_to_make_sb_ro(struct super_block *sb, int force)
+{
+ int retval = 0;
+
+ if (force) {
+ mark_files_ro(sb);
+ lock_super(sb);
+ } else {
+ lock_super(sb);
+ /* Make sure there are no mounts that have writers */
+ if (!fs_may_remount_ro(sb))
+ retval = -EBUSY;
+ }
+ return retval;
+}
+
/**
* do_remount_sb - asks filesystem to change mount options.
* @sb: superblock in question
@@ -585,7 +620,7 @@ static void mark_files_ro(struct super_b
*/
int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
{
- int retval;
+ int retval = 0;

#ifdef CONFIG_BLOCK
if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
@@ -596,23 +631,23 @@ int do_remount_sb(struct super_block *sb
shrink_dcache_sb(sb);
fsync_super(sb);

- /* If we are remounting RDONLY and current sb is read/write,
- make sure there are no rw files opened */
- if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
- if (force)
- mark_files_ro(sb);
- else if (!fs_may_remount_ro(sb))
- return -EBUSY;
- }
-
- if (sb->s_op->remount_fs) {
+ /* Are we remounting RDONLY when current sb is read/write? */
+ if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
+ retval = __try_to_make_sb_ro(sb, force);
+ else
lock_super(sb);
+
+ if (retval)
+ goto out;
+ if (sb->s_op->remount_fs)
retval = sb->s_op->remount_fs(sb, &flags, data);
+out:
+ if (retval) {
unlock_super(sb);
- if (retval)
- return retval;
+ return retval;
}
sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+ unlock_super(sb);
return 0;
}

diff -puN block/cfq-iosched.c~track_sb_mnt_writers block/cfq-iosched.c
diff -puN fs/namei.c~track_sb_mnt_writers fs/namei.c
diff -puN fs/buffer.c~track_sb_mnt_writers fs/buffer.c
diff -puN kernel/Makefile~track_sb_mnt_writers kernel/Makefile
_
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/