[PATCH 4/7] fs: break the file_list_lock for sb->s_files

From: Peter Zijlstra
Date: Sun Jan 28 2007 - 07:12:03 EST


Break the protection of sb->s_files out from under the global file_list_lock.

sb->s_files is converted to a lock_list. furthermore to prevent the
lock_list_head of getting too contended with concurrent add operations
the add is buffered in per cpu filevecs.

This would ordinarily require a flush before a delete operation - to ensure
the to be deleted entry is indeed added to the list. This is avoided by
storing a pointer to the filevec location in the not yet used list_head.
This pointer can then be used to clear the filevec entry before its actually
added.

The file_flag mess is a bit unfortunate - this can be removed by also
converting tty->tty_files to a lock_list (TODO).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
---
fs/dquot.c | 10 +-
fs/file_table.c | 170 +++++++++++++++++++++++++++++++++++++++----
fs/open.c | 2
fs/proc/generic.c | 8 --
fs/super.c | 7 -
include/linux/fs.h | 23 +++++
mm/readahead.c | 2
security/selinux/selinuxfs.c | 9 +-
8 files changed, 194 insertions(+), 37 deletions(-)

Index: linux-2.6/fs/dquot.c
===================================================================
--- linux-2.6.orig/fs/dquot.c 2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/dquot.c 2007-01-27 21:07:44.000000000 +0100
@@ -688,23 +688,21 @@ static int dqinit_needed(struct inode *i
/* This routine is guarded by dqonoff_mutex mutex */
static void add_dquot_ref(struct super_block *sb, int type)
{
- struct list_head *p;
+ struct file *filp;

restart:
- file_list_lock();
- list_for_each(p, &sb->s_files) {
- struct file *filp = list_entry(p, struct file, f_u.fu_list);
+ filevec_add_drain_all();
+ lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
struct inode *inode = filp->f_path.dentry->d_inode;
if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) {
struct dentry *dentry = dget(filp->f_path.dentry);
- file_list_unlock();
+ lock_list_for_each_entry_stop(filp, f_u.fu_llist);
sb->dq_op->initialize(inode, type);
dput(dentry);
/* As we may have blocked we had better restart... */
goto restart;
}
}
- file_list_unlock();
}

/* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */
Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c 2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/file_table.c 2007-01-27 21:07:44.000000000 +0100
@@ -113,7 +113,7 @@ struct file *get_empty_filp(void)
goto fail_sec;

tsk = current;
- INIT_LIST_HEAD(&f->f_u.fu_list);
+ INIT_LOCK_LIST_HEAD(&f->f_u.fu_llist);
atomic_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
f->f_uid = tsk->fsuid;
@@ -245,32 +245,175 @@ void put_filp(struct file *file)
}
}

-void file_move(struct file *file, struct list_head *list)
+enum {
+ FILEVEC_SIZE = 15
+};
+
+struct filevec {
+ unsigned long nr;
+ struct file *files[FILEVEC_SIZE];
+};
+
+static DEFINE_PER_CPU(struct filevec, sb_fvec);
+
+static inline unsigned int filevec_size(struct filevec *fvec)
{
- if (!list)
- return;
- file_list_lock();
- list_move(&file->f_u.fu_list, list);
- file_list_unlock();
+ return FILEVEC_SIZE - fvec->nr;
+}
+
+static inline unsigned int filevec_count(struct filevec *fvec)
+{
+ return fvec->nr;
+}
+
+static inline void filevec_reinit(struct filevec *fvec)
+{
+ fvec->nr = 0;
+}
+
+static inline unsigned int filevec_add(struct filevec *fvec, struct file *filp)
+{
+ rcu_assign_pointer(fvec->files[fvec->nr], filp);
+
+ /*
+ * Here we do icky stuff in order to avoid flushing the per cpu filevec
+ * on list removal.
+ *
+ * We store the location on the per cpu filevec in the as of yet unused
+ * fu_llist.next field and toggle bit 0 to indicate we done so. This
+ * allows the removal code to set the filevec entry to NULL, thereby
+ * avoiding the list add.
+ *
+ * Abuse the fu_llist.lock for protection.
+ */
+ spin_lock(&filp->f_u.fu_llist.lock);
+ filp->f_u.fu_llist.next = (void *)&fvec->files[fvec->nr];
+ __set_bit(0, (void *)&filp->f_u.fu_llist.next);
+ spin_unlock(&filp->f_u.fu_llist.lock);
+
+ fvec->nr++;
+ return filevec_size(fvec);
+}
+
+static void __filevec_add(struct filevec *fvec)
+{
+ int i;
+
+ for (i = 0; i < filevec_count(fvec); i++) {
+ struct file *filp;
+
+ /*
+ * see the comment in filevec_add();
+ * need RCU because a concurrent remove might have deleted
+ * the entry from under us.
+ */
+ rcu_read_lock();
+ filp = rcu_dereference(fvec->files[i]);
+ /*
+ * the simple case, its gone - NEXT!
+ */
+ if (!filp) {
+ rcu_read_unlock();
+ continue;
+ }
+
+ spin_lock(&filp->f_u.fu_llist.lock);
+ /*
+ * If the entry really is still there, add it!
+ */
+ if (rcu_dereference(fvec->files[i])) {
+ struct super_block *sb =
+ filp->f_mapping->host->i_sb;
+
+ __lock_list_add(&filp->f_u.fu_llist, &sb->s_files);
+ }
+ spin_unlock(&filp->f_u.fu_llist.lock);
+ rcu_read_unlock();
+ }
+ filevec_reinit(fvec);
+}
+
+static void filevec_add_drain(void)
+{
+ struct filevec *fvec = &get_cpu_var(sb_fvec);
+ if (filevec_count(fvec))
+ __filevec_add(fvec);
+ put_cpu_var(sb_fvec);
}

+static void filevec_add_drain_per_cpu(struct work_struct *dummy)
+{
+ filevec_add_drain();
+}
+
+int filevec_add_drain_all(void)
+{
+ return schedule_on_each_cpu(filevec_add_drain_per_cpu);
+}
+EXPORT_SYMBOL_GPL(filevec_add_drain_all);
+
void file_kill(struct file *file)
{
- if (!list_empty(&file->f_u.fu_list)) {
+ if (file_flag(file, F_SUPERBLOCK)) {
+ void **ptr;
+
+ file_flag_clear(file, F_SUPERBLOCK);
+
+ /*
+ * If bit 0 of the fu_llist.next pointer is set we're still
+ * enqueued on a per cpu filevec, in that case clear the entry
+ * and we're done.
+ */
+ spin_lock(&file->f_u.fu_llist.lock);
+ ptr = (void **)file->f_u.fu_llist.next;
+ if (__test_and_clear_bit(0, (void *)&ptr)) {
+ rcu_assign_pointer(*ptr, NULL);
+ INIT_LIST_HEAD(&file->f_u.fu_llist.head);
+ spin_unlock(&file->f_u.fu_llist.lock);
+ return;
+ }
+ spin_unlock(&file->f_u.fu_llist.lock);
+
+ if (!list_empty(&file->f_u.fu_list))
+ lock_list_del_init(&file->f_u.fu_llist);
+
+ } else if (!list_empty(&file->f_u.fu_list)) {
file_list_lock();
list_del_init(&file->f_u.fu_list);
file_list_unlock();
}
}

+void file_move(struct file *file, struct list_head *list)
+{
+ struct super_block *sb;
+
+ if (!list)
+ return;
+
+ file_kill(file);
+
+ sb = file->f_mapping->host->i_sb;
+ if (list == &sb->s_files.head) {
+ struct filevec *fvec = &get_cpu_var(sb_fvec);
+ file_flag_set(file, F_SUPERBLOCK);
+ if (!filevec_add(fvec, file))
+ __filevec_add(fvec);
+ put_cpu_var(sb_fvec);
+ } else {
+ file_list_lock();
+ list_add(&file->f_u.fu_list, list);
+ file_list_unlock();
+ }
+}
+
int fs_may_remount_ro(struct super_block *sb)
{
- struct list_head *p;
+ struct file *file;

/* Check that no files are currently opened for writing. */
- file_list_lock();
- list_for_each(p, &sb->s_files) {
- struct file *file = list_entry(p, struct file, f_u.fu_list);
+ filevec_add_drain_all();
+ lock_list_for_each_entry(file, &sb->s_files, f_u.fu_llist) {
struct inode *inode = file->f_path.dentry->d_inode;

/* File with pending delete? */
@@ -281,10 +424,9 @@ int fs_may_remount_ro(struct super_block
if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
goto too_bad;
}
- file_list_unlock();
return 1; /* Tis' cool bro. */
too_bad:
- file_list_unlock();
+ lock_list_for_each_entry_stop(file, f_u.fu_llist);
return 0;
}

Index: linux-2.6/fs/open.c
===================================================================
--- linux-2.6.orig/fs/open.c 2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/open.c 2007-01-27 21:07:44.000000000 +0100
@@ -692,7 +692,7 @@ static struct file *__dentry_open(struct
f->f_path.mnt = mnt;
f->f_pos = 0;
f->f_op = fops_get(inode->i_fop);
- file_move(f, &inode->i_sb->s_files);
+ file_move(f, &inode->i_sb->s_files.head);

if (!open && f->f_op)
open = f->f_op->open;
Index: linux-2.6/fs/proc/generic.c
===================================================================
--- linux-2.6.orig/fs/proc/generic.c 2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/proc/generic.c 2007-01-27 21:07:44.000000000 +0100
@@ -549,15 +549,14 @@ static int proc_register(struct proc_dir
*/
static void proc_kill_inodes(struct proc_dir_entry *de)
{
- struct list_head *p;
+ struct file *filp;
struct super_block *sb = proc_mnt->mnt_sb;

/*
* Actually it's a partial revoke().
*/
- file_list_lock();
- list_for_each(p, &sb->s_files) {
- struct file * filp = list_entry(p, struct file, f_u.fu_list);
+ filevec_add_drain_all();
+ lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
struct dentry * dentry = filp->f_path.dentry;
struct inode * inode;
const struct file_operations *fops;
@@ -571,7 +570,6 @@ static void proc_kill_inodes(struct proc
filp->f_op = NULL;
fops_put(fops);
}
- file_list_unlock();
}

static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c 2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/super.c 2007-01-27 21:07:44.000000000 +0100
@@ -67,7 +67,7 @@ static struct super_block *alloc_super(s
}
INIT_LIST_HEAD(&s->s_dirty);
INIT_LIST_HEAD(&s->s_io);
- INIT_LIST_HEAD(&s->s_files);
+ INIT_LOCK_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
@@ -568,12 +568,11 @@ static void mark_files_ro(struct super_b
{
struct file *f;

- file_list_lock();
- list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+ filevec_add_drain_all();
+ lock_list_for_each_entry(f, &sb->s_files, f_u.fu_llist) {
if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f))
f->f_mode &= ~FMODE_WRITE;
}
- file_list_unlock();
}

/**
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h 2007-01-22 22:43:28.000000000 +0100
+++ linux-2.6/include/linux/fs.h 2007-01-27 21:07:44.000000000 +0100
@@ -275,6 +275,7 @@ extern int dir_notify_enable;
#include <linux/cache.h>
#include <linux/kobject.h>
#include <linux/list.h>
+#include <linux/lock_list.h>
#include <linux/radix-tree.h>
#include <linux/prio_tree.h>
#include <linux/init.h>
@@ -709,9 +710,13 @@ struct file {
/*
* fu_list becomes invalid after file_free is called and queued via
* fu_rcuhead for RCU freeing
+ * fu_llist is used for the superblock s_files list; its crucial that
+ * the spinlock contained therein is not clobbered by other uses of
+ * the union.
*/
union {
struct list_head fu_list;
+ struct lock_list_head fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path;
@@ -744,9 +749,25 @@ extern spinlock_t files_lock;
#define file_list_lock() spin_lock(&files_lock);
#define file_list_unlock() spin_unlock(&files_lock);

+/*
+ * steal the upper 8 bits from the read-a-head flags
+ */
+#define F_SHIFT 24
+
+#define F_SUPERBLOCK 0
+
+#define file_flag_set(file, flag) \
+ __set_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+#define file_flag_clear(file, flag) \
+ __clear_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+#define file_flag(file, flag) \
+ test_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+
#define get_file(x) atomic_inc(&(x)->f_count)
#define file_count(x) atomic_read(&(x)->f_count)

+extern int filevec_add_drain_all(void);
+
#define MAX_NON_LFS ((1UL<<31) - 1)

/* Page cache limit. The filesystems should put that into their s_maxbytes
@@ -928,7 +949,7 @@ struct super_block {
struct list_head s_dirty; /* dirty inodes */
struct list_head s_io; /* parked for writeback */
struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
- struct list_head s_files;
+ struct lock_list_head s_files;

struct block_device *s_bdev;
struct list_head s_instances;
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c 2007-01-22 22:43:29.000000000 +0100
+++ linux-2.6/mm/readahead.c 2007-01-27 21:07:44.000000000 +0100
@@ -69,7 +69,7 @@ static inline void reset_ahead_window(st
static inline void ra_off(struct file_ra_state *ra)
{
ra->start = 0;
- ra->flags = 0;
+ ra->flags &= (~0UL) << F_SHIFT;
ra->size = 0;
reset_ahead_window(ra);
return;
Index: linux-2.6/security/selinux/selinuxfs.c
===================================================================
--- linux-2.6.orig/security/selinux/selinuxfs.c 2007-01-13 21:04:19.000000000 +0100
+++ linux-2.6/security/selinux/selinuxfs.c 2007-01-27 21:10:48.000000000 +0100
@@ -940,7 +940,8 @@ static struct file_operations sel_commit
* fs/proc/generic.c proc_kill_inodes */
static void sel_remove_bools(struct dentry *de)
{
- struct list_head *p, *node;
+ struct list_head *node;
+ struct file *filp;
struct super_block *sb = de->d_sb;

spin_lock(&dcache_lock);
@@ -962,9 +963,8 @@ static void sel_remove_bools(struct dent

spin_unlock(&dcache_lock);

- file_list_lock();
- list_for_each(p, &sb->s_files) {
- struct file * filp = list_entry(p, struct file, f_u.fu_list);
+ filevec_add_drain_all();
+ lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
struct dentry * dentry = filp->f_path.dentry;

if (dentry->d_parent != de) {
@@ -972,7 +972,6 @@ static void sel_remove_bools(struct dent
}
filp->f_op = NULL;
}
- file_list_unlock();
}

#define BOOL_DIR_NAME "booleans"

--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/