[PATCH] vfs: Turn the unlinked inode counter into a percpu counter

From: Andi Kleen
Date: Fri Jul 11 2014 - 16:24:18 EST


From: Andi Kleen <ak@xxxxxxxxxxxxxxx>

The unlinked open inodes super block counter that was added some time ago
unfortunately becomes a very hot cache line in some delete heavy
high IOPs or tmpfs workloads with enough cores, when files
are frequently removed. With TSX it also causes plenty of aborts.

Turn the atomic into a percpu_counter. It's nearly perfectly
suited for a percpu counter because it is only checked
in very slow paths, like remount.

Original commit:

commit 7ada4db88634429f4da690ad1c4eb73c93085f0c
Author: Miklos Szeredi <mszeredi@xxxxxxx>
Date: Mon Nov 21 12:11:32 2011 +0100

vfs: count unlinked inodes

Add a new counter to the superblock that keeps track of unlinked but
not yet deleted inodes.

Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: Miklos Szeredi <mszeredi@xxxxxxx>
Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
---
fs/ext2/super.c | 2 +-
fs/inode.c | 14 ++++++--------
fs/namespace.c | 4 ++--
fs/super.c | 3 +++
include/linux/fs.h | 2 +-
5 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 3750031..ac0ad9a 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1218,7 +1218,7 @@ static int ext2_freeze(struct super_block *sb)
* because we have unattached inodes and thus filesystem is not fully
* consistent.
*/
- if (atomic_long_read(&sb->s_remove_count)) {
+ if (percpu_counter_sum(&sb->s_remove_counters)) {
ext2_sync_fs(sb, 1);
return 0;
}
diff --git a/fs/inode.c b/fs/inode.c
index f96d2a6..4820c35 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -234,10 +234,8 @@ void __destroy_inode(struct inode *inode)
BUG_ON(inode_has_buffers(inode));
security_inode_free(inode);
fsnotify_inode_delete(inode);
- if (!inode->i_nlink) {
- WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
- atomic_long_dec(&inode->i_sb->s_remove_count);
- }
+ if (!inode->i_nlink)
+ percpu_counter_dec(&inode->i_sb->s_remove_counters);

#ifdef CONFIG_FS_POSIX_ACL
if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
@@ -281,7 +279,7 @@ void drop_nlink(struct inode *inode)
WARN_ON(inode->i_nlink == 0);
inode->__i_nlink--;
if (!inode->i_nlink)
- atomic_long_inc(&inode->i_sb->s_remove_count);
+ percpu_counter_inc(&inode->i_sb->s_remove_counters);
}
EXPORT_SYMBOL(drop_nlink);

@@ -297,7 +295,7 @@ void clear_nlink(struct inode *inode)
{
if (inode->i_nlink) {
inode->__i_nlink = 0;
- atomic_long_inc(&inode->i_sb->s_remove_count);
+ percpu_counter_inc(&inode->i_sb->s_remove_counters);
}
}
EXPORT_SYMBOL(clear_nlink);
@@ -317,7 +315,7 @@ void set_nlink(struct inode *inode, unsigned int nlink)
} else {
/* Yes, some filesystems do change nlink from zero to one */
if (inode->i_nlink == 0)
- atomic_long_dec(&inode->i_sb->s_remove_count);
+ percpu_counter_dec(&inode->i_sb->s_remove_counters);

inode->__i_nlink = nlink;
}
@@ -336,7 +334,7 @@ void inc_nlink(struct inode *inode)
{
if (unlikely(inode->i_nlink == 0)) {
WARN_ON(!(inode->i_state & I_LINKABLE));
- atomic_long_dec(&inode->i_sb->s_remove_count);
+ percpu_counter_dec(&inode->i_sb->s_remove_counters);
}

inode->__i_nlink++;
diff --git a/fs/namespace.c b/fs/namespace.c
index 182bc41..20d33d9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -535,7 +535,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
int err = 0;

/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
- if (atomic_long_read(&sb->s_remove_count))
+ if (percpu_counter_sum(&sb->s_remove_counters) != 0)
return -EBUSY;

lock_mount_hash();
@@ -549,7 +549,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
}
}
}
- if (!err && atomic_long_read(&sb->s_remove_count))
+ if (!err && percpu_counter_sum(&sb->s_remove_counters) != 0)
err = -EBUSY;

if (!err) {
diff --git a/fs/super.c b/fs/super.c
index 48377f7..b7a8a45 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -142,6 +142,7 @@ static void destroy_super(struct super_block *s)
list_lru_destroy(&s->s_inode_lru);
for (i = 0; i < SB_FREEZE_LEVELS; i++)
percpu_counter_destroy(&s->s_writers.counter[i]);
+ percpu_counter_destroy(&s->s_remove_counters);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
kfree(s->s_subtype);
@@ -171,6 +172,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
if (security_sb_alloc(s))
goto fail;

+ if (percpu_counter_init(&s->s_remove_counters, 0) < 0)
+ goto fail;
for (i = 0; i < SB_FREEZE_LEVELS; i++) {
if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0)
goto fail;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8780312..0374552 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1240,7 +1240,7 @@ struct super_block {
struct shrinker s_shrink; /* per-sb shrinker handle */

/* Number of inodes with nlink == 0 but still referenced */
- atomic_long_t s_remove_count;
+ struct percpu_counter s_remove_counters;

/* Being remounted read-only */
int s_readonly_remount;
--
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/