[PATCH] kernfs: Change kernfs_rwsem to a per-cpu rwsem.

From: Imran Khan
Date: Sun Jun 19 2022 - 23:46:00 EST


On large systems when few hundred CPUs simulateously acquire kernfs_rwsem,
for reading we see performance degradation due to bouncing of cache line
that contains kernfs_rwsem. Changing kernfs_rwsem into a per-fs, per-cpu
rwsem can fix this degradation.

For example on a system of 384 cores, If I run 200 instances of an
application that is mostly executing following loop:

for (int loop = 0; loop <100 ; loop++)
{
for (int port_num = 1; port_num < 2; port_num++)
{
for (int gid_index = 0; gid_index < 254; gid_index++ )
{
char ret_buf[64], ret_buf_lo[64];
char gid_file_path[1024];

int ret_len;
int ret_fd;
ssize_t ret_rd;

ub4 i, saved_errno;

memset(ret_buf, 0, sizeof(ret_buf));
memset(gid_file_path, 0, sizeof(gid_file_path));

ret_len = snprintf(gid_file_path, sizeof(gid_file_path),
"/sys/class/infiniband/%s/ports/%d/gids/%d",
dev_name,
port_num,
gid_index);

ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC);
if (ret_fd < 0)
{
printf("Failed to open %s\n", gid_file_path);
continue;
}

/* Read the GID */
ret_rd = read(ret_fd, ret_buf, 40);

if (ret_rd == -1)
{
printf("Failed to read from file %s, errno: %u\n",
gid_file_path, saved_errno);

continue;
}

close(ret_fd);
}
}
}

I can see contention around kernfs_rwsem as follows:

- 21.09% 20.95% showgids [kernel.kallsyms] [k] down_read
20.95% __libc_start_main
__GI___libc_open
entry_SYSCALL_64_after_hwframe
do_syscall_64
sys_open
do_sys_open
do_filp_open
- path_openat
- 19.55% link_path_walk
- 10.60% inode_permission
- __inode_permission
- 10.60% kernfs_iop_permission
down_read
- 8.18% walk_component
lookup_fast
- d_revalidate.part.24
- 8.18% kernfs_dop_revalidate
down_read
- 0.76% kernfs_iop_get_link
down_read
- 0.74% may_open
inode_permission
__inode_permission
kernfs_iop_permission
down_read
- 0.66% lookup_fast
d_revalidate.part.24
kernfs_dop_revalidate
down_read

- 16.04% 15.96% showgids [kernel.kallsyms] [k] up_read
15.96% __libc_start_main
__GI___libc_open
entry_SYSCALL_64_after_hwframe
do_syscall_64
sys_open
do_sys_open
do_filp_open
- path_openat
- 15.04% link_path_walk
- 8.36% inode_permission
- __inode_permission
- 8.36% kernfs_iop_permission
up_read
- 6.15% walk_component
lookup_fast
- d_revalidate.part.24
- 6.15% kernfs_dop_revalidate
up_read
- 0.53% kernfs_iop_get_link

Moreover this run of 200 applications take more than 32 secs to finish on
this system.

After changing kernfs_rwsem to a per-cpu rwsem, I can see that contention
for kernfs_rwsem is no longer visible in perf data and the test execution
time has reduced to almost half (17 secs or less from 32 secs or more).

The overhead involving write operations with per-cpu rwsem will be higher
but frequency of creation/deletion of kernfs files is much less than
frequency at which kernfs (cgroup, sysfs) files are read.

Signed-off-by: Imran Khan <imran.f.khan@xxxxxxxxxx>
---

This patch introduces an alternative approach to address problem of kernfs_rwsem
contention discussed in [1]. Since [1] was dealing with 3 global locks i.e.
kernfs_open_file_mutex, kernfs_open_node_lock and kernfs_rwsem and most of the
major changes were around kernfs_rwsem we decided in [2] to sort out
kernfs_open_file_mutex and kernfs_open_node_lock first. These 2 global locks have
now been removed and corresponding patches [3] have been Acked.

As far as kernfs_rwsem is concerned, we are trying to use hashed kernfs_rwsem in
[1]. These changes have not been Acked yet and there may still be some broken
corner cases which I am trying to find/fix at the moment. But in the meanwhile
I tried with per-cpu rwsem for kernfs_rwsem and found that it is giving me
similar improvement as far as my test case is concerned. Compared to hashed
kernfs_rwsem, per-cpu kernfs_rwsem is less intrusive because in worst case we
may incur some performance penalties for some unknown scenarios but it will not
break kernfs flow.

So in this patch I am proposing the idea of using per-cpu kernfs_rwsem, so that
I can get some feedback about this approach as well.

[1]: https://lore.kernel.org/lkml/20220410023719.1752460-1-imran.f.khan@xxxxxxxxxx/
[2]: https://lore.kernel.org/lkml/f2ca9d19-023f-76d9-5c76-6f08ccfbe348@xxxxxxxxxx/
[3]: https://lore.kernel.org/lkml/20220615021059.862643-1-imran.f.khan@xxxxxxxxxx/

fs/kernfs/dir.c | 70 ++++++++++++++++++-------------------
fs/kernfs/file.c | 4 +--
fs/kernfs/inode.c | 16 ++++-----
fs/kernfs/kernfs-internal.h | 4 +--
fs/kernfs/mount.c | 12 +++----
fs/kernfs/symlink.c | 4 +--
6 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 6eca72cfa1f28..678e71856afc0 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -472,7 +472,7 @@ static void kernfs_drain(struct kernfs_node *kn)
lockdep_assert_held_write(&root->kernfs_rwsem);
WARN_ON_ONCE(kernfs_active(kn));

- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);

if (kernfs_lockdep(kn)) {
rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
@@ -491,7 +491,7 @@ static void kernfs_drain(struct kernfs_node *kn)

kernfs_drain_open_files(kn);

- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);
}

/**
@@ -731,7 +731,7 @@ int kernfs_add_one(struct kernfs_node *kn)
bool has_ns;
int ret;

- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);

ret = -EINVAL;
has_ns = kernfs_ns_enabled(parent);
@@ -762,7 +762,7 @@ int kernfs_add_one(struct kernfs_node *kn)
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}

- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);

/*
* Activate the new node unless CREATE_DEACTIVATED is requested.
@@ -776,7 +776,7 @@ int kernfs_add_one(struct kernfs_node *kn)
return 0;

out_unlock:
- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);
return ret;
}

@@ -869,10 +869,10 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
struct kernfs_node *kn;
struct kernfs_root *root = kernfs_root(parent);

- down_read(&root->kernfs_rwsem);
+ percpu_down_read(&root->kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
kernfs_get(kn);
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);

return kn;
}
@@ -894,10 +894,10 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
struct kernfs_node *kn;
struct kernfs_root *root = kernfs_root(parent);

- down_read(&root->kernfs_rwsem);
+ percpu_down_read(&root->kernfs_rwsem);
kn = kernfs_walk_ns(parent, path, ns);
kernfs_get(kn);
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);

return kn;
}
@@ -922,7 +922,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
return ERR_PTR(-ENOMEM);

idr_init(&root->ino_idr);
- init_rwsem(&root->kernfs_rwsem);
+ percpu_init_rwsem(&root->kernfs_rwsem);
INIT_LIST_HEAD(&root->supers);

/*
@@ -1078,12 +1078,12 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
if (parent) {
spin_unlock(&dentry->d_lock);
root = kernfs_root(parent);
- down_read(&root->kernfs_rwsem);
+ percpu_down_read(&root->kernfs_rwsem);
if (kernfs_dir_changed(parent, dentry)) {
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);
return 0;
}
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);
} else
spin_unlock(&dentry->d_lock);

@@ -1095,7 +1095,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)

kn = kernfs_dentry_node(dentry);
root = kernfs_root(kn);
- down_read(&root->kernfs_rwsem);
+ percpu_down_read(&root->kernfs_rwsem);

/* The kernfs node has been deactivated */
if (!kernfs_active(kn))
@@ -1114,10 +1114,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
kernfs_info(dentry->d_sb)->ns != kn->ns)
goto out_bad;

- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);
return 1;
out_bad:
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);
return 0;
}

@@ -1136,7 +1136,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
const void *ns = NULL;

root = kernfs_root(parent);
- down_read(&root->kernfs_rwsem);
+ percpu_down_read(&root->kernfs_rwsem);
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dir->i_sb)->ns;

@@ -1147,7 +1147,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
* create a negative.
*/
if (!kernfs_active(kn)) {
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);
return NULL;
}
inode = kernfs_get_inode(dir->i_sb, kn);
@@ -1162,7 +1162,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
*/
if (!IS_ERR(inode))
kernfs_set_rev(parent, dentry);
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);

/* instantiate and hash (possibly negative) dentry */
return d_splice_alias(inode, dentry);
@@ -1322,7 +1322,7 @@ void kernfs_activate(struct kernfs_node *kn)
struct kernfs_node *pos;
struct kernfs_root *root = kernfs_root(kn);

- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);

pos = NULL;
while ((pos = kernfs_next_descendant_post(pos, kn))) {
@@ -1336,7 +1336,7 @@ void kernfs_activate(struct kernfs_node *kn)
pos->flags |= KERNFS_ACTIVATED;
}

- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);
}

static void __kernfs_remove(struct kernfs_node *kn)
@@ -1420,9 +1420,9 @@ void kernfs_remove(struct kernfs_node *kn)

root = kernfs_root(kn);

- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);
__kernfs_remove(kn);
- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);
}

/**
@@ -1510,7 +1510,7 @@ bool kernfs_remove_self(struct kernfs_node *kn)
bool ret;
struct kernfs_root *root = kernfs_root(kn);

- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);
kernfs_break_active_protection(kn);

/*
@@ -1538,9 +1538,9 @@ bool kernfs_remove_self(struct kernfs_node *kn)
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
break;

- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);
schedule();
- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);
}
finish_wait(waitq, &wait);
WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
@@ -1553,7 +1553,7 @@ bool kernfs_remove_self(struct kernfs_node *kn)
*/
kernfs_unbreak_active_protection(kn);

- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);
return ret;
}

@@ -1579,13 +1579,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
}

root = kernfs_root(parent);
- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);

kn = kernfs_find_ns(parent, name, ns);
if (kn)
__kernfs_remove(kn);

- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);

if (kn)
return 0;
@@ -1613,7 +1613,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
return -EINVAL;

root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);

error = -ENOENT;
if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
@@ -1667,7 +1667,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,

error = 0;
out:
- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);
return error;
}

@@ -1745,7 +1745,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
return 0;

root = kernfs_root(parent);
- down_read(&root->kernfs_rwsem);
+ percpu_down_read(&root->kernfs_rwsem);

if (kernfs_ns_enabled(parent))
ns = kernfs_info(dentry->d_sb)->ns;
@@ -1762,12 +1762,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
file->private_data = pos;
kernfs_get(pos);

- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);
if (!dir_emit(ctx, name, len, ino, type))
return 0;
- down_read(&root->kernfs_rwsem);
+ percpu_down_read(&root->kernfs_rwsem);
}
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);
file->private_data = NULL;
ctx->pos = INT_MAX;
return 0;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e3abfa843879c..068651fbcfa70 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -862,7 +862,7 @@ static void kernfs_notify_workfn(struct work_struct *work)

root = kernfs_root(kn);
/* kick fsnotify */
- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);

list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
struct kernfs_node *parent;
@@ -900,7 +900,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
iput(inode);
}

- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);
kernfs_put(kn);
goto repeat;
}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5daa..6b8d204839ee2 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
int ret;
struct kernfs_root *root = kernfs_root(kn);

- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);
ret = __kernfs_setattr(kn, iattr);
- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);
return ret;
}

@@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return -EINVAL;

root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);
error = setattr_prepare(&init_user_ns, dentry, iattr);
if (error)
goto out;
@@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
setattr_copy(&init_user_ns, inode, iattr);

out:
- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);
return error;
}

@@ -189,12 +189,12 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
struct kernfs_node *kn = inode->i_private;
struct kernfs_root *root = kernfs_root(kn);

- down_read(&root->kernfs_rwsem);
+ percpu_down_read(&root->kernfs_rwsem);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&init_user_ns, inode, stat);
spin_unlock(&inode->i_lock);
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);

return 0;
}
@@ -287,12 +287,12 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
kn = inode->i_private;
root = kernfs_root(kn);

- down_read(&root->kernfs_rwsem);
+ percpu_down_read(&root->kernfs_rwsem);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&init_user_ns, inode, mask);
spin_unlock(&inode->i_lock);
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);

return ret;
}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index eeaa779b929c7..a3c0739db92f9 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -13,7 +13,7 @@
#include <linux/lockdep.h>
#include <linux/fs.h>
#include <linux/mutex.h>
-#include <linux/rwsem.h>
+#include <linux/percpu-rwsem.h>
#include <linux/xattr.h>

#include <linux/kernfs.h>
@@ -46,7 +46,7 @@ struct kernfs_root {
struct list_head supers;

wait_queue_head_t deactivate_waitq;
- struct rw_semaphore kernfs_rwsem;
+ struct percpu_rw_semaphore kernfs_rwsem;
};

/* +1 to avoid triggering overflow warning when negating it */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index cfa79715fc1a7..e3059de2e05c1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -256,9 +256,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
sb->s_shrink.seeks = 0;

/* get root inode, initialize and unlock it */
- down_read(&kf_root->kernfs_rwsem);
+ percpu_down_read(&kf_root->kernfs_rwsem);
inode = kernfs_get_inode(sb, info->root->kn);
- up_read(&kf_root->kernfs_rwsem);
+ percpu_up_read(&kf_root->kernfs_rwsem);
if (!inode) {
pr_debug("kernfs: could not get root inode\n");
return -ENOMEM;
@@ -346,9 +346,9 @@ int kernfs_get_tree(struct fs_context *fc)
}
sb->s_flags |= SB_ACTIVE;

- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);
list_add(&info->node, &info->root->supers);
- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);
}

fc->root = dget(sb->s_root);
@@ -375,9 +375,9 @@ void kernfs_kill_sb(struct super_block *sb)
struct kernfs_super_info *info = kernfs_info(sb);
struct kernfs_root *root = info->root;

- down_write(&root->kernfs_rwsem);
+ percpu_down_write(&root->kernfs_rwsem);
list_del(&info->node);
- up_write(&root->kernfs_rwsem);
+ percpu_up_write(&root->kernfs_rwsem);

/*
* Remove the superblock from fs_supers/s_instances
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 0ab13824822f7..61c8260a55717 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -116,9 +116,9 @@ static int kernfs_getlink(struct inode *inode, char *path)
struct kernfs_root *root = kernfs_root(parent);
int error;

- down_read(&root->kernfs_rwsem);
+ percpu_down_read(&root->kernfs_rwsem);
error = kernfs_get_target_path(parent, target, path);
- up_read(&root->kernfs_rwsem);
+ percpu_up_read(&root->kernfs_rwsem);

return error;
}

base-commit: 35d872b9ea5b3ad784d7479ea728dcb688df2db7
--
2.30.2