[PATCH v3 2/2] kernfs: Reduce contention around global per-fs kernfs_rwsem.

From: Imran Khan
Date: Thu Jan 13 2022 - 05:43:50 EST


Right now a global per file system based rwsem (kernfs_rwsem)
synchronizes multiple kernfs operations. On a large system with
few hundred CPUs and few hundred applications simultaenously trying
to access sysfs, this results in multiple sys_open(s) contending on
kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate.

- 21.42% 21.34% showgids [kernel.kallsyms] [k] up_read
21.34% __libc_start_main
__GI___libc_open
entry_SYSCALL_64_after_hwframe
do_syscall_64
sys_open
do_sys_open
do_filp_open
- path_openat
- 20.05% link_path_walk
- 9.76% walk_component
lookup_fast
- d_revalidate.part.24
- 9.75% kernfs_dop_revalidate
up_read
- 9.46% inode_permission
- __inode_permission
- 9.46% kernfs_iop_permission
up_read
- 0.83% kernfs_iop_get_link
up_read
- 0.80% lookup_fast
d_revalidate.part.24
kernfs_dop_revalidate
up_read

- 21.31% 21.21% showgids [kernel.kallsyms] [k] down_read
21.21% __libc_start_main
__GI___libc_open
entry_SYSCALL_64_after_hwframe
do_syscall_64
sys_open
do_sys_open
do_filp_open
- path_openat
- 19.78% link_path_walk
- 10.62% inode_permission
- __inode_permission
- 10.62% kernfs_iop_permission
down_read
- 8.45% walk_component
lookup_fast
- d_revalidate.part.24
- 8.45% kernfs_dop_revalidate
down_read
- 0.71% kernfs_iop_get_link
down_read
- 0.72% lookup_fast
- d_revalidate.part.24
- 0.72% kernfs_dop_revalidate
down_read
- 0.71% may_open
inode_permission
__inode_permission
kernfs_iop_permission
down_read

Since permission is specific to a kernfs_node we can use a hashed
lock to access/modify permission. Also use kernfs reference counting
to ensure we are accessing/modifying permissions for an existing
kernfs_node object.

Using this change brings down the above mentioned down_read/up_read
numbers to ~8%, thus indicating that contention around kernfs_rwsem
has reduced to about 1/3rd of earlier value.

Signed-off-by: Imran Khan <imran.f.khan@xxxxxxxxxx>
---
fs/kernfs/dir.c | 8 ++++++++
fs/kernfs/inode.c | 35 +++++++++++++++++++++++++----------
fs/kernfs/kernfs-internal.h | 18 ++++++++++++++++--
3 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e6d9772ddb4c..37daaffda718 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -720,6 +720,7 @@ int kernfs_add_one(struct kernfs_node *kn)
struct kernfs_node *parent = kn->parent;
struct kernfs_root *root = kernfs_root(parent);
struct kernfs_iattrs *ps_iattr;
+ struct rw_semaphore *rwsem = NULL;
bool has_ns;
int ret;

@@ -748,11 +749,14 @@ int kernfs_add_one(struct kernfs_node *kn)
goto out_unlock;

/* Update timestamps on the parent */
+ rwsem = iattr_rwsem_ptr(parent);
+ down_write(rwsem);
ps_iattr = parent->iattr;
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
+ up_write(rwsem);

up_write(&root->kernfs_rwsem);

@@ -1326,6 +1330,7 @@ void kernfs_activate(struct kernfs_node *kn)
static void __kernfs_remove(struct kernfs_node *kn)
{
struct kernfs_node *pos;
+ struct rw_semaphore *rwsem;

lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);

@@ -1378,8 +1383,11 @@ static void __kernfs_remove(struct kernfs_node *kn)

/* update timestamps on the parent */
if (ps_iattr) {
+ rwsem = iattr_rwsem_ptr(pos->parent);
+ down_write(rwsem);
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
+ up_write(rwsem);
}

kernfs_put(pos);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5da..7a55ad440bf4 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -99,11 +99,15 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
int ret;
- struct kernfs_root *root = kernfs_root(kn);
+ struct rw_semaphore *rwsem = NULL;

- down_write(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ rwsem = iattr_rwsem_ptr(kn);
+ down_write(rwsem);
ret = __kernfs_setattr(kn, iattr);
- up_write(&root->kernfs_rwsem);
+ up_write(rwsem);
+ kernfs_put(kn);
+
return ret;
}

@@ -112,6 +116,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct kernfs_node *kn = inode->i_private;
+ struct rw_semaphore *rwsem = NULL;
struct kernfs_root *root;
int error;

@@ -119,7 +124,9 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return -EINVAL;

root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ rwsem = iattr_rwsem_ptr(kn);
+ down_write(rwsem);
error = setattr_prepare(&init_user_ns, dentry, iattr);
if (error)
goto out;
@@ -132,7 +139,8 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
setattr_copy(&init_user_ns, inode, iattr);

out:
- up_write(&root->kernfs_rwsem);
+ up_write(rwsem);
+ kernfs_put(kn);
return error;
}

@@ -187,14 +195,17 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
{
struct inode *inode = d_inode(path->dentry);
struct kernfs_node *kn = inode->i_private;
- struct kernfs_root *root = kernfs_root(kn);
+ struct rw_semaphore *rwsem = NULL;

- down_read(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ rwsem = iattr_rwsem_ptr(kn);
+ down_read(rwsem);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&init_user_ns, inode, stat);
spin_unlock(&inode->i_lock);
- up_read(&root->kernfs_rwsem);
+ up_read(rwsem);
+ kernfs_put(kn);

return 0;
}
@@ -279,6 +290,7 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
{
struct kernfs_node *kn;
struct kernfs_root *root;
+ struct rw_semaphore *rwsem = NULL;
int ret;

if (mask & MAY_NOT_BLOCK)
@@ -287,12 +299,15 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
kn = inode->i_private;
root = kernfs_root(kn);

- down_read(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ rwsem = iattr_rwsem_ptr(kn);
+ down_read(rwsem);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&init_user_ns, inode, mask);
spin_unlock(&inode->i_lock);
- up_read(&root->kernfs_rwsem);
+ up_read(rwsem);
+ kernfs_put(kn);

return ret;
}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 4bdcf7a71845..f1977d72369a 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -39,17 +39,23 @@ struct kernfs_open_file_mutex {
struct mutex lock;
} ____cacheline_aligned_in_smp;

+struct kernfs_iattr_rwsem {
+ struct rw_semaphore rwsem;
+} ____cacheline_aligned_in_smp;
+
/*
* There's one kernfs_open_file for each open file and one kernfs_open_node
* for each kernfs_node with one or more open files.
*
* kernfs_node->attr.open points to kernfs_open_node. attr.open is
- * protected by open_node_locks[i].
+ * protected by open_node_locks[i].lock.
*
* filp->private_data points to seq_file whose ->private points to
* kernfs_open_file. kernfs_open_files are chained at
- * kernfs_open_node->files, which is protected by open_file_mutex[i].
+ * kernfs_open_node->files, which is protected by open_file_mutex[i].lock.
*
+ * kernfs_node->iattr points to kernfs_node's attributes and is
+ * protected by iattr_rwsem[i].rwsem
* To reduce possible contention in sysfs access, arising due to single
* locks, use an array of locks and use kernfs_node object address as
* hash keys to get the index of these locks.
@@ -58,6 +64,7 @@ struct kernfs_open_file_mutex {
struct kernfs_global_locks {
struct kernfs_open_node_lock open_node_locks[NR_KERNFS_LOCKS];
struct kernfs_open_file_mutex open_file_mutex[NR_KERNFS_LOCKS];
+ struct kernfs_iattr_rwsem iattr_rwsem[NR_KERNFS_LOCKS];
};

static struct kernfs_global_locks kernfs_global_locks;
@@ -76,6 +83,13 @@ static inline struct mutex *open_file_mutex_ptr(struct kernfs_node *kn)
return &kernfs_global_locks.open_file_mutex[index].lock;
}

+static inline struct rw_semaphore *iattr_rwsem_ptr(struct kernfs_node *kn)
+{
+ int index = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+ return &kernfs_global_locks.iattr_rwsem[index].rwsem;
+}
+
struct kernfs_iattrs {
kuid_t ia_uid;
kgid_t ia_gid;
--
2.30.2