[RFC v2 1/1] fs/namespace: defer RCU sync for MNT_DETACH umount

From: Lucas Karpinski
Date: Fri Apr 26 2024 - 15:56:05 EST


Use call_rcu to defer releasing the detached filesystem when calling
namespace_unlock() during a lazy umount.

When detaching (MNT_DETACH) a filesystem, it should not be necessary to
wait for the grace period before completing the syscall. The
expectation that the filesystem is shut down by the time the syscall
returns does not apply in this case.

Calling synchronize_rcu_expedited() has a significant cost on RT kernel
that default to rcupdate.rcu_normal_after_boot=1. The struct mount
umount'ed are queued up for release in a separate list
once the grace period completes while no longer accessible to following
syscalls.

Without patch, on 6.9.0-rc2-rt kernel:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
0.02756 +- 0.00299 seconds time elapsed ( +- 10.84% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
0.04422 +- 0.00521 seconds time elapsed ( +- 11.79% )

With patch, on 6.9.0-rc2-rt kernel:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
0.02852 +- 0.00377 seconds time elapsed ( +- 13.20% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
0.0030812 +- 0.0000524 seconds time elapsed ( +- 1.70% )

Signed-off-by: Alexander Larsson <alexl@xxxxxxxxxx>
Signed-off-by: Eric Chanudet <echanude@xxxxxxxxxx>
Signed-off-by: Lucas Karpinski <lkarpins@xxxxxxxxxx>
---
fs/namespace.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 5a51315c6678..df03fc0d1990 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init;
static unsigned int mp_hash_mask __ro_after_init;
static unsigned int mp_hash_shift __ro_after_init;

+struct mount_delayed_release {
+ struct rcu_head rcu;
+ struct hlist_head release_list;
+};
+
static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
@@ -78,6 +83,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static bool lazy_unlock = false; /* protected by namespace_sem */

struct mount_kattr {
unsigned int attr_set;
@@ -1553,16 +1559,39 @@ int may_umount(struct vfsmount *mnt)

EXPORT_SYMBOL(may_umount);

-static void namespace_unlock(void)
+static void free_mounts(struct hlist_head *mount_list)
{
- struct hlist_head head;
struct hlist_node *p;
struct mount *m;
+
+ hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
+ hlist_del(&m->mnt_umount);
+ mntput(&m->mnt);
+ }
+}
+
+static void delayed_mount_release(struct rcu_head *head)
+{
+ struct mount_delayed_release *drelease =
+ container_of(head, struct mount_delayed_release, rcu);
+
+ free_mounts(&drelease->release_list);
+ kfree(drelease);
+}
+
+static void namespace_unlock(void)
+{
+ HLIST_HEAD(head);
LIST_HEAD(list);
+ bool lazy;
+

hlist_move_list(&unmounted, &head);
list_splice_init(&ex_mountpoints, &list);

+ lazy = lazy_unlock;
+ lazy_unlock = false;
+
up_write(&namespace_sem);

shrink_dentry_list(&list);
@@ -1570,12 +1599,21 @@ static void namespace_unlock(void)
if (likely(hlist_empty(&head)))
return;

- synchronize_rcu_expedited();
+ if (lazy) {
+ struct mount_delayed_release *drelease =
+ kmalloc(sizeof(*drelease), GFP_KERNEL);

- hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
- hlist_del(&m->mnt_umount);
- mntput(&m->mnt);
+ if (unlikely(!drelease))
+ goto out;
+
+ hlist_move_list(&head, &drelease->release_list);
+ call_rcu(&drelease->rcu, delayed_mount_release);
+ return;
}
+
+out:
+ synchronize_rcu_expedited();
+ free_mounts(&head);
}

static inline void namespace_lock(void)
@@ -1798,6 +1836,7 @@ static int do_umount(struct mount *mnt, int flags)
}
out:
unlock_mount_hash();
+ lazy_unlock = flags & MNT_DETACH ? true : false;
namespace_unlock();
return retval;
}
--
2.44.0