[PATCH] [UNTESTED] fs/namespace: defer RCU sync for MNT_DETACH umount

From: Eric Chanudet
Date: Tue Apr 08 2025 - 16:58:34 EST


Defer releasing the detached file-system when calling namespace_unlock()
during a lazy umount to return faster.

When requesting MNT_DETACH, the caller does not expect the file-system
to be shut down upon returning from the syscall. Calling
synchronize_rcu_expedited() has a significant cost on RT kernel that
defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
mount in a separate list and put it on a workqueue to run post RCU
grace-period.

w/o patch, 6.15-rc1 PREEMPT_RT:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
0.02455 +- 0.00107 seconds time elapsed ( +- 4.36% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
0.02555 +- 0.00114 seconds time elapsed ( +- 4.46% )

w/ patch, 6.15-rc1 PREEMPT_RT:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
0.026311 +- 0.000869 seconds time elapsed ( +- 3.30% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
0.003194 +- 0.000160 seconds time elapsed ( +- 5.01% )

Signed-off-by: Alexander Larsson <alexl@xxxxxxxxxx>
Signed-off-by: Lucas Karpinski <lkarpins@xxxxxxxxxx>
Signed-off-by: Eric Chanudet <echanude@xxxxxxxxxx>
Link: https://lore.kernel.org/20250408210350.749901-12-echanude@xxxxxxxxxx
Not-Tested-by: Christian Brauner <brauner@xxxxxxxxxx>
Massaged-With-Great-Shame-by: Christian Brauner <brauner@xxxxxxxxxx>
---
fs/namespace.c | 78 +++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index bc23c0e1fb9d..c36debbc5135 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init;
static unsigned int mp_hash_mask __ro_after_init;
static unsigned int mp_hash_shift __ro_after_init;

+struct deferred_free_mounts {
+ struct rcu_work rwork;
+ struct hlist_head release_list;
+};
+
static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
@@ -77,8 +82,9 @@ static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
-static HLIST_HEAD(unmounted); /* protected by namespace_sem */
-static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static bool defer_unmount; /* protected by namespace_sem */
+static HLIST_HEAD(unmounted); /* protected by namespace_sem */
+static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_SEQLOCK(mnt_ns_tree_lock);

#ifdef CONFIG_FSNOTIFY
@@ -1412,7 +1418,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
return ERR_PTR(err);
}

-static void cleanup_mnt(struct mount *mnt)
+static void __mntput_no_expire(struct mount *mnt, bool cleanup_sync);
+
+static void cleanup_mnt(struct mount *mnt, bool cleanup_sync)
{
struct hlist_node *p;
struct mount *m;
@@ -1428,7 +1436,9 @@ static void cleanup_mnt(struct mount *mnt)
mnt_pin_kill(mnt);
hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
hlist_del(&m->mnt_umount);
- mntput(&m->mnt);
+ if (unlikely(m->mnt_expiry_mark))
+ WRITE_ONCE(m->mnt_expiry_mark, 0);
+ __mntput_no_expire(m, cleanup_sync);
}
fsnotify_vfsmount_delete(&mnt->mnt);
dput(mnt->mnt.mnt_root);
@@ -1439,7 +1449,7 @@ static void cleanup_mnt(struct mount *mnt)

static void __cleanup_mnt(struct rcu_head *head)
{
- cleanup_mnt(container_of(head, struct mount, mnt_rcu));
+ cleanup_mnt(container_of(head, struct mount, mnt_rcu), false /* cleanup sync */);
}

static LLIST_HEAD(delayed_mntput_list);
@@ -1449,11 +1459,11 @@ static void delayed_mntput(struct work_struct *unused)
struct mount *m, *t;

llist_for_each_entry_safe(m, t, node, mnt_llist)
- cleanup_mnt(m);
+ cleanup_mnt(m, false /* cleanup sync */);
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

-static void mntput_no_expire(struct mount *mnt)
+static void __mntput_no_expire(struct mount *mnt, bool cleanup_sync)
{
LIST_HEAD(list);
int count;
@@ -1507,7 +1517,7 @@ static void mntput_no_expire(struct mount *mnt)
unlock_mount_hash();
shrink_dentry_list(&list);

- if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
+ if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL) && !cleanup_sync)) {
struct task_struct *task = current;
if (likely(!(task->flags & PF_KTHREAD))) {
init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
@@ -1518,7 +1528,12 @@ static void mntput_no_expire(struct mount *mnt)
schedule_delayed_work(&delayed_mntput_work, 1);
return;
}
- cleanup_mnt(mnt);
+ cleanup_mnt(mnt, cleanup_sync);
+}
+
+static inline void mntput_no_expire(struct mount *mnt)
+{
+ __mntput_no_expire(mnt, false);
}

void mntput(struct vfsmount *mnt)
@@ -1789,15 +1804,37 @@ static bool need_notify_mnt_list(void)
}
#endif

-static void namespace_unlock(void)
+static void free_mounts(struct hlist_head *mount_list, bool cleanup_sync)
{
- struct hlist_head head;
struct hlist_node *p;
struct mount *m;
+
+ hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
+ hlist_del(&m->mnt_umount);
+ if (unlikely(m->mnt_expiry_mark))
+ WRITE_ONCE(m->mnt_expiry_mark, 0);
+ __mntput_no_expire(m, cleanup_sync);
+ }
+}
+
+static void defer_free_mounts(struct work_struct *work)
+{
+ struct deferred_free_mounts *d;
+
+ d = container_of(to_rcu_work(work), struct deferred_free_mounts, rwork);
+ free_mounts(&d->release_list, true /* cleanup_sync */);
+ kfree(d);
+}
+
+static void namespace_unlock(void)
+{
+ HLIST_HEAD(head);
LIST_HEAD(list);
+ bool defer = defer_unmount;

hlist_move_list(&unmounted, &head);
list_splice_init(&ex_mountpoints, &list);
+ defer_unmount = false;

if (need_notify_mnt_list()) {
/*
@@ -1817,12 +1854,19 @@ static void namespace_unlock(void)
if (likely(hlist_empty(&head)))
return;

- synchronize_rcu_expedited();
+ if (defer) {
+ struct deferred_free_mounts *d;

- hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
- hlist_del(&m->mnt_umount);
- mntput(&m->mnt);
+ d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
+ if (d) {
+ hlist_move_list(&head, &d->release_list);
+ INIT_RCU_WORK(&d->rwork, defer_free_mounts);
+ queue_rcu_work(system_unbound_wq, &d->rwork);
+ return;
+ }
}
+ synchronize_rcu_expedited();
+ free_mounts(&head, false /* cleanup_sync */);
}

static inline void namespace_lock(void)
@@ -2044,8 +2088,10 @@ static int do_umount(struct mount *mnt, int flags)

event++;
if (flags & MNT_DETACH) {
- if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
+ if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) {
umount_tree(mnt, UMOUNT_PROPAGATE);
+ defer_unmount = true;
+ }
retval = 0;
} else {
shrink_submounts(mnt);
--
2.47.2


--n7c2fkulbsmjbpdu--