[PATCH RESENT] fs, ipc: Use an asynchronous version of kern_unmount in IPC

From: Salman Qazi
Date: Mon Mar 04 2019 - 14:49:27 EST


Prior to this patch, the kernel can spend a lot of time with
this stack trace:

[<ffffffffbe5491e3>] __wait_rcu_gp+0x93/0xe0
[<ffffffffbe549418>] synchronize_sched+0x48/0x60
[<ffffffffbe7ae5b3>] kern_unmount+0x3a/0x46
[<ffffffffbe847c02>] mq_put_mnt+0x15/0x17
[<ffffffffbe8481af>] put_ipc_ns+0x36/0x8b

This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
This is done by implementing an asynchronous version of kern_unmount.

Since mntput() sleeps, it needs to be deferred to a work queue.

Additionally, the callers of mq_put_mnt appear to be safe having
it behave asynchronously. In particular, put_ipc_ns calls
mq_clear_sbinfo which renders the inode inaccessible for the purposes of
mqueue_create by making s_fs_info NULL. This appears
to be the thing that prevents access while free_ipc_ns is taking place.
So, the unmount should be able to proceed lazily.

Tested: Ran the following program:

int main(void)
{
int pid;
int status;
int i;

for (i = 0; i < 1000; i++) {
pid = fork();
if (!pid) {
assert(!unshare(CLONE_NEWUSER|
CLONE_NEWIPC|CLONE_NEWNS));
return 0;
}

assert(waitpid(pid, &status, 0) == pid);
}
}

Before:

$ time ./unshare2

real 0m9.784s
user 0m0.428s
sys 0m0.000s

After:

$ time ./unshare2

real 0m0.368s
user 0m0.226s
sys 0m0.122s

Signed-off-by: Salman Qazi <sqazi@xxxxxxxxxx>
Reviewed-by: Eric Dumazet <edumazet@xxxxxxxxxx>
---
fs/namespace.c | 41 +++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 1 +
ipc/mqueue.c | 2 +-
3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 678ef175d63a..e60b473c3bbc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3321,6 +3321,47 @@ void kern_unmount(struct vfsmount *mnt)
}
EXPORT_SYMBOL(kern_unmount);

+struct async_unmount_cb {
+ struct vfsmount *mnt;
+ struct work_struct work;
+ struct rcu_head rcu_head;
+};
+
+static void kern_unmount_work(struct work_struct *work)
+{
+ struct async_unmount_cb *cb = container_of(work,
+ struct async_unmount_cb, work);
+
+ mntput(cb->mnt);
+ kfree(cb);
+}
+
+static void kern_unmount_rcu_cb(struct rcu_head *rcu_head)
+{
+ struct async_unmount_cb *cb = container_of(rcu_head,
+ struct async_unmount_cb, rcu_head);
+
+ INIT_WORK(&cb->work, kern_unmount_work);
+ schedule_work(&cb->work);
+
+}
+
+void kern_unmount_async(struct vfsmount *mnt)
+{
+ /* release long term mount so mount point can be released */
+ if (!IS_ERR_OR_NULL(mnt)) {
+ struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+
+ if (cb) {
+ real_mount(mnt)->mnt_ns = NULL;
+ cb->mnt = mnt;
+ call_rcu(&cb->rcu_head, kern_unmount_rcu_cb);
+ } else {
+ kern_unmount(mnt);
+ }
+ }
+}
+
bool our_mnt(struct vfsmount *mnt)
{
return check_mnt(real_mount(mnt));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29d8e2cfed0e..8865997a8722 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *);
extern int unregister_filesystem(struct file_system_type *);
extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
#define kern_mount(type) kern_mount_data(type, NULL)
+extern void kern_unmount_async(struct vfsmount *mnt);
extern void kern_unmount(struct vfsmount *mnt);
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c595bed7bfcb..a8c2465ac0cb 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns)

void mq_put_mnt(struct ipc_namespace *ns)
{
- kern_unmount(ns->mq_mnt);
+ kern_unmount_async(ns->mq_mnt);
}

static int __init init_mqueue_fs(void)
--
2.21.0.352.gf09ad66450-goog