Re: [PATCH 03/34] teach move_mount(2) to work with OPEN_TREE_CLONE [ver #12]

From: David Howells
Date: Fri Oct 19 2018 - 07:57:08 EST


Okay, I put in a tracepoint (patch attached) and got a trace from the life of
the offending mount object. I've cropped non-useful information out of the
lines, inserted a blank line every time the usage count goes down to 2 and
dropped most of the lines generated by fsnotify.

Most of the lines are irrelevant. You can see system calls being issued and
commands being run that make no difference on balance.

What matters are the first four lines, the two mounts and the umount. You can
see it go splat on the last line when the usage count has become poisoned.

bash-3597 M=93785f8a u=1 ALC sp=clone_mnt+0x31/0x30a
bash-3597 M=93785f8a u=2 GET sp=do_dentry_open+0x24/0x2d3
bash-3597 M=93785f8a u=1 PUT sp=__se_sys_open_tree+0x195/0x1da

^--- These three lines look like the open_tree() syscall done by test-fsmount.

bash-3597 M=93785f8a u=2 GET sp=set_fs_pwd+0x37/0xdb

^--- And this the fchdir() syscall from test-fsmount. At this point u=2 would
seem correct as the subtree isn't actually mounted anywhere (1 for pwd, 1
for fd).

v--- test-fsmount then called execl() on bash, which did some stat'ing to find
the executable...

bash-3597 M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50
bash-3597 M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc

bash-3597 M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50
bash-3597 M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc

v--- and then exec'd it.

bash-3597 M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50
bash-3597 M=93785f8a u=4 GET sp=do_dentry_open+0x24/0x2d3
bash-3597 M=93785f8a u=3 PUT sp=terminate_walk+0x20/0xda
bash-3597 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
bash-3597 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
bash-3597 M=93785f8a u=2 PUT sp=__fput+0x180/0x1c1

v--- bash then did stuff:

bash-3597 M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50
bash-3597 M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc

bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
grepconf.sh-3598 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3598 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3598 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3598 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3598 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3598 M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3
grepconf.sh-3598 M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda
grepconf.sh-3598 M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3598 M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3598 M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1
grepconf.sh-3598 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3598 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3598 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
grep-3599 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
grepconf.sh-3598 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
bash-3600 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
tty-3601 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
bash-3600 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
tput-3602 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
bash-3600 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
bash-3603 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
dircolors-3604 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
bash-3603 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
grep-3605 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
grepconf.sh-3606 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3606 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3606 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3606 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3606 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3606 M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3
grepconf.sh-3606 M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda
grepconf.sh-3606 M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3606 M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3606 M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1
grepconf.sh-3606 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3606 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3606 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
grep-3607 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
grepconf.sh-3606 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
grepconf.sh-3608 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3608 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3608 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3608 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3608 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3608 M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3
grepconf.sh-3608 M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda
grepconf.sh-3608 M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3608 M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3608 M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1
grepconf.sh-3608 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
grepconf.sh-3608 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
grepconf.sh-3608 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
grep-3609 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
grepconf.sh-3608 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

bash-3597 M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50
bash-3597 M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc

I ran "mount --move . /mnt":

bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
mount-3610 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
mount-3610 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
mount-3610 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
mount-3610 M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3
mount-3610 M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda
mount-3610 M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50
mount-3610 M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc
mount-3610 M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1
mount-3610 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
mount-3610 M=93785f8a u=3 PUT sp=do_mount+0x715/0x929
mount-3610 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

which worked. Herein lieth the problem. The usage count should be 3 now (1
for pwd, 1 for fd, 1 for mount) - but how does VFS know that this mount object
isn't mounted yet?

I ran "mount --move /mnt /mnt":

bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
mount-3611 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
mount-3611 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
mount-3611 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
mount-3611 M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50
mount-3611 M=93785f8a u=4 PUT sp=do_mount+0x715/0x929
mount-3611 M=93785f8a u=3 PUT sp=do_mount+0x1cf/0x929
mount-3611 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

which failed with ELOOP.

I ran "cd":

bash-3597 M=93785f8a u=1 PUT sp=set_fs_pwd+0xb8/0xdb

I ran "umount /mnt":

umount-3612 M=93785f8a u=2 GET sp=legitimize_path.isra.7+0x16/0x50
umount-3612 M=93785f8a u=1 PUT sp=vfs_statx+0x95/0xcc
umount-3612 M=93785f8a u=2 GET sp=legitimize_path.isra.7+0x16/0x50
umount-3612 M=93785f8a u=1 PUT sp=vfs_statx+0x95/0xcc
umount-3612 M=93785f8a u=2 GET sp=legitimize_path.isra.7+0x16/0x50
umount-3612 M=93785f8a u=1 PUT sp=user_statfs+0x61/0x98
umount-3612 M=93785f8a u=2 GET sp=legitimize_mnt+0x12/0x108
umount-3612 M=93785f8a u=1 PUT sp=pin_kill+0x11c/0x325
umount-3612 M=93785f8a u=0 PUT sp=ksys_umount+0x3e8/0x40e
umount-3612 M=93785f8a u=0 FRE sp=cleanup_mnt+0x4d/0x5e

And finally, I exited the shell, which then tried to release the fd inherited
from open_tree():

bash-3597 M=93785f8a u=1802201963 NFY sp=__fput+0xac/0x1c1

Note that the subtree attached to the fd has not at this point been directly
mounted by move_mount(), but implicitly mounted by fchdir() into it and then
using mount(MS_MOVE) of "." to "/mnt".

Note also that if I run the commands all as one go rather than pasting them
individually, a crash occurs in pin_kill() instead.

So, I'm not sure how to proceed from here. There's no easy way to remove the
FMODE_NEED_UNMOUNT flag left by open_tree().

David
---
commit e7c8e6590aa0dd3bf2e10450b8992d496c44be8b
Author: David Howells <dhowells@xxxxxxxxxx>
Date: Fri Oct 19 10:38:35 2018 +0100

mnt_count trace

diff --git a/fs/mount.h b/fs/mount.h
index f39bc9da4d73..124a6dd73936 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -20,7 +20,7 @@ struct mnt_namespace {
} __randomize_layout;

struct mnt_pcp {
- int mnt_count;
+ int mnt_countxxx;
int mnt_writers;
};

@@ -46,6 +46,7 @@ struct mount {
int mnt_count;
int mnt_writers;
#endif
+ atomic_t mnt_count;
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
struct list_head mnt_instance; /* mount instance on sb->s_mounts */
diff --git a/fs/namei.c b/fs/namei.c
index fb913148d4d1..da1489f6925c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -460,32 +460,6 @@ int inode_permission(struct inode *inode, int mask)
}
EXPORT_SYMBOL(inode_permission);

-/**
- * path_get - get a reference to a path
- * @path: path to get the reference to
- *
- * Given a path increment the reference count to the dentry and the vfsmount.
- */
-void path_get(const struct path *path)
-{
- mntget(path->mnt);
- dget(path->dentry);
-}
-EXPORT_SYMBOL(path_get);
-
-/**
- * path_put - put a reference to a path
- * @path: path to put the reference to
- *
- * Given a path decrement the reference count to the dentry and the vfsmount.
- */
-void path_put(const struct path *path)
-{
- dput(path->dentry);
- mntput(path->mnt);
-}
-EXPORT_SYMBOL(path_put);
-
#define EMBEDDED_LEVELS 2
struct nameidata {
struct path path;
diff --git a/fs/namespace.c b/fs/namespace.c
index 6370bfabec99..389e806e1a65 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,8 @@
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
#include <linux/fs_context.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/mnt.h>

#include "pnode.h"
#include "internal.h"
@@ -109,8 +111,10 @@ static int mnt_alloc_id(struct mount *mnt)
return 0;
}

-static void mnt_free_id(struct mount *mnt)
+static noinline void mnt_free_id(struct mount *mnt)
{
+ trace_mnt_count(mnt, mnt->mnt_id, atomic_read(&mnt->mnt_count), 99,
+ __builtin_return_address(0));
ida_free(&mnt_id_ida, mnt->mnt_id);
}

@@ -141,6 +145,9 @@ void mnt_release_group_id(struct mount *mnt)
*/
static inline void mnt_add_count(struct mount *mnt, int n)
{
+ int u;
+
+#if 0
#ifdef CONFIG_SMP
this_cpu_add(mnt->mnt_pcp->mnt_count, n);
#else
@@ -148,6 +155,9 @@ static inline void mnt_add_count(struct mount *mnt, int n)
mnt->mnt_count += n;
preempt_enable();
#endif
+#endif
+ u = atomic_add_return(n, &mnt->mnt_count);
+ trace_mnt_count(mnt, mnt->mnt_id, u, n, __builtin_return_address(0));
}

/*
@@ -155,6 +165,7 @@ static inline void mnt_add_count(struct mount *mnt, int n)
*/
unsigned int mnt_get_count(struct mount *mnt)
{
+#if 0
#ifdef CONFIG_SMP
unsigned int count = 0;
int cpu;
@@ -167,6 +178,8 @@ unsigned int mnt_get_count(struct mount *mnt)
#else
return mnt->mnt_count;
#endif
+#endif
+ return atomic_read(&mnt->mnt_count);
}

static void drop_mountpoint(struct fs_pin *p)
@@ -198,11 +211,15 @@ static struct mount *alloc_vfsmnt(const char *name)
if (!mnt->mnt_pcp)
goto out_free_devname;

+#if 0
this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+#endif
#else
mnt->mnt_count = 1;
mnt->mnt_writers = 0;
#endif
+ atomic_set(&mnt->mnt_count, 1);
+ trace_mnt_count(mnt, mnt->mnt_id, 1, 0, __builtin_return_address(0));

INIT_HLIST_NODE(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
@@ -1128,7 +1145,7 @@ static void mntput_no_expire(struct mount *mnt)
cleanup_mnt(mnt);
}

-void mntput(struct vfsmount *mnt)
+inline void mntput(struct vfsmount *mnt)
{
if (mnt) {
struct mount *m = real_mount(mnt);
@@ -1140,7 +1157,7 @@ void mntput(struct vfsmount *mnt)
}
EXPORT_SYMBOL(mntput);

-struct vfsmount *mntget(struct vfsmount *mnt)
+inline struct vfsmount *mntget(struct vfsmount *mnt)
{
if (mnt)
mnt_add_count(real_mount(mnt), 1);
@@ -3970,3 +3987,29 @@ const struct proc_ns_operations mntns_operations = {
.install = mntns_install,
.owner = mntns_owner,
};
+
+/**
+ * path_get - get a reference to a path
+ * @path: path to get the reference to
+ *
+ * Given a path increment the reference count to the dentry and the vfsmount.
+ */
+void path_get(const struct path *path)
+{
+ mntget(path->mnt);
+ dget(path->dentry);
+}
+EXPORT_SYMBOL(path_get);
+
+/**
+ * path_put - put a reference to a path
+ * @path: path to put the reference to
+ *
+ * Given a path decrement the reference count to the dentry and the vfsmount.
+ */
+void path_put(const struct path *path)
+{
+ dput(path->dentry);
+ mntput(path->mnt);
+}
+EXPORT_SYMBOL(path_put);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ababdbfab537..aaef44d6204c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -23,6 +23,7 @@
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/srcu.h>
+#include <trace/events/mnt.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
@@ -324,10 +325,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
/* global tests shouldn't care about events on child only the specific event */
__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);

- if (data_is == FSNOTIFY_EVENT_PATH)
+ if (data_is == FSNOTIFY_EVENT_PATH) {
mnt = real_mount(((const struct path *)data)->mnt);
- else
+ trace_mnt_count(mnt, mnt->mnt_id, atomic_read(&mnt->mnt_count), 88,
+ __builtin_return_address(0));
+ } else {
mnt = NULL;
+ }

/*
* Optimization: srcu_read_lock() has a memory barrier which can
diff --git a/include/trace/events/mnt.h b/include/trace/events/mnt.h
new file mode 100644
index 000000000000..da1a981f1a61
--- /dev/null
+++ b/include/trace/events/mnt.h
@@ -0,0 +1,57 @@
+/* Mount tracepoints
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mnt
+
+#if !defined(_TRACE_MNT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MNT_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(mnt_count,
+ TP_PROTO(const void *mnt, int mnt_id, int mnt_count,
+ int delta, const void *where),
+
+ TP_ARGS(mnt, mnt_id, mnt_count, delta, where),
+
+ TP_STRUCT__entry(
+ __field(int, mnt_id )
+ __field(int, mnt_count )
+ __field(int, delta )
+ __field(const void *, mnt )
+ __field(const void *, where )
+ ),
+
+ TP_fast_assign(
+ __entry->mnt_id = mnt_id;
+ __entry->mnt_count = mnt_count;
+ __entry->delta = delta;
+ __entry->mnt = mnt;
+ __entry->where = where;
+ ),
+
+ TP_printk("M=%p m=%08x u=%d %s sp=%pSR",
+ __entry->mnt,
+ __entry->mnt_id,
+ __entry->mnt_count,
+ __print_symbolic(__entry->delta,
+ { 0, "ALC" },
+ { 1, "GET" },
+ { -1, "PUT" },
+ { 88, "NFY" },
+ { 99, "FRE" }),
+ __entry->where)
+ );
+
+#endif /* _TRACE_MNT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>