[PATCH 3/5] vfs: Add a mount-notification facility

From: David Howells
Date: Mon Jul 23 2018 - 11:26:11 EST


[*] Note that this needs some cleaning up and not all the events work yet.

Add a mount notification facility whereby notifications about changes in
mount topology and configuration can be received. Note that this only
covers vfsmount topology changes and not superblock events. A separate
facility will be added for that.

Firstly, an event queue needs to be created:

fd = open("/dev/event_queue", O_RDWR);

then a notification can be set up to report notifications via that queue:

struct watch_notification_filter filter;
memset(&filter, 0, sizeof(filter));
filter.subtype_filter[0] = ~0ULL;
filter.info_id = 0x02000000;
mount_notify(AT_FDCWD, "/", 0, fd, &filter);

Note that the queue can be shared between multiple notifications of various
types.

Mount notifications propagate up the tree towards the root, so a watch will
catch all of the events happening in the subtree rooted at the watch.

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
---

arch/x86/entry/syscalls/syscall_32.tbl | 1
arch/x86/entry/syscalls/syscall_64.tbl | 1
fs/Kconfig | 9 ++
fs/Makefile | 1
fs/fs_context.c | 1
fs/mount.h | 26 +++++
fs/mount_notify.c | 178 ++++++++++++++++++++++++++++++++
fs/namespace.c | 18 +++
include/linux/dcache.h | 1
include/linux/syscalls.h | 2
include/uapi/linux/watch_queue.h | 24 ++++
kernel/sys_ni.c | 3 +
12 files changed, 265 insertions(+)
create mode 100644 fs/mount_notify.c

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 806760188a31..449bbcc19a6d 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -405,3 +405,4 @@
391 i386 fsmount sys_fsmount __ia32_sys_fsmount
392 i386 fspick sys_fspick __ia32_sys_fspick
393 i386 fsinfo sys_fsinfo __ia32_sys_fsinfo
+394 i386 mount_notify sys_mount_notify __ia32_sys_mount_notify
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 0823eed2b02e..f25fa7ff5fb9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -350,6 +350,7 @@
339 common fsmount __x64_sys_fsmount
340 common fspick __x64_sys_fspick
341 common fsinfo __x64_sys_fsinfo
+342 common mount_notify __x64_sys_mount_notify

#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/Kconfig b/fs/Kconfig
index ac474a61be37..cbcca62d32e9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -107,6 +107,15 @@ source "fs/crypto/Kconfig"

source "fs/notify/Kconfig"

+config MOUNT_NOTIFICATIONS
+ bool "Mount topology change notifications"
+ select WATCH_QUEUE
+ help
+ This option provides support for getting change notifications on the
+ mount tree topology. This makes use of the /dev/watch_queue misc
+ device to handle the notification buffer and provides the
+ mount_notify() system call to enable/disable watchpoints.
+
source "fs/quota/Kconfig"

source "fs/autofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index d3b33798998e..49b60030d905 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -129,3 +129,4 @@ obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
+obj-$(CONFIG_MOUNT_NOTIFICATIONS) += mount_notify.o
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 071723cf11c8..4fa99a438471 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -321,6 +321,7 @@ struct fs_context *vfs_new_fs_context(struct file_system_type *fs_type,
case FS_CONTEXT_FOR_SUBMOUNT:
fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
fc->net_ns = get_net(current->nsproxy->net_ns);
+ fc->sb_flags |= SB_SUBMOUNT;
break;
case FS_CONTEXT_FOR_RECONFIGURE:
/* We don't pin any namespaces as the superblock's
diff --git a/fs/mount.h b/fs/mount.h
index f39bc9da4d73..7f72f824b958 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,6 +4,7 @@
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>
+#include <linux/watch_queue.h>

struct mnt_namespace {
atomic_t count;
@@ -67,9 +68,13 @@ struct mount {
int mnt_id; /* mount identifier */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
+ int mnt_nr_watchers; /* The number of subtree watches tracking this */
struct hlist_head mnt_pins;
struct fs_pin mnt_umount;
struct dentry *mnt_ex_mountpoint;
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+ struct watch_list *mnt_watchers; /* Watches on dentries within this mount */
+#endif
} __randomize_layout;

#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
@@ -146,3 +151,24 @@ static inline bool is_local_mountpoint(struct dentry *dentry)

return __is_local_mountpoint(dentry);
}
+
+extern void post_mount_notification(struct mount *changed,
+ struct mount_notification *notify);
+
+static inline void notify_mount(struct mount *changed,
+ struct mount *aux,
+ enum mount_notification_subtype subtype,
+ u32 info_flags)
+{
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+ struct mount_notification n = {
+ .watch.type = WATCH_TYPE_MOUNT_NOTIFY,
+ .watch.subtype = subtype,
+ .watch.info = info_flags | sizeof(n),
+ .triggered_on = changed->mnt_id,
+ .changed_mount = aux ? aux->mnt_id : 0,
+ };
+
+ post_mount_notification(changed, &n);
+#endif
+}
diff --git a/fs/mount_notify.c b/fs/mount_notify.c
new file mode 100644
index 000000000000..b4905c363136
--- /dev/null
+++ b/fs/mount_notify.c
@@ -0,0 +1,178 @@
+/* Provide mount topology/attribute change notifications.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/syscalls.h>
+#include "mount.h"
+
+/*
+ * Post mount notifications to all watches going rootwards along the tree.
+ *
+ * Must be called with the mount_lock held.
+ */
+void post_mount_notification(struct mount *changed,
+ struct mount_notification *notify)
+{
+ struct path cursor;
+ struct mount *mnt;
+ unsigned seq;
+
+ seq = 0;
+ rcu_read_lock();
+restart:
+ cursor.mnt = &changed->mnt;
+ cursor.dentry = changed->mnt.mnt_root;
+ mnt = real_mount(cursor.mnt);
+ notify->watch.info &= ~WATCH_INFO_IN_SUBTREE;
+
+ read_seqbegin_or_lock(&rename_lock, &seq);
+ for (;;) {
+ if (mnt->mnt_watchers &&
+ !hlist_empty(&mnt->mnt_watchers->watchers)) {
+ if (cursor.dentry->d_flags & DCACHE_MOUNT_WATCH)
+ post_watch_notification(mnt->mnt_watchers,
+ &notify->watch,
+ (unsigned long)cursor.dentry);
+ } else {
+ cursor.dentry = mnt->mnt.mnt_root;
+ }
+ notify->watch.info |= WATCH_INFO_IN_SUBTREE;
+
+ if (cursor.dentry == cursor.mnt->mnt_root ||
+ IS_ROOT(cursor.dentry)) {
+ struct mount *parent = READ_ONCE(mnt->mnt_parent);
+
+ /* Escaped? */
+ if (cursor.dentry != cursor.mnt->mnt_root)
+ break;
+
+ /* Global root? */
+ if (mnt != parent) {
+ cursor.dentry = READ_ONCE(mnt->mnt_mountpoint);
+ mnt = parent;
+ cursor.mnt = &mnt->mnt;
+ continue;
+ }
+ break;
+ }
+
+ cursor.dentry = cursor.dentry->d_parent;
+ }
+
+ if (need_seqretry(&rename_lock, seq)) {
+ seq = 1;
+ goto restart;
+ }
+
+ done_seqretry(&rename_lock, seq);
+ rcu_read_unlock();
+}
+
+static void release_mount_watch(struct watch_list *wlist, struct watch *watch)
+{
+ struct vfsmount *mnt = watch->private;
+ struct dentry *dentry = (struct dentry *)(unsigned long)watch->id;
+
+ dput(dentry);
+ mntput(mnt);
+}
+
+/**
+ * sys_mount_notify - Watch for mount topology/attribute changes
+ * @dfd: Base directory to pathwalk from or fd referring to mount.
+ * @filename: Path to mount to place the watch upon
+ * @at_flags: Pathwalk control flags
+ * @watch_fd: The watch queue to send notifications to.
+ * @watch_id: The watch ID to be placed in the notification (-1 to remove watch)
+ */
+SYSCALL_DEFINE5(mount_notify,
+ int, dfd,
+ const char __user *, filename,
+ unsigned int, at_flags,
+ int, watch_fd,
+ int, watch_id)
+{
+ struct watch_queue *wqueue;
+ struct watch_list *wlist = NULL;
+ struct watch *watch;
+ struct mount *m;
+ struct path path;
+ int ret;
+
+ if (watch_id < -1 || watch_id > 0xff)
+ return -EINVAL;
+
+ ret = user_path_at(dfd, filename, at_flags, &path);
+ if (ret)
+ return ret;
+
+ wqueue = get_watch_queue(watch_fd);
+ if (IS_ERR(wqueue))
+ goto err_path;
+
+ m = real_mount(path.mnt);
+
+ if (watch_id >= 0) {
+ if (!m->mnt_watchers) {
+ wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
+ if (!wlist)
+ goto err_wqueue;
+ INIT_HLIST_HEAD(&wlist->watchers);
+ spin_lock_init(&wlist->lock);
+ wlist->release_watch = release_mount_watch;
+ }
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (!watch)
+ goto err_wlist;
+
+ init_watch(watch);
+ watch->id = (unsigned long)path.dentry;
+ watch->queue = wqueue;
+ watch->private = path.mnt;
+ watch->info_id = (u32)watch_id << 24;
+
+ down_write(&m->mnt.mnt_sb->s_umount);
+ if (!m->mnt_watchers) {
+ m->mnt_watchers = wlist;
+ wlist = NULL;
+ }
+
+ watch->watch_list = m->mnt_watchers;
+ ret = add_watch_to_object(watch);
+ if (ret == 0) {
+ spin_lock(&path.dentry->d_lock);
+ path.dentry->d_flags |= DCACHE_MOUNT_WATCH;
+ spin_unlock(&path.dentry->d_lock);
+ path_get(&path);
+ }
+ up_write(&m->mnt.mnt_sb->s_umount);
+ if (ret < 0)
+ kfree(watch);
+ } else if (m->mnt_watchers) {
+ down_write(&m->mnt.mnt_sb->s_umount);
+ ret = remove_watch_from_object(m->mnt_watchers, wqueue,
+ (unsigned long)path.dentry,
+ false);
+ up_write(&m->mnt.mnt_sb->s_umount);
+ } else {
+ ret = -EBADSLT;
+ }
+
+err_wlist:
+ kfree(wlist);
+err_wqueue:
+ put_watch_queue(wqueue);
+err_path:
+ path_put(&path);
+ return ret;
+}
diff --git a/fs/namespace.c b/fs/namespace.c
index 7e7b1145d15d..d4d16111659d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -589,6 +589,9 @@ static int mnt_make_readonly(struct mount *mnt)
smp_wmb();
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
unlock_mount_hash();
+ if (ret == 0)
+ notify_mount(mnt, NULL, notify_mount_readonly,
+ WATCH_INFO_FLAG_0);
return ret;
}

@@ -597,6 +600,7 @@ static int __mnt_unmake_readonly(struct mount *mnt)
lock_mount_hash();
mnt->mnt.mnt_flags &= ~MNT_READONLY;
unlock_mount_hash();
+ notify_mount(mnt, NULL, notify_mount_readonly, 0);
return 0;
}

@@ -900,6 +904,7 @@ static void umount_mnt(struct mount *mnt)
{
/* old mountpoint will be dropped when we can do that */
mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
+ notify_mount(mnt->mnt_parent, mnt, notify_mount_unmount, 0);
unhash_mnt(mnt);
}

@@ -1451,6 +1456,11 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
p = list_first_entry(&tmp_list, struct mount, mnt_list);
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
+
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+ if (p->mnt_watchers)
+ remove_watch_list(p->mnt_watchers);
+#endif
ns = p->mnt_ns;
if (ns) {
ns->mounts--;
@@ -2004,11 +2014,17 @@ static int attach_recursive_mnt(struct mount *source_mnt,
lock_mount_hash();
}
if (parent_path) {
+ notify_mount(source_mnt->mnt_parent, source_mnt,
+ notify_mount_move_from, 0);
detach_mnt(source_mnt, parent_path);
+ notify_mount(dest_mnt, source_mnt, notify_mount_move_to, 0);
attach_mnt(source_mnt, dest_mnt, dest_mp);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
+ notify_mount(dest_mnt, source_mnt, notify_mount_new_mount,
+ source_mnt->mnt.mnt_sb->s_flags & SB_SUBMOUNT ?
+ WATCH_INFO_FLAG_0 : 0);
commit_tree(source_mnt);
}

@@ -2361,6 +2377,7 @@ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
mnt->mnt.mnt_flags = mnt_flags;
touch_mnt_namespace(mnt->mnt_ns);
unlock_mount_hash();
+ notify_mount(mnt, NULL, notify_mount_setattr, 0);
}

/*
@@ -2767,6 +2784,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
if (!xchg(&mnt->mnt_expiry_mark, 1) ||
propagate_mount_busy(mnt, 1))
continue;
+ notify_mount(mnt, NULL, notify_mount_expiry, 0);
list_move(&mnt->mnt_expire, &graveyard);
}
while (!list_empty(&graveyard)) {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 66c6e17e61e5..b0eb68ed5b9b 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -217,6 +217,7 @@ struct dentry_operations {

#define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */
#define DCACHE_DENTRY_CURSOR 0x20000000
+#define DCACHE_MOUNT_WATCH 0x40000000 /* There's a mount watch here */

extern seqlock_t rename_lock;

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 84b653874ab8..7db37c58289a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -913,6 +913,8 @@ asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags)
asmlinkage long sys_fsinfo(int dfd, const char __user *path,
struct fsinfo_params __user *params,
void __user *buffer, size_t buf_size);
+asmlinkage long sys_mount_notify(int dfd, const char __user *path,
+ unsigned int at_flags, int watch_fd, int watch_id);

/*
* Architecture-specific system calls
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index 3e0ab5fe388d..9d8e165e0065 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -103,4 +103,28 @@ struct key_notification {
__u32 aux; /* Per-type auxiliary data */
};

+/*
+ * Type of mount topology change notification.
+ */
+enum mount_notification_subtype {
+ notify_mount_new_mount = 0, /* New mount added */
+ notify_mount_unmount = 1, /* Mount removed manually */
+ notify_mount_expiry = 2, /* Automount expired */
+ notify_mount_readonly = 3, /* Mount R/O state changed */
+ notify_mount_setattr = 4, /* Mount attributes changed */
+ notify_mount_move_from = 5, /* Mount moved from here */
+ notify_mount_move_to = 6, /* Mount moved to here (compare op_id) */
+};
+
+/*
+ * Mount topology/configuration change notification record.
+ * - watch.type = WATCH_TYPE_MOUNT_NOTIFY
+ * - watch.subtype = enum mount_notification_subtype
+ */
+struct mount_notification {
+ struct watch_notification watch; /* WATCH_TYPE_MOUNT_NOTIFY */
+ __u32 triggered_on; /* The mount that the notify was on */
+ __u32 changed_mount; /* The mount that got changed */
+};
+
#endif /* _UAPI_LINUX_WATCH_QUEUE_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index df556175be50..f608777be045 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -80,6 +80,9 @@ COND_SYSCALL(ioprio_get);
/* fs/locks.c */
COND_SYSCALL(flock);

+/* fs/mount_notify.c */
+COND_SYSCALL(mount_notify);
+
/* fs/namei.c */

/* fs/namespace.c */