Re: [PATCH 13/19] vfs: Add a mount-notification facility [ver #16]

From: Jann Horn
Date: Wed Feb 19 2020 - 17:40:41 EST


On Tue, Feb 18, 2020 at 6:07 PM David Howells <dhowells@xxxxxxxxxx> wrote:
> Add a mount notification facility whereby notifications about changes in
> mount topology and configuration can be received. Note that this only
> covers vfsmount topology changes and not superblock events. A separate
> facility will be added for that.
[...]
> @@ -70,9 +71,13 @@ struct mount {
> int mnt_id; /* mount identifier */
> int mnt_group_id; /* peer group identifier */
> int mnt_expiry_mark; /* true if marked for expiry */
> + int mnt_nr_watchers; /* The number of subtree watches tracking this */

You're never referencing this variable elsewhere in the patch, and it
also isn't gated by #ifdef.

> struct hlist_head mnt_pins;
> struct hlist_head mnt_stuck_children;
> atomic_t mnt_change_counter; /* Number of changed applied */
> +#ifdef CONFIG_MOUNT_NOTIFICATIONS
> + struct watch_list *mnt_watchers; /* Watches on dentries within this mount */

Please document lifetime semantics. Something like "This pointer can't
change once it has been set to a non-NULL value".

> +#endif
> } __randomize_layout;
>
> #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
> @@ -155,18 +160,8 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
> return ns->seq == 0;
> }
>
> -/*
> - * Type of mount topology change notification.
> - */
> -enum mount_notification_subtype {
> - NOTIFY_MOUNT_NEW_MOUNT = 0, /* New mount added */
> - NOTIFY_MOUNT_UNMOUNT = 1, /* Mount removed manually */
> - NOTIFY_MOUNT_EXPIRY = 2, /* Automount expired */
> - NOTIFY_MOUNT_READONLY = 3, /* Mount R/O state changed */
> - NOTIFY_MOUNT_SETATTR = 4, /* Mount attributes changed */
> - NOTIFY_MOUNT_MOVE_FROM = 5, /* Mount moved from here */
> - NOTIFY_MOUNT_MOVE_TO = 6, /* Mount moved to here (compare op_id) */
> -};

Is there a reason why you introduce these in "vfs: Add mount change
counter", then in this patch move them elsewhere?

[...]
> @@ -174,4 +169,18 @@ static inline void notify_mount(struct mount *changed,
> u32 info_flags)
> {
> atomic_inc(&changed->mnt_change_counter);
> +
> +#ifdef CONFIG_MOUNT_NOTIFICATIONS
> + {
> + struct mount_notification n = {
> + .watch.type = WATCH_TYPE_MOUNT_NOTIFY,
> + .watch.subtype = subtype,
> + .watch.info = info_flags | watch_sizeof(n),
> + .triggered_on = changed->mnt_id,
> + .changed_mount = aux ? aux->mnt_id : 0,
> + };
> +
> + post_mount_notification(changed, &n);
> + }
> +#endif
[...]
> +/*
> + * Post mount notifications to all watches going rootwards along the tree.
> + *
> + * Must be called with the mount_lock held.

Please put such constraints into lockdep assertions instead of
comments; that way, violations can actually be detected.

> + */
> +void post_mount_notification(struct mount *changed,
> + struct mount_notification *notify)
> +{
> + const struct cred *cred = current_cred();
> + struct path cursor;
> + struct mount *mnt;
> + unsigned seq;
> +
> + seq = 0;
> + rcu_read_lock();
> +restart:
> + cursor.mnt = &changed->mnt;
> + cursor.dentry = changed->mnt.mnt_root;
> + mnt = real_mount(cursor.mnt);
> + notify->watch.info &= ~NOTIFY_MOUNT_IN_SUBTREE;
> +
> + read_seqbegin_or_lock(&rename_lock, &seq);
> + for (;;) {
> + if (mnt->mnt_watchers &&

unlocked test should use READ_ONCE() to document that the read value
can concurrently change

> + !hlist_empty(&mnt->mnt_watchers->watchers)) {
> + if (cursor.dentry->d_flags & DCACHE_MOUNT_WATCH)
> + post_watch_notification(mnt->mnt_watchers,
> + &notify->watch, cred,
> + (unsigned long)cursor.dentry);
> + } else {
> + cursor.dentry = mnt->mnt.mnt_root;
> + }
> + notify->watch.info |= NOTIFY_MOUNT_IN_SUBTREE;
> +
> + if (cursor.dentry == cursor.mnt->mnt_root ||
> + IS_ROOT(cursor.dentry)) {
> + struct mount *parent = READ_ONCE(mnt->mnt_parent);
> +
> + /* Escaped? */
> + if (cursor.dentry != cursor.mnt->mnt_root)
> + break;
> +
> + /* Global root? */
> + if (mnt == parent)
> + break;
> +
> + cursor.dentry = READ_ONCE(mnt->mnt_mountpoint);
> + mnt = parent;
> + cursor.mnt = &mnt->mnt;
> + } else {
> + cursor.dentry = cursor.dentry->d_parent;
> + }
> + }
> +
> + if (need_seqretry(&rename_lock, seq)) {
> + seq = 1;
> + goto restart;
> + }
> +
> + done_seqretry(&rename_lock, seq);
> + rcu_read_unlock();
> +}
> +
> +static void release_mount_watch(struct watch *watch)
> +{
> + struct dentry *dentry = (struct dentry *)(unsigned long)watch->id;
> +
> + dput(dentry);
> +}
> +
> +/**
> + * sys_watch_mount - Watch for mount topology/attribute changes
> + * @dfd: Base directory to pathwalk from or fd referring to mount.
> + * @filename: Path to mount to place the watch upon
> + * @at_flags: Pathwalk control flags
> + * @watch_fd: The watch queue to send notifications to.
> + * @watch_id: The watch ID to be placed in the notification (-1 to remove watch)
> + */
> +SYSCALL_DEFINE5(watch_mount,
> + int, dfd,
> + const char __user *, filename,
> + unsigned int, at_flags,
> + int, watch_fd,
> + int, watch_id)
> +{
> + struct watch_queue *wqueue;
> + struct watch_list *wlist = NULL;
> + struct watch *watch = NULL;
> + struct mount *m;
> + struct path path;
> + unsigned int lookup_flags =
> + LOOKUP_DIRECTORY | LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
> + int ret;
> +
> + if (watch_id < -1 || watch_id > 0xff)
> + return -EINVAL;
> + if ((at_flags & ~(AT_NO_AUTOMOUNT | AT_EMPTY_PATH)) != 0)
> + return -EINVAL;
> + if (at_flags & AT_NO_AUTOMOUNT)
> + lookup_flags &= ~LOOKUP_AUTOMOUNT;
> + if (at_flags & AT_EMPTY_PATH)
> + lookup_flags |= LOOKUP_EMPTY;
> +
> + ret = user_path_at(dfd, filename, lookup_flags, &path);
> + if (ret)
> + return ret;
> +
> + ret = inode_permission(path.dentry->d_inode, MAY_EXEC);
> + if (ret)
> + goto err_path;
> +
> + wqueue = get_watch_queue(watch_fd);
> + if (IS_ERR(wqueue))
> + goto err_path;
> +
> + m = real_mount(path.mnt);
> +
> + if (watch_id >= 0) {
> + ret = -ENOMEM;
> + if (!m->mnt_watchers) {

unlocked test should use READ_ONCE

> + wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
> + if (!wlist)
> + goto err_wqueue;
> + init_watch_list(wlist, release_mount_watch);
> + }
> +
> + watch = kzalloc(sizeof(*watch), GFP_KERNEL);
> + if (!watch)
> + goto err_wlist;
> +
> + init_watch(watch, wqueue);
> + watch->id = (unsigned long)path.dentry;
> + watch->info_id = (u32)watch_id << 24;
> +
> + ret = security_watch_mount(watch, &path);
> + if (ret < 0)
> + goto err_watch;
> +
> + down_write(&m->mnt.mnt_sb->s_umount);
> + if (!m->mnt_watchers) {
> + m->mnt_watchers = wlist;
> + wlist = NULL;
> + }
> +
> + ret = add_watch_to_object(watch, m->mnt_watchers);

If another thread concurrently runs close(watch_fd) at this point,
pipe_release -> put_pipe_info -> free_pipe_info -> watch_queue_clear
will run, correct? And then watch_queue_clear() will find the watch
that we've just created and call its ->release_watch() handler, which
causes dput() on path.dentry? At that point, we no longer hold any
reference to the dentry...

> + if (ret == 0) {
> + spin_lock(&path.dentry->d_lock);
> + path.dentry->d_flags |= DCACHE_MOUNT_WATCH;
> + spin_unlock(&path.dentry->d_lock);
> + dget(path.dentry);

... but then here we call dget() on it?


In general, the following pattern indicates a bug unless a surrounding
lock provides the necessary protection:

ret = operation_that_hands_off_the_reference_on_success(ptr);
if (ret == SUCCESS) {
increment_refcount(ptr);
}

and should be replaced with the following pattern:

increment_refcount(ptr);
ret = operation_that_hands_off_the_reference_on_success(ptr);
if (ret == FAILURE) {
decrement_refcount(ptr);
}

> + watch = NULL;
> + }
> + up_write(&m->mnt.mnt_sb->s_umount);
> + } else {
> + ret = -EBADSLT;
> + if (m->mnt_watchers) {
> + down_write(&m->mnt.mnt_sb->s_umount);
> + ret = remove_watch_from_object(m->mnt_watchers, wqueue,
> + (unsigned long)path.dentry,
> + false);

What exactly is the implication of only using the dentry as key here
(and not the mount)? Does this mean that if you watch install watches
on two bind mounts of the same underlying filesystem, the notification
mechanism gets confused?

> + up_write(&m->mnt.mnt_sb->s_umount);
> + }
> + }
> +
> +err_watch:
> + kfree(watch);
> +err_wlist:
> + kfree(wlist);
> +err_wqueue:
> + put_watch_queue(wqueue);
> +err_path:
> + path_put(&path);
> + return ret;
> +}
[...]