Re: [PATCH 1/7] General notification queue with user mmap()'able ring buffer

From: Greg KH
Date: Tue May 28 2019 - 12:31:22 EST


On Tue, May 28, 2019 at 05:01:55PM +0100, David Howells wrote:
> Implement a misc device that implements a general notification queue as a
> ring buffer that can be mmap()'d from userspace.

"general" but just for filesystems, right? :(

> Each entry has a 1-slot header that describes it:
>
> struct watch_notification {
> __u32 type:24;
> __u32 subtype:8;
> __u32 info;
> };

This doesn't match the structure definition in the documentation, so
something is out of sync.

> The type indicates the source (eg. mount tree changes, superblock events,
> keyring changes, block layer events) and the subtype indicates the event
> type (eg. mount, unmount; EIO, EDQUOT; link, unlink). The info field
> indicates a number of things, including the entry length, an ID assigned to
> a watchpoint contributing to this buffer, type-specific flags and meta
> flags, such as an overrun indicator.
>
> Supplementary data, such as the key ID that generated an event, are
> attached in additional slots.

I'm all for a "generic" event system for the kernel (heck, Solaris has
had one for decades), but it keeps getting shot down every time it comes
up. What is different about this one?

> --- a/drivers/misc/Kconfig
> +++ b/drivers/misc/Kconfig
> @@ -4,6 +4,19 @@
>
> menu "Misc devices"
>
> +config WATCH_QUEUE
> + bool "Mappable notification queue"
> + default n

Nit, not needed.

> + depends on MMU
> + help
> + This is a general notification queue for the kernel to pass events to
> + userspace through a mmap()'able ring buffer. It can be used in
> + conjunction with watches for mount topology change notifications,
> + superblock change notifications and key/keyring change notifications.
> +
> + Note that in theory this should work fine with NOMMU, but I'm not
> + sure how to make that work.
> +
> config SENSORS_LIS3LV02D
> tristate
> depends on INPUT
> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
> index b9affcdaa3d6..bf16acd9f8cc 100644
> --- a/drivers/misc/Makefile
> +++ b/drivers/misc/Makefile
> @@ -3,6 +3,7 @@
> # Makefile for misc devices that really don't fit anywhere else.
> #
>
> +obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
> obj-$(CONFIG_IBM_ASM) += ibmasm/
> obj-$(CONFIG_IBMVMC) += ibmvmc.o
> obj-$(CONFIG_AD525X_DPOT) += ad525x_dpot.o
> diff --git a/drivers/misc/watch_queue.c b/drivers/misc/watch_queue.c
> new file mode 100644
> index 000000000000..39a09ea15d97
> --- /dev/null
> +++ b/drivers/misc/watch_queue.c
> @@ -0,0 +1,877 @@
> +/* User-mappable watch queue
> + *
> + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.

You didn't touch the code this year?

> + * Written by David Howells (dhowells@xxxxxxxxxx)
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public Licence
> + * as published by the Free Software Foundation; either version
> + * 2 of the Licence, or (at your option) any later version.

Please drop the boiler plate text and use a SPDX tag, checkpatch should
have caught this. I don't want to have to go and change it again.

> + *
> + * See Documentation/watch_queue.rst
> + */
> +
> +#define pr_fmt(fmt) "watchq: " fmt
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/printk.h>
> +#include <linux/miscdevice.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include <linux/pagemap.h>
> +#include <linux/poll.h>
> +#include <linux/uaccess.h>
> +#include <linux/vmalloc.h>
> +#include <linux/file.h>
> +#include <linux/security.h>
> +#include <linux/cred.h>
> +#include <linux/watch_queue.h>
> +
> +#define DEBUG_WITH_WRITE /* Allow use of write() to record notifications */

debugging code left in?

> +
> +MODULE_DESCRIPTION("Watch queue");
> +MODULE_AUTHOR("Red Hat, Inc.");
> +MODULE_LICENSE("GPL");
> +
> +struct watch_type_filter {
> + enum watch_notification_type type;
> + __u32 subtype_filter[1]; /* Bitmask of subtypes to filter on */
> + __u32 info_filter; /* Filter on watch_notification::info */
> + __u32 info_mask; /* Mask of relevant bits in info_filter */
> +};
> +
> +struct watch_filter {
> + union {
> + struct rcu_head rcu;
> + unsigned long type_filter[2]; /* Bitmask of accepted types */
> + };
> + u32 nr_filters; /* Number of filters */
> + struct watch_type_filter filters[];
> +};
> +
> +struct watch_queue {
> + struct rcu_head rcu;
> + struct address_space mapping;
> + const struct cred *cred; /* Creds of the owner of the queue */
> + struct watch_filter __rcu *filter;
> + wait_queue_head_t waiters;
> + struct hlist_head watches; /* Contributory watches */
> + refcount_t usage;

Usage of what, this structure? Or something else?

> + spinlock_t lock;
> + bool defunct; /* T when queues closed */
> + u8 nr_pages; /* Size of pages[] */
> + u8 flag_next; /* Flag to apply to next item */
> +#ifdef DEBUG_WITH_WRITE
> + u8 debug;
> +#endif
> + u32 size;
> + struct watch_queue_buffer *buffer; /* Pointer to first record */
> +
> + /* The mappable pages. The zeroth page holds the ring pointers. */
> + struct page **pages;
> +};


> +EXPORT_SYMBOL(__post_watch_notification);

_GPL for new apis? (I have to ask...)

> +static long watch_queue_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> + struct watch_queue *wqueue = file->private_data;
> + struct inode *inode = file_inode(file);
> + long ret;
> +
> + switch (cmd) {
> + case IOC_WATCH_QUEUE_SET_SIZE:
> + if (wqueue->buffer)
> + return -EBUSY;
> + inode_lock(inode);
> + ret = watch_queue_set_size(wqueue, arg);
> + inode_unlock(inode);
> + return ret;
> +
> + case IOC_WATCH_QUEUE_SET_FILTER:
> + inode_lock(inode);
> + ret = watch_queue_set_filter(
> + inode, wqueue,
> + (struct watch_notification_filter __user *)arg);
> + inode_unlock(inode);
> + return ret;
> +
> + default:
> + return -EOPNOTSUPP;

-ENOTTY is the correct "not a valid ioctl" error value, right?

> + }
> +}

> +/**
> + * put_watch_queue - Dispose of a ref on a watchqueue.
> + * @wqueue: The watch queue to unref.
> + */
> +void put_watch_queue(struct watch_queue *wqueue)
> +{
> + if (refcount_dec_and_test(&wqueue->usage))
> + kfree_rcu(wqueue, rcu);

Why not just use a kref?

> +}
> +EXPORT_SYMBOL(put_watch_queue);


> +int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
> +{
> + struct watch_queue *wqueue = rcu_access_pointer(watch->queue);
> + struct watch *w;
> +
> + hlist_for_each_entry(w, &wlist->watchers, list_node) {
> + if (watch->id == w->id)
> + return -EBUSY;
> + }
> +
> + rcu_assign_pointer(watch->watch_list, wlist);
> +
> + spin_lock_bh(&wqueue->lock);
> + refcount_inc(&wqueue->usage);
> + hlist_add_head(&watch->queue_node, &wqueue->watches);
> + spin_unlock_bh(&wqueue->lock);
> +
> + hlist_add_head(&watch->list_node, &wlist->watchers);
> + return 0;
> +}
> +EXPORT_SYMBOL(add_watch_to_object);

Naming nit, shouldn't the "prefix" all be the same for these new
functions?

watch_queue_add_object()? watch_queue_put()? And so on?

> +static int __init watch_queue_init(void)
> +{
> + int ret;
> +
> + ret = misc_register(&watch_queue_dev);
> + if (ret < 0)
> + pr_err("Failed to register %d\n", ret);
> + return ret;
> +}
> +fs_initcall(watch_queue_init);
> +
> +static void __exit watch_queue_exit(void)
> +{
> + misc_deregister(&watch_queue_dev);
> +}
> +module_exit(watch_queue_exit);

module_misc_device()?


> --- /dev/null
> +++ b/include/linux/watch_queue.h
> @@ -0,0 +1,86 @@
> +/* User-mappable watch queue
> + *
> + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
> + * Written by David Howells (dhowells@xxxxxxxxxx)
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public Licence
> + * as published by the Free Software Foundation; either version
> + * 2 of the Licence, or (at your option) any later version.

Again, SPDX headers please.

> --- /dev/null
> +++ b/include/uapi/linux/watch_queue.h
> @@ -0,0 +1,82 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */

Yeah!!!

No copyright? :(

> +#ifndef _UAPI_LINUX_WATCH_QUEUE_H
> +#define _UAPI_LINUX_WATCH_QUEUE_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define IOC_WATCH_QUEUE_SET_SIZE _IO('s', 0x01) /* Set the size in pages */
> +#define IOC_WATCH_QUEUE_SET_FILTER _IO('s', 0x02) /* Set the filter */
> +
> +enum watch_notification_type {
> + WATCH_TYPE_META = 0, /* Special record */
> + WATCH_TYPE_MOUNT_NOTIFY = 1, /* Mount notification record */
> + WATCH_TYPE_SB_NOTIFY = 2, /* Superblock notification */
> + WATCH_TYPE_KEY_NOTIFY = 3, /* Key/keyring change notification */
> + WATCH_TYPE_BLOCK_NOTIFY = 4, /* Block layer notifications */
> +#define WATCH_TYPE___NR 5
> +};
> +
> +enum watch_meta_notification_subtype {
> + WATCH_META_SKIP_NOTIFICATION = 0, /* Just skip this record */
> + WATCH_META_REMOVAL_NOTIFICATION = 1, /* Watched object was removed */
> +};
> +
> +/*
> + * Notification record
> + */
> +struct watch_notification {
> + __u32 type:24; /* enum watch_notification_type */
> + __u32 subtype:8; /* Type-specific subtype (filterable) */
> + __u32 info;
> +#define WATCH_INFO_OVERRUN 0x00000001 /* Event(s) lost due to overrun */
> +#define WATCH_INFO_ENOMEM 0x00000002 /* Event(s) lost due to ENOMEM */
> +#define WATCH_INFO_RECURSIVE 0x00000004 /* Change was recursive */
> +#define WATCH_INFO_LENGTH 0x000001f8 /* Length of record / sizeof(watch_notification) */
> +#define WATCH_INFO_IN_SUBTREE 0x00000200 /* Change was not at watched root */
> +#define WATCH_INFO_TYPE_FLAGS 0x00ff0000 /* Type-specific flags */
> +#define WATCH_INFO_FLAG_0 0x00010000
> +#define WATCH_INFO_FLAG_1 0x00020000
> +#define WATCH_INFO_FLAG_2 0x00040000
> +#define WATCH_INFO_FLAG_3 0x00080000
> +#define WATCH_INFO_FLAG_4 0x00100000
> +#define WATCH_INFO_FLAG_5 0x00200000
> +#define WATCH_INFO_FLAG_6 0x00400000
> +#define WATCH_INFO_FLAG_7 0x00800000
> +#define WATCH_INFO_ID 0xff000000 /* ID of watchpoint */
> +};
> +
> +#define WATCH_LENGTH_SHIFT 3
> +
> +struct watch_queue_buffer {
> + union {
> + /* The first few entries are special, containing the
> + * ring management variables.
> + */
> + struct {
> + struct watch_notification watch; /* WATCH_TYPE_SKIP */
> + volatile __u32 head; /* Ring head index */
> + volatile __u32 tail; /* Ring tail index */

A uapi structure that has volatile in it? Are you _SURE_ this is
correct?

That feels wrong to me... This is not a backing-hardware register, it's
"just memory" and slapping volatile on it shouldn't be the correct
solution for telling the compiler to not to optimize away reads/flushes,
right? You need a proper memory access type primitive for that to work
correctly everywhere I thought.

We only have 2 users of volatile in include/uapi, one for WMI structures
that are backed by firmware (seems correct), and one for DRM which I
have no idea how it works as it claims to be a lock. Why is this new
addition the correct way to do this that no other ring-buffer that was
mmapped has needed to?

thanks,

greg k-h