On Tue, 16 Dec 2008 12:27:37 -0800
Jeremy Fitzhardinge <jeremy@xxxxxxxx> wrote:
[ Reviewers: This is in drivers/xen to keep it close to the code it is and will be using. Would people
prefer to see it in fs/xenfs? -J ]
Well it would be nice to keep filesystems under ./fs, but `grep -rl
register_filesystem .' says we screwed that pooch.
From: Alex Zeffertt <alex.zeffertt@xxxxxxxxxxxxx>
The xenfs filesystem exports various interfaces to usermode. Initially
this exports a file to allow usermode to interact with xenbus/xenstore.
Traditionally this appeared in /proc/xen. Rather than extending procfs,
this patch adds a backward-compat mountpoint on /proc/xen, and provides
a xenfs filesystem which can be mounted there.
...
--- /dev/null
+++ b/drivers/xen/xenfs/super.c
@@ -0,0 +1,65 @@
+/*
+ * xenfs.c - a filesystem for passing info between the a domain and
+ * the hypervisor.
+ *
+ * 2008-10-07 Alex Zeffertt Replaced /proc/xen/xenbus with xenfs filesystem
+ * and /proc/xen compatibility mount point.
+ * Turned xenfs into a loadable module.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "xenfs.h"
+
+#include <asm/xen/hypervisor.h>
+
+#define XENFS_MAGIC 0xabba1974
Move to include/linux/magic.h, please.
+MODULE_DESCRIPTION("Xen filesystem");
+MODULE_LICENSE("GPL");
MODULE_AUTHOR()?
./MAINTAINERS?
+static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static struct tree_descr xenfs_files[] = {
+ [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR},
+ {""},
+ };
+
+ return simple_fill_super(sb, XENFS_MAGIC, xenfs_files);
+}
+
...
+struct xenbus_transaction_holder {
+ struct list_head list;
+ struct xenbus_transaction handle;
+};
+
+struct read_buffer {
+ struct list_head list;
+ unsigned int cons;
+ unsigned int len;
+ char msg[];
+};
+struct xenbus_file_priv {
+ /* In-progress transaction. */
+ struct list_head transactions;
+
+ /* Active watches. */
+ struct list_head watches;
+
+ /* Partial request. */
+ unsigned int len;
+ union {
+ struct xsd_sockmsg msg;
+ char buffer[PAGE_SIZE];
+ } u;
+
+ /* Response queue. */
+ struct list_head read_buffers;
+ wait_queue_head_t read_waitq;
+
+ struct mutex reply_mutex;
+};
It would be useful to document the locking for the list_head's (at least).
+
+static ssize_t xenbus_file_read(struct file *filp,
+ char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct xenbus_file_priv *u = filp->private_data;
+ struct read_buffer *rb;
+ int i, ret;
+
+ mutex_lock(&u->reply_mutex);
+ while (list_empty(&u->read_buffers)) {
+ mutex_unlock(&u->reply_mutex);
+ ret = wait_event_interruptible(u->read_waitq,
+ !list_empty(&u->read_buffers));
+ if (ret)
+ return ret;
+ mutex_lock(&u->reply_mutex);
+ }
+
+ rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+ for (i = 0; i < len;) {
+ ret = put_user(rb->msg[rb->cons], ubuf + i);
So mmap_sem nests inside ->reply_mutex. Has this all been carefully
tested with lockdep?
+ if (ret) {
+ mutex_unlock(&u->reply_mutex);
+ return ret;
+ }
+ i++;
+ rb->cons++;
+ if (rb->cons == rb->len) {
+ list_del(&rb->list);
+ kfree(rb);
+ if (list_empty(&u->read_buffers))
+ break;
+ rb = list_entry(u->read_buffers.next,
+ struct read_buffer, list);
+ }
+ }
This code will misbehave if it ever receives a read_buffer with len==0.
+ mutex_unlock(&u->reply_mutex);
+
+ return i;
+}
+
+static int queue_reply(struct list_head *list, char *data, unsigned int len)
+{
+ struct read_buffer *rb;
+
+ if (len == 0)
+ return 0;
That should fix it.
+ rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+ if (rb == NULL)
+ return -ENOMEM;
+
+ rb->cons = 0;
+ rb->len = len;
+
+ memcpy(rb->msg, data, len);
+
+ list_add_tail(&rb->list, list);
So the caller of this function must hold ->reply_mutex.
You thought that was secret but I found it out!
Missing.+ return 0;
+}
+
+/* Free all the read_buffer s on a list.
+ * Caller must have sole reference to list.
+ */
+static void queue_cleanup(struct list_head *list)
+{
+ struct read_buffer *rb;
+
+ while (!list_empty(list)) {
+ rb = list_entry(list->next, struct read_buffer, list);
+ list_del(list->next);
+ kfree(rb);
+ }
+}
+
+struct watch_adapter {
+ struct list_head list;
locking is?
+ struct xenbus_watch watch;
+ struct xenbus_file_priv *dev_data;
+ char *token;
+};
+
...
+static LIST_HEAD(watch_list);
+
+static ssize_t xenbus_file_write(struct file *filp,
+ const char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct xenbus_file_priv *u = filp->private_data;
+ struct xenbus_transaction_holder *trans = NULL;
+ uint32_t msg_type;
+ void *reply;
+ char *path, *token;
+ int err, rc = len;
+ int ret;
+ LIST_HEAD(staging_q);
+
+ if ((len + u->len) > sizeof(u->u.buffer)) {
+ rc = -EINVAL;
+ goto out;
+ }
Now what's this doing?
One would expect a large write to get chunked up into smaller writes by
the kernel.
This code is poorly documented.
+ if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ u->len += len;
+ if ((u->len < sizeof(u->u.msg)) ||
+ (u->len < (sizeof(u->u.msg) + u->u.msg.len)))
+ return rc;
+
+ msg_type = u->u.msg.type;
+
+ switch (msg_type) {
+ case XS_TRANSACTION_START:
+ case XS_TRANSACTION_END:
+ case XS_DIRECTORY:
+ case XS_READ:
+ case XS_GET_PERMS:
+ case XS_RELEASE:
+ case XS_GET_DOMAIN_PATH:
+ case XS_WRITE:
+ case XS_MKDIR:
+ case XS_RM:
+ case XS_SET_PERMS:
+ if (msg_type == XS_TRANSACTION_START) {
+ trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+ if (!trans) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+
+ reply = xenbus_dev_request_and_reply(&u->u.msg);
+ if (IS_ERR(reply)) {
+ kfree(trans);
+ rc = PTR_ERR(reply);
+ goto out;
+ }
+
+ if (msg_type == XS_TRANSACTION_START) {
+ trans->handle.id = simple_strtoul(reply, NULL, 0);
+ list_add(&trans->list, &u->transactions);
+ } else if (msg_type == XS_TRANSACTION_END) {
+ list_for_each_entry(trans, &u->transactions, list)
+ if (trans->handle.id == u->u.msg.tx_id)
+ break;
+ BUG_ON(&trans->list == &u->transactions);
+ list_del(&trans->list);
+ kfree(trans);
+ }
+ mutex_lock(&u->reply_mutex);
+ ret = queue_reply(&staging_q,
+ (char *)&u->u.msg, sizeof(u->u.msg));
+ if (!ret)
+ ret = queue_reply(&staging_q,
+ (char *)reply, u->u.msg.len);
+ if (!ret) {
+ list_splice_tail(&staging_q, &u->read_buffers);
+ wake_up(&u->read_waitq);
+ } else {
+ queue_cleanup(&staging_q);
+ rc = ret;
+ }
+ mutex_unlock(&u->reply_mutex);
+ kfree(reply);
+ break;
+
+ case XS_WATCH:
+ case XS_UNWATCH: {
+ struct watch_adapter *watch, *tmp_watch;
+ static const char XS_RESP[] = "OK";
+ struct xsd_sockmsg hdr;
+
+ path = u->u.buffer + sizeof(u->u.msg);
+ token = memchr(path, 0, u->u.msg.len);
+ if (token == NULL) {
+ rc = -EILSEQ;
+ goto out;
+ }
+ token++;
+
+ if (msg_type == XS_WATCH) {
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (watch == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ watch->watch.node =
+ kmalloc(strlen(path)+1, GFP_KERNEL);
+ if (watch->watch.node == NULL) {
+ kfree(watch);
+ rc = -ENOMEM;
+ goto out;
+ }
+ strcpy((char *)watch->watch.node, path);
+ watch->watch.callback = watch_fired;
+ watch->token =
+ kmalloc(strlen(token)+1, GFP_KERNEL);
+ if (watch->token == NULL) {
+ kfree(watch->watch.node);
+ kfree(watch);
+ rc = -ENOMEM;
+ goto out;
+ }
+ strcpy(watch->token, token);
+ watch->dev_data = u;
+
+ err = register_xenbus_watch(&watch->watch);
+ if (err) {
+ free_watch_adapter(watch);
+ rc = err;
+ goto out;
+ }
+
+ list_add(&watch->list, &u->watches);
+ } else {
+ list_for_each_entry_safe(watch, tmp_watch,
+ &u->watches, list) {
+ if (!strcmp(watch->token, token) &&
+ !strcmp(watch->watch.node, path)) {
+ unregister_xenbus_watch(&watch->watch);
+ list_del(&watch->list);
+ free_watch_adapter(watch);
+ break;
+ }
+ }
+ }
+
+ hdr.type = msg_type;
+ hdr.len = sizeof(XS_RESP);
+ mutex_lock(&u->reply_mutex);
+ ret = queue_reply(&staging_q, (char *)&hdr, sizeof(hdr));
+ if (!ret)
+ ret = queue_reply(&staging_q, (char *)XS_RESP, hdr.len);
+ if (!ret) {
+ list_splice_tail(&staging_q, &u->read_buffers);
+ wake_up(&u->read_waitq);
+ } else {
+ queue_cleanup(&staging_q);
+ rc = ret;
+ }
+ mutex_unlock(&u->reply_mutex);
+ break;
+ }
+
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ out:
+ u->len = 0;
+ return rc;
+}
This function implements a new kernel ABI. How are reviewers to review
the design of this proposed ABI?
...
+static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
+{
+ struct xenbus_file_priv *u = file->private_data;
+
+ poll_wait(file, &u->read_waitq, wait);
+ if (!list_empty(&u->read_buffers))
+ return POLLIN | POLLRDNORM;
I wonder if we needed the lock or some open-coded barrier to safely run
list_empty().