For virtualization, we've developed virtio_ring for efficient communication.
This would also work well for userspace-kernel communication, particularly
for things like the tun device. By using the same ABI, we can join guests
to the host kernel trivially.
These patches are fairly alpha; I've seen some network stalls I have to
track down and there are some fixmes.
Comments welcome!
Rusty.
diff -r 99132ad16999 Documentation/test_vring.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Documentation/test_vring.c Sat Apr 05 21:31:40 2008 +1100
@@ -0,0 +1,47 @@
+#include <unistd.h>
+#include <linux/virtio_ring.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <err.h>
+#include <poll.h>
+
+#ifndef __NR_vringfd
+#define __NR_vringfd 327
+#endif
+
+int main()
+{
+ int fd, r;
+ struct vring vr;
+ uint16_t used = 0;
+ struct pollfd pfd;
+ void *buf = calloc(vring_size(256, getpagesize()), 0);
+ vring_init(&vr, 256, buf, getpagesize());
+
+ fd = syscall(__NR_vringfd, buf, 256, &used);
+ if (fd < 0)
+ err(1, "vringfd gave %i", fd);
+
+ pfd.fd = fd;
+ pfd.events = POLLIN;
+ r = poll(&pfd, 1, 0);
+
+ if (r != 0)
+ err(1, "poll gave %i", r);
+
+ vr.used->idx++;
+ r = poll(&pfd, 1, 0);
+
+ if (r != 1)
+ err(1, "poll after buf used gave %i", r);
+
+ used++;
+ r = poll(&pfd, 1, 0);
+
+ if (r != 0)
+ err(1, "poll after used incremented gave %i", r);
+ close(fd);
+ return 0;
+}
diff -r 99132ad16999 arch/x86/kernel/syscall_table_32.S
--- a/arch/x86/kernel/syscall_table_32.S Sat Apr 05 21:20:32 2008 +1100
+++ b/arch/x86/kernel/syscall_table_32.S Sat Apr 05 21:31:40 2008 +1100
@@ -326,3 +326,4 @@ ENTRY(sys_call_table)
.long sys_fallocate
.long sys_timerfd_settime /* 325 */
.long sys_timerfd_gettime
+ .long sys_vringfd
diff -r 99132ad16999 fs/Kconfig
--- a/fs/Kconfig Sat Apr 05 21:20:32 2008 +1100
+++ b/fs/Kconfig Sat Apr 05 21:31:40 2008 +1100
@@ -2135,4 +2135,14 @@ source "fs/nls/Kconfig"
source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"
+config VRINGFD
+ bool "vring fd support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ vring is a ringbuffer implementation for efficient I/O. It is
+ currently used by virtualization hosts (lguest, kvm) for efficient
+ networking using the tun driver.
+
+ If unsure, say N.
+
+/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */
+int vring_get_buffer(struct vring_info *vr,
+ struct iovec *in_iov,
+ unsigned int *num_in, unsigned long *in_len,
+ struct iovec *out_iov,
+ unsigned int *num_out, unsigned long *out_len)
+{
+void vring_used_buffer_atomic(struct vring_info *vr, int id, u32 len)
+{
+ struct vring_used_elem *used;
+
+ BUG_ON(id <= 0 || id > vr->ring.num);
+ BUG_ON(!vr->used);
+
+ used = &vr->used->ring[vr->used->idx & vr->mask];
+ used->id = id - 1;
+ used->len = len;
+ /* Make sure buffer is written before we update index. */
+ wmb();
+ vr->used->idx++;
+}
+EXPORT_SYMBOL_GPL(vring_used_buffer_atomic);
+
+void vring_wake(struct vring_info *vr)
+{
+ wake_up(&vr->poll_wait);
+}
+EXPORT_SYMBOL_GPL(vring_wake);
+
+struct vring_info *vring_attach(int fd, const struct vring_ops *ops,
+ void *data, bool atomic_use)
+{
+ struct file *filp;
+ struct vring_info *vr;
+
+ /* Must be a valid fd, and must be one of ours. */
+ filp = fget(fd);
+ if (!filp) {
+ vr = ERR_PTR(-EBADF);
+ goto out;
+ }
+
+ if (filp->f_op != &vring_fops) {
+ vr = ERR_PTR(-EBADF);
+ goto fput;
+ }
+
+ /* Mutex simply protects against parallel vring_attach. */
+ mutex_lock(&vring_lock);
+ vr = filp->private_data;
+ if (vr->ops) {
+ vr = ERR_PTR(-EBUSY);
+ goto unlock;
+ }
+
+ /* If they want to use atomically, we have to map the page. */
+ if (atomic_use) {
+ if (get_user_pages(current, current->mm,
+ (unsigned long)vr->ring.used, 1, 1, 1,
+ &vr->used_page, NULL) != 1) {
+ vr = ERR_PTR(-EFAULT);
+ goto unlock;
+ }