[PATCH] vfio-pci: Avoid recursive read-lock usage
From: Alex Williamson
Date: Wed Aug 05 2020 - 14:01:21 EST
A down_read on memory_lock is held when performing read/write accesses
to MMIO BAR space, including across the copy_to/from_user() callouts
which may fault. If the user buffer for these copies resides in an
mmap of device MMIO space, the mmap fault handler will acquire a
recursive read-lock on memory_lock. Avoid this by reducing the lock
granularity. Sequential accesses requiring multiple ioread/iowrite
cycles are expected to be rare, therefore typical accesses should not
see additional overhead.
VGA MMIO accesses are expected to be non-fatal regardless of the PCI
memory enable bit to allow legacy probing, this behavior remains with
a comment added. ioeventfds are now included in memory access testing,
with writes dropped while memory space is disabled.
Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory")
Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
---
drivers/vfio/pci/vfio_pci_private.h | 2 +
drivers/vfio/pci/vfio_pci_rdwr.c | 120 ++++++++++++++++++++++++++++-------
2 files changed, 98 insertions(+), 24 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 86a02aff8735..61ca8ab165dc 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -33,12 +33,14 @@
struct vfio_pci_ioeventfd {
struct list_head next;
+ struct vfio_pci_device *vdev;
struct virqfd *virqfd;
void __iomem *addr;
uint64_t data;
loff_t pos;
int bar;
int count;
+ bool test_mem;
};
struct vfio_pci_irq_ctx {
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 916b184df3a5..9e353c484ace 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -37,17 +37,70 @@
#define vfio_ioread8 ioread8
#define vfio_iowrite8 iowrite8
+#define VFIO_IOWRITE(size) \
+static int vfio_pci_iowrite##size(struct vfio_pci_device *vdev, \
+ bool test_mem, u##size val, void __iomem *io) \
+{ \
+ if (test_mem) { \
+ down_read(&vdev->memory_lock); \
+ if (!__vfio_pci_memory_enabled(vdev)) { \
+ up_read(&vdev->memory_lock); \
+ return -EIO; \
+ } \
+ } \
+ \
+ vfio_iowrite##size(val, io); \
+ \
+ if (test_mem) \
+ up_read(&vdev->memory_lock); \
+ \
+ return 0; \
+}
+
+VFIO_IOWRITE(8)
+VFIO_IOWRITE(16)
+VFIO_IOWRITE(32)
+#ifdef iowrite64
+VFIO_IOWRITE(64)
+#endif
+
+#define VFIO_IOREAD(size) \
+static int vfio_pci_ioread##size(struct vfio_pci_device *vdev, \
+ bool test_mem, u##size *val, void __iomem *io) \
+{ \
+ if (test_mem) { \
+ down_read(&vdev->memory_lock); \
+ if (!__vfio_pci_memory_enabled(vdev)) { \
+ up_read(&vdev->memory_lock); \
+ return -EIO; \
+ } \
+ } \
+ \
+ *val = vfio_ioread##size(io); \
+ \
+ if (test_mem) \
+ up_read(&vdev->memory_lock); \
+ \
+ return 0; \
+}
+
+VFIO_IOREAD(8)
+VFIO_IOREAD(16)
+VFIO_IOREAD(32)
+
/*
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
* range which is inaccessible. The excluded range drops writes and fills
* reads with -1. This is intended for handling MSI-X vector tables and
* leftover space for ROM BARs.
*/
-static ssize_t do_io_rw(void __iomem *io, char __user *buf,
+static ssize_t do_io_rw(struct vfio_pci_device *vdev, bool test_mem,
+ void __iomem *io, char __user *buf,
loff_t off, size_t count, size_t x_start,
size_t x_end, bool iswrite)
{
ssize_t done = 0;
+ int ret;
while (count) {
size_t fillable, filled;
@@ -66,9 +119,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
if (copy_from_user(&val, buf, 4))
return -EFAULT;
- vfio_iowrite32(val, io + off);
+ ret = vfio_pci_iowrite32(vdev, test_mem,
+ val, io + off);
+ if (ret)
+ return ret;
} else {
- val = vfio_ioread32(io + off);
+ ret = vfio_pci_ioread32(vdev, test_mem,
+ &val, io + off);
+ if (ret)
+ return ret;
if (copy_to_user(buf, &val, 4))
return -EFAULT;
@@ -82,9 +141,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
if (copy_from_user(&val, buf, 2))
return -EFAULT;
- vfio_iowrite16(val, io + off);
+ ret = vfio_pci_iowrite16(vdev, test_mem,
+ val, io + off);
+ if (ret)
+ return ret;
} else {
- val = vfio_ioread16(io + off);
+ ret = vfio_pci_ioread16(vdev, test_mem,
+ &val, io + off);
+ if (ret)
+ return ret;
if (copy_to_user(buf, &val, 2))
return -EFAULT;
@@ -98,9 +163,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
if (copy_from_user(&val, buf, 1))
return -EFAULT;
- vfio_iowrite8(val, io + off);
+ ret = vfio_pci_iowrite8(vdev, test_mem,
+ val, io + off);
+ if (ret)
+ return ret;
} else {
- val = vfio_ioread8(io + off);
+ ret = vfio_pci_ioread8(vdev, test_mem,
+ &val, io + off);
+ if (ret)
+ return ret;
if (copy_to_user(buf, &val, 1))
return -EFAULT;
@@ -178,14 +249,6 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
count = min(count, (size_t)(end - pos));
- if (res->flags & IORESOURCE_MEM) {
- down_read(&vdev->memory_lock);
- if (!__vfio_pci_memory_enabled(vdev)) {
- up_read(&vdev->memory_lock);
- return -EIO;
- }
- }
-
if (bar == PCI_ROM_RESOURCE) {
/*
* The ROM can fill less space than the BAR, so we start the
@@ -213,7 +276,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
x_end = vdev->msix_offset + vdev->msix_size;
}
- done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite);
+ done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
+ count, x_start, x_end, iswrite);
if (done >= 0)
*ppos += done;
@@ -221,9 +285,6 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
if (bar == PCI_ROM_RESOURCE)
pci_unmap_rom(pdev, io);
out:
- if (res->flags & IORESOURCE_MEM)
- up_read(&vdev->memory_lock);
-
return done;
}
@@ -278,7 +339,12 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
return ret;
}
- done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite);
+ /*
+ * VGA MMIO is a legacy, non-BAR resource that hopefully allows
+ * probing, so we don't currently worry about access in relation
+ * to the memory enable bit in the command register.
+ */
+ done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite);
vga_put(vdev->pdev, rsrc);
@@ -296,17 +362,21 @@ static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
switch (ioeventfd->count) {
case 1:
- vfio_iowrite8(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite8(ioeventfd->vdev, ioeventfd->test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
case 2:
- vfio_iowrite16(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite16(ioeventfd->vdev, ioeventfd->test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
case 4:
- vfio_iowrite32(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite32(ioeventfd->vdev, ioeventfd->test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
#ifdef iowrite64
case 8:
- vfio_iowrite64(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite64(ioeventfd->vdev, ioeventfd->test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
#endif
}
@@ -378,11 +448,13 @@ long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
goto out_unlock;
}
+ ioeventfd->vdev = vdev;
ioeventfd->addr = vdev->barmap[bar] + pos;
ioeventfd->data = data;
ioeventfd->pos = pos;
ioeventfd->bar = bar;
ioeventfd->count = count;
+ ioeventfd->test_mem = vdev->pdev->resource[bar].flags & IORESOURCE_MEM;
ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
NULL, NULL, &ioeventfd->virqfd, fd);