[PATCH 7/9] vfio/pci: Support mmap() of a VFIO DMABUF

From: Matt Evans

Date: Thu Apr 16 2026 - 09:21:36 EST


A VFIO DMABUF can export a subset of a BAR to userspace by fd; add
support for mmap() of this fd. This provides another route for a
process to map BARs, except one where the process can only map a specific
subset of a BAR represented by the exported DMABUF.

mmap() support enables userspace driver designs that safely delegate
access to BAR sub-ranges to other client processes by sharing a DMABUF
fd, without having to share the (omnipotent) VFIO device fd with them.

Since the main VFIO BAR mmap() is now DMABUF-aware, this path reuses
the existing vm_ops. But, since the lifecycle of an exported DMABUF
is still decoupled from that of the device fd it came from, the device
fd might now be closed concurrent with a VMA fault.

Extra synchronisation is added to deal with the possibility of a fault
racing with the DMABUF cleanup path. (Note that this differs to a
DMABUF implicitly created on the mmap() path, which holds ownership of
the device fd and so prevents close-during-fault scenarios in order to
maintain the same user-facing behaviour on close.) It does this by
temporarily taking a VFIO device registration to ensure vdev remains
valid, then vdev->memory_lock can be taken.

Signed-off-by: Matt Evans <mattev@xxxxxxxx>
---
drivers/vfio/pci/vfio_pci_core.c | 79 ++++++++++++++++++++++++++----
drivers/vfio/pci/vfio_pci_dmabuf.c | 28 +++++++++++
drivers/vfio/pci/vfio_pci_priv.h | 2 +
3 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 464b63585bef..cad126cf8737 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -12,6 +12,8 @@

#include <linux/aperture.h>
#include <linux/device.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include <linux/eventfd.h>
#include <linux/file.h>
#include <linux/interrupt.h>
@@ -1703,20 +1705,76 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
vm_fault_t ret = VM_FAULT_SIGBUS;

/*
- * We can rely on the existence of both a DMABUF (priv) and
- * the VFIO device it was exported from (vdev). This fault's
- * VMA was established using vfio_pci_core_mmap_prep_dmabuf()
- * which transfers ownership of the VFIO device fd to the
- * DMABUF, and so the VFIO device is held open because the
- * VMA's vm_file (DMABUF) is open.
+ * The only thing this can rely on is that the DMABUF relating
+ * to the VMA's vm_file exists (priv).
*
- * Since vfio_pci_dma_buf_cleanup() cannot have happened,
- * vdev must be valid; we can take memory_lock.
+ * A DMABUF for a VFIO device fd mmap() holds a reference to
+ * the original VFIO device fd, but an explicitly-exported
+ * DMABUF does not. The original fd might have closed,
+ * meaning this fault can race with
+ * vfio_pci_dma_buf_cleanup(), meaning priv->vdev might be
+ * NULL, and the VFIO device registration might have been
+ * dropped.
+ *
+ * With the goal of taking vdev->memory_lock in a world where
+ * vdev might not still exist:
+ *
+ * 1. Take the resv lock on the DMABUF:
+ * - If racing cleanup got in first, vdev == NULL and buffer
+ * is revoked; stop/exit if so.
+ * - If we got in first, vdev is non-NULL, accessible, and
+ * cleanup _has not yet put the VFIO device registration_,
+ * so the device refcount must be >0.
+ *
+ * 2. Take vfio_device registration (refcount guaranteed >0
+ * hereafter).
+ *
+ * 3. Unlock the DMABUF's resv lock:
+ * - A racing cleanup can now complete.
+ * - But, the device refcount >0, meaning the vfio_device
+ * (and vfio_pcie_core device vdev) have not yet been
+ * freed. vdev is accessible, even if the DMABUF has been
+ * revoked or cleanup has happened, because
+ * vfio_unregister_group_dev() can't complete.
+ *
+ * 4. Take the vdev->memory_lock
+ * - Either the DMABUF is usable, or has been cleaned up.
+ * Whichever, it can no longer change under us.
+ * - Test the DMABUF revocation status again: if it was
+ * revoked between 1 and 4 return a SIGBUS. Otherwise,
+ * return a PFN.
+ * - It's not necessary to also take the resv lock, because
+ * the status/vdev can't change while memory_lock is held.
+ *
+ * 5. Unlock, done.
*/
+
+ dma_resv_lock(priv->dmabuf->resv, NULL);
vdev = READ_ONCE(priv->vdev);

+ if (READ_ONCE(priv->revoked) || !vdev) {
+ pr_debug_ratelimited("%s VA 0x%lx, pgoff 0x%lx: DMABUF revoked/cleaned up\n",
+ __func__, vmf->address, vma->vm_pgoff);
+ dma_resv_unlock(priv->dmabuf->resv);
+ return VM_FAULT_SIGBUS;
+ }
+ /* vdev is usable */
+
+ if (!vfio_device_try_get_registration(&vdev->vdev)) {
+ /*
+ * If vdev != NULL (above), the registration should
+ * already be >0 and so this try_get should never
+ * fail.
+ */
+ dev_warn(&vdev->pdev->dev, "%s: Unexpected registration failure\n",
+ __func__);
+ dma_resv_unlock(priv->dmabuf->resv);
+ return VM_FAULT_SIGBUS;
+ }
+ dma_resv_unlock(priv->dmabuf->resv);
+
scoped_guard(rwsem_read, &vdev->memory_lock) {
- if (!priv->revoked) {
+ if (!READ_ONCE(priv->revoked)) {
int pres = vfio_pci_dma_buf_find_pfn(priv, vma,
vmf->address,
order, &pfn);
@@ -1734,6 +1792,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
vma->vm_pgoff, (unsigned int)ret);
}

+ vfio_device_put_registration(&vdev->vdev);
return ret;
}

@@ -1742,7 +1801,7 @@ static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf)
return vfio_pci_mmap_huge_fault(vmf, 0);
}

-static const struct vm_operations_struct vfio_pci_mmap_ops = {
+const struct vm_operations_struct vfio_pci_mmap_ops = {
.fault = vfio_pci_mmap_page_fault,
#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
.huge_fault = vfio_pci_mmap_huge_fault,
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index 04c7733fe712..cc477f46a7d5 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -27,6 +27,33 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,

return 0;
}
+
+static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+ u64 req_len, req_start;
+
+ if (priv->revoked)
+ return -ENODEV;
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ req_len = vma->vm_end - vma->vm_start;
+ req_start = vma->vm_pgoff << PAGE_SHIFT;
+ if (req_start + req_len > priv->size)
+ return -EINVAL;
+
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
+ /* See comments in vfio_pci_core_mmap() re VM_ALLOW_ANY_UNCACHED. */
+ vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+ VM_DONTEXPAND | VM_DONTDUMP);
+ vma->vm_private_data = priv;
+ vma->vm_ops = &vfio_pci_mmap_ops;
+
+ return 0;
+}
#endif /* CONFIG_VFIO_PCI_DMABUF */

static void vfio_pci_dma_buf_done(struct kref *kref)
@@ -94,6 +121,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
#ifdef CONFIG_VFIO_PCI_DMABUF
.attach = vfio_pci_dma_buf_attach,
+ .mmap = vfio_pci_dma_buf_mmap,
#endif
.map_dma_buf = vfio_pci_dma_buf_map,
.unmap_dma_buf = vfio_pci_dma_buf_unmap,
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index a8edbee6ce56..f837d6c8bddc 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -37,6 +37,8 @@ struct vfio_pci_dma_buf {
u8 revoked : 1;
};

+extern const struct vm_operations_struct vfio_pci_mmap_ops;
+
bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);

--
2.47.3