Re: [PATCH v12 10/22] vfio iommu type1: Add support for mediated devices

From: Alex Williamson
Date: Mon Nov 14 2016 - 18:25:55 EST


On Mon, 14 Nov 2016 21:12:24 +0530
Kirti Wankhede <kwankhede@xxxxxxxxxx> wrote:

> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
>
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
>
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - Domain for external user is tracked separately in vfio_iommu structure.
> It is allocated when group for first mdev device is attached.
> - Pages pinned for external domain are tracked in each vfio_dma structure
> for that iova range.
> - Page tracking rb-tree in vfio_dma keeps <iova, pfn, ref_count>. Key of
> rb-tree is iova, but it actually aims to track pfns.
> - On external pin request for an iova, page is pinned only once, if iova
> is already pinned and tracked, ref_count is incremented.

This is referring only to the external (ie. pfn_list) tracking only,
correct? In general, a page can be pinned up to twice per iova
referencing it, once for an iommu mapped domain and again in the
pfn_list, right?

> - External unin request unpins pages only when ref_count is 0.
^^^^ unpin

> - Pinned pages list is used to verify unpinning request and to unpin
> remaining pages while detaching the group for that device.
> - Page accounting is updated to account in its address space where the
> pages are pinned/unpinned, i.e dma->task
> - Accouting for mdev device is only done if there is no iommu capable
> domain in the container. When there is a direct device assigned to the
> container and that domain is iommu capable, all pages are already pinned
> during DMA_MAP.
> - Page accouting is updated on hot plug and unplug mdev device and pass
> through device.
>
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
> exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
> exist
>
> Signed-off-by: Kirti Wankhede <kwankhede@xxxxxxxxxx>
> Signed-off-by: Neo Jia <cjia@xxxxxxxxxx>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
> drivers/vfio/vfio_iommu_type1.c | 587 +++++++++++++++++++++++++++++++++++-----
> 1 file changed, 526 insertions(+), 61 deletions(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 50aca95cf61e..2697d874dd35 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -37,6 +37,7 @@
> #include <linux/vfio.h>
> #include <linux/workqueue.h>
> #include <linux/pid_namespace.h>
> +#include <linux/mdev.h>
>
> #define DRIVER_VERSION "0.2"
> #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@xxxxxxxxxx>"
> @@ -56,6 +57,7 @@ MODULE_PARM_DESC(disable_hugepages,
>
> struct vfio_iommu {
> struct list_head domain_list;
> + struct vfio_domain *external_domain; /* domain for external user */
> struct mutex lock;
> struct rb_root dma_list;
> bool v2;
> @@ -76,7 +78,9 @@ struct vfio_dma {
> unsigned long vaddr; /* Process virtual addr */
> size_t size; /* Map size (bytes) */
> int prot; /* IOMMU_READ/WRITE */
> + bool iommu_mapped;
> struct task_struct *task;
> + struct rb_root pfn_list; /* Ex-user pinned pfn list */
> };
>
> struct vfio_group {
> @@ -85,6 +89,21 @@ struct vfio_group {
> };
>
> /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> + struct rb_node node;
> + dma_addr_t iova; /* Device address */
> + unsigned long pfn; /* Host pfn */
> + atomic_t ref_count;
> +};
> +
> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> + (!list_empty(&iommu->domain_list))
> +
> +static int put_pfn(unsigned long pfn, int prot);
> +
> +/*
> * This code handles mapping and unmapping of user data buffers
> * into DMA'ble space using the IOMMU
> */
> @@ -132,6 +151,97 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> rb_erase(&old->node, &iommu->dma_list);
> }
>
> +/*
> + * Helper Functions for host iova-pfn list
> + */
> +static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
> +{
> + struct vfio_pfn *vpfn;
> + struct rb_node *node = dma->pfn_list.rb_node;
> +
> + while (node) {
> + vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> + if (iova < vpfn->iova)
> + node = node->rb_left;
> + else if (iova > vpfn->iova)
> + node = node->rb_right;
> + else
> + return vpfn;
> + }
> + return NULL;
> +}
> +
> +static void vfio_link_pfn(struct vfio_dma *dma,
> + struct vfio_pfn *new)
> +{
> + struct rb_node **link, *parent = NULL;
> + struct vfio_pfn *vpfn;
> +
> + link = &dma->pfn_list.rb_node;
> + while (*link) {
> + parent = *link;
> + vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> + if (new->iova < vpfn->iova)
> + link = &(*link)->rb_left;
> + else
> + link = &(*link)->rb_right;
> + }
> +
> + rb_link_node(&new->node, parent, link);
> + rb_insert_color(&new->node, &dma->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
> +{
> + rb_erase(&old->node, &dma->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
> + unsigned long pfn)
> +{
> + struct vfio_pfn *vpfn;
> +
> + vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> + if (!vpfn)
> + return -ENOMEM;
> +
> + vpfn->iova = iova;
> + vpfn->pfn = pfn;
> + atomic_set(&vpfn->ref_count, 1);
> + vfio_link_pfn(dma, vpfn);
> + return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
> + struct vfio_pfn *vpfn)
> +{
> + vfio_unlink_pfn(dma, vpfn);
> + kfree(vpfn);
> +}
> +
> +static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
> + unsigned long iova)
> +{
> + struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
> +
> + if (vpfn)
> + atomic_inc(&vpfn->ref_count);
> + return vpfn;
> +}
> +
> +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> +{
> + int ret = 0;
> +
> + if (atomic_dec_and_test(&vpfn->ref_count)) {
> + ret = put_pfn(vpfn->pfn, dma->prot);
> + vfio_remove_from_pfn_list(dma, vpfn);
> + }
> + return ret;
> +}
> +
> struct vwork {
> struct mm_struct *mm;
> long npage;
> @@ -270,7 +380,6 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> }
>
> up_read(&mm->mmap_sem);
> -
> return ret;
> }
>
> @@ -280,28 +389,35 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> * first page and all consecutive pages with the same locking.
> */
> static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> - long npage, int prot, unsigned long *pfn_base)
> + long npage, unsigned long *pfn_base)
> {
> unsigned long limit;
> bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
> CAP_IPC_LOCK);
> struct mm_struct *mm;
> - long ret, i;
> + long ret, i, lock_acct = 0;
> bool rsvd;
> + struct vfio_pfn *vpfn;
> + dma_addr_t iova;
>
> mm = get_task_mm(dma->task);
> if (!mm)
> return -ENODEV;
>
> - ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
> + ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
> if (ret)
> goto pin_pg_remote_exit;
>
> + iova = vaddr - dma->vaddr + dma->iova;
> + vpfn = vfio_find_vpfn(dma, iova);
> + if (!vpfn)
> + lock_acct = 1;
> +
> rsvd = is_invalid_reserved_pfn(*pfn_base);
> limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>
> - if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
> - put_pfn(*pfn_base, prot);
> + if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) {
> + put_pfn(*pfn_base, dma->prot);
> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> limit << PAGE_SHIFT);
> ret = -ENOMEM;
> @@ -310,7 +426,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>
> if (unlikely(disable_hugepages)) {
> if (!rsvd)
> - vfio_lock_acct(dma->task, 1);
> + vfio_lock_acct(dma->task, lock_acct);
> ret = 1;
> goto pin_pg_remote_exit;
> }
> @@ -319,18 +435,24 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
> unsigned long pfn = 0;
>
> - ret = vaddr_get_pfn(mm, vaddr, prot, &pfn);
> + ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn);
> if (ret)
> break;
>
> if (pfn != *pfn_base + i ||
> rsvd != is_invalid_reserved_pfn(pfn)) {
> - put_pfn(pfn, prot);
> + put_pfn(pfn, dma->prot);
> break;
> }
>
> - if (!rsvd && !lock_cap && mm->locked_vm + i + 1 > limit) {
> - put_pfn(pfn, prot);
> + iova = vaddr - dma->vaddr + dma->iova;


nit, this could be incremented in the for loop just like vaddr, we
don't need to create it from scratch (iova += PAGE_SIZE).


> + vpfn = vfio_find_vpfn(dma, iova);
> + if (!vpfn)
> + lock_acct++;
> +
> + if (!rsvd && !lock_cap &&
> + mm->locked_vm + lock_acct + 1 > limit) {


lock_acct was incremented just above, why is this lock_acct + 1? I
think we're off by one here (ie. remove the +1)?

> + put_pfn(pfn, dma->prot);
> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> __func__, limit << PAGE_SHIFT);
> break;
> @@ -338,7 +460,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> }
>
> if (!rsvd)
> - vfio_lock_acct(dma->task, i);
> + vfio_lock_acct(dma->task, lock_acct);
> ret = i;
>
> pin_pg_remote_exit:
> @@ -346,14 +468,78 @@ pin_pg_remote_exit:
> return ret;
> }
>
> -static long vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
> - long npage, int prot, bool do_accounting)
> +static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
> + unsigned long pfn, long npage,
> + bool do_accounting)
> {
> - unsigned long unlocked = 0;
> + long unlocked = 0, locked = 0;
> long i;
>
> - for (i = 0; i < npage; i++)
> - unlocked += put_pfn(pfn++, prot);
> + for (i = 0; i < npage; i++) {
> + struct vfio_pfn *vpfn;
> +
> + unlocked += put_pfn(pfn++, dma->prot);
> +
> + vpfn = vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT));
> + if (vpfn)
> + locked++;

This isn't taking into account reserved pages (ex. device mmaps). In
the pinning path above we skip accounting of reserved pages, put_pfn
also only increments for non-reserved pages, but vfio_find_vpfn()
doesn't care, so it's possible that (locked - unlocked) could result in
a positive value. Maybe something like:

if (put_pfn(...)) {
unlocked++;
if (vfio_find_vpfn(...))
locked++;
}

> + }
> +
> + if (do_accounting)
> + vfio_lock_acct(dma->task, locked - unlocked);
> +
> + return unlocked;
> +}
> +
> +static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> + unsigned long *pfn_base, bool do_accounting)
> +{
> + unsigned long limit;
> + bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
> + CAP_IPC_LOCK);
> + struct mm_struct *mm;
> + int ret;
> + bool rsvd;
> +
> + mm = get_task_mm(dma->task);
> + if (!mm)
> + return -ENODEV;
> +
> + ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
> + if (ret)
> + goto pin_page_exit;
> +
> + rsvd = is_invalid_reserved_pfn(*pfn_base);
> + limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> + if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
> + put_pfn(*pfn_base, dma->prot);
> + pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
> + __func__, dma->task->comm, task_pid_nr(dma->task),
> + limit << PAGE_SHIFT);
> + ret = -ENOMEM;
> + goto pin_page_exit;
> + }
> +
> + if (!rsvd && do_accounting)
> + vfio_lock_acct(dma->task, 1);
> + ret = 1;
> +
> +pin_page_exit:
> + mmput(mm);
> + return ret;
> +}
> +
> +static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> + bool do_accounting)
> +{
> + int unlocked;
> + struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
> +
> + if (!vpfn)
> + return 0;
> +
> + unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
>
> if (do_accounting)
> vfio_lock_acct(dma->task, -unlocked);
> @@ -361,14 +547,148 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
> return unlocked;
> }
>
> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +static int vfio_iommu_type1_pin_pages(void *iommu_data,
> + unsigned long *user_pfn,
> + int npage, int prot,
> + unsigned long *phys_pfn)
> +{
> + struct vfio_iommu *iommu = iommu_data;
> + int i, j, ret;
> + unsigned long remote_vaddr;
> + unsigned long *pfn = phys_pfn;

nit, why do we have this variable rather than using phys_pfn directly?
Maybe before we had unwind support we incremented this rather than
indexing it?

> + struct vfio_dma *dma;
> + bool do_accounting;
> +
> + if (!iommu || !user_pfn || !phys_pfn)
> + return -EINVAL;
> +
> + /* Supported for v2 version only */
> + if (!iommu->v2)
> + return -EACCES;
> +
> + mutex_lock(&iommu->lock);
> +
> + if (!iommu->external_domain) {
> + ret = -EINVAL;
> + goto pin_done;
> + }
> +
> + /*
> + * If iommu capable domain exist in the container then all pages are
> + * already pinned and accounted. Accouting should be done if there is no
> + * iommu capable domain in the container.
> + */
> + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> +
> + for (i = 0; i < npage; i++) {
> + dma_addr_t iova;
> + struct vfio_pfn *vpfn;
> +
> + iova = user_pfn[i] << PAGE_SHIFT;
> + dma = vfio_find_dma(iommu, iova, 0);
> + if (!dma) {
> + ret = -EINVAL;
> + goto pin_unwind;
> + }
> +
> + if ((dma->prot & prot) != prot) {
> + pr_info("%s: dma->prot: 0x%x prot: 0x%x\n",
> + __func__, dma->prot, prot);
> + ret = -EPERM;
> + goto pin_unwind;
> + }
> +
> + vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> + if (vpfn) {
> + pfn[i] = vpfn->pfn;
> + continue;
> + }
> +
> + remote_vaddr = dma->vaddr + iova - dma->iova;
> + ret = vfio_pin_page_external(dma, remote_vaddr, &pfn[i],
> + do_accounting);
> + if (ret <= 0) {
> + WARN_ON(!ret);
> + goto pin_unwind;
> + }
> +
> + ret = vfio_add_to_pfn_list(dma, iova, pfn[i]);
> + if (ret) {
> + vfio_unpin_page_external(dma, iova, do_accounting);
> + goto pin_unwind;
> + }
> + }
> +
> + ret = i;
> + goto pin_done;
> +
> +pin_unwind:
> + pfn[i] = 0;
> + for (j = 0; j < i; j++) {
> + dma_addr_t iova;
> +
> + iova = user_pfn[j] << PAGE_SHIFT;
> + dma = vfio_find_dma(iommu, iova, 0);
> + vfio_unpin_page_external(dma, iova, do_accounting);
> + pfn[j] = 0;
> + }
> +pin_done:
> + mutex_unlock(&iommu->lock);
> + return ret;
> +}
> +
> +static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> + unsigned long *user_pfn,
> + int npage)
> +{
> + struct vfio_iommu *iommu = iommu_data;
> + bool do_accounting;
> + int unlocked = 0, i;
> +
> + if (!iommu || !user_pfn)
> + return -EINVAL;
> +
> + /* Supported for v2 version only */
> + if (!iommu->v2)
> + return -EACCES;
> +
> + mutex_lock(&iommu->lock);
> +
> + if (!iommu->external_domain) {
> + mutex_unlock(&iommu->lock);
> + return -EINVAL;
> + }
> +
> + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> + for (i = 0; i < npage; i++) {
> + struct vfio_dma *dma;
> + dma_addr_t iova;
> +
> + iova = user_pfn[i] << PAGE_SHIFT;
> + dma = vfio_find_dma(iommu, iova, 0);
> + if (!dma)
> + goto unpin_exit;
> + unlocked += vfio_unpin_page_external(dma, iova, do_accounting);
> + }
> +
> +unpin_exit:
> + mutex_unlock(&iommu->lock);
> + return unlocked;

This is not returning what it's supposed to. unlocked is going to be
less than or equal to the number of pages unpinned. We don't need to
track unlocked, I think we're just tracking where we are in the unpin
array, whether it was partial or complete. I think we want:

return i > npage ? npage : i;

Or maybe we can make it more obvious if there's an error on the first
unpin entry:

return i > npage ? npage : (i > 0 ? i : -EINVAL);

> +}
> +
> +static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> + bool do_accounting)
> {
> dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> struct vfio_domain *domain, *d;
> long unlocked = 0;
>
> if (!dma->size)
> - return;
> + return 0;
> +
> + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> + return 0;
> +
> /*
> * We use the IOMMU to track the physical addresses, otherwise we'd
> * need a much more complicated tracking system. Unfortunately that
> @@ -410,20 +730,26 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> if (WARN_ON(!unmapped))
> break;
>
> - unlocked += vfio_unpin_pages_remote(dma, phys >> PAGE_SHIFT,
> + unlocked += vfio_unpin_pages_remote(dma, iova,
> + phys >> PAGE_SHIFT,
> unmapped >> PAGE_SHIFT,
> - dma->prot, false);
> + false);
> iova += unmapped;
>
> cond_resched();
> }
>
> - vfio_lock_acct(dma->task, -unlocked);
> + dma->iommu_mapped = false;
> + if (do_accounting) {
> + vfio_lock_acct(dma->task, -unlocked);
> + return 0;
> + }
> + return unlocked;
> }
>
> static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> {
> - vfio_unmap_unpin(iommu, dma);
> + vfio_unmap_unpin(iommu, dma, true);
> vfio_unlink_dma(iommu, dma);
> put_task_struct(dma->task);
> kfree(dma);
> @@ -606,8 +932,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> while (size) {
> /* Pin a contiguous chunk of memory */
> npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
> - size >> PAGE_SHIFT, dma->prot,
> - &pfn);
> + size >> PAGE_SHIFT, &pfn);
> if (npage <= 0) {
> WARN_ON(!npage);
> ret = (int)npage;
> @@ -618,8 +943,8 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
> dma->prot);
> if (ret) {
> - vfio_unpin_pages_remote(dma, pfn, npage,
> - dma->prot, true);
> + vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
> + npage, true);
> break;
> }
>
> @@ -627,6 +952,8 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> dma->size += npage << PAGE_SHIFT;
> }
>
> + dma->iommu_mapped = true;
> +
> if (ret)
> vfio_remove_dma(iommu, dma);
>
> @@ -682,11 +1009,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> dma->prot = prot;
> get_task_struct(current);
> dma->task = current;
> + dma->pfn_list = RB_ROOT;
>
> /* Insert zero-sized and grow as we map chunks of it */
> vfio_link_dma(iommu, dma);
>
> - ret = vfio_pin_map_dma(iommu, dma, size);
> + /* Don't pin and map if container doesn't contain IOMMU capable domain*/
> + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> + dma->size = size;
> + else
> + ret = vfio_pin_map_dma(iommu, dma, size);
> do_map_err:
> mutex_unlock(&iommu->lock);
> return ret;
> @@ -715,10 +1047,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
> n = rb_first(&iommu->dma_list);
>
> - /* If there's not a domain, there better not be any mappings */
> - if (WARN_ON(n && !d))
> - return -EINVAL;
> -
> for (; n; n = rb_next(n)) {
> struct vfio_dma *dma;
> dma_addr_t iova;
> @@ -727,21 +1055,49 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> iova = dma->iova;
>
> while (iova < dma->iova + dma->size) {
> - phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
> + phys_addr_t phys;
> size_t size;
>
> - if (WARN_ON(!phys)) {
> - iova += PAGE_SIZE;
> - continue;
> + if (dma->iommu_mapped) {
> + phys_addr_t p;
> + dma_addr_t i;
> +
> + phys = iommu_iova_to_phys(d->domain, iova);
> +
> + if (WARN_ON(!phys)) {
> + iova += PAGE_SIZE;
> + continue;
> + }
> +
> + size = PAGE_SIZE;
> + p = phys + size;
> + i = iova + size;
> + while (i < dma->iova + dma->size &&
> + p == iommu_iova_to_phys(d->domain, i)) {
> + size += PAGE_SIZE;
> + p += PAGE_SIZE;
> + i += PAGE_SIZE;
> + }
> + } else {
> + unsigned long pfn;
> + unsigned long vaddr = dma->vaddr +
> + (iova - dma->iova);
> + size_t n = dma->iova + dma->size - iova;
> + long npage;
> +
> + npage = vfio_pin_pages_remote(dma, vaddr,
> + n >> PAGE_SHIFT,
> + &pfn);
> + if (npage <= 0) {
> + WARN_ON(!npage);
> + ret = (int)npage;
> + return ret;
> + }
> +
> + phys = pfn << PAGE_SHIFT;
> + size = npage << PAGE_SHIFT;
> }
>
> - size = PAGE_SIZE;
> -
> - while (iova + size < dma->iova + dma->size &&
> - phys + size == iommu_iova_to_phys(d->domain,
> - iova + size))
> - size += PAGE_SIZE;
> -
> ret = iommu_map(domain->domain, iova, phys,
> size, dma->prot | domain->prot);
> if (ret)
> @@ -749,8 +1105,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>
> iova += size;
> }
> + dma->iommu_mapped = true;
> }
> -
> return 0;
> }
>
> @@ -806,7 +1162,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> struct vfio_iommu *iommu = iommu_data;
> struct vfio_group *group;
> struct vfio_domain *domain, *d;
> - struct bus_type *bus = NULL;
> + struct bus_type *bus = NULL, *mdev_bus;
> int ret;
>
> mutex_lock(&iommu->lock);
> @@ -818,6 +1174,13 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> }
> }
>
> + if (iommu->external_domain) {
> + if (find_iommu_group(iommu->external_domain, iommu_group)) {
> + mutex_unlock(&iommu->lock);
> + return -EINVAL;
> + }
> + }
> +
> group = kzalloc(sizeof(*group), GFP_KERNEL);
> domain = kzalloc(sizeof(*domain), GFP_KERNEL);
> if (!group || !domain) {
> @@ -832,6 +1195,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> if (ret)
> goto out_free;
>
> + mdev_bus = symbol_get(mdev_bus_type);
> +
> + if (mdev_bus) {
> + if ((bus == mdev_bus) && !iommu_present(bus)) {
> + symbol_put(mdev_bus_type);
> + if (!iommu->external_domain) {
> + INIT_LIST_HEAD(&domain->group_list);
> + iommu->external_domain = domain;
> + } else
> + kfree(domain);
> +
> + list_add(&group->next,
> + &iommu->external_domain->group_list);
> + mutex_unlock(&iommu->lock);
> + return 0;
> + }
> + symbol_put(mdev_bus_type);
> + }
> +
> domain->domain = iommu_domain_alloc(bus);
> if (!domain->domain) {
> ret = -EIO;
> @@ -922,6 +1304,46 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
> vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
> }
>
> +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> +{
> + struct rb_node *n, *p;
> +
> + n = rb_first(&iommu->dma_list);
> + for (; n; n = rb_next(n)) {
> + struct vfio_dma *dma;
> + long locked = 0, unlocked = 0;
> +
> + dma = rb_entry(n, struct vfio_dma, node);
> + unlocked += vfio_unmap_unpin(iommu, dma, false);
> + p = rb_first(&dma->pfn_list);
> + for (; p; p = rb_next(p))
> + locked++;

We don't know that these weren't reserved pages. If the vendor driver
was pinning peer-to-peer ranges vfio_unmap_unpin() might have returned
0 yet we're assuming each is locked RAM, so our accounting can go
upside down.

> + vfio_lock_acct(dma->task, locked - unlocked);
> + }
> +}
> +
> +static void vfio_external_unpin_all(struct vfio_iommu *iommu,
> + bool do_accounting)
> +{
> + struct rb_node *n, *p;
> +
> + n = rb_first(&iommu->dma_list);
> + for (; n; n = rb_next(n)) {
> + struct vfio_dma *dma;
> +
> + dma = rb_entry(n, struct vfio_dma, node);
> + while ((p = rb_first(&dma->pfn_list))) {
> + int unlocked;
> + struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> + node);
> +
> + unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
> + if (do_accounting)
> + vfio_lock_acct(dma->task, -unlocked);

nit, we could batch these further, only updating accounting once per
vfio_dma, or once entirely.

> + }
> + }
> +}
> +
> static void vfio_iommu_type1_detach_group(void *iommu_data,
> struct iommu_group *iommu_group)
> {
> @@ -931,6 +1353,26 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>
> mutex_lock(&iommu->lock);
>
> + if (iommu->external_domain) {
> + domain = iommu->external_domain;
> + group = find_iommu_group(domain, iommu_group);
> + if (group) {
> + list_del(&group->next);
> + kfree(group);
> +
> + if (list_empty(&domain->group_list)) {
> + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> + vfio_external_unpin_all(iommu, true);
> + vfio_iommu_unmap_unpin_all(iommu);
> + } else
> + vfio_external_unpin_all(iommu, false);
> + kfree(domain);
> + iommu->external_domain = NULL;
> + }
> + goto detach_group_done;
> + }
> + }
> +
> list_for_each_entry(domain, &iommu->domain_list, next) {
> group = find_iommu_group(domain, iommu_group);
> if (!group)
> @@ -940,21 +1382,27 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
> list_del(&group->next);
> kfree(group);
> /*
> - * Group ownership provides privilege, if the group
> - * list is empty, the domain goes away. If it's the
> - * last domain, then all the mappings go away too.
> + * Group ownership provides privilege, if the group list is
> + * empty, the domain goes away. If it's the last domain with
> + * iommu and external domain doesn't exist, then all the
> + * mappings go away too. If it's the last domain with iommu and
> + * external domain exist, update accounting
> */
> if (list_empty(&domain->group_list)) {
> - if (list_is_singular(&iommu->domain_list))
> - vfio_iommu_unmap_unpin_all(iommu);
> + if (list_is_singular(&iommu->domain_list)) {
> + if (!iommu->external_domain)
> + vfio_iommu_unmap_unpin_all(iommu);
> + else
> + vfio_iommu_unmap_unpin_reaccount(iommu);
> + }
> iommu_domain_free(domain->domain);
> list_del(&domain->next);
> kfree(domain);
> }
> - goto done;
> + break;
> }
>
> -done:
> +detach_group_done:
> mutex_unlock(&iommu->lock);
> }
>
> @@ -986,27 +1434,42 @@ static void *vfio_iommu_type1_open(unsigned long arg)
> return iommu;
> }
>
> +static void vfio_release_domain(struct vfio_domain *domain, bool external)
> +{
> + struct vfio_group *group, *group_tmp;
> +
> + list_for_each_entry_safe(group, group_tmp,
> + &domain->group_list, next) {
> + if (!external)
> + iommu_detach_group(domain->domain, group->iommu_group);
> + list_del(&group->next);
> + kfree(group);
> + }
> +
> + if (!external)
> + iommu_domain_free(domain->domain);
> +}
> +
> static void vfio_iommu_type1_release(void *iommu_data)
> {
> struct vfio_iommu *iommu = iommu_data;
> struct vfio_domain *domain, *domain_tmp;
> - struct vfio_group *group, *group_tmp;
> +
> + if (iommu->external_domain) {
> + vfio_release_domain(iommu->external_domain, true);
> + vfio_external_unpin_all(iommu, false);
> + kfree(iommu->external_domain);
> + iommu->external_domain = NULL;
> + }
>
> vfio_iommu_unmap_unpin_all(iommu);
>
> list_for_each_entry_safe(domain, domain_tmp,
> &iommu->domain_list, next) {
> - list_for_each_entry_safe(group, group_tmp,
> - &domain->group_list, next) {
> - iommu_detach_group(domain->domain, group->iommu_group);
> - list_del(&group->next);
> - kfree(group);
> - }
> - iommu_domain_free(domain->domain);
> + vfio_release_domain(domain, false);
> list_del(&domain->next);
> kfree(domain);
> }
> -
> kfree(iommu);
> }
>
> @@ -1110,6 +1573,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> .ioctl = vfio_iommu_type1_ioctl,
> .attach_group = vfio_iommu_type1_attach_group,
> .detach_group = vfio_iommu_type1_detach_group,
> + .pin_pages = vfio_iommu_type1_pin_pages,
> + .unpin_pages = vfio_iommu_type1_unpin_pages,
> };
>
> static int __init vfio_iommu_type1_init(void)