[PATCH 7/9] vfio/iommufd: use iova_to_phys_length for efficient unmap

From: Guanghui Feng

Date: Sun May 31 2026 - 05:42:01 EST

Use iommu_iova_to_phys_length() to get PTE page size, allowing
traversal by actual mapping granularity instead of PAGE_SIZE steps.

Signed-off-by: Guanghui Feng <guanghuifeng@xxxxxxxxxxxxxxxxx>
Acked-by: Shiqiang Zhang <shiyu.zsq@xxxxxxxxxxxxxxxxx>
Acked-by: Simon Guo <wei.guo.simon@xxxxxxxxxxxxxxxxx>
---
drivers/iommu/iommufd/pages.c | 71 ++++++++++++++++++++++++++------
drivers/iommu/iommufd/selftest.c | 2 +-
drivers/vfio/vfio_iommu_type1.c | 24 +++++++++--
3 files changed, 80 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 9bdb2945afe1..d67e564035b4 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -417,17 +417,42 @@ static void batch_from_domain(struct pfn_batch *batch,
if (start_index == iopt_area_index(area))
page_offset = area->page_offset;
while (start_index <= last_index) {
+ size_t pgsize;
+ unsigned long npages;
+ unsigned long i;
+
/*
- * This is pretty slow, it would be nice to get the page size
- * back from the driver, or have the driver directly fill the
- * batch.
+ * Use iova_to_phys_length to get both the physical address
+ * and the PTE page size in a single page table walk, allowing
+ * us to skip ahead by the contiguous region size instead of
+ * walking the page tables for every PAGE_SIZE step.
*/
- phys = iommu_iova_to_phys(domain, iova) - page_offset;
- if (!batch_add_pfn(batch, PHYS_PFN(phys)))
- return;
- iova += PAGE_SIZE - page_offset;
+ phys = iommu_iova_to_phys_length(domain, iova, &pgsize) -
+ page_offset;
+ if (!pgsize || pgsize < PAGE_SIZE)
+ pgsize = PAGE_SIZE;
+
+ /*
+ * Calculate contiguous pages within this PTE from our
+ * position. phys points to the page-aligned start (backed
+ * up by page_offset), so pages available = bytes from phys
+ * to PTE end divided by PAGE_SIZE.
+ */
+ npages = (pgsize - (iova & (pgsize - 1)) + page_offset) /
+ PAGE_SIZE;
+ npages = min_t(unsigned long, npages,
+ last_index - start_index + 1);
+ if (!npages)
+ npages = 1;
+
+ for (i = 0; i < npages; i++) {
+ if (!batch_add_pfn(batch, PHYS_PFN(phys) + i))
+ return;
+ }
+
+ iova += npages * PAGE_SIZE - page_offset;
page_offset = 0;
- start_index++;
+ start_index += npages;
}
}

@@ -445,11 +470,33 @@ static struct page **raw_pages_from_domain(struct iommu_domain *domain,
if (start_index == iopt_area_index(area))
page_offset = area->page_offset;
while (start_index <= last_index) {
- phys = iommu_iova_to_phys(domain, iova) - page_offset;
- *(out_pages++) = pfn_to_page(PHYS_PFN(phys));
- iova += PAGE_SIZE - page_offset;
+ size_t pgsize;
+ unsigned long npages;
+ unsigned long i;
+
+ /*
+ * Resolve the PTE page size together with the physical
+ * address so we can fill multiple struct page pointers per
+ * page table walk when the IOMMU uses large pages.
+ */
+ phys = iommu_iova_to_phys_length(domain, iova, &pgsize) -
+ page_offset;
+ if (!pgsize || pgsize < PAGE_SIZE)
+ pgsize = PAGE_SIZE;
+
+ npages = (pgsize - (iova & (pgsize - 1)) + page_offset) /
+ PAGE_SIZE;
+ npages = min_t(unsigned long, npages,
+ last_index - start_index + 1);
+ if (!npages)
+ npages = 1;
+
+ for (i = 0; i < npages; i++)
+ *(out_pages++) = pfn_to_page(PHYS_PFN(phys) + i);
+
+ iova += npages * PAGE_SIZE - page_offset;
page_offset = 0;
- start_index++;
+ start_index += npages;
}
return out_pages;
}
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index af07c642a526..4b9c3ffc9523 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -1214,7 +1214,7 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
pfn = page_to_pfn(pages[0]);
put_page(pages[0]);

- io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova);
+ io_phys = iommu_iova_to_phys(&mock->domain, iova);
if (io_phys !=
pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
rc = -EINVAL;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index c8151ba54de3..393f9e8f1511 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1177,25 +1177,41 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,

iommu_iotlb_gather_init(&iotlb_gather);
while (pos < dma->size) {
- size_t unmapped, len;
+ size_t unmapped, len, pgsize;
phys_addr_t phys, next;
dma_addr_t iova = dma->iova + pos;

- phys = iommu_iova_to_phys(domain->domain, iova);
+ /* Single page table walk returns both phys and PTE size */
+ phys = iommu_iova_to_phys_length(domain->domain, iova,
+ &pgsize);
if (WARN_ON(!phys)) {
pos += PAGE_SIZE;
continue;
}
+ if (!pgsize || pgsize < PAGE_SIZE)
+ pgsize = PAGE_SIZE;

/*
* To optimize for fewer iommu_unmap() calls, each of which
* may require hardware cache flushing, try to find the
* largest contiguous physical memory chunk to unmap.
+ *
+ * Calculate remaining contiguous bytes within this PTE from
+ * our position, then try to join following physically
+ * contiguous PTEs.
*/
- for (len = PAGE_SIZE; pos + len < dma->size; len += PAGE_SIZE) {
- next = iommu_iova_to_phys(domain->domain, iova + len);
+ len = pgsize - (iova & (pgsize - 1));
+ for (; pos + len < dma->size; ) {
+ size_t next_pgsize;
+
+ next = iommu_iova_to_phys_length(domain->domain,
+ iova + len,
+ &next_pgsize);
if (next != phys + len)
break;
+ if (!next_pgsize || next_pgsize < PAGE_SIZE)
+ next_pgsize = PAGE_SIZE;
+ len += next_pgsize;
}

/*
--
2.43.7