[RFC PATCH] Optimize VFIO and IOMMU mapping traversal

From: Guanghui Feng

Date: Fri May 29 2026 - 03:14:42 EST


In VFIO, vfio_unmap_unpin requires performing iommu unmap and mm
unpin on the address space. However, VFIO doesn't record the PHY
address corresponding to iova, but instead obtains the iova-PHY
mapping through iommu_iommu_iova_to_phys.

In IOMMU, under conditions such as address alignment, it prioritizes
mapping iova-PHY based on bigpages. Therefore, during the
vfio_unmap_unpin process, traversal can be performed at the
granularity of the IOMMU map, reducing the number of
iommu_iova_to_phys queries and significantly improving conversion
efficiency.

Therefore, an iommu_iova_to_pgsize implementation is added to the
IOMMU driver to return the pagesize used for the iova mapping.

Signed-off-by: Guanghui Feng <guanghuifeng@xxxxxxxxxxxxxxxxx>
Signed-off-by: Shiqiang Zhang <shiyu.zsq@xxxxxxxxxxxxxxxxx>
Signed-off-by: Simon Guo <wei.guo.simon@xxxxxxxxxxxxxxxxx>
---
drivers/iommu/amd/iommu.c | 2 ++
drivers/iommu/generic_pt/iommu_pt.h | 53 +++++++++++++++++++++++++++++
drivers/iommu/intel/iommu.c | 2 ++
drivers/iommu/iommu.c | 25 ++++++++++++++
drivers/vfio/vfio_iommu_type1.c | 17 +++++++--
include/linux/generic_pt/iommu.h | 4 +++
include/linux/iommu.h | 3 ++
7 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 57dc8fabc7d9..36ffeb96c454 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2662,6 +2662,7 @@ static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = {

static const struct iommu_domain_ops amdv1_ops = {
IOMMU_PT_DOMAIN_OPS(amdv1),
+ IOMMU_PT_PGSIZE_OPS(amdv1),
.iotlb_sync_map = amd_iommu_iotlb_sync_map,
.flush_iotlb_all = amd_iommu_flush_iotlb_all,
.iotlb_sync = amd_iommu_iotlb_sync,
@@ -2740,6 +2741,7 @@ static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev,

static const struct iommu_domain_ops amdv2_ops = {
IOMMU_PT_DOMAIN_OPS(x86_64),
+ IOMMU_PT_PGSIZE_OPS(x86_64),
.iotlb_sync_map = amd_iommu_iotlb_sync_map,
.flush_iotlb_all = amd_iommu_flush_iotlb_all,
.iotlb_sync = amd_iommu_iotlb_sync,
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index dc91fb4e2f61..de861d8b6ce2 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -199,6 +199,59 @@ phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain,
}
EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU");

+static __always_inline int __do_iova_to_pgsize(struct pt_range *range,
+ void *arg, unsigned int level,
+ struct pt_table_p *table,
+ pt_level_fn_t descend_fn)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ size_t *pgsize = arg;
+
+ switch (pt_load_single_entry(&pts)) {
+ case PT_ENTRY_EMPTY:
+ return -ENOENT;
+ case PT_ENTRY_TABLE:
+ return pt_descend(&pts, arg, descend_fn);
+ case PT_ENTRY_OA:
+ *pgsize = BIT(pt_entry_oa_lg2sz(&pts));
+ return 0;
+ }
+ return -ENOENT;
+}
+PT_MAKE_LEVELS(__iova_to_pgsize, __do_iova_to_pgsize);
+
+/**
+ * iova_to_pgsize() - Return the page size of the mapping at the given IOVA
+ * @domain: Table to query
+ * @iova: IO virtual address to query
+ *
+ * Walk the IOMMU page table to determine the actual page size of the PTE
+ * entry that maps the given IOVA.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Return: The page size in bytes, or 0 if there is no translation.
+ */
+size_t DOMAIN_NS(iova_to_pgsize)(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_range range;
+ size_t pgsize;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+ if (ret)
+ return 0;
+
+ ret = pt_walk_range(&range, __iova_to_pgsize, &pgsize);
+ if (ret)
+ return 0;
+ return pgsize;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_pgsize), "GENERIC_PT_IOMMU");
+
struct pt_iommu_dirty_args {
struct iommu_dirty_bitmap *dirty;
unsigned int flags;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 4d0e65bc131d..f992162cfa67 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3890,6 +3890,7 @@ static struct iommu_domain identity_domain = {

const struct iommu_domain_ops intel_fs_paging_domain_ops = {
IOMMU_PT_DOMAIN_OPS(x86_64),
+ IOMMU_PT_PGSIZE_OPS(x86_64),
.attach_dev = intel_iommu_attach_device,
.set_dev_pasid = intel_iommu_set_dev_pasid,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
@@ -3901,6 +3902,7 @@ const struct iommu_domain_ops intel_fs_paging_domain_ops = {

const struct iommu_domain_ops intel_ss_paging_domain_ops = {
IOMMU_PT_DOMAIN_OPS(vtdss),
+ IOMMU_PT_PGSIZE_OPS(vtdss),
.attach_dev = intel_iommu_attach_device,
.set_dev_pasid = intel_iommu_set_dev_pasid,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d1a9e713d3a0..e27f26bc1851 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2557,6 +2557,31 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
}
EXPORT_SYMBOL_GPL(iommu_iova_to_phys);

+/**
+ * iommu_iova_to_pgsize - Get the page size of the mapping at a given IOVA
+ * @domain: IOMMU domain to query
+ * @iova: IO virtual address to query
+ *
+ * Walk the IOMMU page table to determine the actual page size of the PTE
+ * entry that maps the given IOVA. This reflects the real mapping granularity,
+ * not an inferred value from alignment.
+ *
+ * Returns the page size in bytes, or 0 if the mapping doesn't exist or the
+ * domain doesn't support this query.
+ */
+size_t iommu_iova_to_pgsize(struct iommu_domain *domain, dma_addr_t iova)
+{
+ if (domain->type == IOMMU_DOMAIN_IDENTITY ||
+ domain->type == IOMMU_DOMAIN_BLOCKED)
+ return 0;
+
+ if (!domain->ops->iova_to_pgsize)
+ return 0;
+
+ return domain->ops->iova_to_pgsize(domain, iova);
+}
+EXPORT_SYMBOL_GPL(iommu_iova_to_pgsize);
+
static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, size_t *count)
{
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index c8151ba54de3..bf918a93a159 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1177,7 +1177,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,

iommu_iotlb_gather_init(&iotlb_gather);
while (pos < dma->size) {
- size_t unmapped, len;
+ size_t unmapped, len, pgsize;
phys_addr_t phys, next;
dma_addr_t iova = dma->iova + pos;

@@ -1191,11 +1191,24 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
* To optimize for fewer iommu_unmap() calls, each of which
* may require hardware cache flushing, try to find the
* largest contiguous physical memory chunk to unmap.
+ *
+ * Query the actual IOMMU PTE mapping granularity at this IOVA
+ * to determine the guaranteed contiguous range. Use only the
+ * remaining portion within the current PTE from our position,
+ * in case we start from the middle of a large page mapping.
*/
- for (len = PAGE_SIZE; pos + len < dma->size; len += PAGE_SIZE) {
+ pgsize = iommu_iova_to_pgsize(domain->domain, iova);
+ if (!pgsize)
+ pgsize = PAGE_SIZE;
+ len = pgsize - (iova & (pgsize - 1));
+ for (; pos + len < dma->size; len += pgsize) {
next = iommu_iova_to_phys(domain->domain, iova + len);
if (next != phys + len)
break;
+ pgsize = iommu_iova_to_pgsize(domain->domain,
+ iova + len);
+ if (!pgsize)
+ pgsize = PAGE_SIZE;
}

/*
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index dd0edd02a48a..2f30ae73a9eb 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -251,6 +251,8 @@ struct pt_iommu_cfg {
#define IOMMU_PROTOTYPES(fmt) \
phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \
dma_addr_t iova); \
+ size_t pt_iommu_##fmt##_iova_to_pgsize(struct iommu_domain *domain, \
+ dma_addr_t iova); \
int pt_iommu_##fmt##_read_and_clear_dirty( \
struct iommu_domain *domain, unsigned long iova, size_t size, \
unsigned long flags, struct iommu_dirty_bitmap *dirty); \
@@ -272,6 +274,8 @@ struct pt_iommu_cfg {
*/
#define IOMMU_PT_DOMAIN_OPS(fmt) \
.iova_to_phys = &pt_iommu_##fmt##_iova_to_phys
+#define IOMMU_PT_PGSIZE_OPS(fmt) \
+ .iova_to_pgsize = &pt_iommu_##fmt##_iova_to_pgsize
#define IOMMU_PT_DIRTY_OPS(fmt) \
.read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e587d4ac4d33..d04dc7dcfb1e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -776,6 +776,8 @@ struct iommu_domain_ops {

phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
dma_addr_t iova);
+ size_t (*iova_to_pgsize)(struct iommu_domain *domain,
+ dma_addr_t iova);

bool (*enforce_cache_coherency)(struct iommu_domain *domain);
int (*set_pgtable_quirks)(struct iommu_domain *domain,
@@ -930,6 +932,7 @@ extern ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
struct scatterlist *sg, unsigned int nents,
int prot, gfp_t gfp);
extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova);
+extern size_t iommu_iova_to_pgsize(struct iommu_domain *domain, dma_addr_t iova);
extern void iommu_set_fault_handler(struct iommu_domain *domain,
iommu_fault_handler_t handler, void *token);

--
2.43.7