[PATCH 3/5] vfio/type1: Use vfio_batch for vaddr_get_pfns()

From: Alex Williamson
Date: Wed Feb 05 2025 - 18:19:20 EST


Passing the vfio_batch to vaddr_get_pfns() allows for greater
distinction between page backed pfns and pfnmaps. In the case of page
backed pfns, vfio_batch.size is set to a positive value matching the
number of pages filled in vfio_batch.pages. For a pfnmap,
vfio_batch.size remains zero as vfio_batch.pages are not used. In both
cases the return value continues to indicate the number of pfns and the
provided pfn arg is set to the initial pfn value.

This allows us to shortcut the pfnmap case, which is detected by the
zero vfio_batch.size. pfnmaps do not contribute to locked memory
accounting, therefore we can update counters and continue directly,
which also enables a future where vaddr_get_pfns() can return a value
greater than one for consecutive pfnmaps.

NB. Now that we're not guessing whether the initial pfn is page backed
or pfnmap, we no longer need to special case the put_pfn() and batch
size reset. It's safe for vfio_batch_unpin() to handle this case.

Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
---
drivers/vfio/vfio_iommu_type1.c | 62 ++++++++++++++++++---------------
1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2e95f5f4d881..939920454da7 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -555,12 +555,16 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,

/*
* Returns the positive number of pfns successfully obtained or a negative
- * error code.
+ * error code. The initial pfn is stored in the pfn arg. For page-backed
+ * pfns, the provided batch is also updated to indicate the filled pages and
+ * initial offset. For VM_PFNMAP pfns, only the returned number of pfns and
+ * returned initial pfn are provided; subsequent pfns are contiguous.
*/
static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
long npages, int prot, unsigned long *pfn,
- struct page **pages)
+ struct vfio_batch *batch)
{
+ long pin_pages = min_t(long, npages, batch->capacity);
struct vm_area_struct *vma;
unsigned int flags = 0;
int ret;
@@ -569,10 +573,12 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
flags |= FOLL_WRITE;

mmap_read_lock(mm);
- ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
- pages, NULL);
+ ret = pin_user_pages_remote(mm, vaddr, pin_pages, flags | FOLL_LONGTERM,
+ batch->pages, NULL);
if (ret > 0) {
- *pfn = page_to_pfn(pages[0]);
+ *pfn = page_to_pfn(batch->pages[0]);
+ batch->size = ret;
+ batch->offset = 0;
goto done;
} else if (!ret) {
ret = -EFAULT;
@@ -628,32 +634,41 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
*pfn_base = 0;
}

+ if (unlikely(disable_hugepages))
+ npage = 1;
+
while (npage) {
if (!batch->size) {
/* Empty batch, so refill it. */
- long req_pages = min_t(long, npage, batch->capacity);
-
- ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
- &pfn, batch->pages);
+ ret = vaddr_get_pfns(mm, vaddr, npage, dma->prot,
+ &pfn, batch);
if (ret < 0)
goto unpin_out;

- batch->size = ret;
- batch->offset = 0;
-
if (!*pfn_base) {
*pfn_base = pfn;
rsvd = is_invalid_reserved_pfn(*pfn_base);
}
+
+ /* Handle pfnmap */
+ if (!batch->size) {
+ if (pfn != *pfn_base + pinned || !rsvd)
+ goto out;
+
+ pinned += ret;
+ npage -= ret;
+ vaddr += (PAGE_SIZE * ret);
+ iova += (PAGE_SIZE * ret);
+ continue;
+ }
}

/*
- * pfn is preset for the first iteration of this inner loop and
- * updated at the end to handle a VM_PFNMAP pfn. In that case,
- * batch->pages isn't valid (there's no struct page), so allow
- * batch->pages to be touched only when there's more than one
- * pfn to check, which guarantees the pfns are from a
- * !VM_PFNMAP vma.
+ * pfn is preset for the first iteration of this inner loop due to the
+ * fact that vaddr_get_pfns() needs to provide the initial pfn for pfnmaps.
+ * Therefore to reduce redundancy, the next pfn is fetched at the end of
+ * the loop. A PageReserved() page could still qualify as page backed and
+ * rsvd here, and therefore continues to use the batch.
*/
while (true) {
if (pfn != *pfn_base + pinned ||
@@ -688,21 +703,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,

pfn = page_to_pfn(batch->pages[batch->offset]);
}
-
- if (unlikely(disable_hugepages))
- break;
}

out:
ret = vfio_lock_acct(dma, lock_acct, false);

unpin_out:
- if (batch->size == 1 && !batch->offset) {
- /* May be a VM_PFNMAP pfn, which the batch can't remember. */
- put_pfn(pfn, dma->prot);
- batch->size = 0;
- }
-
if (ret < 0) {
if (pinned && !rsvd) {
for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
@@ -750,7 +756,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,

vfio_batch_init_single(&batch);

- ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, batch.pages);
+ ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, &batch);
if (ret != 1)
goto out;

--
2.47.1