[PATCH v12 25/84] KVM: Provide refcounted page as output field in struct kvm_follow_pfn

From: Sean Christopherson
Date: Fri Jul 26 2024 - 20:01:54 EST


Add kvm_follow_pfn.refcounted_page as an output for the "to pfn" APIs to
"return" the struct page that is associated with the returned pfn (if KVM
acquired a reference to the page). This will eventually allow removing
KVM's hacky kvm_pfn_to_refcounted_page() code, which is error prone and
can't detect pfns that are valid, but aren't (currently) refcounted.

Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
---
virt/kvm/kvm_main.c | 100 +++++++++++++++++++++-----------------------
virt/kvm/kvm_mm.h | 9 ++++
2 files changed, 56 insertions(+), 53 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8e83d3f043f1..31570c5627e3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2756,6 +2756,46 @@ static inline int check_user_page_hwpoison(unsigned long addr)
return rc == -EHWPOISON;
}

+static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
+ pte_t *pte, bool writable)
+{
+ kvm_pfn_t pfn;
+
+ WARN_ON_ONCE(!!page == !!pte);
+
+ if (kfp->map_writable)
+ *kfp->map_writable = writable;
+
+ /*
+ * FIXME: Remove this once KVM no longer blindly calls put_page() on
+ * every pfn that points at a struct page.
+ *
+ * Get a reference for follow_pte() pfns if they happen to point at a
+ * struct page, as KVM will ultimately call kvm_release_pfn_clean() on
+ * the returned pfn, i.e. KVM expects to have a reference.
+ *
+ * Certain IO or PFNMAP mappings can be backed with valid struct pages,
+ * but be allocated without refcounting, e.g. tail pages of
+ * non-compound higher order allocations. Grabbing and putting a
+ * reference to such pages would cause KVM to prematurely free a page
+ * it doesn't own (KVM gets and puts the one and only reference).
+ * Don't allow those pages until the FIXME is resolved.
+ */
+ if (pte) {
+ pfn = pte_pfn(*pte);
+ page = kvm_pfn_to_refcounted_page(pfn);
+ if (page && !get_page_unless_zero(page))
+ return KVM_PFN_ERR_FAULT;
+ } else {
+ pfn = page_to_pfn(page);
+ }
+
+ if (kfp->refcounted_page)
+ *kfp->refcounted_page = page;
+
+ return pfn;
+}
+
/*
* The fast path to get the writable pfn which will be stored in @pfn,
* true indicates success, otherwise false is returned. It's also the
@@ -2774,9 +2814,7 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
return false;

if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) {
- *pfn = page_to_pfn(page);
- if (kfp->map_writable)
- *kfp->map_writable = true;
+ *pfn = kvm_resolve_pfn(kfp, page, NULL, true);
return true;
}

@@ -2808,23 +2846,15 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
if (npages != 1)
return npages;

- if (!kfp->map_writable)
- goto out;
-
- if (kfp->flags & FOLL_WRITE) {
- *kfp->map_writable = true;
- goto out;
- }
-
/* map read fault as writable if possible */
- if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) {
- *kfp->map_writable = true;
+ if (!(flags & FOLL_WRITE) && kfp->map_writable &&
+ get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) {
put_page(page);
page = wpage;
+ flags |= FOLL_WRITE;
}

-out:
- *pfn = page_to_pfn(page);
+ *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE);
return npages;
}

@@ -2839,20 +2869,9 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
return true;
}

-static int kvm_try_get_pfn(kvm_pfn_t pfn)
-{
- struct page *page = kvm_pfn_to_refcounted_page(pfn);
-
- if (!page)
- return 1;
-
- return get_page_unless_zero(page);
-}
-
static int hva_to_pfn_remapped(struct vm_area_struct *vma,
struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn)
{
- kvm_pfn_t pfn;
pte_t *ptep;
pte_t pte;
spinlock_t *ptl;
@@ -2882,38 +2901,13 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
pte = ptep_get(ptep);

if (write_fault && !pte_write(pte)) {
- pfn = KVM_PFN_ERR_RO_FAULT;
+ *p_pfn = KVM_PFN_ERR_RO_FAULT;
goto out;
}

- if (kfp->map_writable)
- *kfp->map_writable = pte_write(pte);
- pfn = pte_pfn(pte);
-
- /*
- * Get a reference here because callers of *hva_to_pfn* and
- * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
- * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
- * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
- * simply do nothing for reserved pfns.
- *
- * Whoever called remap_pfn_range is also going to call e.g.
- * unmap_mapping_range before the underlying pages are freed,
- * causing a call to our MMU notifier.
- *
- * Certain IO or PFNMAP mappings can be backed with valid
- * struct pages, but be allocated without refcounting e.g.,
- * tail pages of non-compound higher order allocations, which
- * would then underflow the refcount when the caller does the
- * required put_page. Don't allow those pages here.
- */
- if (!kvm_try_get_pfn(pfn))
- r = -EFAULT;
-
+ *p_pfn = kvm_resolve_pfn(kfp, NULL, &pte, pte_write(pte));
out:
pte_unmap_unlock(ptep, ptl);
- *p_pfn = pfn;
-
return r;
}

diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index d5a215958f06..d3ac1ba8ba66 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -35,6 +35,15 @@ struct kvm_follow_pfn {
* Set to true if a writable mapping was obtained.
*/
bool *map_writable;
+
+ /*
+ * Optional output. Set to a valid "struct page" if the returned pfn
+ * is for a refcounted or pinned struct page, NULL if the returned pfn
+ * has no struct page or if the struct page is not being refcounted
+ * (e.g. tail pages of non-compound higher order allocations from
+ * IO/PFNMAP mappings).
+ */
+ struct page **refcounted_page;
};

kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp);
--
2.46.0.rc1.232.g9752f9e123-goog