[PATCH v6 1/4] KVM: mmu: introduce new gfn_to_pfn_noref functions

From: David Stevens
Date: Thu Mar 30 2023 - 04:58:26 EST


From: David Stevens <stevensd@xxxxxxxxxxxx>

Introduce new gfn_to_pfn_noref functions that parallel existing
gfn_to_pfn functions. These functions can be used when the caller does
not need to maintain a reference to the returned pfn (i.e. when usage is
guarded by a mmu_notifier). The noref functions take an out parameter
that is used to return the struct page if the hva was resolved via gup.
The caller needs to drop its reference such a returned page.

Signed-off-by: David Stevens <stevensd@xxxxxxxxxxxx>
---
include/linux/kvm_host.h | 18 ++++
virt/kvm/kvm_main.c | 209 ++++++++++++++++++++++++++++-----------
virt/kvm/kvm_mm.h | 6 +-
virt/kvm/pfncache.c | 12 ++-
4 files changed, 188 insertions(+), 57 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 90edc16d37e5..146f220cc25b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1162,8 +1162,22 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
bool atomic, bool interruptible, bool *async,
bool write_fault, bool *writable, hva_t *hva);

+kvm_pfn_t gfn_to_pfn_noref(struct kvm *kvm, gfn_t gfn, struct page **page);
+kvm_pfn_t gfn_to_pfn_noref_prot(struct kvm *kvm, gfn_t gfn,
+ bool write_fault, bool *writable,
+ struct page **page);
+kvm_pfn_t gfn_to_pfn_noref_memslot(const struct kvm_memory_slot *slot,
+ gfn_t gfn, struct page **page);
+kvm_pfn_t gfn_to_pfn_noref_memslot_atomic(const struct kvm_memory_slot *slot,
+ gfn_t gfn, struct page **page);
+kvm_pfn_t __gfn_to_pfn_noref_memslot(const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool atomic, bool interruptible,
+ bool *async, bool write_fault, bool *writable,
+ hva_t *hva, struct page **page);
+
void kvm_release_pfn_clean(kvm_pfn_t pfn);
void kvm_release_pfn_dirty(kvm_pfn_t pfn);
+void kvm_release_pfn_noref_clean(kvm_pfn_t pfn, struct page *page);
void kvm_set_pfn_dirty(kvm_pfn_t pfn);
void kvm_set_pfn_accessed(kvm_pfn_t pfn);

@@ -1242,6 +1256,10 @@ struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_noref_atomic(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct page **page);
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_noref(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct page **page);
int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f40b72eb0e7b..007dd984eeea 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2484,9 +2484,9 @@ static inline int check_user_page_hwpoison(unsigned long addr)
* only part that runs if we can in atomic context.
*/
static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
- bool *writable, kvm_pfn_t *pfn)
+ bool *writable, kvm_pfn_t *pfn,
+ struct page **page)
{
- struct page *page[1];

/*
* Fast pin a writable pfn only if it is a write fault request
@@ -2497,7 +2497,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
return false;

if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
- *pfn = page_to_pfn(page[0]);
+ *pfn = page_to_pfn(*page);

if (writable)
*writable = true;
@@ -2512,10 +2512,10 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
* 1 indicates success, -errno is returned if error is detected.
*/
static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
- bool interruptible, bool *writable, kvm_pfn_t *pfn)
+ bool interruptible, bool *writable, kvm_pfn_t *pfn,
+ struct page **page)
{
unsigned int flags = FOLL_HWPOISON;
- struct page *page;
int npages;

might_sleep();
@@ -2530,7 +2530,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
if (interruptible)
flags |= FOLL_INTERRUPTIBLE;

- npages = get_user_pages_unlocked(addr, 1, &page, flags);
+ npages = get_user_pages_unlocked(addr, 1, page, flags);
if (npages != 1)
return npages;

@@ -2540,11 +2540,11 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,

if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
*writable = true;
- put_page(page);
- page = wpage;
+ put_page(*page);
+ *page = wpage;
}
}
- *pfn = page_to_pfn(page);
+ *pfn = page_to_pfn(*page);
return npages;
}

@@ -2559,16 +2559,6 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
return true;
}

-static int kvm_try_get_pfn(kvm_pfn_t pfn)
-{
- struct page *page = kvm_pfn_to_refcounted_page(pfn);
-
- if (!page)
- return 1;
-
- return get_page_unless_zero(page);
-}
-
static int hva_to_pfn_remapped(struct vm_area_struct *vma,
unsigned long addr, bool write_fault,
bool *writable, kvm_pfn_t *p_pfn)
@@ -2607,26 +2597,6 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
*writable = pte_write(*ptep);
pfn = pte_pfn(*ptep);

- /*
- * Get a reference here because callers of *hva_to_pfn* and
- * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
- * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
- * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
- * simply do nothing for reserved pfns.
- *
- * Whoever called remap_pfn_range is also going to call e.g.
- * unmap_mapping_range before the underlying pages are freed,
- * causing a call to our MMU notifier.
- *
- * Certain IO or PFNMAP mappings can be backed with valid
- * struct pages, but be allocated without refcounting e.g.,
- * tail pages of non-compound higher order allocations, which
- * would then underflow the refcount when the caller does the
- * required put_page. Don't allow those pages here.
- */
- if (!kvm_try_get_pfn(pfn))
- r = -EFAULT;
-
out:
pte_unmap_unlock(ptep, ptl);
*p_pfn = pfn;
@@ -2643,6 +2613,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
* host page is not in the memory
* @write_fault: whether we should get a writable host page
* @writable: whether it allows to map a writable host page for !@write_fault
+ * @page: outparam for the refcounted page assicated with the pfn, if any
*
* The function will map a writable host page for these two cases:
* 1): @write_fault = true
@@ -2650,23 +2621,25 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
* whether the mapping is writable.
*/
kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
- bool *async, bool write_fault, bool *writable)
+ bool *async, bool write_fault, bool *writable,
+ struct page **page)
{
struct vm_area_struct *vma;
kvm_pfn_t pfn;
int npages, r;
+ *page = NULL;

/* we can do it either atomically or asynchronously, not both */
BUG_ON(atomic && async);

- if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
+ if (hva_to_pfn_fast(addr, write_fault, writable, &pfn, page))
return pfn;

if (atomic)
return KVM_PFN_ERR_FAULT;

npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
- writable, &pfn);
+ writable, &pfn, page);
if (npages == 1)
return pfn;
if (npages == -EINTR)
@@ -2700,9 +2673,37 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
return pfn;
}

-kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
- bool atomic, bool interruptible, bool *async,
- bool write_fault, bool *writable, hva_t *hva)
+/*
+ * Helper function for managing refcounts of pfn returned by hva_to_pfn.
+ * @pfn: pfn returned by hva_to_pfn
+ * @page: page outparam from hva_to_pfn
+ *
+ * In cases where access to the pfn resolved by hva_to_pfn isn't protected by
+ * our MMU notifier, if the pfn was resolved by hva_to_pfn_remapped instead of
+ * gup, then its refcount needs to be bumped.
+ *
+ * Certain IO or PFNMAP mappings can be backed with valid struct pages, but be
+ * allocated without refcounting e.g., tail pages of non-compound higher order
+ * allocations, which would then underflow the refcount when the caller does the
+ * required put_page. Don't allow those pages here.
+ */
+kvm_pfn_t kvm_try_get_refcounted_page_ref(kvm_pfn_t pfn, struct page *page)
+{
+ /* If @page is valid, KVM already has a reference to the pfn/page. */
+ if (page || is_error_pfn(pfn))
+ return pfn;
+
+ page = kvm_pfn_to_refcounted_page(pfn);
+ if (!page || get_page_unless_zero(page))
+ return pfn;
+
+ return KVM_PFN_ERR_FAULT;
+}
+
+kvm_pfn_t __gfn_to_pfn_noref_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
+ bool atomic, bool interruptible, bool *async,
+ bool write_fault, bool *writable, hva_t *hva,
+ struct page **page)
{
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);

@@ -2728,47 +2729,134 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
}

return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
- writable);
+ writable, page);
+}
+EXPORT_SYMBOL_GPL(__gfn_to_pfn_noref_memslot);
+
+kvm_pfn_t gfn_to_pfn_noref_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+ bool *writable, struct page **page)
+{
+ return __gfn_to_pfn_noref_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
+ NULL, write_fault, writable, NULL, page);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_noref_prot);
+
+kvm_pfn_t gfn_to_pfn_noref_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
+ struct page **page)
+{
+ return __gfn_to_pfn_noref_memslot(slot, gfn, false, false, NULL, true,
+ NULL, NULL, page);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_noref_memslot);
+
+kvm_pfn_t gfn_to_pfn_noref_memslot_atomic(const struct kvm_memory_slot *slot,
+ gfn_t gfn, struct page **page)
+{
+ return __gfn_to_pfn_noref_memslot(slot, gfn, true, false, NULL, true, NULL,
+ NULL, page);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_noref_memslot_atomic);
+
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_noref_atomic(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct page **page)
+{
+ return gfn_to_pfn_noref_memslot_atomic(
+ kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, page);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_noref_atomic);
+
+kvm_pfn_t gfn_to_pfn_noref(struct kvm *kvm, gfn_t gfn, struct page **page)
+{
+ return gfn_to_pfn_noref_memslot(gfn_to_memslot(kvm, gfn), gfn, page);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_noref);
+
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_noref(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct page **page)
+{
+ return gfn_to_pfn_noref_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn),
+ gfn, page);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_noref);
+
+kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
+ bool atomic, bool interruptible, bool *async,
+ bool write_fault, bool *writable, hva_t *hva)
+{
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = __gfn_to_pfn_noref_memslot(slot, gfn, atomic, interruptible, async,
+ write_fault, writable, hva, &page);
+
+ return kvm_try_get_refcounted_page_ref(pfn, page);
}
EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);

kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable)
{
- return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
- NULL, write_fault, writable, NULL);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = gfn_to_pfn_noref_prot(kvm, gfn, write_fault, writable, &page);
+
+ return kvm_try_get_refcounted_page_ref(pfn, page);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);

kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
{
- return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
- NULL, NULL);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = gfn_to_pfn_noref_memslot(slot, gfn, &page);
+
+ return kvm_try_get_refcounted_page_ref(pfn, page);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);

kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
{
- return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
- NULL, NULL);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = gfn_to_pfn_noref_memslot_atomic(slot, gfn, &page);
+
+ return kvm_try_get_refcounted_page_ref(pfn, page);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);

kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
{
- return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = kvm_vcpu_gfn_to_pfn_noref_atomic(vcpu, gfn, &page);
+
+ return kvm_try_get_refcounted_page_ref(pfn, page);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);

kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{
- return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = gfn_to_pfn_noref(kvm, gfn, &page);
+
+ return kvm_try_get_refcounted_page_ref(pfn, page);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);

kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
{
- return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = kvm_vcpu_gfn_to_pfn_noref(vcpu, gfn, &page);
+
+ return kvm_try_get_refcounted_page_ref(pfn, page);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);

@@ -2925,6 +3013,17 @@ void kvm_release_pfn_clean(kvm_pfn_t pfn)
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);

+void kvm_release_pfn_noref_clean(kvm_pfn_t pfn, struct page *page)
+{
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ kvm_set_pfn_accessed(pfn);
+ if (page)
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_release_pfn_noref_clean);
+
void kvm_release_page_dirty(struct page *page)
{
WARN_ON(is_error_page(page));
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 180f1a09e6ba..a4072cc5a189 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -3,6 +3,8 @@
#ifndef __KVM_MM_H__
#define __KVM_MM_H__ 1

+#include <linux/mm_types.h>
+
/*
* Architectures can choose whether to use an rwlock or spinlock
* for the mmu_lock. These macros, for use in common code
@@ -21,7 +23,9 @@
#endif /* KVM_HAVE_MMU_RWLOCK */

kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
- bool *async, bool write_fault, bool *writable);
+ bool *async, bool write_fault, bool *writable,
+ struct page **page);
+kvm_pfn_t kvm_try_get_refcounted_page_ref(kvm_pfn_t pfn, struct page *page);

#ifdef CONFIG_HAVE_KVM_PFNCACHE
void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 2d6aba677830..e25d3af969f4 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -144,6 +144,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
void *new_khva = NULL;
unsigned long mmu_seq;
+ struct page *page;

lockdep_assert_held(&gpc->refresh_lock);

@@ -183,10 +184,19 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
}

/* We always request a writeable mapping */
- new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL);
+ new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL, &page);
if (is_error_noslot_pfn(new_pfn))
goto out_error;

+ /*
+ * Filter out pages that support refcounting but which aren't
+ * currently being refcounted. Some KVM MMUs support such pages, but
+ * although we could support them here, kvm internals more generally
+ * don't. Reject them here for consistency.
+ */
+ if (kvm_try_get_refcounted_page_ref(new_pfn, page) != new_pfn)
+ goto out_error;
+
/*
* Obtain a new kernel mapping if KVM itself will access the
* pfn. Note, kmap() and memremap() can both sleep, so this
--
2.40.0.348.gf938b09366-goog