[PATCH 09/10] KVM: Rename/refactor kvm_is_reserved_pfn() to kvm_pfn_to_refcounted_page()

From: Sean Christopherson
Date: Thu Apr 28 2022 - 21:04:52 EST


Rename and refactor kvm_is_reserved_pfn() to kvm_pfn_to_refcounted_page()
to better reflect what KVM is actually checking, and to eliminate extra
pfn_to_page() lookups. The kvm_release_pfn_*() an kvm_try_get_pfn()
helpers in particular benefit from "refouncted" nomenclature, as it's not
all that obvious why KVM needs to get/put refcounts for some PG_reserved
pages (ZERO_PAGE and ZONE_DEVICE).

Add a comment to call out that the list of exceptions to PG_reserved is
all but guaranteed to be incomplete. The list has mostly been compiled
by people throwing noodles at KVM and finding out they stick a little too
well, e.g. the ZERO_PAGE's refcount overflowed and ZONE_DEVICE pages
didn't get freed.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
---
arch/x86/kvm/mmu/mmu.c | 15 +++++----
arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
include/linux/kvm_host.h | 2 +-
virt/kvm/kvm_main.c | 66 ++++++++++++++++++++++++++++++--------
4 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5cf1436adecd..7da6741d6ea7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -587,6 +587,7 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
kvm_pfn_t pfn;
u64 old_spte = *sptep;
int level = sptep_to_sp(sptep)->role.level;
+ struct page *page;

if (!spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, 0ull);
@@ -601,11 +602,13 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
pfn = spte_to_pfn(old_spte);

/*
- * KVM does not hold the refcount of the page used by
- * kvm mmu, before reclaiming the page, we should
- * unmap it from mmu first.
+ * KVM doesn't hold a reference to any pages mapped into the guest, and
+ * instead uses the mmu_notifier to ensure that KVM unmaps any pages
+ * before they are reclaimed. Sanity check that, if the pfn is backed
+ * by a refcounted page, the refcount is elevated.
*/
- WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+ page = kvm_pfn_to_refcounted_page(pfn);
+ WARN_ON(page && !page_count(page));

if (is_accessed_spte(old_spte))
kvm_set_pfn_accessed(pfn);
@@ -2877,7 +2880,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (unlikely(fault->max_level == PG_LEVEL_4K))
return;

- if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
+ if (is_error_noslot_pfn(fault->pfn) || !kvm_pfn_to_refcounted_page(fault->pfn))
return;

if (kvm_slot_dirty_track_enabled(slot))
@@ -5947,7 +5950,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
* the guest, and the guest page table is using 4K page size
* mapping if the indirect sp has level = 1.
*/
- if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+ if (sp->role.direct && kvm_pfn_to_refcounted_page(pfn) &&
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
pfn, PG_LEVEL_NUM)) {
pte_list_remove(kvm, rmap_head, sptep);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 566548a3efa7..de2cc963dbec 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1737,7 +1737,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
continue;

pfn = spte_to_pfn(iter.old_spte);
- if (kvm_is_reserved_pfn(pfn) ||
+ if (!kvm_pfn_to_refcounted_page(pfn) ||
iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
pfn, PG_LEVEL_NUM))
continue;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4ccc309a43f2..9d5818b782f9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1547,7 +1547,7 @@ void kvm_arch_sync_events(struct kvm *kvm);

int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);

-bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
+struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn);
bool kvm_is_zone_device_page(struct page *page);

struct kvm_irq_ack_notifier {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cbc6d58081d4..656c47037eea 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -178,19 +178,36 @@ bool kvm_is_zone_device_page(struct page *page)
return is_zone_device_page(page);
}

-bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
+/*
+ * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
+ * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
+ * is likely incomplete, it has been compiled purely through people wanting to
+ * back guest with a certain type of memory and encountering issues.
+ */
+struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
{
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ page = pfn_to_page(pfn);
+ if (!PageReserved(page))
+ return page;
+
+ /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
+ if (is_zero_pfn(pfn))
+ return page;
+
/*
* ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
* perspective they are "normal" pages, albeit with slightly different
* usage rules.
*/
- if (pfn_valid(pfn))
- return PageReserved(pfn_to_page(pfn)) &&
- !is_zero_pfn(pfn) &&
- !kvm_is_zone_device_page(pfn_to_page(pfn));
+ if (kvm_is_zone_device_page(page))
+ return page;

- return true;
+ return NULL;
}

/*
@@ -2479,9 +2496,12 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)

static int kvm_try_get_pfn(kvm_pfn_t pfn)
{
- if (kvm_is_reserved_pfn(pfn))
+ struct page *page = kvm_pfn_to_refcounted_page(pfn);
+
+ if (!page)
return 1;
- return get_page_unless_zero(pfn_to_page(pfn));
+
+ return get_page_unless_zero(page);
}

static int hva_to_pfn_remapped(struct vm_area_struct *vma,
@@ -2706,6 +2726,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
*/
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
+ struct page *page;
kvm_pfn_t pfn;

pfn = gfn_to_pfn(kvm, gfn);
@@ -2713,10 +2734,11 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
if (is_error_noslot_pfn(pfn))
return KVM_ERR_PTR_BAD_PAGE;

- if (kvm_is_reserved_pfn(pfn))
+ page = kvm_pfn_to_refcounted_page(pfn);
+ if (!page)
return KVM_ERR_PTR_BAD_PAGE;

- return pfn_to_page(pfn);
+ return page;
}
EXPORT_SYMBOL_GPL(gfn_to_page);

@@ -2819,8 +2841,16 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);

void kvm_release_pfn_clean(kvm_pfn_t pfn)
{
- if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
- kvm_release_page_clean(pfn_to_page(pfn));
+ struct page *page;
+
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ page = kvm_pfn_to_refcounted_page(pfn);
+ if (!page)
+ return;
+
+ kvm_release_page_clean(page);
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);

@@ -2835,8 +2865,16 @@ EXPORT_SYMBOL_GPL(kvm_release_page_dirty);

void kvm_release_pfn_dirty(kvm_pfn_t pfn)
{
- if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
- kvm_release_page_dirty(pfn_to_page(pfn));
+ struct page *page;
+
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ page = kvm_pfn_to_refcounted_page(pfn);
+ if (!page)
+ return;
+
+ kvm_release_page_dirty(page);
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);

--
2.36.0.464.gb9c8b46e94-goog