[Patch v4 03/18] KVM: x86/mmu: Track count of pages in KVM MMU page caches globally

From: Vipin Sharma
Date: Mon Mar 06 2023 - 17:41:59 EST


Create a global counter for total number of pages available
in MMU page caches across all VMs. Add mmu_shadow_page_cache
pages to this counter.

This accounting will be used in future commits to shrink MMU caches via
KVM MMU shrinker.

Signed-off-by: Vipin Sharma <vipinsh@xxxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 5 ++
arch/x86/kvm/mmu/mmu.c | 90 ++++++++++++++++++++++++++++-----
arch/x86/kvm/mmu/mmu_internal.h | 2 +
arch/x86/kvm/mmu/paging_tmpl.h | 25 +++++----
arch/x86/kvm/mmu/tdp_mmu.c | 3 +-
5 files changed, 100 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ebbe692acf3f..4322c7020d5d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -791,6 +791,11 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;

+ /*
+ * Protect allocation and release of pages from mmu_shadow_page_cache.
+ */
+ struct mutex mmu_shadow_page_cache_lock;
+
/*
* QEMU userspace and the guest each have their own FPU state.
* In vcpu_run, we switch between the user and guest FPU contexts.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3a452989f5cd..13f41b7ac280 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -167,6 +167,11 @@ struct kvm_shadow_walk_iterator {
static struct kmem_cache *pte_list_desc_cache;
struct kmem_cache *mmu_page_header_cache;

+/*
+ * Global count of unused pages in MMU page caches across all VMs.
+ */
+static struct percpu_counter kvm_total_unused_cached_pages;
+
static void mmu_spte_set(u64 *sptep, u64 spte);

struct kvm_mmu_role_regs {
@@ -667,6 +672,34 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
}
}

+/**
+ * Caller should hold mutex lock corresponding to cache, if available.
+ */
+static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
+ int min)
+{
+ int orig_nobjs, r;
+
+ orig_nobjs = cache->nobjs;
+ r = kvm_mmu_topup_memory_cache(cache, min);
+ if (orig_nobjs != cache->nobjs)
+ percpu_counter_add(&kvm_total_unused_cached_pages,
+ (cache->nobjs - orig_nobjs));
+
+ return r;
+}
+
+/**
+ * Caller should hold mutex lock corresponding to kvm_mmu_memory_cache, if
+ * available.
+ */
+static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache)
+{
+ if (cache->nobjs)
+ percpu_counter_sub(&kvm_total_unused_cached_pages, cache->nobjs);
+ kvm_mmu_free_memory_cache(cache);
+}
+
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
{
int r;
@@ -676,10 +709,11 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
if (r)
return r;
- r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
- PT64_ROOT_MAX_LEVEL);
+
+ r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL);
if (r)
return r;
+
if (maybe_indirect) {
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
PT64_ROOT_MAX_LEVEL);
@@ -693,7 +727,9 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
- kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+ mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
+ mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+ mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
@@ -2148,6 +2184,7 @@ struct shadow_page_caches {
struct kvm_mmu_memory_cache *page_header_cache;
struct kvm_mmu_memory_cache *shadow_page_cache;
struct kvm_mmu_memory_cache *shadowed_info_cache;
+ bool count_shadow_page_allocation;
};

static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
@@ -2159,7 +2196,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
struct kvm_mmu_page *sp;

sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
- sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
+ sp->spt = mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
+ caches->count_shadow_page_allocation);
if (!role.direct)
sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);

@@ -2216,6 +2254,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
.page_header_cache = &vcpu->arch.mmu_page_header_cache,
.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
+ .count_shadow_page_allocation = true,
};

return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
@@ -4314,29 +4353,32 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r != RET_PF_INVALID)
return r;

+ mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
r = mmu_topup_memory_caches(vcpu, false);
if (r)
- return r;
+ goto out_page_cache_unlock;

r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
if (r != RET_PF_CONTINUE)
- return r;
+ goto out_page_cache_unlock;

r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);

if (is_page_fault_stale(vcpu, fault))
- goto out_unlock;
+ goto out_mmu_unlock;

r = make_mmu_pages_available(vcpu);
if (r)
- goto out_unlock;
+ goto out_mmu_unlock;

r = direct_map(vcpu, fault);

-out_unlock:
+out_mmu_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(fault->pfn);
+out_page_cache_unlock:
+ mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
return r;
}

@@ -4396,25 +4438,28 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
if (r != RET_PF_INVALID)
return r;

+ mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
r = mmu_topup_memory_caches(vcpu, false);
if (r)
- return r;
+ goto out_page_cache_unlock;

r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
if (r != RET_PF_CONTINUE)
- return r;
+ goto out_page_cache_unlock;

r = RET_PF_RETRY;
read_lock(&vcpu->kvm->mmu_lock);

if (is_page_fault_stale(vcpu, fault))
- goto out_unlock;
+ goto out_mmu_unlock;

r = kvm_tdp_mmu_map(vcpu, fault);

-out_unlock:
+out_mmu_unlock:
read_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(fault->pfn);
+out_page_cache_unlock:
+ mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
return r;
}
#endif
@@ -5394,6 +5439,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;

+ mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
if (r)
goto out;
@@ -5420,6 +5466,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
*/
static_call(kvm_x86_flush_tlb_current)(vcpu);
out:
+ mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
return r;
}

@@ -5924,6 +5971,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;

vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+ mutex_init(&vcpu->arch.mmu_shadow_page_cache_lock);

vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@ -6769,12 +6817,17 @@ int kvm_mmu_vendor_module_init(void)
if (!mmu_page_header_cache)
goto out;

+ if (percpu_counter_init(&kvm_total_unused_cached_pages, 0, GFP_KERNEL))
+ goto out;
+
ret = register_shrinker(&mmu_shrinker, "x86-mmu");
if (ret)
- goto out;
+ goto out_shrinker;

return 0;

+out_shrinker:
+ percpu_counter_destroy(&kvm_total_unused_cached_pages);
out:
mmu_destroy_caches();
return ret;
@@ -6792,6 +6845,7 @@ void kvm_mmu_vendor_module_exit(void)
{
mmu_destroy_caches();
unregister_shrinker(&mmu_shrinker);
+ percpu_counter_destroy(&kvm_total_unused_cached_pages);
}

/*
@@ -6994,3 +7048,11 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
if (kvm->arch.nx_huge_page_recovery_thread)
kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
}
+
+void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
+ bool count_allocation)
+{
+ if (count_allocation && shadow_page_cache->nobjs)
+ percpu_counter_dec(&kvm_total_unused_cached_pages);
+ return kvm_mmu_memory_cache_alloc(shadow_page_cache);
+}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index cc58631e2336..798cfbf0a36b 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -338,5 +338,7 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);

void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *cache,
+ bool count_allocation);

#endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 57f0b75c80f9..1dea9be6849d 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -821,9 +821,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
return RET_PF_EMULATE;
}

+ mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
r = mmu_topup_memory_caches(vcpu, true);
if (r)
- return r;
+ goto out_page_cache_unlock;

vcpu->arch.write_fault_to_shadow_pgtable = false;

@@ -837,7 +838,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault

r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
if (r != RET_PF_CONTINUE)
- return r;
+ goto out_page_cache_unlock;

/*
* Do not change pte_access if the pfn is a mmio page, otherwise
@@ -862,16 +863,18 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
write_lock(&vcpu->kvm->mmu_lock);

if (is_page_fault_stale(vcpu, fault))
- goto out_unlock;
+ goto out_mmu_unlock;

r = make_mmu_pages_available(vcpu);
if (r)
- goto out_unlock;
+ goto out_mmu_unlock;
r = FNAME(fetch)(vcpu, fault, &walker);

-out_unlock:
+out_mmu_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(fault->pfn);
+out_page_cache_unlock:
+ mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
return r;
}

@@ -897,17 +900,18 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)

vcpu_clear_mmio_info(vcpu, gva);

+ if (!VALID_PAGE(root_hpa)) {
+ WARN_ON(1);
+ return;
+ }
+
+ mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
/*
* No need to check return value here, rmap_can_add() can
* help us to skip pte prefetch later.
*/
mmu_topup_memory_caches(vcpu, true);

- if (!VALID_PAGE(root_hpa)) {
- WARN_ON(1);
- return;
- }
-
write_lock(&vcpu->kvm->mmu_lock);
for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
level = iterator.level;
@@ -943,6 +947,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
break;
}
write_unlock(&vcpu->kvm->mmu_lock);
+ mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
}

/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7c25dbf32ecc..fa6eb1e9101e 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -265,7 +265,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;

sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
- sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+ sp->spt = mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
+ true);

return sp;
}
--
2.40.0.rc0.216.gc4246ad0f0-goog