Re: [Patch v4 16/18] KVM: x86/mmu: Allocate numa aware page tables during page fault

From: David Matlack
Date: Wed Mar 29 2023 - 15:04:15 EST


On Tue, Mar 28, 2023 at 05:21:29PM -0700, David Matlack wrote:
> On Mon, Mar 06, 2023 at 02:41:25PM -0800, Vipin Sharma wrote:
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 64de083cd6b9..77d3aa368e5e 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -787,7 +787,7 @@ struct kvm_vcpu_arch {
> > struct kvm_mmu *walk_mmu;
> >
> > struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
> > - struct kvm_mmu_memory_cache mmu_shadow_page_cache;
> > + struct kvm_mmu_memory_cache mmu_shadow_page_cache[MAX_NUMNODES];
>
> I think we need an abstraction for a NUMA-aware mmu cache, since there
> is more than one by the end of this series.
>
> e.g. A wrapper struct (struct kvm_mmu_numa_memory_cache) or make
> NUMA-awareness an optional feature within kvm_mmu_memory_cache, plus
> common helper functions for operations like initializing, topping-up,
> and freeing.
>
> I have some ideas I want to try but I ran out of time today.

Something like this (compile test only, applies on top of this series):

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 041302d6132c..b44f867d0ed2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -787,7 +787,7 @@ struct kvm_vcpu_arch {
struct kvm_mmu *walk_mmu;

struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
- struct kvm_mmu_memory_cache mmu_shadow_page_cache[MAX_NUMNODES];
+ struct kvm_mmu_numa_memory_cache mmu_shadow_page_cache;
struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;

@@ -1453,7 +1453,7 @@ struct kvm_arch {
*
* Protected by kvm->slots_lock.
*/
- struct kvm_mmu_memory_cache split_shadow_page_cache[MAX_NUMNODES];
+ struct kvm_mmu_numa_memory_cache split_shadow_page_cache;
struct kvm_mmu_memory_cache split_page_header_cache;

/*
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5463ce6e52fa..fb7b3932f08d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -702,7 +702,7 @@ static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache)

static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
{
- int r, nid = KVM_MMU_DEFAULT_CACHE_INDEX;
+ int r;

/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
@@ -710,16 +710,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
if (r)
return r;

- if (kvm_numa_aware_page_table_enabled(vcpu->kvm)) {
- for_each_online_node(nid) {
- r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache[nid],
- KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE);
- }
- } else {
- r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache[nid],
- KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE);
- }
-
+ r = kvm_mmu_topup_numa_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+ KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE);
if (r)
return r;

@@ -735,12 +727,9 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)

static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- int nid;
-
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
- for_each_node(nid)
- mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache[nid]);
+ kvm_mmu_free_numa_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
@@ -2262,7 +2251,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
{
struct shadow_page_caches caches = {
.page_header_cache = &vcpu->arch.mmu_page_header_cache,
- .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache[nid],
+ .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache.nodes[nid],
.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
};

@@ -5977,7 +5966,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)

int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
- int ret, nid;
+ int ret;

INIT_KVM_MMU_MEMORY_CACHE(&vcpu->arch.mmu_pte_list_desc_cache);
vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
@@ -5985,11 +5974,9 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
INIT_KVM_MMU_MEMORY_CACHE(&vcpu->arch.mmu_page_header_cache);
vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;

- for_each_node(nid) {
- INIT_KVM_MMU_MEMORY_CACHE(&vcpu->arch.mmu_shadow_page_cache[nid]);
- if (kvm_numa_aware_page_table_enabled(vcpu->kvm))
- vcpu->arch.mmu_shadow_page_cache[nid].node = nid;
- }
+ kvm_mmu_init_numa_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+ if (kvm_numa_aware_page_table_enabled(vcpu->kvm))
+ kvm_mmu_enable_numa_memory_cache(&vcpu->arch.mmu_shadow_page_cache);

mutex_init(&vcpu->arch.mmu_shadow_page_cache_lock);

@@ -6140,7 +6127,7 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
int kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
- int r, nid;
+ int r;

INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
@@ -6159,9 +6146,7 @@ int kvm_mmu_init_vm(struct kvm *kvm)
INIT_KVM_MMU_MEMORY_CACHE(&kvm->arch.split_page_header_cache);
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;

- for_each_node(nid)
- INIT_KVM_MMU_MEMORY_CACHE(&kvm->arch.split_shadow_page_cache[nid]);
-
+ kvm_mmu_init_numa_memory_cache(&kvm->arch.split_shadow_page_cache);

INIT_KVM_MMU_MEMORY_CACHE(&kvm->arch.split_desc_cache);
kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
@@ -6171,13 +6156,10 @@ int kvm_mmu_init_vm(struct kvm *kvm)

static void mmu_free_vm_memory_caches(struct kvm *kvm)
{
- int nid;
-
kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
mutex_lock(&kvm->slots_lock);
- for_each_node(nid)
- mmu_free_sp_memory_cache(&kvm->arch.split_shadow_page_cache[nid]);
+ kvm_mmu_free_numa_memory_cache(&kvm->arch.split_shadow_page_cache);
mutex_unlock(&kvm->slots_lock);
}

@@ -6299,7 +6281,7 @@ static bool need_topup_split_caches_or_resched(struct kvm *kvm, int nid)
*/
return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
need_topup(&kvm->arch.split_page_header_cache, 1) ||
- need_topup(&kvm->arch.split_shadow_page_cache[nid], 1);
+ need_topup(&kvm->arch.split_shadow_page_cache.nodes[nid], 1);
}

static int topup_split_caches(struct kvm *kvm, int nid)
@@ -6332,7 +6314,7 @@ static int topup_split_caches(struct kvm *kvm, int nid)
if (r)
return r;

- return mmu_topup_sp_memory_cache(&kvm->arch.split_shadow_page_cache[nid], 1);
+ return mmu_topup_sp_memory_cache(&kvm->arch.split_shadow_page_cache.nodes[nid], 1);
}

static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep,
@@ -6357,7 +6339,7 @@ static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *hu

/* Direct SPs do not require a shadowed_info_cache. */
caches.page_header_cache = &kvm->arch.split_page_header_cache;
- caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache[nid];
+ caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache.nodes[nid];

/* Safe to pass NULL for vCPU since requesting a direct SP. */
return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
@@ -6760,7 +6742,7 @@ static unsigned long mmu_shrink_scan(struct shrinker *shrink,
list_move_tail(&kvm->vm_list, &vm_list);

kvm_for_each_vcpu(i, vcpu, kvm) {
- freed += mmu_memory_cache_try_empty(vcpu->arch.mmu_shadow_page_cache,
+ freed += mmu_memory_cache_try_empty(vcpu->arch.mmu_shadow_page_cache.nodes,
MAX_NUMNODES,
&vcpu->arch.mmu_shadow_page_cache_lock);
freed += mmu_memory_cache_try_empty(&vcpu->arch.mmu_shadowed_info_cache,
@@ -6769,7 +6751,7 @@ static unsigned long mmu_shrink_scan(struct shrinker *shrink,
if (freed >= sc->nr_to_scan)
goto out;
}
- freed += mmu_memory_cache_try_empty(kvm->arch.split_shadow_page_cache,
+ freed += mmu_memory_cache_try_empty(kvm->arch.split_shadow_page_cache.nodes,
MAX_NUMNODES, &kvm->slots_lock);
if (freed >= sc->nr_to_scan)
goto out;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 63113a66f560..721d5a415807 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -265,7 +265,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu, int nid)
struct kvm_mmu_page *sp;

sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
- sp->spt = mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache[nid]);
+ sp->spt = mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache.nodes[nid]);

return sp;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d8ea39b248cd..940099629626 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6176,7 +6176,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
- int r, nid;
+ int r;

if (cap->flags)
return -EINVAL;
@@ -6399,9 +6399,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
kvm->arch.numa_aware_page_table = true;

mutex_lock(&kvm->slots_lock);
- for_each_node(nid) {
- kvm->arch.split_shadow_page_cache[nid].node = nid;
- }
+ kvm_mmu_enable_numa_memory_cache(&kvm->arch.split_shadow_page_cache);
mutex_unlock(&kvm->slots_lock);
r = 0;
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 31586a65e346..d5d966e4a8bf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1365,6 +1365,11 @@ int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
void kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc);
void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
+
+void kvm_mmu_init_numa_memory_cache(struct kvm_mmu_numa_memory_cache *cache);
+void kvm_mmu_enable_numa_memory_cache(struct kvm_mmu_numa_memory_cache *cache);
+int kvm_mmu_topup_numa_memory_cache(struct kvm_mmu_numa_memory_cache *cache, int min);
+void kvm_mmu_free_numa_memory_cache(struct kvm_mmu_numa_memory_cache *cache);
#endif

void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 13032da2ddfc..7a58ea37bc15 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -101,6 +101,10 @@ struct kvm_mmu_memory_cache {
int node;
};

+struct kvm_mmu_numa_memory_cache {
+ struct kvm_mmu_memory_cache nodes[MAX_NUMNODES];
+};
+
#define KVM_MMU_MEMORY_CACHE_INIT() { \
.gfp_zero = __GFP_ZERO, \
.node = NUMA_NO_NODE, \
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 25a549705c8e..2607b546c3c9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -476,6 +476,43 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
BUG_ON(!p);
return p;
}
+
+void kvm_mmu_init_numa_memory_cache(struct kvm_mmu_numa_memory_cache *cache)
+{
+ int node;
+
+ for_each_node(node)
+ INIT_KVM_MMU_MEMORY_CACHE(&cache->nodes[node]);
+}
+
+void kvm_mmu_enable_numa_memory_cache(struct kvm_mmu_numa_memory_cache *cache)
+{
+ int node;
+
+ for_each_node(node)
+ cache->nodes[node].node = node;
+}
+
+int kvm_mmu_topup_numa_memory_cache(struct kvm_mmu_numa_memory_cache *cache, int min)
+{
+ int r, node;
+
+ for_each_online_node(node) {
+ r = kvm_mmu_topup_memory_cache(&cache->nodes[node], min);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+void kvm_mmu_free_numa_memory_cache(struct kvm_mmu_numa_memory_cache *cache)
+{
+ int node;
+
+ for_each_node(node)
+ kvm_mmu_free_memory_cache(&cache->nodes[node]);
+}
#endif

static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)