[PATCH v3 2/4] x86/kvm: introduce a per cpu vcpu mask

From: Juergen Gross
Date: Tue Nov 16 2021 - 09:11:20 EST


In order to support high vcpu numbers per guest don't use an on stack
vcpu bitmask. As this currently used bitmask is not used in functions
subject to recursion it is fairly easy to replace it with a percpu
bitmask.

Allocate this bitmask dynamically in order to support boot time
specified max number of vcpus in future.

Disable preemption while such a bitmask is being used in order to
avoid double usage in case we'd switch cpus.

Note that this doesn't apply to vcpu bitmasks used in hyperv.c, as
there the max number of vcpus is architecturally limited to 4096 and
that bitmask can remain on the stack.

Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
---
V2:
- use local_lock() instead of preempt_disable() (Paolo Bonzini)
V3:
- drop hyperv.c related changes (Eduardo Habkost)
---
arch/x86/include/asm/kvm_host.h | 7 +++++++
arch/x86/kvm/ioapic.c | 8 +++++++-
arch/x86/kvm/irq_comm.c | 9 +++++++--
arch/x86/kvm/x86.c | 18 +++++++++++++++++-
4 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bcef56f1039a..886930ec8264 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
#include <linux/cpumask.h>
#include <linux/irq_work.h>
#include <linux/irq.h>
+#include <linux/local_lock.h>

#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -1612,6 +1613,12 @@ extern bool kvm_has_bus_lock_exit;
/* maximum vcpu-id */
unsigned int kvm_max_vcpu_ids(void);

+/* per cpu vcpu bitmask, protected by kvm_pcpu_mask_lock */
+DECLARE_PER_CPU(local_lock_t, kvm_pcpu_mask_lock);
+extern unsigned long __percpu *kvm_pcpu_vcpu_mask;
+#define KVM_VCPU_MASK_SZ \
+ (sizeof(*kvm_pcpu_vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS))
+
extern u64 kvm_mce_cap_supported;

/*
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 64ba9b1c8b3d..c81963a27594 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -320,7 +320,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
bool mask_before, mask_after;
union kvm_ioapic_redirect_entry *e;
int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode;
- DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
+ unsigned long *vcpu_bitmap;

switch (ioapic->ioregsel) {
case IOAPIC_REG_VERSION:
@@ -384,6 +384,10 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
irq.shorthand = APIC_DEST_NOSHORT;
irq.dest_id = e->fields.dest_id;
irq.msi_redir_hint = false;
+
+ local_lock(&kvm_pcpu_mask_lock);
+
+ vcpu_bitmap = this_cpu_ptr(kvm_pcpu_vcpu_mask);
bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
vcpu_bitmap);
@@ -403,6 +407,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
}
kvm_make_scan_ioapic_request_mask(ioapic->kvm,
vcpu_bitmap);
+
+ local_unlock(&kvm_pcpu_mask_lock);
} else {
kvm_make_scan_ioapic_request(ioapic->kvm);
}
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index d5b72a08e566..c331204de007 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -47,7 +47,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
{
int i, r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
- unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned long *dest_vcpu_bitmap;
unsigned int dest_vcpus = 0;

if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
@@ -59,7 +59,10 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
irq->delivery_mode = APIC_DM_FIXED;
}

- memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+ local_lock(&kvm_pcpu_mask_lock);
+ dest_vcpu_bitmap = this_cpu_ptr(kvm_pcpu_vcpu_mask);
+
+ memset(dest_vcpu_bitmap, 0, KVM_VCPU_MASK_SZ);

kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
@@ -93,6 +96,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
lowest = kvm_get_vcpu(kvm, idx);
}

+ local_unlock(&kvm_pcpu_mask_lock);
+
if (lowest)
r = kvm_apic_set_irq(lowest, irq, dest_map);

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 61bab2bdeefb..a388acdc5eb0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -215,6 +215,10 @@ unsigned int kvm_max_vcpu_ids(void)
}
EXPORT_SYMBOL_GPL(kvm_max_vcpu_ids);

+DEFINE_PER_CPU(local_lock_t, kvm_pcpu_mask_lock) =
+ INIT_LOCAL_LOCK(kvm_pcpu_mask_lock);
+unsigned long __percpu *kvm_pcpu_vcpu_mask;
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -11247,9 +11251,16 @@ int kvm_arch_hardware_setup(void *opaque)
if (boot_cpu_has(X86_FEATURE_XSAVES))
rdmsrl(MSR_IA32_XSS, host_xss);

+ kvm_pcpu_vcpu_mask = __alloc_percpu(KVM_VCPU_MASK_SZ,
+ sizeof(unsigned long));
+ if (!kvm_pcpu_vcpu_mask) {
+ r = -ENOMEM;
+ goto err;
+ }
+
r = ops->hardware_setup();
if (r != 0)
- return r;
+ goto err;

memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
kvm_ops_static_call_update();
@@ -11277,11 +11288,16 @@ int kvm_arch_hardware_setup(void *opaque)

kvm_init_msr_list();
return 0;
+
+ err:
+ free_percpu(kvm_pcpu_vcpu_mask);
+ return r;
}

void kvm_arch_hardware_unsetup(void)
{
static_call(kvm_x86_hardware_unsetup)();
+ free_percpu(kvm_pcpu_vcpu_mask);
}

int kvm_arch_check_processor_compat(void *opaque)
--
2.26.2