Re: [RFC 02/33] KVM: x86: Introduce KVM_CAP_APIC_ID_GROUPS

From: Alexander Graf
Date: Wed Nov 08 2023 - 07:12:04 EST



On 08.11.23 12:17, Nicolas Saenz Julienne wrote:
From: Anel Orazgaliyeva <anelkz@xxxxxxxxx>

Introduce KVM_CAP_APIC_ID_GROUPS, this capability segments the VM's APIC
ids into two. The lower bits, the physical APIC id, represent the part
that's exposed to the guest. The higher bits, which are private to KVM,
groups APICs together. APICs in different groups are isolated from each
other, and IPIs can only be directed at APICs that share the same group
as its source. Furthermore, groups are only relevant to IPIs, anything
incoming from outside the local APIC complex: from the IOAPIC, MSIs, or
PV-IPIs is targeted at the default APIC group, group 0.

When routing IPIs with physical destinations, KVM will OR the source's
vCPU APIC group with the ICR's destination ID and use that to resolve
the target lAPIC. The APIC physical map is also made group aware in
order to speed up this process. For the sake of simplicity, the logical
map is not built while KVM_CAP_APIC_ID_GROUPS is in use and we defer IPI
routing to the slower per-vCPU scan method.

This capability serves as a building block to implement virtualisation
based security features like Hyper-V's Virtual Secure Mode (VSM). VSM
introduces a para-virtualised switch that allows for guest CPUs to jump
into a different execution context, this switches into a different CPU
state, lAPIC state, and memory protections. We model this in KVM by
using distinct kvm_vcpus for each context. Moreover, execution contexts
are hierarchical and its APICs are meant to remain functional even when
the context isn't 'scheduled in'. For example, we have to keep track of
timers' expirations, and interrupt execution of lesser priority contexts
when relevant. Hence the need to alias physical APIC ids, while keeping
the ability to target specific execution contexts.

Signed-off-by: Anel Orazgaliyeva <anelkz@xxxxxxxxx>
Co-developed-by: Nicolas Saenz Julienne <nsaenz@xxxxxxxxxx>
Signed-off-by: Nicolas Saenz Julienne <nsaenz@xxxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 3 ++
arch/x86/include/uapi/asm/kvm.h | 5 +++
arch/x86/kvm/lapic.c | 59 ++++++++++++++++++++++++++++-----
arch/x86/kvm/lapic.h | 33 ++++++++++++++++++
arch/x86/kvm/x86.c | 15 +++++++++
include/uapi/linux/kvm.h | 2 ++
6 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dff10051e9b6..a2f224f95404 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1298,6 +1298,9 @@ struct kvm_arch {
struct rw_semaphore apicv_update_lock;
unsigned long apicv_inhibit_reasons;
+ u32 apic_id_group_mask;
+ u8 apic_id_group_shift;
+
gpa_t wall_clock;
bool mwait_in_guest;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a448d0964fc0..f73d137784d7 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -565,4 +565,9 @@ struct kvm_pmu_event_filter {
#define KVM_X86_DEFAULT_VM 0
#define KVM_X86_SW_PROTECTED_VM 1
+/* for KVM_SET_APIC_ID_GROUPS */
+struct kvm_apic_id_groups {
+ __u8 n_bits; /* nr of bits used to represent group in the APIC ID */
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3e977dbbf993..f55d216cb2a0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -141,7 +141,7 @@ static inline int apic_enabled(struct kvm_lapic *apic)
static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
{
- return apic->vcpu->vcpu_id;
+ return kvm_apic_id(apic->vcpu);
}
static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
@@ -219,8 +219,8 @@ static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
bool *xapic_id_mismatch)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u32 x2apic_id = kvm_x2apic_id(apic);
- u32 xapic_id = kvm_xapic_id(apic);
+ u32 x2apic_id = kvm_apic_id_and_group(vcpu);
+ u32 xapic_id = kvm_apic_id_and_group(vcpu);
u32 physical_id;
/*
@@ -299,6 +299,13 @@ static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
u16 mask;
u32 ldr;
+ /*
+ * Using maps for logical destinations when KVM_CAP_APIC_ID_GRUPS is in
+ * use isn't supported.
+ */
+ if (kvm_apic_group(vcpu))
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+
if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
return;
@@ -370,6 +377,25 @@ enum {
DIRTY
};
+int kvm_vm_ioctl_set_apic_id_groups(struct kvm *kvm,
+ struct kvm_apic_id_groups *groups)
+{
+ u8 n_bits = groups->n_bits;
+
+ if (n_bits > 32)
+ return -EINVAL;
+
+ kvm->arch.apic_id_group_mask = n_bits ? GENMASK(31, 32 - n_bits): 0;
+ /*
+ * Bitshifts >= than the width of the type are UD, so set the
+ * apic group shift to 0 when n_bits == 0. The group mask above will
+ * clear the APIC ID, so group querying functions will return the
+ * correct value.
+ */
+ kvm->arch.apic_id_group_shift = n_bits ? 32 - n_bits : 0;
+ return 0;
+}
+
void kvm_recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
@@ -414,7 +440,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
kvm_for_each_vcpu(i, vcpu, kvm)
if (kvm_apic_present(vcpu))
- max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
+ max_id = max(max_id, kvm_apic_id_and_group(vcpu));
new = kvzalloc(sizeof(struct kvm_apic_map) +
sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
@@ -525,7 +551,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
u32 ldr = kvm_apic_calc_x2apic_ldr(id);
- WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+ WARN_ON_ONCE(id != kvm_apic_id(apic->vcpu));
kvm_lapic_set_reg(apic, APIC_ID, id);
kvm_lapic_set_reg(apic, APIC_LDR, ldr);
@@ -1067,6 +1093,17 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
struct kvm_lapic *target = vcpu->arch.apic;
u32 mda = kvm_apic_mda(vcpu, dest, source, target);
+ /*
+ * Make sure vCPUs belong to the same APIC group, it's not possible
+ * to send interrupts across groups.
+ *
+ * Non-IPIs and PV-IPIs can only be injected into the default APIC
+ * group (group 0).
+ */
+ if ((source && !kvm_match_apic_group(source->vcpu, vcpu)) ||
+ kvm_apic_group(vcpu))
+ return false;
+
ASSERT(target);
switch (shorthand) {
case APIC_DEST_NOSHORT:
@@ -1518,6 +1555,10 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
else
irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high);
+ if (irq.dest_mode == APIC_DEST_PHYSICAL)
+ kvm_apic_id_set_group(apic->vcpu->kvm,
+ kvm_apic_group(apic->vcpu), &irq.dest_id);
+
trace_kvm_apic_ipi(icr_low, irq.dest_id);
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
@@ -2541,7 +2582,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
/* update jump label if enable bit changes */
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
if (value & MSR_IA32_APICBASE_ENABLE) {
- kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ kvm_apic_set_xapic_id(apic, kvm_apic_id(vcpu));
static_branch_slow_dec_deferred(&apic_hw_disabled);
/* Check if there are APF page ready requests pending */
kvm_make_request(KVM_REQ_APF_READY, vcpu);
@@ -2553,9 +2594,9 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if ((old_value ^ value) & X2APIC_ENABLE) {
if (value & X2APIC_ENABLE)
- kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+ kvm_apic_set_x2apic_id(apic, kvm_apic_id(vcpu));
else if (value & MSR_IA32_APICBASE_ENABLE)
- kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ kvm_apic_set_xapic_id(apic, kvm_apic_id(vcpu));
}
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
@@ -2685,7 +2726,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
/* The xAPIC ID is set at RESET even if the APIC was already enabled. */
if (!init_event)
- kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ kvm_apic_set_xapic_id(apic, kvm_apic_id(vcpu));
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < apic->nr_lvt_entries; i++)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e1021517cf04..542bd208e52b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -97,6 +97,8 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+int kvm_vm_ioctl_set_apic_id_groups(struct kvm *kvm,
+ struct kvm_apic_id_groups *groups);
void kvm_recalculate_apic_map(struct kvm *kvm);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
@@ -277,4 +279,35 @@ static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
}
+static inline u32 kvm_apic_id(struct kvm_vcpu *vcpu)
+{
+ return vcpu->vcpu_id & ~vcpu->kvm->arch.apic_id_group_mask;
+}
+
+static inline u32 kvm_apic_id_and_group(struct kvm_vcpu *vcpu)
+{
+ return vcpu->vcpu_id;
+}
+
+static inline u32 kvm_apic_group(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ return (vcpu->vcpu_id & kvm->arch.apic_id_group_mask) >>
+ kvm->arch.apic_id_group_shift;
+}
+
+static inline void kvm_apic_id_set_group(struct kvm *kvm, u32 group,
+ u32 *apic_id)
+{
+ *apic_id |= ((group << kvm->arch.apic_id_group_shift) &
+ kvm->arch.apic_id_group_mask);
+}
+
+static inline bool kvm_match_apic_group(struct kvm_vcpu *src,
+ struct kvm_vcpu *dst)
+{
+ return kvm_apic_group(src) == kvm_apic_group(dst);
+}
+
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e3eb608b6692..4cd3f00475c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4526,6 +4526,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_MEMORY_FAULT_INFO:
+ case KVM_CAP_APIC_ID_GROUPS:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -7112,6 +7113,20 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
break;
}
+ case KVM_SET_APIC_ID_GROUPS: {


Instead of the separate ioctl, could this just be part of the ENABLE_CAP arguments? See kvm_vm_ioctl_enable_cap() for reference.

Also, for v1 please document the CAP in Documentation/virt/kvm/api.rst.

And finally, isn't there some guest exposure of the number of bits an APIC ID can have?


Alex




Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879