[PATCHv4 3/5] kvm: host side for eoi optimization

From: Michael S. Tsirkin
Date: Wed May 16 2012 - 07:46:22 EST


Implementation of PV EOI using shared memory.
This reduces the number of exits an interrupt
causes as much as by half.

The idea is simple: there's a bit, per APIC, in guest memory,
that tells the guest that it does not need EOI.
We set it before injecting an interrupt and clear
before injecting a nested one. Guest tests it using
a test and clear operation - this is necessary
so that host can detect interrupt nesting -
and if set, it can skip the EOI MSR.

There's a new MSR to set the address of said register
in guest memory. Otherwise not much changed:
- Guest EOI is not required
- Register is tested & ISR is automatically cleared on exit

For testing results see description of previous patch
'kvm_para: guest side for eoi avoidance'.

Signed-off-by: Michael S. Tsirkin <mst@xxxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 12 ++++
arch/x86/kvm/cpuid.c | 1 +
arch/x86/kvm/irq.c | 2 +-
arch/x86/kvm/lapic.c | 137 +++++++++++++++++++++++++++++++++++++--
arch/x86/kvm/lapic.h | 2 +
arch/x86/kvm/trace.h | 34 ++++++++++
arch/x86/kvm/x86.c | 5 ++
7 files changed, 187 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 32297f2..3ded418 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -174,6 +174,13 @@ enum {

/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
+/*
+ * The following bit is set with PV-EOI, unset on EOI.
+ * We detect PV-EOI changes by guest by comparing
+ * this bit with PV-EOI in guest memory.
+ * See the implementation in apic_update_pv_eoi.
+ */
+#define KVM_APIC_PV_EOI_PENDING 1

/*
* We don't want allocation failures within the mmu code, so we preallocate
@@ -485,6 +492,11 @@ struct kvm_vcpu_arch {
u64 length;
u64 status;
} osvw;
+
+ struct {
+ u64 msr_val;
+ struct gfn_to_hva_cache data;
+ } pv_eoi;
};

struct kvm_lpage_info {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index eba8345..a9f155a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -408,6 +408,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
(1 << KVM_FEATURE_NOP_IO_DELAY) |
(1 << KVM_FEATURE_CLOCKSOURCE2) |
(1 << KVM_FEATURE_ASYNC_PF) |
+ (1 << KVM_FEATURE_PV_EOI) |
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);

if (sched_info_on())
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 7e06ba1..07bdfab 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -48,7 +48,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
if (!irqchip_in_kernel(v->kvm))
return v->arch.interrupt.pending;

- if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
+ if (kvm_apic_has_interrupt(v) < 0) { /* LAPIC */
if (kvm_apic_accept_pic_intr(v)) {
s = pic_irqchip(v->kvm); /* PIC */
return s->output;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93c1574..b4f7013 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -270,6 +270,54 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
irq->level, irq->trig_mode);
}

+static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+ sizeof(val));
+}
+
+static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
+}
+
+static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+
+static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
+{
+ u8 val;
+ if (pv_eoi_get_user(vcpu, &val) < 0)
+ apic_debug("Can't read EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+ return val & 0x1;
+}
+
+static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
+ apic_debug("Can't set EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+ return;
+ }
+ __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
+static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
+ apic_debug("Can't clear EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+ return;
+ }
+ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
{
int result;
@@ -482,16 +530,26 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
}

-static void apic_set_eoi(struct kvm_lapic *apic)
+static int apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
+
+ trace_kvm_eoi(apic, vector);
+
/*
* Not every write EOI will has corresponding ISR,
* one example is when Kernel check timer on setup_IO_APIC
*/
if (vector == -1)
- return;
+ return vector;

+ /*
+ * It's legal for guest to ignore the PV EOI optimization
+ * and signal EOI by APIC write. If this happens, clear
+ * PV EOI on guest's behalf.
+ */
+ if (pv_eoi_enabled(apic->vcpu))
+ pv_eoi_clr_pending(apic->vcpu);
apic_clear_vector(vector, apic->regs + APIC_ISR);
apic_update_ppr(apic);

@@ -505,6 +563,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
}
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ return vector;
}

static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1085,6 +1144,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
atomic_set(&apic->lapic_timer.pending, 0);
if (kvm_vcpu_is_bsp(vcpu))
vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);

vcpu->arch.apic_arb_prio = 0;
@@ -1211,9 +1271,10 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)

apic_update_ppr(apic);
highest_irr = apic_find_highest_irr(apic);
- if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+ if (highest_irr == -1)
return -1;
+ if (((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+ return -2;
return highest_irr;
}

@@ -1245,9 +1306,20 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;

- if (vector == -1)
+ /* Detect interrupt nesting and disable EOI optimization */
+ if (pv_eoi_enabled(vcpu) && vector == -2)
+ pv_eoi_clr_pending(vcpu);
+
+ if (vector < 0)
return -1;

+ if (pv_eoi_enabled(vcpu)) {
+ if (kvm_ioapic_handles_vector(vcpu->kvm, vector))
+ pv_eoi_clr_pending(vcpu);
+ else
+ pv_eoi_set_pending(vcpu);
+ }
+
apic_set_vector(vector, apic->regs + APIC_ISR);
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
@@ -1262,6 +1334,18 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
MSR_IA32_APICBASE_BASE;
kvm_apic_set_version(vcpu);

+ if (pv_eoi_enabled(apic->vcpu)) {
+ /*
+ * Update shadow bit in apic_attention bitmask
+ * from guest memory.
+ */
+ if (pv_eoi_get_pending(apic->vcpu))
+ __set_bit(KVM_APIC_PV_EOI_PENDING,
+ &vcpu->arch.apic_attention);
+ else
+ __clear_bit(KVM_APIC_PV_EOI_PENDING,
+ &vcpu->arch.apic_attention);
+ }
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
update_divide_count(apic);
@@ -1283,11 +1367,42 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}

+/*
+ * apic_update_pv_eoi - called on vmexit
+ *
+ * Detect whether guest triggered PV EOI since the
+ * last entry. If yes, set EOI on guests's behalf.
+ */
+static void apic_update_pv_eoi(struct kvm_lapic *apic)
+{
+ /*
+ * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
+ * and KVM_PV_EOI_ENABLED in guest memory as follows:
+ *
+ * KVM_APIC_PV_EOI_PENDING is unset:
+ * -> host disabled PV EOI.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
+ * -> host enabled PV EOI, guest did not execute EOI yet.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
+ * -> host enabled PV EOI, guest executed EOI.
+ */
+ int vector;
+ BUG_ON(!pv_eoi_enabled(apic->vcpu));
+ if (pv_eoi_get_pending(apic->vcpu))
+ return;
+ __clear_bit(KVM_APIC_PV_EOI_PENDING, &apic->vcpu->arch.apic_attention);
+ vector = apic_set_eoi(apic);
+ trace_kvm_pv_eoi(apic, vector);
+}
+
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
{
u32 data;
void *vapic;

+ if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
+ apic_update_pv_eoi(vcpu->arch.apic);
+
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;

@@ -1394,3 +1509,15 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)

return 0;
}
+
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
+{
+ u64 addr = data & ~KVM_MSR_ENABLED;
+ if (pv_eoi_enabled(vcpu))
+ pv_eoi_clr_pending(vcpu);
+ vcpu->arch.pv_eoi.msr_val = data;
+ if (!pv_eoi_enabled(vcpu))
+ return 0;
+ return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ addr);
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6f4ce25..149ef01 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -60,4 +60,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
{
return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
}
+
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
#endif
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 911d264..851914e 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq,
__entry->coalesced ? " (coalesced)" : "")
);

+TRACE_EVENT(kvm_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+TRACE_EVENT(kvm_pv_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
/*
* Tracepoint for nested VMRUN
*/
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 185a2b8..4d77af0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -795,6 +795,7 @@ static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -1653,6 +1654,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);

break;
+ case MSR_KVM_PV_EOI_EN:
+ if (kvm_lapic_enable_pv_eoi(vcpu, data))
+ return 1;
+ break;

case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
--
MST

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/