Re: [PATCH RFC 2/2] KVM: x86: Support using the VMX preemption timer for APIC Timer periodic/oneshot mode

From: Radim KrÄmÃÅ
Date: Tue Oct 11 2016 - 11:06:48 EST


2016-10-11 20:17+0800, Wanpeng Li:
> From: Wanpeng Li <wanpeng.li@xxxxxxxxxxx>
>
> Most windows guests still utilize APIC Timer periodic/oneshot mode
> instead of tsc-deadline mode, and the APIC Timer periodic/oneshot
> mode are still emulated by high overhead hrtimer on host. This patch
> converts the expected expire time of the periodic/oneshot mode to
> guest deadline tsc in order to leverage VMX preemption timer logic
> for APIC Timer tsc-deadline mode.
>
> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> Cc: Radim KrÄmÃÅ <rkrcmar@xxxxxxxxxx>
> Cc: Yunhong Jiang <yunhong.jiang@xxxxxxxxx>
> Signed-off-by: Wanpeng Li <wanpeng.li@xxxxxxxxxxx>
> ---
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> @@ -1101,13 +1101,20 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
> apic->lapic_timer.period == 0)
> return 0;
>
> + if (kvm_lapic_hv_timer_in_use(apic->vcpu)) {
> + u64 tscl = rdtsc();
> +
> + tmcct = apic->lapic_timer.tscdeadline -
> + kvm_read_l1_tsc(apic->vcpu, tscl);

Yes, this won't work. The easiest way to return a less bogus TMCCT
would be remember the timeout when setting up the timer and replace
hrtimer_get_remaining() with it -- our deliver method shouldn't change
the expiration time.

> + } else {
> + remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
> + if (ktime_to_ns(remaining) < 0)
> + remaining = ktime_set(0, 0);
> +
> + ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
> + tmcct = div64_u64(ns,
> + (APIC_BUS_CYCLE_NS * apic->divide_count));
> + }
>
> return tmcct;
> }
> @@ -1400,52 +1407,65 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
> }
> EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
>
> -static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
> +static void cancel_hv_timer(struct kvm_lapic *apic)
> {
> kvm_x86_ops->cancel_hv_timer(apic->vcpu);
> apic->lapic_timer.hv_timer_in_use = false;
> }
>
> +static bool start_hv_timer(struct kvm_lapic *apic)
> {
> + u64 tscdeadline;
>
> + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
> + u64 tscl = rdtsc();
>
> + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
> + * APIC_BUS_CYCLE_NS * apic->divide_count;
> + apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
> + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);

start_hv_timer() is called from kvm_lapic_switch_to_hv_timer(), which
can happen mid-period. The worst case is that the timer will never
fire, because we always convert back and forth. You have to compute the
equivalent deadline only once, and carry it around.

> + }
> +
> + tscdeadline = apic->lapic_timer.tscdeadline;
>
> if (atomic_read(&apic->lapic_timer.pending) ||
> kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
> if (apic->lapic_timer.hv_timer_in_use)
> + cancel_hv_timer(apic);
> } else {
> apic->lapic_timer.hv_timer_in_use = true;
> hrtimer_cancel(&apic->lapic_timer.timer);
>
> /* In case the sw timer triggered in the window */
> if (atomic_read(&apic->lapic_timer.pending))
> + cancel_hv_timer(apic);
> }
> trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
> apic->lapic_timer.hv_timer_in_use);
> return apic->lapic_timer.hv_timer_in_use;
> }
>
> +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_lapic *apic = vcpu->arch.apic;
> +
> + WARN_ON(!apic->lapic_timer.hv_timer_in_use);
> + WARN_ON(swait_active(&vcpu->wq));
> + cancel_hv_timer(apic);
> + apic_timer_expired(apic);
> +
> + if (apic_lvtt_period(apic))
> + start_hv_timer(apic);
> +}
> +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
> +
> void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
> {
> struct kvm_lapic *apic = vcpu->arch.apic;
>
> WARN_ON(apic->lapic_timer.hv_timer_in_use);
>
> - if (apic_lvtt_tscdeadline(apic))
> - start_hv_tscdeadline(apic);
> + start_hv_timer(apic);
> }
> EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
>