Re: [PATCH v3 1/2] KVM: LAPIC: Tune lapic_timer_advance_ns smoothly

From: Wanpeng Li
Date: Mon Sep 16 2019 - 00:03:05 EST


On Thu, 12 Sep 2019 at 20:37, Paolo Bonzini <pbonzini@xxxxxxxxxx> wrote:
>
> On 12/09/19 02:34, Wanpeng Li wrote:
> >>> - timer_advance_ns -= min((u32)ns,
> >>> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> >>> + timer_advance_ns -= ns;
>
> Looking more closely, this assignment...
>
> >>> } else {
> >>> /* too late */
> >>> ns = advance_expire_delta * 1000000ULL;
> >>> do_div(ns, vcpu->arch.virtual_tsc_khz);
> >>> - timer_advance_ns += min((u32)ns,
> >>> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> >>> + timer_advance_ns += ns;
>
> ... and this one are dead code now. However...
>
> >>> }
> >>>
> >>> + timer_advance_ns = (apic->lapic_timer.timer_advance_ns *
> >>> + (LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) /
> >>> + LAPIC_TIMER_ADVANCE_ADJUST_STEP;
>
> ... you should instead remove this new assignment and just make the
> assignments above just
>
> timer_advance -= ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP;
>
> and
>
> timer_advance -= ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP;
>
> In fact this whole last assignment is buggy, since advance_expire_delta
> is in TSC units rather than nanoseconds.
>
> >>> if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> >>> apic->lapic_timer.timer_advance_adjust_done = true;
> >>> if (unlikely(timer_advance_ns > 5000)) {
> >> This looks great. But instead of patch 2, why not remove
> >> timer_advance_adjust_done altogether?
> > It can fluctuate w/o stop.
>
> Possibly because of the wrong calculation of timer_advance_ns?

Hi Paolo,

Something like below? It still fluctuate when running complex guest os
like linux. Removing timer_advance_adjust_done will hinder introduce
patch v3 2/2 since there is no adjust done flag in each round
evaluation. I have two questions here, best-effort tune always as
below or periodically revaluate to get conservative value and get
best-effort value in each round evaluation as patch v3 2/2, which one
do you prefer? The former one can wast time to wait sometimes and the
later one can not get the best latency. In addition, can the adaptive
tune algorithm be using in all the scenarios contain
RT/over-subscribe?

---8<---
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 685d17c..895735b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -69,6 +69,7 @@
#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+#define LAPIC_TIMER_ADVANCE_FILTER 5000

static inline int apic_test_vector(int vec, void *bitmap)
{
@@ -1479,29 +1480,28 @@ static inline void
adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
s64 advance_expire_delta)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
- u64 ns;
+ u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns;
+
+ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER ||
+ abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) {
+ /* filter out random fluctuations */
+ return;
+ }

/* too early */
if (advance_expire_delta < 0) {
ns = -advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns -= min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
} else {
/* too late */
ns = advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns += min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
}

- if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- apic->lapic_timer.timer_advance_adjust_done = true;
- if (unlikely(timer_advance_ns > 5000)) {
+ if (unlikely(timer_advance_ns > 5000))
timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
- apic->lapic_timer.timer_advance_adjust_done = false;
- }
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}

@@ -1521,7 +1521,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);

- if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+ if (lapic_timer_advance_ns == -1)
adjust_lapic_timer_advance(vcpu,
apic->lapic_timer.advance_expire_delta);
}

@@ -2298,10 +2298,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int
timer_advance_ns)
apic->lapic_timer.timer.function = apic_timer_fn;
if (timer_advance_ns == -1) {
apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
- apic->lapic_timer.timer_advance_adjust_done = false;
} else {
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
- apic->lapic_timer.timer_advance_adjust_done = true;
}


diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 50053d2..2aad7e2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -35,7 +35,6 @@ struct kvm_timer {
s64 advance_expire_delta;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
- bool timer_advance_adjust_done;
};

struct kvm_lapic {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 93b0bd4..4f65ef1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -141,7 +141,7 @@
* advancement entirely. Any other value is used as-is and disables adaptive
* tuning, i.e. allows priveleged userspace to set an exact advancement time.
*/
-static int __read_mostly lapic_timer_advance_ns = -1;
+int __read_mostly lapic_timer_advance_ns = -1;
module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);

static bool __read_mostly vector_hashing = true;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6594020..2c6ba86 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -301,6 +301,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,

extern bool enable_vmware_backdoor;

+extern int lapic_timer_advance_ns;
extern int pi_inject_timer;

extern struct static_key kvm_no_apic_vcpu;