[PATCH 1/2] x86: kvmclock: abolish PVCLOCK_COUNTS_FROM_ZERO

From: Radim KrÄmÃÅ
Date: Fri Sep 18 2015 - 11:58:05 EST


Newer KVM won't be exposing PVCLOCK_COUNTS_FROM_ZERO anymore.
The purpose of that flags was to start counting system time from 0 when
the KVM clock has been initialized.
We can achieve the same by selecting one read as the initial point.

A simple subtraction will work unless the KVM clock count overflows
earlier (has smaller width) than scheduler's cycle count. We should be
safe till x86_128.

Because PVCLOCK_COUNTS_FROM_ZERO was enabled only on new hypervisors,
setting sched clock as stable based on PVCLOCK_TSC_STABLE_BIT might
regress on older ones.

I presume we don't need to change kvm_clock_read instead of introducing
kvm_sched_clock_read. A problem could arise in case sched_clock is
expected to return the same value as get_cycles, but we should have
merged those clocks in that case.

Signed-off-by: Radim KrÄmÃÅ <rkrcmar@xxxxxxxxxx>
---
arch/x86/kernel/kvmclock.c | 46 +++++++++++++++++++++++++++++++++++-----------
1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2c7aafa70702..ef5b3d2cecce 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -32,6 +32,7 @@
static int kvmclock = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+static cycle_t kvm_sched_clock_offset;

static int parse_no_kvmclock(char *arg)
{
@@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
return kvm_clock_read();
}

+static cycle_t kvm_sched_clock_read(void)
+{
+ return kvm_clock_read() - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+ if (!stable) {
+ pv_time_ops.sched_clock = kvm_clock_read;
+ return;
+ }
+
+ kvm_sched_clock_offset = kvm_clock_read();
+ pv_time_ops.sched_clock = kvm_sched_clock_read;
+ set_sched_clock_stable();
+
+ printk("kvm-clock: using sched offset of %llu cycles\n",
+ kvm_sched_clock_offset);
+
+ BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+ sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
@@ -248,7 +272,17 @@ void __init kvmclock_init(void)
memblock_free(mem, size);
return;
}
- pv_time_ops.sched_clock = kvm_clock_read;
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+ cpu = get_cpu();
+ vcpu_time = &hv_clock[cpu].pvti;
+ flags = pvclock_read_flags(vcpu_time);
+
+ kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+ put_cpu();
+
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
@@ -265,16 +299,6 @@ void __init kvmclock_init(void)
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
-
- if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
- pvclock_set_flags(~0);
-
- cpu = get_cpu();
- vcpu_time = &hv_clock[cpu].pvti;
- flags = pvclock_read_flags(vcpu_time);
- if (flags & PVCLOCK_COUNTS_FROM_ZERO)
- set_sched_clock_stable();
- put_cpu();
}

int __init kvm_setup_vsyscall_timeinfo(void)
--
2.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/