[PATCH v1 1/1] kvm/x86: make kvm to determin tsc frequency earlier

From: Pavel Tatashin
Date: Fri Dec 08 2017 - 11:25:42 EST


For debugging and performance reasons it is desirable to initialize tsc
earlier cover more parts of the boot process.

All but one hypervisors and platforms allow to calibrate tsc right after
this call:

setup_arch()
init_hypervisor_platform();

Here is where every hypervisor/platform configures its function to
initialize tsc:

- VMware:
.init.init_platform = vmware_platform_setup,

vmware_platform_setup()
x86_platform.calibrate_tsc = vmware_get_tsc_khz;

- Hyper-V:
.init.init_platform = ms_hyperv_init_platform,

ms_hyperv_init_platform()
x86_platform.calibrate_tsc = hv_get_tsc_khz;

- Xen
.init.init_platform = xen_hvm_guest_init,

xen_hvm_guest_init()
xen_hvm_init_time_ops()
x86_platform.calibrate_tsc = xen_tsc_khz;

There are also some cpus that modify these vectors, but they also do it
before init_hypervisor_platform() call:

1. x86_intel_mid_early_setup() is called before kernel_start(),
sets: x86_platform.calibrate_tsc = intel_mid_calibrate_tsc;

2. x86_init.oem.arch_setup() also called init_hypervisor_platform(). May
set tsc to (depending on arch_setup() function):
x86_platform.calibrate_tsc = intel_mid_calibrate_tsc;
x86_platform.calibrate_tsc = tangier_calibrate_tsc;

The only exception is kvm, that waits until memblock is initialized and
calls kvmclock_init() surrounded by ifdefs from the common code.

This is why tsc_early_delay_calibrate() is called after kvmclock_init().

The reason for this exception is that kvmclock needs a memblock allocator
is because the size of the memory that is needed for its clock is rather
large: NR_CPUS * 256, which currently can be up to 2M.

There is, however, a way to solve this problem by doing the following:
use a page from __initdata section that will be descarded after boot to
tempararly determine tsc frequency of the boot cpu, and later once
memblock is available replace that page with dynamically allocated memory.

In addition to initializing tsc earlier, another benefit of this patch, is
that it also cleans up common code by removing kvm specific call from
arch_setup() common function.

Signed-off-by: Pavel Tatashin <pasha.tatashin@xxxxxxxxxx>
Reviewed-by: Steven Sistare <steven.sistare@xxxxxxxxxx>
Reviewed-by: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx>
---
arch/x86/include/asm/kvm_para.h | 2 +-
arch/x86/kernel/kvm.c | 2 ++
arch/x86/kernel/kvmclock.c | 67 ++++++++++++++++++++++++++++-------------
arch/x86/kernel/setup.c | 6 +---
4 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b407dda2bd7..6448dd793c82 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -7,7 +7,7 @@
#include <uapi/asm/kvm_para.h>

extern void kvmclock_init(void);
-extern int kvm_register_clock(char *txt);
+extern void kvmclock_init_early(void);

#ifdef CONFIG_KVM_GUEST
bool kvm_check_and_clear_guest_paused(void);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b40ffbf156c1..209a660b1603 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -505,6 +505,7 @@ static void __init kvm_guest_init(void)
if (!kvm_para_available())
return;

+ kvmclock_init();
paravirt_ops_setup();
register_reboot_notifier(&kvm_pv_reboot_nb);
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
@@ -582,6 +583,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.name = "KVM",
.detect = kvm_detect,
.type = X86_HYPER_KVM,
+ .init.init_platform = kvmclock_init_early,
.init.guest_late_init = kvm_guest_init,
.init.x2apic_available = kvm_para_available,
};
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 8b26c9e01cc4..af05a5580ccb 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -48,6 +48,8 @@ early_param("no-kvmclock", parse_no_kvmclock);
static struct pvclock_vsyscall_time_info *hv_clock;
static struct pvclock_wall_clock *wall_clock;

+static u8 hv_clock_tmp_page[PAGE_SIZE] __aligned(PAGE_SIZE) __initdata = {0};
+
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -112,8 +114,8 @@ static inline void kvm_sched_clock_init(bool stable)
kvm_sched_clock_offset = kvm_clock_read();
pv_time_ops.sched_clock = kvm_sched_clock_read;

- printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
- kvm_sched_clock_offset);
+ pr_info("kvm-clock: using sched offset of %llu cycles\n",
+ kvm_sched_clock_offset);

BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
@@ -181,7 +183,7 @@ struct clocksource kvm_clock = {
};
EXPORT_SYMBOL_GPL(kvm_clock);

-int kvm_register_clock(char *txt)
+static int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high, ret;
@@ -194,8 +196,10 @@ int kvm_register_clock(char *txt)
low = (int)slow_virt_to_phys(src) | 1;
high = ((u64)slow_virt_to_phys(src) >> 32);
ret = native_write_msr_safe(msr_kvm_system_time, low, high);
- printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
- cpu, high, low, txt);
+ if (txt) {
+ pr_info("kvm-clock: cpu %d, msr %x:%x, %s\n",
+ cpu, high, low, txt);
+ }

return ret;
}
@@ -272,6 +276,38 @@ static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size)
memblock_free(addr, size);
}

+/*
+ * Initialize enough functionality to get tsc frequency. This function is
+ * called before memblock allocator, so we are using a page from initdata
+ * section, which will be unmapped after boot.
+ */
+void __init kvmclock_init_early(void)
+{
+ if (!kvm_para_available() || !kvmclock)
+ return;
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+ msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+ msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+ } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ return;
+ }
+
+ hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_tmp_page;
+
+ if (kvm_register_clock(NULL)) {
+ hv_clock = NULL;
+ return;
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+ x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+ x86_platform.calibrate_cpu = kvm_get_tsc_khz;
+ kvm_get_preset_lpj();
+}
+
void __init kvmclock_init(void)
{
struct pvclock_vcpu_time_info *vcpu_time;
@@ -281,14 +317,12 @@ void __init kvmclock_init(void)

size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);

- if (!kvm_para_available())
+ if (!hv_clock)
return;

- if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
- msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
- msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
- } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
- return;
+ /* Remove and reset early hv clock configs */
+ hv_clock = NULL;
+ native_write_msr(msr_kvm_system_time, 0, 0);

wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock));
mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE);
@@ -316,12 +350,9 @@ void __init kvmclock_init(void)
return;
}

- printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+ pr_info("kvm-clock: Using msrs %x and %x",
msr_kvm_system_time, msr_kvm_wall_clock);

- if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
- pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
-
cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
flags = pvclock_read_flags(vcpu_time);
@@ -329,8 +360,6 @@ void __init kvmclock_init(void)
kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
put_cpu();

- x86_platform.calibrate_tsc = kvm_get_tsc_khz;
- x86_platform.calibrate_cpu = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
@@ -343,7 +372,6 @@ void __init kvmclock_init(void)
#ifdef CONFIG_KEXEC_CORE
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
- kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
}
@@ -354,13 +382,10 @@ int __init kvm_setup_vsyscall_timeinfo(void)
int cpu;
u8 flags;
struct pvclock_vcpu_time_info *vcpu_time;
- unsigned int size;

if (!hv_clock)
return 0;

- size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
-
cpu = get_cpu();

vcpu_time = &hv_clock[cpu].pvti;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8af2e8d0c0a1..c602c624ab63 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1018,6 +1018,7 @@ void __init setup_arch(char **cmdline_p)
* needs to be done after dmi_scan_machine(), for the boot CPU.
*/
init_hypervisor_platform();
+ tsc_early_delay_calibrate();

x86_init.resources.probe_roms();

@@ -1204,11 +1205,6 @@ void __init setup_arch(char **cmdline_p)

memblock_find_dma_reserve();

-#ifdef CONFIG_KVM_GUEST
- kvmclock_init();
-#endif
-
- tsc_early_delay_calibrate();
if (!early_xdbc_setup_hardware())
early_xdbc_register_console();

--
2.15.1