[PATCH v2 2/3] x86/xen/time: setup vcpu 0 time info page
From: Joao Martins
Date: Fri Sep 22 2017 - 12:27:00 EST
In order to support pvclock vdso on xen we need to setup the time
info page for vcpu 0 and register the page with Xen using the
VCPUOP_register_vcpu_time_memory_area hypercall. This hypercall
will also forcefully update the pvti which will set some of the
necessary flags for vdso. Afterwards we check if it supports the
PVCLOCK_TSC_STABLE_BIT flag which is mandatory for having
vdso/vsyscall support. And if so, it will set the cpu 0 pvti that
will be later on used when mapping the vdso image.
The xen headers are also updated to include the new hypercall for
registering the secondary vcpu_time_info struct.
Signed-off-by: Joao Martins <joao.m.martins@xxxxxxxxxx>
---
Changes since v1:
* Check flags ahead to see if the primary clock can use
PVCLOCK_TSC_STABLE_BIT even if secondary registration fails.
(Comments from Boris)
* Remove addr, addr variables;
* Change first pr_debug to pr_warn;
* Change last pr_debug to pr_notice;
* Add routine to solely register secondary time info.
* Move xen_clock to outside xen_setup_vsyscall_time_info to allow
restore path to simply re-register secondary time info. Let us
handle the restore path more gracefully without re-allocating a
page.
* Removed cpu argument from xen_setup_vsyscall_time_info()
* Adjustment failed registration error messages/loglevel to be the same
* Also teardown secondary time info on suspend
Changes since RFC:
(Comments from Boris and David)
* Remove Kconfig option
* Use get_zeroed_page/free/page
* Remove the hypercall availability check
* Unregister pvti with arg.addr.v = NULL if stable bit isn't supported.
(New)
* Set secondary copy on restore such that it works on migration.
* Drop global xen_clock variable and stash it locally on
xen_setup_vsyscall_time_info.
* WARN_ON(ret) if we fail to unregister the pvti.
---
arch/x86/xen/suspend.c | 4 ++
arch/x86/xen/time.c | 100 +++++++++++++++++++++++++++++++++++++++++++
arch/x86/xen/xen-ops.h | 2 +
include/xen/interface/vcpu.h | 28 ++++++++++++
4 files changed, 134 insertions(+)
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index d6b1680693a9..800ed36ecfba 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -16,6 +16,8 @@
void xen_arch_pre_suspend(void)
{
+ xen_save_time_memory_area();
+
if (xen_pv_domain())
xen_pv_pre_suspend();
}
@@ -26,6 +28,8 @@ void xen_arch_post_suspend(int cancelled)
xen_pv_post_suspend(cancelled);
else
xen_hvm_post_suspend(cancelled);
+
+ xen_restore_time_memory_area();
}
static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 1ecb05db3632..2924b97691c6 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -370,6 +370,105 @@ static const struct pv_time_ops xen_time_ops __initconst = {
.steal_clock = xen_steal_clock,
};
+static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
+
+void xen_save_time_memory_area(void)
+{
+ struct vcpu_register_time_memory_area t;
+ int ret;
+
+ if (!xen_clock)
+ return;
+
+ t.addr.v = NULL;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+ if (ret != 0)
+ pr_notice("Cannot save secondary vcpu_time_info (err %d)",
+ ret);
+ else
+ clear_page(xen_clock);
+}
+
+void xen_restore_time_memory_area(void)
+{
+ struct vcpu_register_time_memory_area t;
+ int ret;
+
+ if (!xen_clock)
+ return;
+
+ t.addr.v = &xen_clock->pvti;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+
+ /*
+ * We don't disable VCLOCK_PVCLOCK entirely if it fails to register the
+ * secondary time info with Xen or if we migrated to a host without the
+ * necessary flags. On both of these cases what happens is either
+ * process seeing a zeroed out pvti or seeing no PVCLOCK_TSC_STABLE_BIT
+ * bit set. Userspace checks the latter and if 0, it discards the data
+ * in pvti and fallbacks to a system call for a reliable timestamp.
+ */
+ if (ret != 0)
+ pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
+ ret);
+}
+
+static void xen_setup_vsyscall_time_info(void)
+{
+ struct vcpu_register_time_memory_area t;
+ struct pvclock_vsyscall_time_info *ti;
+ struct pvclock_vcpu_time_info *pvti;
+ int ret;
+
+ pvti = &__this_cpu_read(xen_vcpu)->time;
+
+ /*
+ * We check ahead on the primary time info if this
+ * bit is supported hence speeding up Xen clocksource.
+ */
+ if (!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))
+ return;
+
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+ ti = (struct pvclock_vsyscall_time_info *) get_zeroed_page(GFP_KERNEL);
+ if (!ti)
+ return;
+
+ t.addr.v = &ti->pvti;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+ if (ret) {
+ pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret);
+ free_page((unsigned long) ti);
+ return;
+ }
+
+ /*
+ * If the check above succedded this one should too since it's the
+ * same data on both primary and secondary time infos just different
+ * memory regions. But we still check it in case hypervisor is buggy.
+ */
+ pvti = &ti->pvti;
+ if (!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)) {
+ t.addr.v = NULL;
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
+ 0, &t);
+ if (!ret)
+ free_page((unsigned long) ti);
+
+ pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret);
+ return;
+ }
+
+ xen_clock = ti;
+ pvclock_set_pvti_cpu0_va(xen_clock);
+
+ xen_clocksource.archdata.vclock_mode = VCLOCK_PVCLOCK;
+}
+
static void __init xen_time_init(void)
{
int cpu = smp_processor_id();
@@ -396,6 +495,7 @@ static void __init xen_time_init(void)
setup_force_cpu_cap(X86_FEATURE_TSC);
xen_setup_runstate_info(cpu);
+ xen_setup_vsyscall_time_info();
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c8a6d224f7ed..f96dbedb33d4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -69,6 +69,8 @@ void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
u64 xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
+void xen_save_time_memory_area(void);
+void xen_restore_time_memory_area(void);
void __init xen_init_time_ops(void);
void __init xen_hvm_init_time_ops(void);
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
index 98188c87f5c1..8da788c5bd4f 100644
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -178,4 +178,32 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
/* Send an NMI to the specified VCPU. @extra_arg == NULL. */
#define VCPUOP_send_nmi 11
+
+/*
+ * Register a memory location to get a secondary copy of the vcpu time
+ * parameters. The master copy still exists as part of the vcpu shared
+ * memory area, and this secondary copy is updated whenever the master copy
+ * is updated (and using the same versioning scheme for synchronisation).
+ *
+ * The intent is that this copy may be mapped (RO) into userspace so
+ * that usermode can compute system time using the time info and the
+ * tsc. Usermode will see an array of vcpu_time_info structures, one
+ * for each vcpu, and choose the right one by an existing mechanism
+ * which allows it to get the current vcpu number (such as via a
+ * segment limit). It can then apply the normal algorithm to compute
+ * system time from the tsc.
+ *
+ * @extra_arg == pointer to vcpu_register_time_info_memory_area structure.
+ */
+#define VCPUOP_register_vcpu_time_memory_area 13
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_time_info_t);
+struct vcpu_register_time_memory_area {
+ union {
+ GUEST_HANDLE(vcpu_time_info_t) h;
+ struct pvclock_vcpu_time_info *v;
+ uint64_t p;
+ } addr;
+};
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_time_memory_area_t);
+
#endif /* __XEN_PUBLIC_VCPU_H__ */
--
2.11.0