[RFC PATCH v3 10/10] kernel/time: Add /dev/vmclock_host miscdev

From: David Woodhouse

Date: Wed May 20 2026 - 10:07:29 EST


From: David Woodhouse <dwmw@xxxxxxxxxxxx>

Expose the host's NTP-disciplined clock as a vmclock_abi page via
/dev/vmclock_host. A VMM can mmap or poll() this device to obtain
precision time parameters for relaying to guests.

The page is updated via the pvclock_gtod notifier chain when the NTP
frequency or skew rate changes. The period computation is redone on
frequency changes; time tuple updates are cheap. The phase offset
(time_offset) is computed under tk_core.lock in
timekeeping_set_reference(), matching the locking used by
do_adjtimex/hardpps.

Fields populated:
- counter_id: X86_TSC (or ARM_VCNT)
- time_type: TAI (if tai_offset known) or UTC
- counter_value: TSC at reference point
- time_sec/time_frac_sec: time at reference point
- counter_period_frac_sec: NTP-disciplined TSC period
- tai_offset_sec: current UTC-TAI offset (if known)
- clock_status: SYNCHRONIZED / FREERUNNING / UNKNOWN
- leap_indicator: from NTP time_state

Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
Assisted-by: Kiro:claude-opus-4.6-1m
---
include/linux/timekeeper_internal.h | 2 +
kernel/time/Kconfig | 7 +
kernel/time/Makefile | 1 +
kernel/time/ntp.c | 19 +
kernel/time/ntp_internal.h | 5 +
kernel/time/timekeeping.c | 2 +
kernel/time/vmclock_host.c | 391 ++++++++++++++++++
.../selftests/timers/vmclock_host_test.c | 171 ++++++++
8 files changed, 598 insertions(+)
create mode 100644 kernel/time/vmclock_host.c
create mode 100644 tools/testing/selftests/timers/vmclock_host_test.c

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 9de6b5b94dc0..c3d6f17e0623 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -185,6 +185,8 @@ struct timekeeper {
u32 ntp_err_mult;
u32 skip_second_overflow;
s64 skew_delta;
+ int ntp_status;
+ int ntp_time_state;
s32 tai_offset;
};

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 02aac7c5aa76..f0cddfec5751 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -221,4 +221,11 @@ config POSIX_AUX_CLOCKS
and other clock domains, which are not correlated to the TAI/NTP
notion of time.

+config VMCLOCK_HOST
+ tristate "VMClock host time provider (/dev/vmclock_host)"
+ depends on X86_TSC || ARM64
+ help
+ Expose the host NTP-disciplined clock as a vmclock page via
+ /dev/vmclock_host for VMMs to relay precision time to guests.
+
endmenu
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index eaf290c972f9..549070254e3a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_TIME_NS) += namespace.o
obj-$(CONFIG_TIME_NS_VDSO) += namespace_vdso.o
obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o
obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o
+obj-$(CONFIG_VMCLOCK_HOST) += vmclock_host.o
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 3dc098695665..2866d4208117 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -365,6 +365,13 @@ s64 ntp_get_skew_delta(unsigned int tkid)
return tk_ntp_data[tkid].skew_delta;
}

+s64 ntp_get_time_offset_ns(unsigned int tkid)
+{
+ return shift_right(tk_ntp_data[tkid].time_offset * NTP_INTERVAL_FREQ,
+ NTP_SCALE_SHIFT);
+}
+EXPORT_SYMBOL_GPL(ntp_get_time_offset_ns);
+
s64 ntp_drain_time_offset(unsigned int tkid, s64 amount)
{
struct ntp_data *ntpdata = &tk_ntp_data[tkid];
@@ -669,6 +676,18 @@ static inline bool ntp_synced(void)
return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC);
}

+int ntp_get_status(void)
+{
+ return tk_ntp_data[TIMEKEEPER_CORE].time_status;
+}
+EXPORT_SYMBOL_GPL(ntp_get_status);
+
+int ntp_get_time_state(void)
+{
+ return tk_ntp_data[TIMEKEEPER_CORE].time_state;
+}
+EXPORT_SYMBOL_GPL(ntp_get_time_state);
+
/*
* If we have an externally synchronized Linux clock, then update RTC clock
* accordingly every ~11 minutes. Generally RTCs can only store second
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 14ca8bc08120..ba1d14bbcf0e 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -2,13 +2,18 @@
#ifndef _LINUX_NTP_INTERNAL_H
#define _LINUX_NTP_INTERNAL_H

+struct audit_ntp_data;
+
extern void ntp_init(void);
+extern int ntp_get_status(void);
+extern int ntp_get_time_state(void);
extern void ntp_clear(unsigned int tkid);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
extern u64 ntp_tick_length(unsigned int tkid);
extern s64 ntp_get_skew_delta(unsigned int tkid);
extern s64 ntp_drain_time_offset(unsigned int tkid, s64 amount);
extern void ntp_set_time_offset(unsigned int tkid, s64 offset_ns);
+extern s64 ntp_get_time_offset_ns(unsigned int tkid);
extern void ntp_set_tick_length(unsigned int tkid, u64 tick_length);
extern ktime_t ntp_get_next_leap(unsigned int tkid);
extern int second_overflow(unsigned int tkid, time64_t secs);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5c4b377505bc..b93fab0890df 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2380,6 +2380,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
return;
tk->ntp_tick = ntp_tl;
tk->skew_delta = skew;
+ tk->ntp_status = ntp_get_status();
+ tk->ntp_time_state = ntp_get_time_state();
mult = div64_u64((tk->ntp_tick + skew) >> tk->ntp_error_shift,
tk->cycle_interval);
}
diff --git a/kernel/time/vmclock_host.c b/kernel/time/vmclock_host.c
new file mode 100644
index 000000000000..d43f2b043fb9
--- /dev/null
+++ b/kernel/time/vmclock_host.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * /dev/vmclock_host - Expose host NTP-disciplined time as a vmclock page.
+ *
+ * This provides a vmclock_abi structure populated from the host's
+ * CLOCK_REALTIME (TAI), allowing a VMM to efficiently relay precision
+ * time to guests without per-tick overhead.
+ *
+ * The page is updated only when the NTP frequency (ntp_tick) changes
+ * or the clocksource changes — not on every timekeeping tick.
+ * Userspace can poll() for changes.
+ *
+ * Copyright © 2026 Amazon.com, Inc. or its affiliates.
+ */
+
+#include <linux/clocksource_ids.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/timex.h>
+#include "ntp_internal.h"
+#include <linux/timekeeping.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/wait.h>
+
+#include <uapi/linux/vmclock-abi.h>
+
+
+static struct vmclock_abi *vmclock_page;
+static DECLARE_WAIT_QUEUE_HEAD(vmclock_wait);
+static u64 cached_ntp_tick;
+static s64 cached_skew_delta;
+static u64 cached_period_frac;
+static u8 cached_period_shift;
+
+/*
+ * Compute counter_period_frac_sec from ntp_tick and cycle_interval.
+ *
+ * ntp_tick is ns_per_tick << 32.
+ * cycle_interval is counter cycles per tick.
+ *
+ * vmclock wants: period = frac_sec / 2^(64 + shift) in seconds.
+ *
+ * ns_per_cycle = ntp_tick / cycle_interval (in <<32 fixed point)
+ *
+ * period = ntp_tick / (cycle_interval * 10^9 * 2^32) seconds/cycle
+ * frac_sec = ntp_tick * 2^(32+shift) / (cycle_interval * 10^9)
+ *
+ * Use div64_u64 with maximum pre-shift for precision.
+ * The key: do TWO divisions to get 64 bits of quotient.
+ */
+static void vmclock_compute_period(struct timekeeper *tk,
+ u64 *period_frac, u8 *period_shift)
+{
+ u64 ntp_tick = tk->ntp_tick;
+ u64 cycle_interval = tk->cycle_interval;
+ u64 divisor = cycle_interval * 1000000000ULL;
+ int headroom = __builtin_clzll(ntp_tick);
+ u64 rem, result;
+ int bits_so_far;
+
+ /*
+ * Compute ntp_tick * 2^(headroom + N) / divisor with 64 bits
+ * of precision, using iterative 32-bit chunk divisions.
+ *
+ * First division: ntp_tick << headroom / divisor
+ */
+ result = div64_u64_rem(ntp_tick << headroom, divisor, &rem);
+ bits_so_far = 64 - __builtin_clzll(result ?: 1);
+
+ /* Fill remaining bits 32 at a time from the remainder */
+ while (bits_so_far < 64 && rem) {
+ int chunk = min(32, 64 - bits_so_far);
+ int rem_headroom = __builtin_clzll(rem);
+ u64 extra;
+
+ if (rem_headroom < chunk)
+ chunk = rem_headroom;
+
+ extra = div64_u64_rem(rem << chunk, divisor, &rem);
+ result = (result << chunk) | extra;
+ bits_so_far += chunk;
+ headroom += chunk;
+ }
+
+ /* Pad with zeros if we ran out of remainder */
+ if (bits_so_far < 64) {
+ result <<= (64 - bits_so_far);
+ headroom += (64 - bits_so_far);
+ }
+
+ /*
+ * result = ntp_tick * 2^headroom / divisor
+ * = (ntp_tick / (cycle_interval * 10^9)) * 2^headroom
+ * = period_seconds * 2^32 * 2^headroom
+ * = period_seconds * 2^(32 + headroom)
+ *
+ * vmclock: frac_sec / 2^(64 + shift) = period_seconds
+ * So: shift = 32 + headroom - 64 = headroom - 32
+ */
+ *period_frac = result;
+ *period_shift = (u8)(headroom - 32);
+}
+
+
+static u8 vmclock_counter_id(struct timekeeper *tk)
+{
+ enum clocksource_ids id = tk->cs_id;
+
+ if (IS_ENABLED(CONFIG_X86) && id == CSID_X86_TSC)
+ return VMCLOCK_COUNTER_X86_TSC;
+ if (IS_ENABLED(CONFIG_ARM64) && id == CSID_ARM_ARCH_COUNTER)
+ return VMCLOCK_COUNTER_ARM_VCNT;
+ return VMCLOCK_COUNTER_INVALID;
+}
+
+/*
+ * Called from pvclock_gtod_notify on every timekeeping update.
+ * Only does real work when ntp_tick or skew_delta changes.
+ */
+static int vmclock_host_notify(struct notifier_block *nb,
+ unsigned long was_set, void *data)
+{
+ struct timekeeper *tk = data;
+ struct vmclock_abi *clk = vmclock_page;
+ bool period_changed = false;
+ u8 counter_id;
+ s64 ns, sec;
+ u64 hi, rem, counter_value, time_frac;
+ __le64 le_time_sec, le_time_frac, le_counter_value;
+ __le64 le_period_frac;
+ u8 period_shift, clock_status;
+
+ if (!clk)
+ return NOTIFY_DONE;
+
+ /* Early exit if nothing relevant changed */
+ if (clk->clock_status != VMCLOCK_STATUS_UNKNOWN &&
+ tk->ntp_tick == cached_ntp_tick &&
+ tk->skew_delta == cached_skew_delta && !was_set)
+ return NOTIFY_DONE;
+
+ counter_id = vmclock_counter_id(tk);
+ if (counter_id == VMCLOCK_COUNTER_INVALID) {
+ /* Invalidate the page if clocksource isn't usable */
+ WRITE_ONCE(clk->seq_count, cpu_to_le32(
+ le32_to_cpu(READ_ONCE(clk->seq_count)) + 1));
+ smp_wmb();
+ clk->counter_id = VMCLOCK_COUNTER_INVALID;
+ clk->clock_status = VMCLOCK_STATUS_UNKNOWN;
+ smp_wmb();
+ WRITE_ONCE(clk->seq_count, cpu_to_le32(
+ le32_to_cpu(READ_ONCE(clk->seq_count)) + 1));
+ return NOTIFY_DONE;
+ }
+
+ /* Recompute period only when frequency changes */
+ if (tk->ntp_tick != cached_ntp_tick) {
+ vmclock_compute_period(tk, &cached_period_frac,
+ &cached_period_shift);
+ cached_ntp_tick = tk->ntp_tick;
+ period_changed = true;
+ }
+ cached_skew_delta = tk->skew_delta;
+
+ /* Compute time tuple: C = A + ntp_error + time_offset */
+ ns = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ sec = tk->xtime_sec + tk->tai_offset;
+ ns += tk->ntp_error >> (tk->tkr_mono.shift + tk->ntp_error_shift);
+ ns += ntp_get_time_offset_ns(tk->id);
+
+ while (ns < 0) {
+ ns += NSEC_PER_SEC;
+ sec--;
+ }
+ while (ns >= NSEC_PER_SEC) {
+ ns -= NSEC_PER_SEC;
+ sec++;
+ }
+
+ counter_value = tk->tkr_mono.cycle_last;
+ hi = div64_u64_rem((u64)ns << 32, 1000000000ULL, &rem);
+ time_frac = (hi << 32) | div64_u64(rem << 32, 1000000000ULL);
+
+ clock_status = !(ntp_get_status() & STA_UNSYNC) ?
+ VMCLOCK_STATUS_SYNCHRONIZED : VMCLOCK_STATUS_FREERUNNING;
+
+ /* Prepare le values */
+ le_counter_value = cpu_to_le64(counter_value);
+ le_time_sec = cpu_to_le64(sec);
+ le_time_frac = cpu_to_le64(time_frac);
+ le_period_frac = cpu_to_le64(cached_period_frac);
+ period_shift = cached_period_shift;
+
+ /* Update page under seqcount */
+ WRITE_ONCE(clk->seq_count, cpu_to_le32(
+ le32_to_cpu(READ_ONCE(clk->seq_count)) + 1));
+ smp_wmb();
+
+ clk->counter_id = counter_id;
+ clk->counter_value = le_counter_value;
+ clk->time_sec = le_time_sec;
+ clk->time_frac_sec = le_time_frac;
+ if (period_changed) {
+ clk->counter_period_frac_sec = le_period_frac;
+ clk->counter_period_shift = period_shift;
+ }
+ clk->clock_status = clock_status;
+
+ /* Set leap second indicator from NTP time_state */
+ switch (ntp_get_time_state()) {
+ case TIME_INS:
+ clk->leap_indicator = VMCLOCK_LEAP_PRE_POS;
+ break;
+ case TIME_DEL:
+ clk->leap_indicator = VMCLOCK_LEAP_PRE_NEG;
+ break;
+ case TIME_OOP:
+ clk->leap_indicator = VMCLOCK_LEAP_POS;
+ break;
+ case TIME_WAIT:
+ clk->leap_indicator = (ntp_get_status() & STA_DEL) ?
+ VMCLOCK_LEAP_POST_NEG : VMCLOCK_LEAP_POST_POS;
+ break;
+ default:
+ clk->leap_indicator = VMCLOCK_LEAP_NONE;
+ break;
+ }
+
+ /* Export as TAI if tai_offset is known, otherwise UTC */
+ if (tk->tai_offset) {
+ clk->time_type = VMCLOCK_TIME_TAI;
+ clk->tai_offset_sec = cpu_to_le16((s16)tk->tai_offset);
+ clk->flags = cpu_to_le64(VMCLOCK_FLAG_TAI_OFFSET_VALID |
+ VMCLOCK_FLAG_TIME_MONOTONIC |
+ VMCLOCK_FLAG_NOTIFICATION_PRESENT);
+ } else {
+ clk->time_type = VMCLOCK_TIME_UTC;
+ clk->tai_offset_sec = 0;
+ clk->flags = cpu_to_le64(VMCLOCK_FLAG_TIME_MONOTONIC |
+ VMCLOCK_FLAG_NOTIFICATION_PRESENT);
+ }
+
+ smp_wmb();
+ WRITE_ONCE(clk->seq_count, cpu_to_le32(
+ le32_to_cpu(READ_ONCE(clk->seq_count)) + 1));
+
+ wake_up_interruptible(&vmclock_wait);
+ return NOTIFY_DONE;
+}
+
+/* File operations */
+
+struct vmclock_host_file {
+ u32 last_seq;
+};
+
+static int vmclock_host_open(struct inode *inode, struct file *fp)
+{
+ struct vmclock_host_file *fst;
+
+ fst = kzalloc(sizeof(*fst), GFP_KERNEL);
+ if (!fst)
+ return -ENOMEM;
+
+ fp->private_data = fst;
+ return 0;
+}
+
+static int vmclock_host_release(struct inode *inode, struct file *fp)
+{
+ kfree(fp->private_data);
+ return 0;
+}
+
+static int vmclock_host_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ if ((vma->vm_flags & (VM_READ | VM_WRITE)) != VM_READ)
+ return -EROFS;
+
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
+ return -EINVAL;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ virt_to_phys(vmclock_page) >> PAGE_SHIFT,
+ PAGE_SIZE, vma->vm_page_prot);
+}
+
+static ssize_t vmclock_host_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct vmclock_host_file *fst = fp->private_data;
+ u32 seq;
+
+ if (*ppos >= PAGE_SIZE)
+ return 0;
+ if (count > PAGE_SIZE - *ppos)
+ count = PAGE_SIZE - *ppos;
+
+ do {
+ seq = le32_to_cpu(READ_ONCE(vmclock_page->seq_count));
+ if (seq & 1) {
+ cpu_relax();
+ continue;
+ }
+ smp_rmb();
+ if (copy_to_user(buf, (char *)vmclock_page + *ppos, count))
+ return -EFAULT;
+ smp_rmb();
+ } while (le32_to_cpu(READ_ONCE(vmclock_page->seq_count)) != seq);
+
+ fst->last_seq = seq;
+ *ppos += count;
+ return count;
+}
+
+static __poll_t vmclock_host_poll(struct file *fp, poll_table *wait)
+{
+ struct vmclock_host_file *fst = fp->private_data;
+ u32 seq;
+
+ poll_wait(fp, &vmclock_wait, wait);
+
+ seq = le32_to_cpu(READ_ONCE(vmclock_page->seq_count));
+ if (fst->last_seq != seq)
+ return EPOLLIN | EPOLLRDNORM;
+
+ return 0;
+}
+
+static const struct file_operations vmclock_host_fops = {
+ .owner = THIS_MODULE,
+ .open = vmclock_host_open,
+ .release = vmclock_host_release,
+ .mmap = vmclock_host_mmap,
+ .read = vmclock_host_read,
+ .poll = vmclock_host_poll,
+};
+
+static struct miscdevice vmclock_host_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "vmclock_host",
+ .fops = &vmclock_host_fops,
+};
+
+static struct notifier_block vmclock_host_nb = {
+ .notifier_call = vmclock_host_notify,
+};
+
+static int __init vmclock_host_init(void)
+{
+ int ret;
+
+ vmclock_page = (struct vmclock_abi *)get_zeroed_page(GFP_KERNEL);
+ if (!vmclock_page)
+ return -ENOMEM;
+
+ /* Set constant fields */
+ vmclock_page->magic = cpu_to_le32(VMCLOCK_MAGIC);
+ vmclock_page->size = cpu_to_le32(PAGE_SIZE);
+ vmclock_page->version = cpu_to_le16(1);
+
+ ret = misc_register(&vmclock_host_miscdev);
+ if (ret) {
+ free_page((unsigned long)vmclock_page);
+ vmclock_page = NULL;
+ return ret;
+ }
+
+ pvclock_gtod_register_notifier(&vmclock_host_nb);
+ pr_info("vmclock_host: registered /dev/vmclock_host\n");
+ return 0;
+}
+
+static void __exit vmclock_host_exit(void)
+{
+ pvclock_gtod_unregister_notifier(&vmclock_host_nb);
+ misc_deregister(&vmclock_host_miscdev);
+ free_page((unsigned long)vmclock_page);
+ vmclock_page = NULL;
+}
+
+module_init(vmclock_host_init);
+module_exit(vmclock_host_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David Woodhouse <dwmw@xxxxxxxxxxxx>");
+MODULE_DESCRIPTION("VMClock host time provider");
diff --git a/tools/testing/selftests/timers/vmclock_host_test.c b/tools/testing/selftests/timers/vmclock_host_test.c
new file mode 100644
index 000000000000..c83cc7e6d404
--- /dev/null
+++ b/tools/testing/selftests/timers/vmclock_host_test.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test /dev/vmclock_host by comparing its time against CLOCK_TAI.
+ *
+ * Maps the vmclock page, reads time from it using the ABI formula,
+ * and compares with clock_gettime(CLOCK_TAI) using ABA timestamps
+ * to bound the uncertainty.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <linux/vmclock-abi.h>
+
+#ifdef __x86_64__
+static inline uint64_t read_counter(void)
+{
+ unsigned int lo, hi;
+ asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
+ return ((uint64_t)hi << 32) | lo;
+}
+#elif defined(__aarch64__)
+static inline uint64_t read_counter(void)
+{
+ uint64_t val;
+ asm volatile("mrs %0, cntvct_el0" : "=r"(val));
+ return val;
+}
+#else
+#error "Unsupported architecture"
+#endif
+
+/*
+ * Compute time from vmclock: T = time_sec + time_frac_sec/2^64 +
+ * (counter_now - counter_value) * counter_period_frac_sec >> (64 + shift)
+ *
+ * Returns nanoseconds since epoch.
+ */
+static int64_t vmclock_read_ns(const volatile struct vmclock_abi *clk,
+ uint64_t counter_now)
+{
+ uint64_t delta = counter_now - clk->counter_value;
+ uint64_t period = clk->counter_period_frac_sec;
+ uint8_t shift = clk->counter_period_shift;
+ __uint128_t ns128;
+
+ /* delta * period gives seconds in 0.(64+shift) fixed point */
+ ns128 = (__uint128_t)delta * period;
+ ns128 >>= shift;
+ /* Now ns128 is seconds in 0.64 fixed point. Add time_frac_sec */
+ ns128 += clk->time_frac_sec;
+ /* Top 64 bits are whole seconds of fractional part — but we
+ * need to add time_sec for the full result */
+ uint64_t frac_sec = (uint64_t)(ns128 >> 64);
+ uint64_t sub_sec_ns = (uint64_t)(((ns128 & 0xFFFFFFFFFFFFFFFFULL) *
+ 1000000000ULL) >> 64);
+
+ return (int64_t)(clk->time_sec + frac_sec) * 1000000000LL + sub_sec_ns;
+}
+
+static int64_t clock_tai_ns(void)
+{
+ struct timespec ts;
+ clock_gettime(CLOCK_TAI, &ts);
+ return (int64_t)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+int main(void)
+{
+ int fd, ret = 0;
+ volatile struct vmclock_abi *clk;
+ int i, failures = 0;
+
+ fd = open("/dev/vmclock_host", O_RDONLY);
+ if (fd < 0) {
+ if (errno == ENOENT) {
+ printf("SKIP: /dev/vmclock_host not available\n");
+ return 4;
+ }
+ perror("open /dev/vmclock_host");
+ return 1;
+ }
+
+ clk = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd, 0);
+ if (clk == MAP_FAILED) {
+ perror("mmap");
+ close(fd);
+ return 1;
+ }
+
+ if (clk->magic != VMCLOCK_MAGIC) {
+ fprintf(stderr, "Bad magic: 0x%x\n", clk->magic);
+ ret = 1;
+ goto out;
+ }
+
+ if (clk->counter_id == VMCLOCK_COUNTER_INVALID) {
+ printf("SKIP: counter_id is INVALID (clocksource not TSC?)\n");
+ ret = 4;
+ goto out;
+ }
+
+ printf("vmclock_host: version=%u counter_id=%u time_type=%u status=%u\n",
+ clk->version, clk->counter_id, clk->time_type, clk->clock_status);
+ printf(" tai_offset=%d\n", (int16_t)clk->tai_offset_sec);
+ printf(" counter_period_frac_sec=0x%" PRIx64 " shift=%u\n",
+ (uint64_t)clk->counter_period_frac_sec, clk->counter_period_shift);
+
+ /* ABA comparison: read CLOCK_TAI, vmclock, CLOCK_TAI */
+ printf("\nABA comparison (vmclock vs CLOCK_TAI):\n");
+ for (i = 0; i < 10; i++) {
+ uint32_t seq;
+ int64_t tai_before, tai_after, vmclock_ns;
+ int64_t delta, window;
+
+ /* Read with seqcount retry */
+ do {
+ seq = clk->seq_count;
+ if (seq & 1) {
+ __asm__ volatile("pause" ::: "memory");
+ continue;
+ }
+ __asm__ volatile("" ::: "memory");
+
+ tai_before = clock_tai_ns();
+ uint64_t ctr = read_counter();
+ tai_after = clock_tai_ns();
+
+ __asm__ volatile("" ::: "memory");
+ if (clk->seq_count != seq)
+ continue;
+
+ vmclock_ns = vmclock_read_ns(clk, ctr);
+ break;
+ } while (1);
+
+ window = tai_after - tai_before;
+ /* vmclock should be between tai_before and tai_after */
+ delta = vmclock_ns - tai_before;
+
+ printf(" [%d] vmclock-tai_before=%+" PRId64 "ns window=%"
+ PRId64 "ns", i, delta, window);
+
+ if (delta < -2000 || delta > window + 2000) {
+ printf(" FAIL (out of range)\n");
+ failures++;
+ } else {
+ printf(" OK\n");
+ }
+
+ usleep(100000); /* 100ms between samples */
+ }
+
+ if (failures) {
+ printf("\nFAIL: %d/%d samples out of range\n", failures, 10);
+ ret = 1;
+ } else {
+ printf("\nPASS: all samples within ABA window\n");
+ }
+
+out:
+ munmap((void *)clk, 4096);
+ close(fd);
+ return ret;
+}
--
2.54.0