[PATCH v4.15.7 1/1] x86/vdso: handle clock_gettime(CLOCK_MONOTONIC_RAW, &ts) in VDSO

From: Jason Vas Dias
Date: Tue Mar 06 2018 - 18:57:02 EST


Handling clock_gettime( CLOCK_MONOTONIC_RAW, &timespec)
by calling vdso_fallback_gettime(), ie. syscall, is too slow -
latencies of 300-700ns are common on Haswell (06:3C) CPUs .

This patch against the 4.15.7 stable branch makes the VDSO handle
clock_gettime(CLOCK_GETTIME_RAW, &ts)
by issuing rdtscp in userspace, IFF the clock source is the TSC, and converting
it to nanoseconds using the vsyscall_gtod_data 'mult' and 'shift' fields :

volatile u32 tsc_lo, tsc_hi, tsc_cpu;
asm volatile( "rdtscp" : (=a) tsc_lo, (=d) tsc_hi, (=c) tsc_cpu );
u64 tsc = (((u64)tsc_hi)<<32) | ((u64)tsc_lo);
tsc *= gtod->mult;
tsc >>=gtod->shift;
/* tsc is now number of nanoseconds */
ts->tv_sec = __iter_div_u64_rem( tsc, NSEC_PER_SEC, &ts->tv_nsec);

Use of the "open coded asm" style here actually forces the compiler to
always choose the 32-bit version of rdtscp, which sets only %eax,
%edx, and %ecx and does not clear the high bits of %rax, %rdx, and
%rdx , because the
variables are declared 32-bit - so the same 32-bit version is used whether
the code is compiled with -m32 or -m64 ( tested using gcc 5.4.0, gcc 6.4.1 ) .

The full story and test programs are in Bug #198961 :
https://bugzilla.kernel.org/show_bug.cgi?id=198961
.

The patched VDSO now handles clock_gettime(CLOCK_MONOTONIC_RAW, &ts)
on the same machine with a latency (minimum time that can be measured)
of
around 100ns (compared with 300-700ns before patch).

I also think it makes sense to expose pointers to the live, updated
gtod->mult and gtod->shift values somehow to userspace . Then
a userspace TSC reader could re-use previous values to avoid
the long-division in most cases and obtain latencies of 10-20ns .

Hence there is now a new method in the VDSO:
__ vdso_linux_tsc_calibration()
which returns a pointer to a 'struct linux_tsc_calibration'
declared in a new header
arch/x86/include/uapi/asm/vdso_tsc_calibration.h

If the clock source is NOT the TSC, this function returns NULL .
The pointer is only valid when the system clock source is the TSC .
User-space TSC readers can detect when TSC is modified with Events,
and now can detect when clock source changes from / to TSC with
this function .

The patch :

---
diff --git a/arch/x86/entry/vdso/vclock_gettime.c \
b/arch/x86/entry/vdso/vclock_gettime.c
index f19856d..e840600 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -21,6 +21,7 @@
#include <linux/math64.h>
#include <linux/time.h>
#include <linux/kernel.h>
+#include <uapi/asm/vdso_tsc_calibration.h>

#define gtod (&VVAR(vsyscall_gtod_data))

@@ -246,6 +247,29 @@ notrace static int __always_inline do_monotonic\
(struct timespec *ts)
return mode;
}

+notrace static int __always_inline do_monotonic_raw( struct timespec *ts)
+{
+ volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs
generated for 64-bit as for 32-bit builds
+ u64 ns;
+ register u64 tsc=0;
+ if (gtod->vclock_mode == VCLOCK_TSC)
+ {
+ asm volatile
+ ( "rdtscp"
+ : "=a" (tsc_lo)
+ , "=d" (tsc_hi)
+ , "=c" (tsc_cpu)
+ ); // : eax, edx, ecx used - NOT rax, rdx, rcx
+ tsc = ((((u64)tsc_hi) & 0xffffffffUL) << 32) |
(((u64)tsc_lo) & 0xffffffffUL);
+ tsc *= gtod->mult;
+ tsc >>= gtod->shift;
+ ts->tv_sec = __iter_div_u64_rem(tsc, NSEC_PER_SEC,
&ns);
+ ts->tv_nsec = ns;
+ return VCLOCK_TSC;
+ }
+ return VCLOCK_NONE;
+}
+
notrace static void do_realtime_coarse(struct timespec *ts)
{
unsigned long seq;
@@ -277,6 +301,10 @@ notrace int __vdso_clock_gettime(clockid_t clock,
struct timespec *ts)
if (do_monotonic(ts) == VCLOCK_NONE)
goto fallback;
break;
+ case CLOCK_MONOTONIC_RAW:
+ if (do_monotonic_raw(ts) == VCLOCK_NONE)
+ goto fallback;
+ break;
case CLOCK_REALTIME_COARSE:
do_realtime_coarse(ts);
break;
@@ -326,3 +354,18 @@ notrace time_t __vdso_time(time_t *t)
}
time_t time(time_t *t)
__attribute__((weak, alias("__vdso_time")));
+
+extern const struct linux_tsc_calibration *
+ __vdso_linux_tsc_calibration(void);
+
+notrace const struct linux_tsc_calibration *
+ __vdso_linux_tsc_calibration(void)
+{
+ if( gtod->vclock_mode == VCLOCK_TSC )
+ return ((const struct linux_tsc_calibration*) &gtod->mult);
+ return 0UL;
+}
+
+const struct linux_tsc_calibration * linux_tsc_calibration(void)
+ __attribute((weak, alias("__vdso_linux_tsc_calibration")));
+
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index d3a2dce..41a2ca5 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -24,7 +24,9 @@ VERSION {
getcpu;
__vdso_getcpu;
time;
- __vdso_time;
+ __vdso_time;
+ linux_tsc_calibration;
+ __vdso_linux_tsc_calibration;
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 422764a..d53bd73 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -25,7 +25,8 @@ VERSION
global:
__vdso_clock_gettime;
__vdso_gettimeofday;
- __vdso_time;
+ __vdso_time;
+ __vdso_linux_tsc_calibration;
};

LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S
b/arch/x86/entry/vdso/vdsox32.lds.S
index 05cd1c5..fb13b16 100644
--- a/arch/x86/entry/vdso/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -20,7 +20,8 @@ VERSION {
__vdso_clock_gettime;
__vdso_gettimeofday;
__vdso_getcpu;
- __vdso_time;
+ __vdso_time;
+ __vdso_linux_tsc_calibration;
local: *;
};
}
diff --git a/arch/x86/include/uapi/asm/vdso_tsc_calibration.h \
b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
new file mode 100644
index 0000000..6febb84
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_VDSO_TSC_CALIBRATION_H
+#define _ASM_X86_VDSO_TSC_CALIBRATION_H
+/*
+ * Programs that want to use rdtsc / rdtscp instructions
+ * from user-space can make use of the Linux kernel TSC calibration
+ * by calling :
+ * __vdso_linux_tsc_calibration(void);
+ * ( one has to resolve this symbol as in
+ * tools/testing/selftests/vDSO/parse_vdso.c
+ * )
+ * which returns a pointer into the read-only
+ * vvar_page (the vsyscall_gtod_data structure),
+ * with the following layout :
+ */
+
+struct linux_tsc_calibration
+{
+ unsigned int mult; /* amount to multiply 64-bit TSC value by */
+
+ unsigned int shift; /* the right shift to apply to
(mult*TSC) yielding \
nanoseconds */
+};
+
+/* To use:
+ *
+ * static const struct lnx_tsc_calibration_s*
+ * (*linux_tsc_cal)(void) = vdso_sym("LINUX_2.6",
"__vdso_linux_tsc_calibration");
+ * if( linux_tsc_cal == 0UL )
+ * { fprintf(stderr,"the patch providing __vdso_linux_tsc_calibration is not \
applied to the kernel.\n");
+ * return ERROR;
+ * }
+ * static const struct lnx_tsc_calibration_s *clock_source =
(*linux_tsc_cal)();
+ * if( clock_source == ((void*)0UL))
+ * fprintf(stderr,"TSC is not the system clocksource.\n");
+ * volatile unsigned int tsc_lo, tsc_hi, tsc_cpu;
+ * asm volatile
+ * ( "rdtscp" : (=a) tsc_hi, (=d) tsc_lo, (=c) tsc_cpu );
+ * unsigned long tsc = (((unsigned long)tsc_hi) << 32) | tsc_lo;
+ * unsigned long nanoseconds =
+ * (( clock_source -> mult ) * tsc ) >> (clock_source -> shift);
+ *
+ * nanoseconds is now TSC value converted to nanoseconds,
+ * according to Linux' clocksource calibration values.
+ *
+ */
+
+#endif
---

Attachment: vdso_vclock_gettime_userspace_tsc_4.15.7.patch
Description: Binary data