Re: [PATCH] lib/vdso: Use __arch_use_vsyscall() to indicate fallback

From: Thomas Gleixner
Date: Mon Oct 21 2019 - 05:32:41 EST


On Sun, 20 Oct 2019, Andy Lutomirski wrote:
> On Sat, Oct 19, 2019 at 3:01 AM Thomas Gleixner <tglx@xxxxxxxxxxxxx> wrote:
> > __arch_use_vsyscall() is a pointless exercise TBH. The VDSO data should be
> > updated unconditionally so all the trivial interfaces like time() and
> > getres() just work independently of the functions which depend on the
> > underlying clocksource.
> >
> > This functions have a fallback operation already:
> >
> > Let __arch_get_hw_counter() return U64_MAX and the syscall fallback is
> > invoked.
> >
>
> My thought was that __arch_get_hw_counter() could return last-1 to
> indicate failure, which would allow the two checks to be folded into
> one check. Or we could continue to use U64_MAX and rely on the fact
> that (s64)U64_MAX < 0, not worry about the cycle counter overflowing,
> and letting cycles < last catch it.

This is not an overflow catch. It's solely to deal with the fact that on
X86 you can observe (cycles < last) on multi socket systems under rare
circumstances. Any other architecture does not have that issue AFAIK.

The wraparound of clocksources with a smaller width than 64bit is handled
by:

delta = (cycles - last) & mask;

which operates on unsigned values for obvious reasons.

> (And we should change it to return s64 at some point regardless -- all
> the math is signed, so the underlying types should be too IMO.)

See above. delta is guaranteed to be >= 0 and the mult/shift is not signed
either. All the base values which are in the VDSO are unsigned as well.

The weird typecast there:

if ((s64)cycles < 0)

could as well be

if (cycles == U64_MAX)

but the typecasted version creates better code.

I tried to fold the two operations (see patch below) and on all machines I
tested on (various generations of Intel and AMD) the result is slower than
what we have now by a couple of cycles, which is a lot for these functions
(i.e. between 3-5%). I'm sure I tried that before and ended up with the
existing code as the fastest variant.

Why? That's subject to speculation :)

Thanks,

tglx

8<----------
arch/x86/include/asm/vdso/gettimeofday.h | 39 ++++++-------------------------
lib/vdso/gettimeofday.c | 23 +++---------------
2 files changed, 13 insertions(+), 49 deletions(-)

--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -235,10 +235,14 @@ static u64 vread_hvclock(void)
}
#endif

-static inline u64 __arch_get_hw_counter(s32 clock_mode)
+static inline u64 __arch_get_hw_counter(s32 clock_mode, u64 last, u64 mask)
{
+ /*
+ * Mask operation is not required as all VDSO clocksources are
+ * 64bit wide.
+ */
if (clock_mode == VCLOCK_TSC)
- return (u64)rdtsc_ordered();
+ return (u64)rdtsc_ordered() - last;
/*
* For any memory-mapped vclock type, we need to make sure that gcc
* doesn't cleverly hoist a load before the mode check. Otherwise we
@@ -248,13 +252,13 @@ static inline u64 __arch_get_hw_counter(
#ifdef CONFIG_PARAVIRT_CLOCK
if (clock_mode == VCLOCK_PVCLOCK) {
barrier();
- return vread_pvclock();
+ return vread_pvclock() - last;
}
#endif
#ifdef CONFIG_HYPERV_TIMER
if (clock_mode == VCLOCK_HVCLOCK) {
barrier();
- return vread_hvclock();
+ return vread_hvclock() - last;
}
#endif
return U64_MAX;
@@ -265,33 +269,6 @@ static __always_inline const struct vdso
return __vdso_data;
}

-/*
- * x86 specific delta calculation.
- *
- * The regular implementation assumes that clocksource reads are globally
- * monotonic. The TSC can be slightly off across sockets which can cause
- * the regular delta calculation (@cycles - @last) to return a huge time
- * jump.
- *
- * Therefore it needs to be verified that @cycles are greater than
- * @last. If not then use @last, which is the base time of the current
- * conversion period.
- *
- * This variant also removes the masking of the subtraction because the
- * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX
- * which would result in a pointless operation. The compiler cannot
- * optimize it away as the mask comes from the vdso data and is not compile
- * time constant.
- */
-static __always_inline
-u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
-{
- if (cycles > last)
- return (cycles - last) * mult;
- return 0;
-}
-#define vdso_calc_delta vdso_calc_delta
-
#endif /* !__ASSEMBLY__ */

#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -26,34 +26,21 @@
#include <asm/vdso/gettimeofday.h>
#endif /* ENABLE_COMPAT_VDSO */

-#ifndef vdso_calc_delta
-/*
- * Default implementation which works for all sane clocksources. That
- * obviously excludes x86/TSC.
- */
-static __always_inline
-u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
-{
- return ((cycles - last) & mask) * mult;
-}
-#endif
-
static int do_hres(const struct vdso_data *vd, clockid_t clk,
struct __kernel_timespec *ts)
{
const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
- u64 cycles, last, sec, ns;
+ u64 delta, sec, ns;
u32 seq;

do {
seq = vdso_read_begin(vd);
- cycles = __arch_get_hw_counter(vd->clock_mode);
- ns = vdso_ts->nsec;
- last = vd->cycle_last;
- if (unlikely((s64)cycles < 0))
+ delta = __arch_get_hw_counter(vd->clock_mode, vd->cycle_last,
+ vd->mask);
+ if (unlikely((s64)delta < 0))
return -1;

- ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
+ ns = vdso_ts->nsec + delta * vd->mult;
ns >>= vd->shift;
sec = vdso_ts->sec;
} while (unlikely(vdso_read_retry(vd, seq)));