Re: [PATCH 2/2 v10] printk: Add monotonic, boottime, and realtime timestamps

From: Thomas Gleixner
Date: Thu Aug 31 2017 - 12:18:56 EST


On Mon, 28 Aug 2017, Prarit Bhargava wrote:
> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index d111039e0245..9463606951b1 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -509,6 +509,19 @@ u64 notrace ktime_get_boot_fast_ns(void)
> EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);

Why is this still in the middle of the printk hodgepodge?

> /**
> + * __ktime_get_real_fast_ns_unsafe: - Return an unsafe realtime value
> + * On 32-bit systems may lead to unlikely situations where the result is wrong
> + * because the real time offset is read without the protection of a sequence
> + * lock.
> + */
> +u64 __ktime_get_real_fast_ns_unsafe(void)
> +{
> + struct timekeeper *tk = &tk_core.timekeeper;
> +
> + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_real));
> +}

And no, this function is just horrible, both in name and
implementation. This can be done without that unprotected and racy access
to the realtime offset. Patch below.

Thanks,

tglx

8<-------------------
Subject: timekeeping: Provide NMI safe access to clock realtime
From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Date: Thu, 31 Aug 2017 17:12:48 +0200

The configurable printk timestamping wants access to clock realtime. Right
now there is no ktime_get_real_fast_ns() accessor because reading the
monotonic base and the realtime offset cannot be done atomically. Contrary
to boot time this offset can change during runtime and cause half updated
readouts.

struct tk_read_base was fully packed when the fast timekeeper access was
implemented. commit ceea5e3771ed ("time: Fix clock->read(clock) race around
clocksource changes") removed the 'read' function pointer from the
structure, but of course left the comment stale.

So now the structure can fit a new 64bit member w/o violating the cache
line constraints.

Add real_base to tk_read_base and update it in the fast timekeeper update
sequence.

Implement an accessor which follows the same scheme as the accessor to
clock monotonic, but uses the new real_base to access clock real time.

The runtime overhead for updating real_base is minimal as it just adds two
cache hot values and stores into the same an already dirtied cache line.

Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
include/linux/timekeeper_internal.h | 6 +++++-
include/linux/timekeeping.h | 1 +
kernel/time/timekeeping.c | 35 +++++++++++++++++++++++++++++++++++
3 files changed, 41 insertions(+), 1 deletion(-)

--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -13,19 +13,22 @@
/**
* struct tk_read_base - base structure for timekeeping readout
* @clock: Current clocksource used for timekeeping.
- * @read: Read function of @clock
* @mask: Bitmask for two's complement subtraction of non 64bit clocks
* @cycle_last: @clock cycle value at last update
* @mult: (NTP adjusted) multiplier for scaled math conversion
* @shift: Shift value for scaled math conversion
* @xtime_nsec: Shifted (fractional) nano seconds offset for readout
* @base: ktime_t (nanoseconds) base time for readout
+ * @base_real: Nanoseconds base value for clock REALTIME readout
*
* This struct has size 56 byte on 64 bit. Together with a seqcount it
* occupies a single 64byte cache line.
*
* The struct is separate from struct timekeeper as it is also used
* for a fast NMI safe accessors.
+ *
+ * @base_real is for the fast NMI safe accessor to allow reading clock
+ * realtime from any context.
*/
struct tk_read_base {
struct clocksource *clock;
@@ -35,6 +38,7 @@ struct tk_read_base {
u32 shift;
u64 xtime_nsec;
ktime_t base;
+ u64 base_real;
};

/**
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -239,6 +239,7 @@ static inline u64 ktime_get_raw_ns(void)
extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
extern u64 ktime_get_boot_fast_ns(void);
+extern u64 ktime_get_real_fast_ns(void);

/*
* Timespec interfaces utilizing the ktime based ones
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -496,6 +496,39 @@ u64 notrace ktime_get_boot_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);

+
+/*
+ * See comment for __ktime_get_fast_ns() vs. timestamp ordering
+ */
+static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf)
+{
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ u64 now;
+
+ do {
+ seq = raw_read_seqcount_latch(&tkf->seq);
+ tkr = tkf->base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base_real);
+
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tk_clock_read(tkr),
+ tkr->cycle_last,
+ tkr->mask));
+ } while (read_seqcount_retry(&tkf->seq, seq));
+
+ return now;
+}
+
+/**
+ * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ */
+u64 ktime_get_real_fast_ns(void)
+{
+ return __ktime_get_real_fast_ns(&tk_fast_mono);
+}
+
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
@@ -514,6 +547,7 @@ static void halt_fast_timekeeper(struct
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
cycles_at_suspend = tk_clock_read(tkr);
tkr_dummy.clock = &dummy_clock;
+ tkr_dummy.base_real = tkr->base + tk->offs_real;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);

tkr = &tk->tkr_raw;
@@ -663,6 +697,7 @@ static void timekeeping_update(struct ti
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);

+ tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);