[PATCH 1/3 v2] time: Fix clock->read(clock) race around clocksource changes

From: John Stultz
Date: Wed May 31 2017 - 23:08:24 EST


In some testing on arm64 platforms, I was seeing null ptr
crashes in the kselftest/timers clocksource-switch test.

This was happening in a read function like:
u64 clocksource_mmio_readl_down(struct clocksource *c)
{
return ~(u64)readl_relaxed(to_mmio_clksrc(c)->reg) & c->mask;
}

Where the callers enter the seqlock, and then call something
like:
cycle_now = tkr->read(tkr->clock);

The problem seeming to be that since the ->read() and ->clock
pointer references are happening separately, its possible the
clocksource change happens in between and we end up calling the
old ->read() function with the new clocksource, (or vice-versa)
which causes the to_mmio_clksrc() in the read function to run
off into space.

This patch tries to address the issue by providing a helper
function that atomically reads the clock value and then calls
the clock->read(clock) function so that we always call the read
funciton with the appropriate clocksource and don't accidentally
mix them.

The one exception where this helper isn't necessary is for the
fast-timekepers which use their own locking and update logic
to the tkr structures.

Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Miroslav Lichvar <mlichvar@xxxxxxxxxx>
Cc: Richard Cochran <richardcochran@xxxxxxxxx>
Cc: Prarit Bhargava <prarit@xxxxxxxxxx>
Cc: Stephen Boyd <stephen.boyd@xxxxxxxxxx>
Cc: Daniel Mentz <danielmentz@xxxxxxxxxx>
Cc: stable <stable@xxxxxxxxxxxxxxx>
Acked-by: Ingo Molnar <mingo@xxxxxxxxxx>
Signed-off-by: John Stultz <john.stultz@xxxxxxxxxx>
---
v2: Addressed Ingo's feedback on wording
---
kernel/time/timekeeping.c | 40 +++++++++++++++++++++++++++++-----------
1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9652bc5..797c73e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -118,6 +118,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->offs_boot = ktime_add(tk->offs_boot, delta);
}

+/*
+ * tk_clock_read - atomic clocksource read() helper
+ *
+ * This helper is necessary to use in the read paths because, while the
+ * seqlock ensures we don't return a bad value while structures are updated,
+ * it doesn't protect from potential crashes. There is the possibility that
+ * the tkr's clocksource may change between the read reference, and the
+ * clock reference passed to the read function. This can cause crashes if
+ * the wrong clocksource is passed to the wrong read function.
+ * This isn't necessary to use when holding the timekeeper_lock or doing
+ * a read of the fast-timekeeper tkrs (which is protected by its own locking
+ * and update logic).
+ */
+static inline u64 tk_clock_read(struct tk_read_base *tkr)
+{
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ return clock->read(clock);
+}
+
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */

@@ -175,7 +195,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
*/
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tkr->read(tkr->clock);
+ now = tk_clock_read(tkr);
last = tkr->cycle_last;
mask = tkr->mask;
max = tkr->clock->max_cycles;
@@ -209,7 +229,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
u64 cycle_now, delta;

/* read clocksource */
- cycle_now = tkr->read(tkr->clock);
+ cycle_now = tk_clock_read(tkr);

/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
@@ -240,7 +260,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
tk->tkr_mono.clock = clock;
tk->tkr_mono.read = clock->read;
tk->tkr_mono.mask = clock->mask;
- tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+ tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);

tk->tkr_raw.clock = clock;
tk->tkr_raw.read = clock->read;
@@ -477,7 +497,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
struct tk_read_base *tkr = &tk->tkr_mono;

memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- cycles_at_suspend = tkr->read(tkr->clock);
+ cycles_at_suspend = tk_clock_read(tkr);
tkr_dummy.read = dummy_clock_read;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);

@@ -649,11 +669,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- struct clocksource *clock = tk->tkr_mono.clock;
u64 cycle_now, delta;
u64 nsec;

- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
@@ -929,8 +948,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)

do {
seq = read_seqcount_begin(&tk_core.seq);
-
- now = tk->tkr_mono.read(tk->tkr_mono.clock);
+ now = tk_clock_read(&tk->tkr_mono);
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
@@ -1108,7 +1126,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
* Check whether the system counter value provided by the
* device driver is on the current timekeeping interval.
*/
- now = tk->tkr_mono.read(tk->tkr_mono.clock);
+ now = tk_clock_read(&tk->tkr_mono);
interval_start = tk->tkr_mono.cycle_last;
if (!cycle_between(interval_start, cycles, now)) {
clock_was_set_seq = tk->clock_was_set_seq;
@@ -1629,7 +1647,7 @@ void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
cycle_now > tk->tkr_mono.cycle_last) {
u64 nsec, cyc_delta;
@@ -2030,7 +2048,7 @@ void update_wall_time(void)
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif

--
2.7.4