[RFC PATCH] Fix: clocksource watchdog marks TSC unstable on guest VM
From: Mathieu Desnoyers
Date: Mon Sep 07 2015 - 18:22:07 EST
I have been getting those warnings across a range of guest
kernels in my development virtual machines. The host is a
3.13 Ubuntu kernel. The latest guest on which I reproduced
this is a 4.2 kernel (akpm's tree).
[ 126.902240] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
[ 126.902240] clocksource: 'hpet' wd_now: f948c267 wd_last: f5edb97c mask: ffffffff
[ 126.967563] clocksource: 'tsc' cs_now: 48f04ee060 cs_last: 48a8c73ed0 mask: ffffffffffffffff
Relevant info from the guest kernel dmesg:
[ 2.197070] HPET: 3 timers in total, 0 timers will be used for per-cpu timer
[ 2.198021] hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0
[ 2.199557] hpet0: 3 comparators, 64-bit 100.000000 MHz counter
[ 3.393185] tsc: Refined TSC clocksource calibration: 2400.037 MHz
[ 3.396552] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x22985a87e67, max_idle_ns: 440795315743 ns
It appears that the clocksource watchdog does:
local_irq_disable();
csnow = cs->read(cs);
wdnow = watchdog->read(watchdog);
local_irq_enable();
If the host kernel preempts the guest between reading the
TSC and the watchdog, the resulting delta may go well over
the watchdog threshold, which is currently 62ms.
Correct this issue by reading "wdnow" before and after reading
TSC, and retry if the delta between the two reads exceeds
WATCHDOG_THRESHOLD.
Introduce WATCHDOG_RETRY to bound the number of retry (in the
unlikely event of a bogus clock source for wdnow). If the
number of retry has been reached, bail out.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
CC: Daniel Lezcano <daniel.lezcano@xxxxxxxxxx>
CC: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
kernel/time/clocksource.c | 190 +++++++++++++++++++++++++++-------------------
1 file changed, 111 insertions(+), 79 deletions(-)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 174c594..caca552 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -126,6 +126,11 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating);
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
+/*
+ * When two back-to-back reads of the watchdog clock are too far
+ * apart, limit the number of retry allowed before we bail out.
+ */
+#define WATCHDOG_RETRY 20
static void clocksource_watchdog_work(struct work_struct *work)
{
@@ -166,97 +171,124 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static void clocksource_watchdog(unsigned long data)
+static void check_cs_wd(struct clocksource *cs)
{
- struct clocksource *cs;
- cycle_t csnow, wdnow, cslast, wdlast, delta;
- int64_t wd_nsec, cs_nsec;
- int next_cpu, reset_pending;
-
- spin_lock(&watchdog_lock);
- if (!watchdog_running)
- goto out;
-
- reset_pending = atomic_read(&watchdog_reset_pending);
+ cycle_t csnow, wdnow[2], cslast, wdlast, delta;
+ int64_t wd_nsec[2], cs_nsec;
+ int retry_count = WATCHDOG_RETRY;
+
+restart:
+ /* Clocksource already marked unstable? */
+ if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+ if (finished_booting)
+ schedule_work(&watchdog_work);
+ return;
+ }
- list_for_each_entry(cs, &watchdog_list, wd_list) {
+ local_irq_disable();
+ wdnow[0] = watchdog->read(watchdog);
+ csnow = cs->read(cs);
+ wdnow[1] = watchdog->read(watchdog);
+ local_irq_enable();
+
+ /* Clocksource initialized ? */
+ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
+ atomic_read(&watchdog_reset_pending)) {
+ cs->flags |= CLOCK_SOURCE_WATCHDOG;
+ cs->wd_last = wdnow[1];
+ cs->cs_last = csnow;
+ return;
+ }
- /* Clocksource already marked unstable? */
- if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
- if (finished_booting)
- schedule_work(&watchdog_work);
- continue;
- }
+ delta = clocksource_delta(wdnow[0], cs->wd_last, watchdog->mask);
+ wd_nsec[0] = clocksource_cyc2ns(delta, watchdog->mult,
+ watchdog->shift);
+ delta = clocksource_delta(wdnow[1], cs->wd_last, watchdog->mask);
+ wd_nsec[1] = clocksource_cyc2ns(delta, watchdog->mult,
+ watchdog->shift);
- local_irq_disable();
- csnow = cs->read(cs);
- wdnow = watchdog->read(watchdog);
- local_irq_enable();
-
- /* Clocksource initialized ? */
- if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
- atomic_read(&watchdog_reset_pending)) {
- cs->flags |= CLOCK_SOURCE_WATCHDOG;
- cs->wd_last = wdnow;
- cs->cs_last = csnow;
- continue;
+ if ((abs(wd_nsec[1] - wd_nsec[0]) > WATCHDOG_THRESHOLD)) {
+ /*
+ * Got bogus reads from the watchdog clocksource.
+ * This can be caused by SMI, MCE, NMI, and preemption of guest
+ * kernel by host.
+ */
+ pr_warn("Bogus reads from watchdog clocksource: wdnow[0]: %llx wdnow[1]: %llx\n",
+ wdnow[0], wdnow[1]);
+ pr_warn(" delta: %llu nsec",
+ (unsigned long long) abs(wd_nsec[1] - wd_nsec[0]));
+ if (retry_count-- > 0) {
+ goto restart;
+ } else {
+ pr_warn("Bogus clock source used for watchdog. Giving up.\n");
+ return;
}
+ }
+ delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
+ cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ wdlast = cs->wd_last; /* save these in case we print them */
+ cslast = cs->cs_last;
+ cs->cs_last = csnow;
+ cs->wd_last = wdnow[1];
+
+ if (atomic_read(&watchdog_reset_pending))
+ return;
- delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
- wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
- watchdog->shift);
+ /* Check the deviation from the watchdog clocksource. */
+ if ((abs(cs_nsec - wd_nsec[1]) > WATCHDOG_THRESHOLD)) {
+ pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
+ cs->name);
+ pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+ watchdog->name, wdnow[1], wdlast, watchdog->mask);
+ pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+ cs->name, csnow, cslast, cs->mask);
+ __clocksource_unstable(cs);
+ return;
+ }
- delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
- cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
- wdlast = cs->wd_last; /* save these in case we print them */
- cslast = cs->cs_last;
- cs->cs_last = csnow;
- cs->wd_last = wdnow;
+ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+ (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
+ (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+ /* Mark it valid for high-res. */
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
- if (atomic_read(&watchdog_reset_pending))
- continue;
+ /*
+ * clocksource_done_booting() will sort it if
+ * finished_booting is not set yet.
+ */
+ if (!finished_booting)
+ return;
- /* Check the deviation from the watchdog clocksource. */
- if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
- pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
- cs->name);
- pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
- watchdog->name, wdnow, wdlast, watchdog->mask);
- pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
- cs->name, csnow, cslast, cs->mask);
- __clocksource_unstable(cs);
- continue;
+ /*
+ * If this is not the current clocksource let
+ * the watchdog thread reselect it. Due to the
+ * change to high res this clocksource might
+ * be preferred now. If it is the current
+ * clocksource let the tick code know about
+ * that change.
+ */
+ if (cs != curr_clocksource) {
+ cs->flags |= CLOCK_SOURCE_RESELECT;
+ schedule_work(&watchdog_work);
+ } else {
+ tick_clock_notify();
}
+ }
+}
- if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
- (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
- (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
- /* Mark it valid for high-res. */
- cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+static void clocksource_watchdog(unsigned long data)
+{
+ struct clocksource *cs;
+ int next_cpu, reset_pending;
- /*
- * clocksource_done_booting() will sort it if
- * finished_booting is not set yet.
- */
- if (!finished_booting)
- continue;
-
- /*
- * If this is not the current clocksource let
- * the watchdog thread reselect it. Due to the
- * change to high res this clocksource might
- * be preferred now. If it is the current
- * clocksource let the tick code know about
- * that change.
- */
- if (cs != curr_clocksource) {
- cs->flags |= CLOCK_SOURCE_RESELECT;
- schedule_work(&watchdog_work);
- } else {
- tick_clock_notify();
- }
- }
- }
+ spin_lock(&watchdog_lock);
+ if (!watchdog_running)
+ goto out;
+
+ reset_pending = atomic_read(&watchdog_reset_pending);
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ check_cs_wd(cs);
/*
* We only clear the watchdog_reset_pending, when we did a
--
1.9.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/