[PATCH 5/5] powerpc/64s: SMP hardlockup watchdog

From: Nicholas Piggin
Date: Mon May 22 2017 - 09:04:59 EST


Implement a new SMP based watchdog rather than use the perf based
hardlockup detector. The new watchdog uses all SMP threads to watch
each other for lockups, by pinging a shared cpumask.

This has nothing really to do with NMIs at the moment, but it looks
into a couple of kernel options and APIs that have NMI in the name.

In the interests of size and simplicity, I have avoided grabbing the
pseudo-NMI from underneath the soft IRQ masking code. One issue there
is I want to avoid reusing process stacks when Linux irqs are disabled.

This will become most useful with NMI IPIs to crash stuck CPUs with.
Probably needs some small build fixes on other archs which do their
own watchdogs like sparc, but it works on powerpc.

Signed-off-by: Nicholas Piggin <npiggin@xxxxxxxxx>
---
arch/powerpc/Kconfig | 3 +-
arch/powerpc/include/asm/nmi.h | 8 +
arch/powerpc/include/asm/smp.h | 2 +
arch/powerpc/kernel/Makefile | 1 +
arch/powerpc/kernel/kvm.c | 7 +
arch/powerpc/kernel/setup_64.c | 19 ---
arch/powerpc/kernel/smp.c | 20 ++-
arch/powerpc/kernel/watchdog.c | 332 +++++++++++++++++++++++++++++++++++++++++
lib/Kconfig.debug | 2 +-
9 files changed, 372 insertions(+), 22 deletions(-)
create mode 100644 arch/powerpc/kernel/watchdog.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 481010797553..a9ce1a8b03ac 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -198,7 +198,8 @@ config PPC
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MOD_ARCH_SPECIFIC
- select HAVE_NMI if PERF_EVENTS
+ select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S)
+ select HAVE_NMI_WATCHDOG if (PPC64 && PPC_BOOK3S && SMP)
select HAVE_OPROFILE
select HAVE_OPTPROBES if PPC64
select HAVE_PERF_EVENTS
diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h
index ff1ccb375e60..780d8d5178dc 100644
--- a/arch/powerpc/include/asm/nmi.h
+++ b/arch/powerpc/include/asm/nmi.h
@@ -1,4 +1,12 @@
#ifndef _ASM_NMI_H
#define _ASM_NMI_H

+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void arch_touch_nmi_watchdog(void);
+#else
+static inline void arch_touch_nmi_watchdog(void)
+{
+}
+#endif
+
#endif /* _ASM_NMI_H */
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index ebddb2111d87..8ea98504f900 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -55,6 +55,8 @@ struct smp_ops_t {
int (*cpu_bootable)(unsigned int nr);
};

+extern void smp_flush_nmi_ipi(u64 delay_us);
+extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us);
extern void smp_send_debugger_break(void);
extern void start_secondary_resume(void);
extern void smp_generic_give_timebase(void);
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index e132902e1f14..459c17bc5cf3 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \
signal_64.o ptrace32.o \
paca.o nvram_64.o firmware.o
obj-$(CONFIG_VDSO32) += vdso32/
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 9ad37f827a97..1086ea37c832 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -25,6 +25,7 @@
#include <linux/kvm_para.h>
#include <linux/slab.h>
#include <linux/of.h>
+#include <linux/nmi.h> /* hardlockup_detector_disable() */

#include <asm/reg.h>
#include <asm/sections.h>
@@ -718,6 +719,12 @@ static __init void kvm_free_tmp(void)

static int __init kvm_guest_init(void)
{
+ /*
+ * The hardlockup detector is likely to get false positives in
+ * KVM guests, so disable it by default.
+ */
+ hardlockup_detector_disable();
+
if (!kvm_para_available())
goto free_tmp;

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index ab650905f75a..9c15b3b8ba4d 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -726,22 +726,3 @@ unsigned long memory_block_size_bytes(void)
struct ppc_pci_io ppc_pci_io;
EXPORT_SYMBOL(ppc_pci_io);
#endif
-
-#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
-u64 hw_nmi_get_sample_period(int watchdog_thresh)
-{
- return ppc_proc_freq * watchdog_thresh;
-}
-
-/*
- * The hardlockup detector breaks PMU event based branches and is likely
- * to get false positives in KVM guests, so disable it by default.
- */
-static int __init disable_hardlockup_detector(void)
-{
- hardlockup_detector_disable();
-
- return 0;
-}
-early_initcall(disable_hardlockup_detector);
-#endif
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index df2a41647d8e..5b602a284084 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -433,13 +433,31 @@ static void do_smp_send_nmi_ipi(int cpu)
}
}

+void smp_flush_nmi_ipi(u64 delay_us)
+{
+ unsigned long flags;
+
+ nmi_ipi_lock_start(&flags);
+ while (nmi_ipi_busy_count) {
+ nmi_ipi_unlock_end(&flags);
+ udelay(1);
+ if (delay_us) {
+ delay_us--;
+ if (!delay_us)
+ return;
+ }
+ nmi_ipi_lock_start(&flags);
+ }
+ nmi_ipi_unlock_end(&flags);
+}
+
/*
* - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS.
* - fn is the target callback function.
* - delay_us > 0 is the delay before giving up waiting for targets to
* enter the handler, == 0 specifies indefinite delay.
*/
-static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
+int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
{
unsigned long flags;
int me = raw_smp_processor_id();
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
new file mode 100644
index 000000000000..bd6755d6bd5a
--- /dev/null
+++ b/arch/powerpc/kernel/watchdog.c
@@ -0,0 +1,332 @@
+/*
+ * Watchdog support on powerpc systems.
+ *
+ * Copyright 2017, IBM Corporation.
+ *
+ * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
+ */
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/kdebug.h>
+#include <linux/sched/debug.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+
+#include <asm/paca.h>
+
+/*
+ * The watchdog has a simple timer that runs on each CPU, once per timer
+ * period. This is the heartbeat.
+ *
+ * Then there are checks to see if the heartbeat has not triggered on a CPU
+ * for the panic timeout period. Currently the watchdog only supports an
+ * SMP check, so the heartbeat only turns on when we have 2 or more CPUs.
+ *
+ * This is not an NMI watchdog, but Linux uses that name for a generic
+ * watchdog in some cases, so NMI gets used in some places.
+ */
+
+static cpumask_t wd_cpus_enabled __read_mostly;
+
+static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */
+
+static u64 wd_timer_period __read_mostly; /* msec between checks */
+
+static DEFINE_PER_CPU(struct timer_list, wd_timer);
+
+/*
+ * These are for the SMP checker. CPUs clear their pending bit in their
+ * heartbeat. If the bitmask becomes empty, the time is noted and the
+ * bitmask is refilled.
+ *
+ * All CPUs clear their bit in the pending mask every timer period.
+ * Once all have cleared, the time is noted and the bits are reset.
+ * If the time since all clear was greater than the panic timeout,
+ * we can panic with the list of stuck CPUs.
+ *
+ * This will work best with NMI IPIs for crash code so the stuck CPUs
+ * can be pulled out to get their backtraces.
+ */
+static unsigned long __wd_smp_lock = 0;
+static int wd_smp_enabled __read_mostly = 0;
+static cpumask_t wd_smp_cpus_pending;
+static cpumask_t wd_smp_cpus_stuck;
+static u64 wd_smp_last_reset_tb;
+
+static inline void wd_smp_lock(unsigned long *flags)
+{
+ /*
+ * Avoid locking layers if possible.
+ * This may be called from low level interrupt handlers at some
+ * point in future.
+ */
+ local_irq_save(*flags);
+ while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock)))
+ cpu_relax();
+}
+
+static inline void wd_smp_unlock(unsigned long *flags)
+{
+ clear_bit_unlock(0, &__wd_smp_lock);
+ local_irq_restore(*flags);
+}
+
+static void wd_lockup_ipi(struct pt_regs *regs)
+{
+ pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", smp_processor_id());
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+}
+
+static void watchdog_smp_panic(int cpu, u64 tb)
+{
+ unsigned long flags;
+
+ wd_smp_lock(&flags);
+ if (!(tb - wd_smp_last_reset_tb >= wd_panic_timeout_tb)) {
+ wd_smp_unlock(&flags);
+ return;
+ }
+
+ pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n",
+ cpu, cpumask_pr_args(&wd_smp_cpus_pending));
+
+ if (hardlockup_panic) {
+ panic("Hard LOCKUP");
+ } else {
+ int c;
+
+ for_each_cpu(c, &wd_smp_cpus_pending) {
+ if (c == cpu)
+ continue;
+ smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+ }
+ smp_flush_nmi_ipi(1000000);
+ printk_safe_flush();
+ /*
+ * printk_safe_flush() seems to require another print
+ * before anything actually goes out to console.
+ */
+ }
+
+ pr_emerg("Watchdog removing stuck CPUS:%*pbl\n",
+ cpumask_pr_args(&wd_smp_cpus_pending));
+
+ /* Take the stuck CPU out of the watch group */
+ cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, &wd_smp_cpus_pending);
+ cpumask_andnot(&wd_smp_cpus_pending,
+ &wd_cpus_enabled,
+ &wd_smp_cpus_stuck);
+ wd_smp_last_reset_tb = tb;
+
+ wd_smp_unlock(&flags);
+}
+
+static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
+{
+ if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
+ if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
+ unsigned long flags;
+
+ pr_emerg("Watchdog CPU:%d became unstuck\n", cpu);
+ dump_stack();
+
+ wd_smp_lock(&flags);
+ cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
+ wd_smp_unlock(&flags);
+ }
+ return;
+ }
+
+ cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
+ if (cpumask_empty(&wd_smp_cpus_pending)) {
+ unsigned long flags;
+
+ wd_smp_lock(&flags);
+ if (cpumask_empty(&wd_smp_cpus_pending)) {
+ wd_smp_last_reset_tb = tb;
+ cpumask_andnot(&wd_smp_cpus_pending,
+ &wd_cpus_enabled,
+ &wd_smp_cpus_stuck);
+ }
+ wd_smp_unlock(&flags);
+ }
+}
+
+static void watchdog_timer_interrupt(int cpu)
+{
+ u64 tb;
+
+ if (wd_smp_enabled) {
+ smp_rmb();
+
+ tb = get_tb();
+
+ wd_smp_clear_cpu_pending(cpu, tb);
+
+ if (tb - wd_smp_last_reset_tb >= wd_panic_timeout_tb)
+ watchdog_smp_panic(cpu, tb);
+ }
+}
+
+static void wd_timer_reset(unsigned int cpu, struct timer_list *t)
+{
+ t->expires = jiffies + msecs_to_jiffies(wd_timer_period);
+ if (wd_timer_period > 1000)
+ t->expires = round_jiffies(t->expires);
+ add_timer_on(t, cpu);
+}
+
+static void wd_timer_fn(unsigned long data)
+{
+ struct timer_list *t = this_cpu_ptr(&wd_timer);
+ int cpu = smp_processor_id();
+
+ watchdog_timer_interrupt(cpu);
+
+ wd_timer_reset(cpu, t);
+}
+
+void arch_touch_nmi_watchdog(void)
+{
+ int cpu = smp_processor_id();
+
+ watchdog_timer_interrupt(cpu);
+}
+EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+
+static void start_watchdog_timer_on(unsigned int cpu)
+{
+ struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
+
+ setup_pinned_timer(t, wd_timer_fn, 0);
+ wd_timer_reset(cpu, t);
+}
+
+static void stop_watchdog_timer_on(unsigned int cpu)
+{
+ struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
+
+ del_timer_sync(t);
+}
+
+static int start_wd_on_cpu(unsigned int cpu)
+{
+ pr_info("Watchdog cpu:%d\n", cpu);
+
+ if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
+ return 0;
+
+ if (cpumask_weight(&wd_cpus_enabled) > 0) {
+ start_watchdog_timer_on(cpu);
+
+ if (cpumask_weight(&wd_cpus_enabled) == 1)
+ start_watchdog_timer_on(cpumask_first(&wd_cpus_enabled));
+ }
+
+ cpumask_set_cpu(cpu, &wd_cpus_enabled);
+
+ if (cpumask_weight(&wd_cpus_enabled) == 2) {
+ cpumask_copy(&wd_smp_cpus_pending, &wd_cpus_enabled);
+ wd_smp_last_reset_tb = get_tb();
+ smp_wmb();
+ wd_smp_enabled = 1;
+
+ pr_info("Watchdog starting cross-CPU SMP watchdog\n");
+ }
+
+ return 0;
+}
+
+static int stop_wd_on_cpu(unsigned int cpu)
+{
+ if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /* In case of == 1, the timer won't have started yet */
+ if (cpumask_weight(&wd_cpus_enabled) > 1)
+ stop_watchdog_timer_on(cpu);
+
+ cpumask_clear_cpu(cpu, &wd_cpus_enabled);
+
+ if (wd_smp_enabled) {
+ smp_wmb();
+ wd_smp_clear_cpu_pending(cpu, get_tb());
+
+ if (cpumask_weight(&wd_cpus_enabled) == 1) {
+ stop_watchdog_timer_on(cpumask_first(&wd_cpus_enabled));
+
+ pr_info("Watchdog stopping cross-CPU SMP watchdog\n");
+ wd_smp_last_reset_tb = get_tb();
+ cpumask_copy(&wd_smp_cpus_pending, &wd_cpus_enabled);
+ smp_wmb();
+ wd_smp_enabled = 0;
+ }
+ }
+
+ return 0;
+}
+
+static void watchdog_calc_timeouts(void)
+{
+ wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
+ wd_timer_period = watchdog_thresh * 1000 / 3;
+}
+
+void watchdog_nmi_reconfigure(void)
+{
+ int cpu;
+
+ watchdog_calc_timeouts();
+
+ for_each_cpu(cpu, &wd_cpus_enabled) {
+ stop_wd_on_cpu(cpu);
+ }
+
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ return;
+
+ if (watchdog_suspended)
+ return;
+
+ for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) {
+ start_wd_on_cpu(cpu);
+ }
+}
+
+static int __init powerpc_watchdog_init(void)
+{
+ int err;
+
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ return 0;
+
+ watchdog_calc_timeouts();
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online",
+ start_wd_on_cpu, stop_wd_on_cpu);
+ if (err < 0)
+ pr_warning("Watchdog could not be initialized");
+
+ return 0;
+}
+arch_initcall(powerpc_watchdog_init);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c69e12ce823a..081e7df0eb9a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -818,7 +818,7 @@ config HARDLOCKUP_DETECTOR_PERF
config HARDLOCKUP_DETECTOR
bool "Detect Hard Lockups"
depends on LOCKUP_DETECTOR
- depends on !HAVE_NMI_WATCHDOG || (PERF_EVENTS && HAVE_PERF_EVENTS_NMI)
+ depends on PPC64 || !HAVE_NMI_WATCHDOG || (PERF_EVENTS && HAVE_PERF_EVENTS_NMI)
select HARDLOCKUP_DETECTOR_PERF if !HAVE_NMI_WATCHDOG

config BOOTPARAM_HARDLOCKUP_PANIC
--
2.11.0