[PATCH tip/master] RCU-based detection of stalled CPUs for ClassicRCU

From: Paul E. McKenney
Date: Thu Oct 02 2008 - 19:06:55 EST


Hello!

This patch adds stalled-CPU detection to Classic RCU. This capability
is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which
defaults disabled. This is a debugging feature to detect infinite loops
in kernel code, not something that non-kernel-hackers would be expected
to care about. This feature can detect looping CPUs in !PREEMPT builds
and looping CPUs with preemption disabled in PREEMPT builds. This is
essentially a port of this functionality from the treercu patch, replacing
the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4).

The changes from the patch in tip/core/rcu include making the config
variable name match that in treercu, changing from seconds to jiffies to
avoid spurious warnings, and printing a boot message when this feature
is enabled.

Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
---

include/linux/rcuclassic.h | 12 ++-
kernel/rcuclassic.c | 166 +++++++++++++++++++++++----------------------
lib/Kconfig.debug | 2
3 files changed, 96 insertions(+), 84 deletions(-)

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 29bf528..2d72d20 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -40,15 +40,21 @@
#include <linux/cpumask.h>
#include <linux/seqlock.h>

+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */

/* Global control variables for rcupdate callback mechanism. */
struct rcu_ctrlblk {
long cur; /* Current batch number. */
long completed; /* Number of the last completed batch */
long pending; /* Number of the last pending batch */
-#ifdef CONFIG_DEBUG_RCU_STALL
- unsigned long gp_check; /* Time grace period should end, in seconds. */
-#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+ unsigned long gp_start; /* Time at which GP started in jiffies. */
+ unsigned long jiffies_stall;
+ /* Time at which to check for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */

int signaled;

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index ed15128..eae2fb6 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
}
}

+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+ rcp->gp_start = jiffies;
+ rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ int cpu;
+ long delta;
+ unsigned long flags;
+
+ /* Only let one CPU complain about others per time interval. */
+
+ spin_lock_irqsave(&rcp->lock, flags);
+ delta = jiffies - rcp->jiffies_stall;
+ if (delta < 2 || rcp->cur != rcp->completed) {
+ spin_unlock_irqrestore(&rcp->lock, flags);
+ return;
+ }
+ rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+ spin_unlock_irqrestore(&rcp->lock, flags);
+
+ /* OK, time to rat on our buddy... */
+
+ printk(KERN_ERR "RCU detected CPU stalls:");
+ for_each_possible_cpu(cpu) {
+ if (cpu_isset(cpu, rcp->cpumask))
+ printk(" %d", cpu);
+ }
+ printk(" (detected by %d, t=%ld jiffies)\n",
+ smp_processor_id(), (long)(jiffies - rcp->gp_start));
+}
+
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ unsigned long flags;
+
+ printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+ smp_processor_id(), jiffies,
+ jiffies - rcp->gp_start);
+ dump_stack();
+ spin_lock_irqsave(&rcp->lock, flags);
+ if ((long)(jiffies - rcp->jiffies_stall) >= 0)
+ rcp->jiffies_stall =
+ jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+ spin_unlock_irqrestore(&rcp->lock, flags);
+ set_need_resched(); /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ long delta;
+
+ delta = jiffies - rcp->jiffies_stall;
+ if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall(rcp);
+
+ } else if (rcp->cur != rcp->completed && delta >= 2) {
+
+ /* They had two seconds to dump stack, so complain. */
+ print_other_cpu_stall(rcp);
+ }
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
@@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
* period (if necessary).
*/

-#ifdef CONFIG_DEBUG_RCU_STALL
-
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
-{
- rcp->gp_check = get_seconds() + 3;
-}
-
-static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- int cpu;
- long delta;
- unsigned long flags;
-
- /* Only let one CPU complain about others per time interval. */
-
- spin_lock_irqsave(&rcp->lock, flags);
- delta = get_seconds() - rcp->gp_check;
- if (delta < 2L || cpus_empty(rcp->cpumask)) {
- spin_unlock(&rcp->lock);
- return;
- }
- rcp->gp_check = get_seconds() + 30;
- spin_unlock_irqrestore(&rcp->lock, flags);
-
- /* OK, time to rat on our buddy... */
-
- printk(KERN_ERR "RCU detected CPU stalls:");
- for_each_cpu_mask(cpu, rcp->cpumask)
- printk(" %d", cpu);
- printk(" (detected by %d, t=%lu/%lu)\n",
- smp_processor_id(), get_seconds(), rcp->gp_check);
-}
-
-static void print_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- unsigned long flags;
-
- printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
- smp_processor_id(), get_seconds(), rcp->gp_check);
- dump_stack();
- spin_lock_irqsave(&rcp->lock, flags);
- if ((long)(get_seconds() - rcp->gp_check) >= 0L)
- rcp->gp_check = get_seconds() + 30;
- spin_unlock_irqrestore(&rcp->lock, flags);
-}
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
- long delta;
-
- delta = get_seconds() - rcp->gp_check;
- if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
-
- /* We haven't checked in, so go dump stack. */
-
- print_cpu_stall(rcp);
-
- } else {
- if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
- /* They had two seconds to dump stack, so complain. */
- print_other_cpu_stall(rcp);
- }
- }
-}
-
-#else /* #ifdef CONFIG_DEBUG_RCU_STALL */
-
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
-{
-}
-
-static inline void
-check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
-
/*
* Register a new batch of callbacks, and start it up if there is currently no
* active batch and the batch to be registered has not already occurred.
@@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
if (rcp->cur != rcp->pending &&
rcp->completed == rcp->cur) {
rcp->cur++;
- record_gp_check_time(rcp);
+ record_gp_stall_check_time(rcp);

/*
* Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* Check for CPU stalls, if enabled. */
- check_cpu_stall(rcp, rdp);
+ check_cpu_stall(rcp);

if (rdp->nxtlist) {
long completed_snap = ACCESS_ONCE(rcp->completed);
@@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
*/
void __init __rcu_init(void)
{
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+ printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
/* Register notifier for non-boot CPUs */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4e921a8..e0e0582 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -616,7 +616,7 @@ config RCU_TORTURE_TEST_RUNNABLE
Say N here if you want the RCU torture tests to start only
after being manually enabled via /proc.

-config RCU_CPU_STALL
+config RCU_CPU_STALL_DETECTOR
bool "Check for stalled CPUs delaying RCU grace periods"
depends on CLASSIC_RCU
default n
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/