Re: [PATCH 1/4] sched/idle: Fix missing need_resched() check after rcu_idle_enter()

From: Peter Zijlstra
Date: Tue Jan 05 2021 - 04:56:01 EST


On Mon, Jan 04, 2021 at 04:20:55PM +0100, Frederic Weisbecker wrote:
> Entering RCU idle mode may cause a deferred wake up of an RCU NOCB_GP
> kthread (rcuog) to be serviced.
>
> Usually a wake up happening while running the idle task is spotted in
> one of the need_resched() checks carefully placed within the idle loop
> that can break to the scheduler.

Urgh, this is horrific and fragile :/ You having had to audit and fix a
number of rcu_idle_enter() callers should've made you realize that
making rcu_idle_enter() return something would've been saner.

Also, I might hope that when RCU does do that wakeup, it will not have
put RCU in idle mode? So it is a natural 'fail' state for
rcu_idle_enter(), *sigh* it continues to put RCU to sleep, so that needs
fixing too.

I'm thinking that rcu_user_enter() will have the exact same problem? Did
you audit that?

Something like the below, combined with a fixup for all callers (which
the compiler will help us find thanks to __must_check).

---

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index de0826411311..612f66c16078 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -95,10 +95,10 @@ static inline void rcu_sysrq_end(void) { }
#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */

#ifdef CONFIG_NO_HZ_FULL
-void rcu_user_enter(void);
+bool __must_check rcu_user_enter(void);
void rcu_user_exit(void);
#else
-static inline void rcu_user_enter(void) { }
+static inline bool __must_check rcu_user_enter(void) { return true; }
static inline void rcu_user_exit(void) { }
#endif /* CONFIG_NO_HZ_FULL */

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index df578b73960f..9ba0c5d9e99e 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -43,7 +43,7 @@ bool rcu_gp_might_be_stalled(void);
unsigned long get_state_synchronize_rcu(void);
void cond_synchronize_rcu(unsigned long oldstate);

-void rcu_idle_enter(void);
+bool __must_check rcu_idle_enter(void);
void rcu_idle_exit(void);
void rcu_irq_enter(void);
void rcu_irq_exit(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 40e5e3dd253e..13e19e5db0b8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -625,7 +625,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
* the possibility of usermode upcalls having messed up our count
* of interrupt nesting level during the prior busy period.
*/
-static noinstr void rcu_eqs_enter(bool user)
+static noinstr bool rcu_eqs_enter(bool user)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);

@@ -636,7 +636,7 @@ static noinstr void rcu_eqs_enter(bool user)
if (rdp->dynticks_nesting != 1) {
// RCU will still be watching, so just do accounting and leave.
rdp->dynticks_nesting--;
- return;
+ return true;
}

lockdep_assert_irqs_disabled();
@@ -644,7 +644,14 @@ static noinstr void rcu_eqs_enter(bool user)
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rdp = this_cpu_ptr(&rcu_data);
- do_nocb_deferred_wakeup(rdp);
+ if (do_nocb_deferred_wakeup(rdp)) {
+ /*
+ * We did the wakeup, don't enter EQS, we'll need to abort idle
+ * and schedule.
+ */
+ return false;
+ }
+
rcu_prepare_for_idle();
rcu_preempt_deferred_qs(current);

@@ -657,6 +664,8 @@ static noinstr void rcu_eqs_enter(bool user)
rcu_dynticks_eqs_enter();
// ... but is no longer watching here.
rcu_dynticks_task_enter();
+
+ return true;
}

/**
@@ -670,10 +679,10 @@ static noinstr void rcu_eqs_enter(bool user)
* If you add or remove a call to rcu_idle_enter(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
-void rcu_idle_enter(void)
+bool rcu_idle_enter(void)
{
lockdep_assert_irqs_disabled();
- rcu_eqs_enter(false);
+ return rcu_eqs_enter(false);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);

@@ -689,10 +698,10 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
* If you add or remove a call to rcu_user_enter(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
-noinstr void rcu_user_enter(void)
+noinstr bool rcu_user_enter(void)
{
lockdep_assert_irqs_disabled();
- rcu_eqs_enter(true);
+ return rcu_eqs_enter(true);
}
#endif /* CONFIG_NO_HZ_FULL */

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7708ed161f4a..9226f4021a36 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -433,7 +433,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
static void __init rcu_spawn_nocb_kthreads(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7e291ce0a1d6..8ca41b3fe4f9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1631,7 +1631,7 @@ bool rcu_is_nocb_cpu(int cpu)
* Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
* and this function releases it.
*/
-static void wake_nocb_gp(struct rcu_data *rdp, bool force,
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
unsigned long flags)
__releases(rdp->nocb_lock)
{
@@ -1654,8 +1654,11 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force,
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
}
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- if (needwake)
+ if (needwake) {
wake_up_process(rdp_gp->nocb_gp_kthread);
+ return true;
+ }
+ return false;
}

/*
@@ -2155,17 +2158,19 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
{
unsigned long flags;
+ bool ret;
int ndw;

rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_nocb_need_deferred_wakeup(rdp)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
+ return false;
}
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+ return ret;
}

/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
@@ -2181,10 +2186,12 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
* This means we do an inexact common-case check. Note that if
* we miss, ->nocb_timer will eventually clean things up.
*/
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
if (rcu_nocb_need_deferred_wakeup(rdp))
- do_nocb_deferred_wakeup_common(rdp);
+ return do_nocb_deferred_wakeup_common(rdp);
+
+ return false;
}

void __init rcu_init_nohz(void)
@@ -2518,8 +2525,9 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
return false;
}

-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
+ return false
}

static void rcu_spawn_cpu_nocb_kthread(int cpu)