[PATCH v3 3/4] rcu/nocb: Protect NOCB state via local_lock() under PREEMPT_RT

From: Valentin Schneider
Date: Wed Aug 11 2021 - 16:14:30 EST


Warning
=======

Running v5.13-rt1 on my arm64 Juno board triggers:

[ 0.156302] =============================
[ 0.160416] WARNING: suspicious RCU usage
[ 0.164529] 5.13.0-rt1 #20 Not tainted
[ 0.168300] -----------------------------
[ 0.172409] kernel/rcu/tree_plugin.h:69 Unsafe read of RCU_NOCB offloaded state!
[ 0.179920]
[ 0.179920] other info that might help us debug this:
[ 0.179920]
[ 0.188037]
[ 0.188037] rcu_scheduler_active = 1, debug_locks = 1
[ 0.194677] 3 locks held by rcuc/0/11:
[ 0.198448] #0: ffff00097ef10cf8 ((softirq_ctrl.lock).lock){+.+.}-{2:2}, at: __local_bh_disable_ip (./include/linux/rcupdate.h:662 kernel/softirq.c:171)
[ 0.208709] #1: ffff80001205e5f0 (rcu_read_lock){....}-{1:2}, at: rt_spin_lock (kernel/locking/spinlock_rt.c:43 (discriminator 4))
[ 0.217134] #2: ffff80001205e5f0 (rcu_read_lock){....}-{1:2}, at: __local_bh_disable_ip (kernel/softirq.c:169)
[ 0.226428]
[ 0.226428] stack backtrace:
[ 0.230889] CPU: 0 PID: 11 Comm: rcuc/0 Not tainted 5.13.0-rt1 #20
[ 0.237100] Hardware name: ARM Juno development board (r0) (DT)
[ 0.243041] Call trace:
[ 0.245497] dump_backtrace (arch/arm64/kernel/stacktrace.c:163)
[ 0.249185] show_stack (arch/arm64/kernel/stacktrace.c:219)
[ 0.252522] dump_stack (lib/dump_stack.c:122)
[ 0.255947] lockdep_rcu_suspicious (kernel/locking/lockdep.c:6439)
[ 0.260328] rcu_rdp_is_offloaded (kernel/rcu/tree_plugin.h:69 kernel/rcu/tree_plugin.h:58)
[ 0.264537] rcu_core (kernel/rcu/tree.c:2332 kernel/rcu/tree.c:2398 kernel/rcu/tree.c:2777)
[ 0.267786] rcu_cpu_kthread (./include/linux/bottom_half.h:32 kernel/rcu/tree.c:2876)
[ 0.271644] smpboot_thread_fn (kernel/smpboot.c:165 (discriminator 3))
[ 0.275767] kthread (kernel/kthread.c:321)
[ 0.279013] ret_from_fork (arch/arm64/kernel/entry.S:1005)

In this case, this is the RCU core kthread accessing the local CPU's
rdp. Before that, rcu_cpu_kthread() invokes local_bh_disable().

Under !CONFIG_PREEMPT_RT (and rcutree.use_softirq=0), this ends up
incrementing the preempt_count, which satisfies the "local non-preemptible
read" of rcu_rdp_is_offloaded().

Under CONFIG_PREEMPT_RT however, this becomes

local_lock(&softirq_ctrl.lock)

which, under the same config, is migrate_disable() + rt_spin_lock(). As
pointed out by Frederic, this is not sufficient to safely access an rdp's
offload state, as the RCU core kthread can be preempted by a kworker
executing rcu_nocb_rdp_offload() [1].

Introduce a local_lock to serialize an rdp's offload state while the rdp's
associated core kthread is executing rcu_core().

rcu_core() preemptability considerations
========================================

As pointed out by Paul [2], keeping rcu_check_quiescent_state() preemptible
(which is the case under CONFIG_PREEMPT_RT) requires some consideration.

note_gp_changes() itself runs with irqs off, and enters
__note_gp_changes() with rnp->lock held (raw_spinlock), thus is safe vs
preemption.

rdp->core_needs_qs *could* change after being read by the RCU core
kthread if it then gets preempted. Consider, with
CONFIG_RCU_STRICT_GRACE_PERIOD:

rcuc/x task_foo

rcu_check_quiescent_state()
`\
rdp->core_needs_qs == true
<PREEMPT>
rcu_read_unlock()
`\
rcu_preempt_deferred_qs_irqrestore()
`\
rcu_report_qs_rdp()
`\
rdp->core_needs_qs := false;

This would let rcuc/x's rcu_check_quiescent_state() proceed further down to
rcu_report_qs_rdp(), but if task_foo's earlier rcu_report_qs_rdp()
invocation would have cleared the rdp grpmask from the rnp mask, so
rcuc/x's invocation would simply bail.

Since rcu_report_qs_rdp() can be safely invoked, even if rdp->core_needs_qs
changed, it appears safe to keep rcu_check_quiescent_state() preemptible.

[1]: http://lore.kernel.org/r/20210727230814.GC283787@lothringen
[2]: http://lore.kernel.org/r/20210729010445.GO4397@paulmck-ThinkPad-P17-Gen-1
Signed-off-by: Valentin Schneider <valentin.schneider@xxxxxxx>
---
kernel/rcu/tree.c | 4 ++
kernel/rcu/tree.h | 4 ++
kernel/rcu/tree_plugin.h | 82 +++++++++++++++++++++++++++++++++++-----
3 files changed, 81 insertions(+), 9 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51f24ecd94b2..caadfec994f3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -87,6 +87,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
#ifdef CONFIG_RCU_NOCB_CPU
.cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
+ .nocb_local_lock = INIT_LOCAL_LOCK(nocb_local_lock),
#endif
};
static struct rcu_state rcu_state = {
@@ -2853,10 +2854,12 @@ static void rcu_cpu_kthread(unsigned int cpu)
{
unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
int spincnt;

trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
for (spincnt = 0; spincnt < 10; spincnt++) {
+ rcu_nocb_local_lock(rdp);
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
@@ -2866,6 +2869,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
if (work)
rcu_core();
local_bh_enable();
+ rcu_nocb_local_unlock(rdp);
if (*workp == 0) {
trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
*statusp = RCU_KTHREAD_WAITING;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 305cf6aeb408..aa6831255fec 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -210,6 +210,8 @@ struct rcu_data {
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */

+ local_lock_t nocb_local_lock;
+
/* The following fields are used by call_rcu, hence own cacheline. */
raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */
@@ -445,6 +447,8 @@ static void rcu_nocb_unlock(struct rcu_data *rdp);
static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
unsigned long flags);
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
+static void rcu_nocb_local_lock(struct rcu_data *rdp);
+static void rcu_nocb_local_unlock(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
#define rcu_nocb_lock_irqsave(rdp, flags) \
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0ff5e4fb933e..11c4ff00afde 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -21,6 +21,17 @@ static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
return lockdep_is_held(&rdp->nocb_lock);
}

+static inline int rcu_lockdep_is_held_nocb_local(struct rcu_data *rdp)
+{
+ return lockdep_is_held(
+#ifdef CONFIG_PREEMPT_RT
+ &rdp->nocb_local_lock.lock
+#else
+ &rdp->nocb_local_lock
+#endif
+ );
+}
+
static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
{
/* Race on early boot between thread creation and assignment */
@@ -38,7 +49,10 @@ static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
{
return 0;
}
-
+static inline int rcu_lockdep_is_held_nocb_local(struct rcu_data *rdp)
+{
+ return 0;
+}
static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
{
return false;
@@ -46,23 +60,44 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)

#endif /* #ifdef CONFIG_RCU_NOCB_CPU */

+/*
+ * Is a local read of the rdp's offloaded state safe and stable?
+ * See rcu_nocb_local_lock() & family.
+ */
+static inline bool rcu_local_offload_access_safe(struct rcu_data *rdp)
+{
+ if (!preemptible())
+ return true;
+
+ if (!migratable()) {
+ if (!IS_ENABLED(CONFIG_RCU_NOCB))
+ return true;
+
+ return rcu_lockdep_is_held_nocb_local(rdp);
+ }
+
+ return false;
+}
+
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
{
/*
- * In order to read the offloaded state of an rdp is a safe
- * and stable way and prevent from its value to be changed
- * under us, we must either hold the barrier mutex, the cpu
- * hotplug lock (read or write) or the nocb lock. Local
- * non-preemptible reads are also safe. NOCB kthreads and
- * timers have their own means of synchronization against the
- * offloaded state updaters.
+ * In order to read the offloaded state of an rdp is a safe and stable
+ * way and prevent from its value to be changed under us, we must either...
*/
RCU_LOCKDEP_WARN(
+ // ...hold the barrier mutex...
!(lockdep_is_held(&rcu_state.barrier_mutex) ||
+ // ... the cpu hotplug lock (read or write)...
(IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
+ // ... or the NOCB lock.
rcu_lockdep_is_held_nocb(rdp) ||
+ // Local reads still require the local state to remain stable
+ // (preemption disabled / local lock held)
(rdp == this_cpu_ptr(&rcu_data) &&
- !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
+ rcu_local_offload_access_safe(rdp)) ||
+ // NOCB kthreads and timers have their own means of synchronization
+ // against the offloaded state updaters.
rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
);
@@ -1629,6 +1664,22 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
}
}

+/*
+ * The invocation of rcu_core() within the RCU core kthreads remains preemptible
+ * under PREEMPT_RT, thus the offload state of a CPU could change while
+ * said kthreads are preempted. Prevent this from happening by protecting the
+ * offload state with a local_lock().
+ */
+static void rcu_nocb_local_lock(struct rcu_data *rdp)
+{
+ local_lock(&rcu_data.nocb_local_lock);
+}
+
+static void rcu_nocb_local_unlock(struct rcu_data *rdp)
+{
+ local_unlock(&rcu_data.nocb_local_lock);
+}
+
/* Lockdep check that ->cblist may be safely accessed. */
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
{
@@ -2396,6 +2447,7 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
if (rdp->nocb_cb_sleep)
rdp->nocb_cb_sleep = false;
rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_local_unlock(rdp);

/*
* Ignore former value of nocb_cb_sleep and force wake up as it could
@@ -2427,6 +2479,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)

pr_info("De-offloading %d\n", rdp->cpu);

+ rcu_nocb_local_lock(rdp);
rcu_nocb_lock_irqsave(rdp, flags);
/*
* Flush once and for all now. This suffices because we are
@@ -2509,6 +2562,7 @@ static long rcu_nocb_rdp_offload(void *arg)
* Can't use rcu_nocb_lock_irqsave() while we are in
* SEGCBLIST_SOFTIRQ_ONLY mode.
*/
+ rcu_nocb_local_lock(rdp);
raw_spin_lock_irqsave(&rdp->nocb_lock, flags);

/*
@@ -2868,6 +2922,16 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
local_irq_restore(flags);
}

+/* No ->nocb_local_lock to acquire. */
+static void rcu_nocb_local_lock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_local_lock to release. */
+static void rcu_nocb_local_unlock(struct rcu_data *rdp)
+{
+}
+
/* Lockdep check that ->cblist may be safely accessed. */
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
{
--
2.25.1