[PATCH tip/core/rcu 5/5] rcu: Maintain special bits at bottom of ->dynticks counter

From: Paul E. McKenney
Date: Mon Nov 14 2016 - 13:30:35 EST


Currently, IPIs are used to force other CPUs to invalidate their TLBs
in response to a kernel virtual-memory mapping change. This works, but
degrades both battery lifetime (for idle CPUs) and real-time response
(for nohz_full CPUs), and in addition results in unnecessary IPIs due to
the fact that CPUs executing in usermode are unaffected by stale kernel
mappings. It would be better to cause a CPU executing in usermode to
wait until it is entering kernel mode to do the flush, first to avoid
interrupting usemode tasks and second to handle multiple flush requests
with a single flush in the case of a long-running user task.

This commit therefore reserves a bit at the bottom of the ->dynticks
counter, which is checked upon exit from extended quiescent states.
If it is set, it is cleared and then a new rcu_eqs_special_exit() macro is
invoked, which, if not supplied, is an empty single-pass do-while loop.
If this bottom bit is set on -entry- to an extended quiescent state,
then a WARN_ON_ONCE() triggers.

This bottom bit may be set using a new rcu_eqs_special_set() function,
which returns true if the bit was set, or false if the CPU turned
out to not be in an extended quiescent state. Please note that this
function refuses to set the bit for a non-nohz_full CPU when that CPU
is executing in usermode because usermode execution is tracked by RCU
as a dyntick-idle extended quiescent state only for nohz_full CPUs.

Reported-by: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
---
include/linux/rcutiny.h | 5 +++
kernel/rcu/tree.c | 81 +++++++++++++++++++++++++++++++++++++------------
kernel/rcu/tree.h | 1 +
3 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 4f9b2fa2173d..7232d199a81c 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
return 0;
}

+static inline bool rcu_eqs_special_set(int cpu)
+{
+ return false; /* Never flag non-existent other CPUs! */
+}
+
static inline unsigned long get_state_synchronize_rcu(void)
{
return 0;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c2b2f5b591b7..2c399db6df6e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -269,9 +269,19 @@ void rcu_bh_qs(void)

static DEFINE_PER_CPU(int, rcu_sched_qs_mask);

+/*
+ * Steal a bit from the bottom of ->dynticks for idle entry/exit
+ * control. Initially this is for TLB flushing.
+ */
+#define RCU_DYNTICK_CTRL_MASK 0x1
+#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
+#ifndef rcu_eqs_special_exit
+#define rcu_eqs_special_exit() do { } while (0)
+#endif
+
static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
- .dynticks = ATOMIC_INIT(1),
+ .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
.dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
.dynticks_idle = ATOMIC_INIT(1),
@@ -285,17 +295,20 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
static void rcu_dynticks_eqs_enter(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int seq;

/*
- * CPUs seeing atomic_inc() must see prior RCU read-side critical
- * sections, and we also must force ordering with the next idle
- * sojourn.
+ * CPUs seeing atomic_inc_return() must see prior RCU read-side
+ * critical sections, and we also must force ordering with the
+ * next idle sojourn.
*/
- smp_mb__before_atomic(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic(); /* See above. */
+ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+ /* Better be in an extended quiescent state! */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (seq & RCU_DYNTICK_CTRL_CTR));
+ /* Better not have special action (TLB flush) pending! */
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- atomic_read(&rdtp->dynticks) & 0x1);
+ (seq & RCU_DYNTICK_CTRL_MASK));
}

/*
@@ -305,17 +318,22 @@ static void rcu_dynticks_eqs_enter(void)
static void rcu_dynticks_eqs_exit(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int seq;

/*
- * CPUs seeing atomic_inc() must see prior idle sojourns,
+ * CPUs seeing atomic_inc_return() must see prior idle sojourns,
* and we also must force ordering with the next RCU read-side
* critical section.
*/
- smp_mb__before_atomic(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic(); /* See above. */
+ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !(atomic_read(&rdtp->dynticks) & 0x1));
+ !(seq & RCU_DYNTICK_CTRL_CTR));
+ if (seq & RCU_DYNTICK_CTRL_MASK) {
+ rcu_eqs_special_exit();
+ /* Prefer duplicate flushes to losing a flush. */
+ smp_mb__before_atomic(); /* NMI safety. */
+ atomic_and(~RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
+ }
}

/*
@@ -332,9 +350,9 @@ static void rcu_dynticks_eqs_online(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);

- if (atomic_read(&rdtp->dynticks) & 0x1)
+ if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
return;
- atomic_add(0x1, &rdtp->dynticks);
+ atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
}

/*
@@ -346,7 +364,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);

- return !(atomic_read(&rdtp->dynticks) & 0x1);
+ return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
}

/*
@@ -357,7 +375,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
{
int snap = atomic_add_return(0, &rdtp->dynticks);

- return snap;
+ return snap & ~RCU_DYNTICK_CTRL_MASK;
}

/*
@@ -366,7 +384,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & 0x1);
+ return !(snap & RCU_DYNTICK_CTRL_CTR);
}

/*
@@ -386,10 +404,33 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
static void rcu_dynticks_momentary_idle(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- int special = atomic_add_return(2, &rdtp->dynticks);
+ int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
+ &rdtp->dynticks);

/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(special & 0x1));
+ WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
+}
+
+/*
+ * Set the special (bottom) bit of the specified CPU so that it
+ * will take special action (such as flushing its TLB) on the
+ * next exit from an extended quiescent state. Returns true if
+ * the bit was successfully set, or false if the CPU was not in
+ * an extended quiescent state.
+ */
+bool rcu_eqs_special_set(int cpu)
+{
+ int old;
+ int new;
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ do {
+ old = atomic_read(&rdtp->dynticks);
+ if (old & RCU_DYNTICK_CTRL_CTR)
+ return false;
+ new = old | RCU_DYNTICK_CTRL_MASK;
+ } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
+ return true;
}

DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 3b953dcf6afc..7dcdd59d894c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -596,6 +596,7 @@ extern struct rcu_state rcu_preempt_state;
#endif /* #ifdef CONFIG_PREEMPT_RCU */

int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+bool rcu_eqs_special_set(int cpu);

#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
--
2.5.2