Re: [PATCH] rcu/tree: Correctly handle single cpu check in rcu_blocking_is_gp

From: Joel Fernandes
Date: Mon Sep 28 2020 - 19:19:24 EST


On Tue, Sep 22, 2020 at 01:15:57AM +0530, Neeraj Upadhyay wrote:
> Currently, for non-preempt kernels (with CONFIG_PREEMPTION=n),
> rcu_blocking_is_gp() checks (with preemption disabled), whether
> there is only one cpu online. It uses num_online_cpus() to
> decide whether only one cpu is online. If there is only single
> cpu online, synchronize_rcu() is optimized to return without
> doing all the work to wait for grace period. However, there are
> few issues with the num_online_cpus() check used:
>
> 1. num_online_cpus() does a atomic_read(&__num_online_cpus). As
> hotplug locks are not held, this does not ensure that
> new incoming cpus update of the count is visible. This can
> result in read side section on new incoming cpu, observe
> updates which should not be visible beyond the grace period
> corresponding to synchronize_rcu().
>
> For e.g. below litmus test, where P0 process corresponds to
> synchronize_rcu() and P1 corresponds to new online cpu, has
> positive witnesses; confirming the possibility of read side
> section to extend before and after the grace period, thereby
> breaking guarantees provided by synchronize_rcu().
>
> {
> int x = 0;
> atomic_t numonline = ATOMIC_INIT(1);
> }
>
> P0(int *x, atomic_t *numonline)
> {
> int r0;
> WRITE_ONCE(*x, 1);
> r0 = atomic_read(numonline);
> if (r0 == 1) {
> smp_mb();
> } else {
> synchronize_rcu();
> }
> WRITE_ONCE(*x, 2);
> }
>
> P1(int *x, atomic_t *numonline)
> {
> int r0; int r1;
>
> atomic_inc(numonline);
> smp_mb();
> rcu_read_lock();
> r0 = READ_ONCE(*x);
> smp_rmb();
> r1 = READ_ONCE(*x);
> rcu_read_unlock();
> }
>
> locations [x;numonline;]
>
> exists (1:r0=0 /\ 1:r1=2)

<self rambling>
Right so basically:
P1(int *x, atomic_t *numonline)
{
int r0; int r1;

atomic_inc(numonline);
smp_mb();
rcu_read_lock();
r0 = READ_ONCE(*x);

Happened before:
P0(int *x, atomic_t *numonline)
{
int r0;
WRITE_ONCE(*x, 1);
r0 = atomic_read(numonline);
if (r0 == 1) {
smp_mb();
...
}

However, P0 did not see P1's update of numonline because the atomic_read()
does not provide any ordering.
</self rambling>

Btw, its cool how Paul/you removed the need for memory-barriers for the
single-CPU case at all, by making the update to num_online_cpus in rcu_state,
from the CPU doing the onlining and offlining.

Reviewed-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
(For the version in -rcu :)).

thanks,

- Joel


> 2. Second problem is, the same early exit, from synchronize_rcu()
> does not provide full ordering, memory barrier, w.r.t. memory
> accesses after synchronize_rcu() call.
>
> 3. Third, more important issue is related to outgoing cpu. Checking
> only for __num_online_cpus with preemotion disabled isn't sufficient
> for RCU, as, on completion of CPUHP_TEARDOWN_CPU stop machine (which
> clears outgoing cpu from __num_online_cpus, the CPU switches to idle
> task. So, checking only for __num_online_cpus does not consider
> RCU read side sections in scheduler code (before switching to idle
> task) and any potential read side sections in idle task, before final
> RCU-quiesce entry into cpuhp_report_idle_dead() -> rcu_report_dead().
>
> To handle these issues, add a new rcu_state member n_online_cpus, to
> keep account of the current number of online cpus. The atomic updates
> to this counter from rcu_report_dead() and rcu_cpu_starting() and
> the added read/write memory ordering semantics ensure that
> synchronize_rcu() fast path waits for all read side sections, where
> incoming/outgoing cpus are considered online, for RCU i.e. after
> rcu_cpu_starting() and before rcu_report_dead().
>
> Signed-off-by: Neeraj Upadhyay <neeraju@xxxxxxxxxxxxxx>
> ---
>
> Below is the reproducer for issue described in point 3; this snippet
> is based on klitmus generated test, which is modified to sample reads
> from idle thread:
>
> static void code0(int* x) {
> WRITE_ONCE(*x, 1);
> idle_ctr = 0;
> smp_mb();
> mdelay(10);
> WRITE_ONCE(*x, 1);
> idle_ctr = 1;
> synchronize_rcu();
> WRITE_ONCE(*x, 2);
> idle_ctr = 2;
>
> }
>
> static int thread0(void *_p) {
> int _j, _i;
> ctx_t *_a = (ctx_t *)_p;
>
> smp_mb();
> for (_j = 0 ; _j < stride ; _j++) {
> for (_i = _j ; _i < size ; _i += stride) {
> while (idle_wait1) {
> cpu_relax();
> cond_resched();
> }
> code0(&_a->x[_i]);
> smp_mb();
> get_online_cpus();
> idle_wait1 = true;
> put_online_cpus();
> }
> }
> atomic_inc(&done);
> smp_mb();
> wake_up(wq);
> smp_mb();
> do_exit(0);
> }
>
>
> static void code1(int* x,int* out_1_r1,int* out_1_r0) {
>
> int r0; int r1;
>
> r0 = READ_ONCE(idle_ctr_snap1);
> r1 = READ_ONCE(idle_ctr_snap2);
>
> *out_1_r1 = (int)r1;
> *out_1_r0 = (int)r0;
> }
>
> static int thread1(void *_p) {
> ctx_t *_a = (ctx_t *)_p;
> int _j, _i;
>
> smp_mb();
> for (_j = 0 ; _j < stride ; _j++) {
> for (_i = _j ; _i < size ; _i += stride) {
> while (idle_wait2) {
> cpu_relax();
> cond_resched();
> }
> get_online_cpus();
> code1(&_a->x[_i],&_a->out_1_r1[_i],&_a->out_1_r0[_i]);
> smp_mb();
> idle_wait2 = true;
> put_online_cpus();
> }
> }
> atomic_inc(&done);
> smp_mb();
> wake_up(wq);
> smp_mb();
> do_exit(0);
> }
>
> Idle thread snippet:
>
> if (cpu_is_offline(cpu)) {
> smp_mb();
> idle_wait1 = false;
> mdelay(8);
> smp_mb();
> rcu_read_lock();
> idle_ctr_snap1 = idle_ctr;
> mdelay(40);
> smp_rmb();
> idle_ctr_snap2 = idle_ctr;
> rcu_read_unlock();
> smp_mb();
> idle_wait2 = false;
> tick_nohz_idle_stop_tick();
> cpuhp_report_idle_dead();
> arch_cpu_idle_dead();
>
> kernel/rcu/tree.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> kernel/rcu/tree.h | 1 +
> 2 files changed, 66 insertions(+)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 2424e2a..33493f0 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3609,9 +3609,59 @@ static int rcu_blocking_is_gp(void)
> if (IS_ENABLED(CONFIG_PREEMPTION))
> return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
> might_sleep(); /* Check for RCU read-side critical section. */
> + /*
> + * a = p
> + * a = NULL
> + * synchronize_rcu()
> + * rcu_blocking_is_gp()
> + * num_online_cpus()
> + * atomic_read(&__num_online_cpus)
> + * kfree(p);
> + *
> + * - VS -
> + *
> + * __cpu_up()
> + * set_cpu_online(cpu)
> + * atomic_inc(&__num_online_cpus)
> + * rcu_read_lock()
> + * rcu_dereference(a) (a == p)
> + * rcu_read_unlock()
> + *
> + * rcu_blocking_is_gp() must observe atomic_inc(&__num_online_cpus),
> + * in order to ensure that, RCU read side critical section on new
> + * online cpu, either start after synchronize_rcu()'s GP starts or
> + * it completes before synchronize_rcu() returns.
> + *
> + * However, atomic_read(&__num_online_cpus) does not ensure that.
> + *
> + * Essentially, below condition exist:
> + *
> + * {
> + * int x = 0;
> + * atomic_t numonline = ATOMIC_INIT(1);
> + * }
> + *
> + * P0(int *x, atomic_t *numonline) P1(int *x, atomic_t *numonline)
> + * { {
> + * int r0; int r0; int r1;
> + * WRITE_ONCE(*x, 1); atomic_inc(numonline);
> + * r0 = atomic_read(numonline); rcu_read_lock();
> + * if (r0 == 2) { r0 = READ_ONCE(*x);
> + * synchronize_rcu(); smp_rmb();
> + * } r1 = READ_ONCE(*x);
> + * WRITE_ONCE(*x, 2); rcu_read_unlock();
> + * } }
> + *
> + * exists (1:r0=0 /\ 1:r1=2)
> + *
> + * atomic_add_return(0, &rcu_state.n_online_cpus) and corresponding
> + * atomic_inc(&rcu_state.n_online_cpus) in rcu_cpu_starting() corrects
> + * this ordering issue.
> + */
> preempt_disable();
> ret = num_online_cpus() <= 1;
> preempt_enable();
> + ret = ret && (atomic_add_return(0, &rcu_state.n_online_cpus) <= 1);
> return ret;
> }
>
> @@ -3655,6 +3705,11 @@ void synchronize_rcu(void)
> lock_is_held(&rcu_sched_lock_map),
> "Illegal synchronize_rcu() in RCU read-side critical section");
> if (rcu_blocking_is_gp())
> + /*
> + * atomic_add_return() in rcu_blocking_is_gp () provides
> + * full memory barrier ordering with any rcu section after
> + * synchronize_rcu() call.
> + */
> return;
> if (rcu_gp_is_expedited())
> synchronize_rcu_expedited();
> @@ -4086,6 +4141,10 @@ void rcu_cpu_starting(unsigned int cpu)
> mask = rdp->grpmask;
> raw_spin_lock_irqsave_rcu_node(rnp, flags);
> WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
> + /* Order with access of n_online_cpus in rcu_blocking_is_gp */
> + atomic_inc(&rcu_state.n_online_cpus);
> + /* Order with rcu-side usages after this */
> + smp_mb__after_atomic();
> newcpu = !(rnp->expmaskinitnext & mask);
> rnp->expmaskinitnext |= mask;
> /* Allow lockless access for expedited grace periods. */
> @@ -4138,6 +4197,12 @@ void rcu_report_dead(unsigned int cpu)
> raw_spin_lock_irqsave_rcu_node(rnp, flags);
> }
> WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
> + /*
> + * Order with access of n_online_cpus in rcu_blocking_is_gp().
> + * Release semantics ensures that RCU read sections before it
> + * are observed by rcu_blocking_is_gp().
> + */
> + atomic_dec_return_release(&rcu_state.n_online_cpus);
> raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
> raw_spin_unlock(&rcu_state.ofl_lock);
>
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index e4f66b8..4d9a9c0 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -298,6 +298,7 @@ struct rcu_state {
> /* Hierarchy levels (+1 to */
> /* shut bogus gcc warning) */
> int ncpus; /* # CPUs seen so far. */
> + atomic_t n_online_cpus; /* # CPUs online for RCU. */