Re: [PATCH tip/core/rcu 02/18] rcu: Move rcu_report_exp_rnp() to allow consolidation

From: Paul E. McKenney
Date: Wed Oct 07 2015 - 11:18:38 EST


On Wed, Oct 07, 2015 at 01:50:46PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 07, 2015 at 01:01:20PM +0200, Peter Zijlstra wrote:
>
> > That again doesn't explain which UNLOCKs with non-matching lock values
> > it pairs with and what particular ordering is important here.
>
> So after staring at that stuff for a while I came up with the following.
> Does this make sense, or am I completely misunderstanding things?
>
> Not been near a compiler.

Actually, this would be quite good. "Premature abstraction is the
root of all evil" and all that, but this abstraction is anything but
premature. My thought would be to have it against commit cd58087c9cee
("Merge branches 'doc.2015.10.06a', 'percpu-rwsem.2015.10.06a' and
'torture.2015.10.06a' into HEAD") in -rcu given the merge conflicts
that would otherwise arise.

Thanx, Paul

> ---
> kernel/rcu/tree.c | 99 ++++++++++++++++++++++++++++++++++---------------------
> 1 file changed, 61 insertions(+), 38 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 775d36cc0050..46e1e23ff762 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1460,6 +1460,48 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
> }
>
> /*
> + * Wrappers for the rcu_node::lock acquire.
> + *
> + * Because the rcu_nodes form a tree, the tree taversal locking will observe
> + * different lock values, this in turn means that an UNLOCK of one level
> + * followed by a LOCK of another level does not imply a full memory barrier;
> + * and most importantly transitivity is lost.
> + *
> + * In order to restore full ordering between tree levels, augment the regular
> + * lock acquire functions with smp_mb__after_unlock_lock().
> + */
> +static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
> +{
> + raw_spin_lock(&rnp->lock);
> + smp_mb__after_unlock_lock();
> +}
> +
> +static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
> +{
> + raw_spin_lock_irq(&rnp->lock);
> + smp_mb__after_unlock_lock();
> +}
> +
> +static inline void
> +_raw_spin_lock_irqsave_rcu_node(struct rcu_node *rnp, unsigned long *flags)
> +{
> + _raw_spin_lock_irqsave(&rnp->lock, flags);
> + smp_mb__after_unlock_lock();
> +}
> +
> +#define raw_spin_lock_irqsave_rcu_node(rnp, flags)
> + _raw_spin_lock_irqsave_rcu_node((rnp), &(flags))
> +
> +static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
> +{
> + bool locked = raw_spin_trylock(&rnp->lock);
> + if (locked)
> + smp_mb__after_unlock_lock();
> + return locked;
> +}
> +
> +
> +/*
> * Start some future grace period, as needed to handle newly arrived
> * callbacks. The required future grace periods are recorded in each
> * rcu_node structure's ->need_future_gp field. Returns true if there
> @@ -1512,10 +1554,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
> * hold it, acquire the root rcu_node structure's lock in order to
> * start one (if needed).
> */
> - if (rnp != rnp_root) {
> - raw_spin_lock(&rnp_root->lock);
> - smp_mb__after_unlock_lock();
> - }
> + if (rnp != rnp_root)
> + raw_spin_lock_rcu_node(rnp);
>
> /*
> * Get a new grace-period number. If there really is no grace
> @@ -1764,11 +1804,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
> if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
> rdp->completed == READ_ONCE(rnp->completed) &&
> !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
> - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
> + !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
> local_irq_restore(flags);
> return;
> }
> - smp_mb__after_unlock_lock();
> needwake = __note_gp_changes(rsp, rnp, rdp);
> raw_spin_unlock_irqrestore(&rnp->lock, flags);
> if (needwake)
> @@ -1792,8 +1831,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
> struct rcu_node *rnp = rcu_get_root(rsp);
>
> WRITE_ONCE(rsp->gp_activity, jiffies);
> - raw_spin_lock_irq(&rnp->lock);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_irq_rcu_node(rnp);
> if (!READ_ONCE(rsp->gp_flags)) {
> /* Spurious wakeup, tell caller to go back to sleep. */
> raw_spin_unlock_irq(&rnp->lock);
> @@ -1825,8 +1863,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
> */
> rcu_for_each_leaf_node(rsp, rnp) {
> rcu_gp_slow(rsp, gp_preinit_delay);
> - raw_spin_lock_irq(&rnp->lock);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_irq_rcu_node(rnp);
> if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
> !rnp->wait_blkd_tasks) {
> /* Nothing to do on this leaf rcu_node structure. */
> @@ -1882,8 +1919,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
> */
> rcu_for_each_node_breadth_first(rsp, rnp) {
> rcu_gp_slow(rsp, gp_init_delay);
> - raw_spin_lock_irq(&rnp->lock);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_irq_rcu_node(rnp);
> rdp = this_cpu_ptr(rsp->rda);
> rcu_preempt_check_blocked_tasks(rnp);
> rnp->qsmask = rnp->qsmaskinit;
> @@ -1953,8 +1989,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
> }
> /* Clear flag to prevent immediate re-entry. */
> if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
> - raw_spin_lock_irq(&rnp->lock);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_irq_rcu_node(rnp);
> WRITE_ONCE(rsp->gp_flags,
> READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
> raw_spin_unlock_irq(&rnp->lock);
> @@ -1974,8 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
> struct rcu_node *rnp = rcu_get_root(rsp);
>
> WRITE_ONCE(rsp->gp_activity, jiffies);
> - raw_spin_lock_irq(&rnp->lock);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_irq_rcu_node(rnp);
> gp_duration = jiffies - rsp->gp_start;
> if (gp_duration > rsp->gp_max)
> rsp->gp_max = gp_duration;
> @@ -2000,8 +2034,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
> * grace period is recorded in any of the rcu_node structures.
> */
> rcu_for_each_node_breadth_first(rsp, rnp) {
> - raw_spin_lock_irq(&rnp->lock);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_irq_rcu_node(rnp);
> WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
> WARN_ON_ONCE(rnp->qsmask);
> WRITE_ONCE(rnp->completed, rsp->gpnum);
> @@ -2016,8 +2049,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
> rcu_gp_slow(rsp, gp_cleanup_delay);
> }
> rnp = rcu_get_root(rsp);
> - raw_spin_lock_irq(&rnp->lock);
> - smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
> + raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
> rcu_nocb_gp_set(rnp, nocb);
>
> /* Declare grace period done. */
> @@ -2264,8 +2296,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
> raw_spin_unlock_irqrestore(&rnp->lock, flags);
> rnp_c = rnp;
> rnp = rnp->parent;
> - raw_spin_lock_irqsave(&rnp->lock, flags);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_irqsave_rcu_node(rnp, flags);
> oldmask = rnp_c->qsmask;
> }
>
> @@ -2312,8 +2343,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
> gps = rnp->gpnum;
> mask = rnp->grpmask;
> raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
> rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
> }
>
> @@ -2335,8 +2365,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
> struct rcu_node *rnp;
>
> rnp = rdp->mynode;
> - raw_spin_lock_irqsave(&rnp->lock, flags);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_irqsave_rcu_node(rnp, flags);
> if ((rdp->passed_quiesce == 0 &&
> rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
> rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
> @@ -2562,8 +2591,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
> rnp = rnp->parent;
> if (!rnp)
> break;
> - raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> - smp_mb__after_unlock_lock(); /* GP memory ordering. */
> + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
> rnp->qsmaskinit &= ~mask;
> rnp->qsmask &= ~mask;
> if (rnp->qsmaskinit) {
> @@ -2591,8 +2619,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
>
> /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
> mask = rdp->grpmask;
> - raw_spin_lock_irqsave(&rnp->lock, flags);
> - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
> + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
> rnp->qsmaskinitnext &= ~mask;
> raw_spin_unlock_irqrestore(&rnp->lock, flags);
> }
> @@ -2789,8 +2816,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
> rcu_for_each_leaf_node(rsp, rnp) {
> cond_resched_rcu_qs();
> mask = 0;
> - raw_spin_lock_irqsave(&rnp->lock, flags);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_irqsave_rcu_node(rnp, flags);
> if (rnp->qsmask == 0) {
> if (rcu_state_p == &rcu_sched_state ||
> rsp != rcu_state_p ||
> @@ -2861,8 +2887,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
> /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
>
> /* Reached the root of the rcu_node tree, acquire lock. */
> - raw_spin_lock_irqsave(&rnp_old->lock, flags);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
> raw_spin_unlock(&rnp_old->fqslock);
> if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
> rsp->n_force_qs_lh++;
> @@ -2985,8 +3010,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
> if (!rcu_gp_in_progress(rsp)) {
> struct rcu_node *rnp_root = rcu_get_root(rsp);
>
> - raw_spin_lock(&rnp_root->lock);
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_rcu_node(rnp_root);
> needwake = rcu_start_gp(rsp);
> raw_spin_unlock(&rnp_root->lock);
> if (needwake)
> @@ -3925,8 +3949,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
> */
> rnp = rdp->mynode;
> mask = rdp->grpmask;
> - raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> - smp_mb__after_unlock_lock();
> + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
> rnp->qsmaskinitnext |= mask;
> rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
> rdp->completed = rnp->completed;
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/