[PATCH 2/2] srcu: implement Peter's checking algorithm

From: Lai Jiangshan
Date: Mon Feb 27 2012 - 01:22:47 EST


This patch implement the algorithm as Peter's:
https://lkml.org/lkml/2012/2/1/119

o Make the checking lock-free and we can perform parallel checking,
Although almost parallel checking makes no sense, but we need it
when 1) the original checking task is preempted for long, 2)
sychronize_srcu_expedited(), 3) avoid lock(see next)

o Since it is lock-free, we save a mutex in state machine for
call_srcu().

o Remove the SRCU_REF_MASK and remove the coupling with the flipping.
(so we can remove the preempt_disable() in future, but use
__this_cpu_inc() instead.)

o reduce a smp_mb(), simplify the comments and make the smp_mb() pairs
more intuitive.

Inspired-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Lai Jiangshan <laijs@xxxxxxxxxxxxxx>
---
include/linux/srcu.h | 7 +--
kernel/srcu.c | 137 ++++++++++++++++++++-----------------------------
2 files changed, 57 insertions(+), 87 deletions(-)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 5b49d41..15354db 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -32,18 +32,13 @@

struct srcu_struct_array {
unsigned long c[2];
+ unsigned long seq[2];
};

-/* Bit definitions for field ->c above and ->snap below. */
-#define SRCU_USAGE_BITS 1
-#define SRCU_REF_MASK (ULONG_MAX >> SRCU_USAGE_BITS)
-#define SRCU_USAGE_COUNT (SRCU_REF_MASK + 1)
-
struct srcu_struct {
unsigned completed;
struct srcu_struct_array __percpu *per_cpu_ref;
struct mutex mutex;
- unsigned long snap[NR_CPUS];
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 47ee35d..376b583 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -73,10 +73,25 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

/*
+ * Returns approximate total sequence of readers on the specified rank
+ * of per-CPU counters.
+ */
+static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+ unsigned long t;
+
+ for_each_possible_cpu(cpu) {
+ t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+ sum += t;
+ }
+ return sum;
+}
+
+/*
* Returns approximate number of readers active on the specified rank
- * of per-CPU counters. Also snapshots each counter's value in the
- * corresponding element of sp->snap[] for later use validating
- * the sum.
+ * of per-CPU counters.
*/
static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
{
@@ -87,26 +102,36 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
for_each_possible_cpu(cpu) {
t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
sum += t;
- sp->snap[cpu] = t;
}
- return sum & SRCU_REF_MASK;
+ return sum;
}

-/*
- * To be called from the update side after an index flip. Returns true
- * if the modulo sum of the counters is stably zero, false if there is
- * some possibility of non-zero.
- */
static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
{
int cpu;
+ unsigned long seq;
+
+ seq = srcu_readers_seq_idx(sp, idx);
+
+ /*
+ * smp_mb() A pairs with smp_mb() B for critical section.
+ * It ensures that the SRCU read-side critical section whose
+ * read-lock is not seen by the following srcu_readers_active_idx()
+ * will see any updates that before the current task performed before.
+ * (So we don't need to care these readers this time)
+ *
+ * Also, if we see the increment of the seq, we must see the
+ * increment of the active counter in the following
+ * srcu_readers_active_idx().
+ */
+ smp_mb(); /* A */

/*
* Note that srcu_readers_active_idx() can incorrectly return
* zero even though there is a pre-existing reader throughout.
* To see this, suppose that task A is in a very long SRCU
* read-side critical section that started on CPU 0, and that
- * no other reader exists, so that the modulo sum of the counters
+ * no other reader exists, so that the sum of the counters
* is equal to one. Then suppose that task B starts executing
* srcu_readers_active_idx(), summing up to CPU 1, and then that
* task C starts reading on CPU 0, so that its increment is not
@@ -122,53 +147,26 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
return false;

/*
- * Since the caller recently flipped ->completed, we can see at
- * most one increment of each CPU's counter from this point
- * forward. The reason for this is that the reader CPU must have
- * fetched the index before srcu_readers_active_idx checked
- * that CPU's counter, but not yet incremented its counter.
- * Its eventual counter increment will follow the read in
- * srcu_readers_active_idx(), and that increment is immediately
- * followed by smp_mb() B. Because smp_mb() D is between
- * the ->completed flip and srcu_readers_active_idx()'s read,
- * that CPU's subsequent load of ->completed must see the new
- * value, and therefore increment the counter in the other rank.
- */
- smp_mb(); /* A */
-
- /*
- * Now, we check the ->snap array that srcu_readers_active_idx()
- * filled in from the per-CPU counter values. Since
- * __srcu_read_lock() increments the upper bits of the per-CPU
- * counter, an increment/decrement pair will change the value
- * of the counter. Since there is only one possible increment,
- * the only way to wrap the counter is to have a huge number of
- * counter decrements, which requires a huge number of tasks and
- * huge SRCU read-side critical-section nesting levels, even on
- * 32-bit systems.
- *
- * All of the ways of confusing the readings require that the scan
- * in srcu_readers_active_idx() see the read-side task's decrement,
- * but not its increment. However, between that decrement and
- * increment are smb_mb() B and C. Either or both of these pair
- * with smp_mb() A above to ensure that the scan below will see
- * the read-side tasks's increment, thus noting a difference in
- * the counter values between the two passes.
+ * Validation step, smp_mb() D pairs with smp_mb() C. If the above
+ * srcu_readers_active_idx() see a decrement of the active counter
+ * in srcu_read_unlock(), it should see one of these for corresponding
+ * srcu_read_lock():
+ * See the increment of the active counter,
+ * Failed to see the increment of the active counter.
+ * The second one can cause srcu_readers_active_idx() incorrectly
+ * return zero, but it means the above srcu_readers_seq_idx() does not
+ * see the increment of the seq(ref: comments of smp_mb() A),
+ * and the following srcu_readers_seq_idx() sees the increment of
+ * the seq. The seq is changed.
*
- * Therefore, if srcu_readers_active_idx() returned zero, and
- * none of the counters changed, we know that the zero was the
- * correct sum.
- *
- * Of course, it is possible that a task might be delayed
- * for a very long time in __srcu_read_lock() after fetching
- * the index but before incrementing its counter. This
- * possibility will be dealt with in __synchronize_srcu().
+ * This smp_mb() D pairs with smp_mb() C for critical section.
+ * then any of the current task's subsequent code will happen after
+ * that SRCU read-side critical section whose read-unlock is seen in
+ * srcu_readers_active_idx().
*/
- for_each_possible_cpu(cpu)
- if (sp->snap[cpu] !=
- ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]))
- return false; /* False zero reading! */
- return true;
+ smp_mb(); /* D */
+
+ return srcu_readers_seq_idx(sp, idx) == seq;
}

/**
@@ -216,9 +214,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
preempt_disable();
idx = rcu_dereference_index_check(sp->completed,
rcu_read_lock_sched_held()) & 0x1;
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) +=
- SRCU_USAGE_COUNT + 1;
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
smp_mb(); /* B */ /* Avoid leaking the critical section. */
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
preempt_enable();
return idx;
}
@@ -258,17 +256,6 @@ static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
int trycount = 0;

/*
- * If a reader fetches the index before the ->completed increment,
- * but increments its counter after srcu_readers_active_idx_check()
- * sums it, then smp_mb() D will pair with __srcu_read_lock()'s
- * smp_mb() B to ensure that the SRCU read-side critical section
- * will see any updates that the current task performed before its
- * call to synchronize_srcu(), or to synchronize_srcu_expedited(),
- * as the case may be.
- */
- smp_mb(); /* D */
-
- /*
* SRCU read-side critical sections are normally short, so wait
* a small amount of time before possibly blocking.
*/
@@ -281,18 +268,6 @@ static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
schedule_timeout_interruptible(1);
}
}
-
- /*
- * The following smp_mb() E pairs with srcu_read_unlock()'s
- * smp_mb C to ensure that if srcu_readers_active_idx_check()
- * sees srcu_read_unlock()'s counter decrement, then any
- * of the current task's subsequent code will happen after
- * that SRCU read-side critical section.
- *
- * It also ensures the order between the above waiting and
- * the next flipping.
- */
- smp_mb(); /* E */
}

static void srcu_flip(struct srcu_struct *sp)
--
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/