Re: [RFC][PATCH] fs: optimize inotify/fsnotify code for unwatched files

From: Paul E. McKenney
Date: Mon Jun 22 2015 - 20:26:34 EST


On Mon, Jun 22, 2015 at 11:50:50AM -0700, Dave Hansen wrote:
> On 06/22/2015 08:11 AM, Paul E. McKenney wrote:
> > But if Dave is willing to test it, I would be happy to send along
> > a fast-readers patch, easy to do.
>
> I'm always willing to test, but the cost of the srcu_read_lock() barrier
> shows up even on my 2-year-old "Intel(R) Core(TM) i5-3320M CPU @
> 2.60GHz" laptop. The numbers I shared in this thread are on a newer CPU
> than that, so I'm fairly confident this will show up on just about any
> (big core) Intel CPU newer than SandyBridge.
>
> The tests I've been running are:
>
> https://github.com/antonblanchard/will-it-scale
>
> with two new 1-byte read/write tests copied in to "tests/":
>
> https://www.sr71.net/~dave/intel/read1byte.c
> https://www.sr71.net/~dave/intel/write1byte.c
>
> The one-byte thing is silly but it does help isolate the kernel's
> overhead from what the app is doing.

OK, here is an experimental patch that provides a fast-readers variant
of RCU, forward-ported from v3.3. Because we didn't have call_srcu()
and srcu_barrier() back then, it is not a drop-in replacement for SRCU,
so you need to adapt the code to the API, which means putting an "fr"
in front of the "srcu" in the API members.

Understood on the overhead of the memory-barrier instruction showing
up consistently. My point was instead that getting rid of this
memory-barrier instruction does not come for free, as it greatly
increases the latency of synchronize_frsrcu(). In a real workload,
it is entirely possible that the savings from eliminating the memory
barrier are overwhelmed by the increased grace-period latency.

Anyway, the patch is below. Very lightly tested.

Thanx, Paul

------------------------------------------------------------------------

commit d28d4e08ee2f87e6746ac77b07d5c94a65bcbd95
Author: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Date: Mon Jun 22 14:51:18 2015 -0700

rcu: Add a fast-readers variant of SRCU

Dave Hansen's testing of single-byte I/O to tmpfs showed significant
system-level overhead from the memory barriers in srcu_read_lock()
and srcu_read_unlock().

This experimental not-for-merging commit therefore re-introduces the old
variant of SRCU that avoided read-side memory barriers. This of course
also introduces that variant's slow grace periods. There is no analog
to call_rcu(), nor is there an analog to rcu_barrier(), so this is not a
drop-in replacement for SRCU. Unlike SRCU, this implementation cannot
be invoked from idle or offline CPUs. This variant therefore gets its
own API: frsrcu_read_lock(), frsrcu_read_unlock(), and so on.

Reported-by: Dave Hansen <dave@xxxxxxxx>
Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>

diff --git a/include/linux/frsrcu.h b/include/linux/frsrcu.h
new file mode 100644
index 000000000000..5ad60d2d5bca
--- /dev/null
+++ b/include/linux/frsrcu.h
@@ -0,0 +1,250 @@
+/*
+ * Fast-Reader Sleepable Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2006, 2015
+ *
+ * Author: Paul McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
+ */
+
+#ifndef _LINUX_FRSRCU_H
+#define _LINUX_FRSRCU_H
+
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+
+struct frsrcu_struct_array {
+ int c[2];
+};
+
+struct frsrcu_struct {
+ int completed;
+ struct frsrcu_struct_array __percpu *per_cpu_ref;
+ struct mutex mutex;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_frsrcu_struct(struct frsrcu_struct *frsp, const char *name,
+ struct lock_class_key *key);
+
+#define init_frsrcu_struct(frsp) \
+({ \
+ static struct lock_class_key __frsrcu_key; \
+ \
+ __init_frsrcu_struct((frsp), #frsp, &__frsrcu_key); \
+})
+
+#define __FRSRCU_DEP_MAP_INIT(frsrcu_name) .dep_map = { .name = #frsrcu_name },
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+int init_frsrcu_struct(struct frsrcu_struct *frsp);
+
+#define __FRSRCU_DEP_MAP_INIT(frsrcu_name)
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#define __FRSRCU_STRUCT_INIT(name) \
+ { \
+ .completed = -300, \
+ .per_cpu_ref = &name##_frsrcu_array, \
+ .mutex = __MUTEX_INITIALIZER(name.mutex), \
+ __FRSRCU_DEP_MAP_INIT(name) \
+ }
+
+/*
+ * define and init a frsrcu struct at build time.
+ * dont't call init_frsrcu_struct() nor cleanup_frsrcu_struct() on it.
+ */
+#define __DEFINE_FRSRCU(name, is_static) \
+ static DEFINE_PER_CPU(struct frsrcu_struct_array, name##_frsrcu_array);\
+ is_static struct frsrcu_struct name = __FRSRCU_STRUCT_INIT(name)
+#define DEFINE_FRSRCU(name) __DEFINE_FRSRCU(name, /* not static */)
+#define DEFINE_STATIC_FRSRCU(name) __DEFINE_FRSRCU(name, static)
+
+void cleanup_frsrcu_struct(struct frsrcu_struct *frsp);
+int __frsrcu_read_lock(struct frsrcu_struct *frsp) __acquires(frsp);
+void __frsrcu_read_unlock(struct frsrcu_struct *frsp, int idx) __releases(frsp);
+void synchronize_frsrcu(struct frsrcu_struct *frsp);
+void synchronize_frsrcu_expedited(struct frsrcu_struct *frsp);
+long frsrcu_batches_completed(struct frsrcu_struct *frsp);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+/**
+ * frsrcu_read_lock_held - might we be in FRSRCU read-side critical section?
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an FRSRCU
+ * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
+ * this assumes we are in an FRSRCU read-side critical section unless it can
+ * prove otherwise.
+ *
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of view
+ * (ie: that we are in the section between rcu_idle_enter() and
+ * rcu_idle_exit()) then frsrcu_read_lock_held() returns false even if
+ * the CPU did an frsrcu_read_lock(). The reason for this is that RCU
+ * ignores CPUs that are in such a section, considering these as in
+ * extended quiescent state, so such a CPU is effectively never in an
+ * RCU read-side critical section regardless of what RCU primitives it
+ * invokes. This state of affairs is required --- we need to keep an
+ * RCU-free window in idle where the CPU may possibly enter into low
+ * power mode. This way we can notice an extended quiescent state to
+ * other CPUs that started a grace period. Otherwise we would delay any
+ * grace period as long as we run in the idle task.
+ *
+ * Similarly, we avoid claiming an SRCU read lock held if the current
+ * CPU is offline.
+ */
+static inline int frsrcu_read_lock_held(struct frsrcu_struct *frsp)
+{
+ if (!debug_lockdep_rcu_enabled())
+ return 1;
+
+ if (!rcu_is_watching())
+ return 0;
+ if (!rcu_lockdep_current_cpu_online())
+ return 0;
+ return lock_is_held(&frsp->dep_map);
+}
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline int frsrcu_read_lock_held(struct frsrcu_struct *frsp)
+{
+ return 1;
+}
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * frsrcu_dereference_check - fetch FRSRCU-protected pointer for later deref
+ * @p: the pointer to fetch and protect for later dereferencing
+ * @frsp: pointer to the frsrcu_struct, which is used to check that we
+ * really are in an FRSRCU read-side critical section.
+ * @c: condition to check for update-side use
+ *
+ * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
+ * critical section will result in an RCU-lockdep splat, unless @c evaluates
+ * to 1. The @c argument will normally be a logical expression containing
+ * lockdep_is_held() calls.
+ */
+#define frsrcu_dereference_check(p, frsp, c) \
+ __rcu_dereference_check((p), frsrcu_read_lock_held(frsp) || (c), __rcu)
+
+/**
+ * frsrcu_dereference - fetch FRSRCU-protected pointer for later dereferencing
+ * @p: the pointer to fetch and protect for later dereferencing
+ * @frsp: pointer to the frsrcu_struct, which is used to check that we
+ * really are in an FRSRCU read-side critical section.
+ *
+ * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU
+ * is enabled, invoking this outside of an RCU read-side critical
+ * section will result in an RCU-lockdep splat.
+ */
+#define frsrcu_dereference(p, frsp) frsrcu_dereference_check((p), (frsp), 0)
+
+/**
+ * frsrcu_read_lock - register a new reader for an FRSRCU-protected structure
+ * @frsp: frsrcu_struct in which to register the new reader.
+ *
+ * Enter an FRSRCU read-side critical section. Note that FRSRCU read-side
+ * critical sections may be nested. However, it is illegal to
+ * call anything that waits on an FRSRCU grace period for the same
+ * frsrcu_struct, whether directly or indirectly. Please note that
+ * one way to indirectly wait on an FRSRCU grace period is to acquire
+ * a mutex that is held elsewhere while calling synchronize_srcu() or
+ * synchronize_srcu_expedited().
+ *
+ * Note that frsrcu_read_lock() and the matching frsrcu_read_unlock() must
+ * occur in the same context, for example, it is illegal to invoke
+ * frsrcu_read_unlock() in an irq handler if the matching frsrcu_read_lock()
+ * was invoked in process context.
+ */
+static inline int frsrcu_read_lock(struct frsrcu_struct *frsp) __acquires(frsp)
+{
+ int retval = __frsrcu_read_lock(frsp);
+
+ rcu_lock_acquire(&(frsp)->dep_map);
+ rcu_lockdep_assert(rcu_is_watching(),
+ "frsrcu_read_lock() used illegally while idle");
+ return retval;
+}
+
+/**
+ * frsrcu_read_unlock - unregister old reader from FRSRCU-protected structure
+ * @frsp: frsrcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding frsrcu_read_lock().
+ *
+ * Exit an FRSRCU read-side critical section.
+ */
+static inline void frsrcu_read_unlock(struct frsrcu_struct *frsp, int idx)
+ __releases(frsp)
+{
+ rcu_lockdep_assert(rcu_is_watching(),
+ "frsrcu_read_unlock() used illegally while idle");
+ rcu_lock_release(&(frsp)->dep_map);
+ __frsrcu_read_unlock(frsp, idx);
+}
+
+/**
+ * frsrcu_read_lock_raw - register new reader for FRSRCU-protected structure
+ * @frsp: frsrcu_struct in which to register the new reader.
+ *
+ * Enter an FRSRCU read-side critical section. Similar to frsrcu_read_lock(),
+ * but avoids the RCU-lockdep checking. This means that it is legal to
+ * use frsrcu_read_lock_raw() in one context, for example, in an exception
+ * handler, and then have the matching frsrcu_read_unlock_raw() in another
+ * context, for example in the task that took the exception.
+ *
+ * However, the entire FRSRCU read-side critical section must reside within
+ * a single task. For example, beware of using frsrcu_read_lock_raw() in a
+ * device interrupt handler and frsrcu_read_unlock() in the interrupted task:
+ * This will not work if interrupts are threaded.
+ */
+static inline int frsrcu_read_lock_raw(struct frsrcu_struct *frsp)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = __frsrcu_read_lock(frsp);
+ local_irq_restore(flags);
+ return ret;
+}
+
+/**
+ * frsrcu_read_unlock_raw - unregister reader from FRSRCU-protected structure
+ * @frsp: frsrcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding frsrcu_read_lock_raw().
+ *
+ * Exit an FRSRCU read-side critical section without lockdep-RCU checking.
+ * See frsrcu_read_lock_raw() for more details.
+ */
+static inline void frsrcu_read_unlock_raw(struct frsrcu_struct *frsp, int idx)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __frsrcu_read_unlock(frsp, idx);
+ local_irq_restore(flags);
+}
+
+#endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 03a899aabd17..7ff71dbabf86 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -69,6 +69,7 @@ void rcu_unexpedite_gp(void);
#endif /* #else #ifdef CONFIG_TINY_RCU */

enum rcutorture_type {
+ FRSRCU_FLAVOR,
RCU_FLAVOR,
RCU_BH_FLAVOR,
RCU_SCHED_FLAVOR,
diff --git a/init/Kconfig b/init/Kconfig
index 4c08197044f1..381d75cafe7d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -517,6 +517,15 @@ config SRCU
permits arbitrary sleeping or blocking within RCU read-side critical
sections.

+config FRSRCU
+ bool
+ help
+ This option selects the fast-readers sleepable version of
+ RCU. This version permits arbitrary sleeping or blocking within
+ RCU read-side critical sections, but also avoids memory barriers
+ within read-side markers. Nothing comes for free, though.
+ Grace periods are quite a bit more expensive than those of SRCU.
+
config TASKS_RCU
bool
default n
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 50a808424b06..d2d7272a69bb 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
obj-y += update.o
+obj-$(CONFIG_FRSRCU) += frsrcu.o
obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/rcu/frsrcu.c b/kernel/rcu/frsrcu.c
new file mode 100644
index 000000000000..3bea0a020379
--- /dev/null
+++ b/kernel/rcu/frsrcu.c
@@ -0,0 +1,320 @@
+/*
+ * Fast-Reader Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2006, 2015
+ *
+ * Author: Paul McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/frsrcu.h>
+
+/**
+ * init_frsrcu_struct - initialize a sleep-RCU structure
+ * @frsp: structure to initialize.
+ *
+ * Must invoke this on a given frsrcu_struct before passing that
+ * frsrcu_struct to any other function. Each frsrcu_struct represents a
+ * separate domain of FRSRCU protection.
+ */
+int init_frsrcu_struct(struct frsrcu_struct *frsp)
+{
+ frsp->completed = 0;
+ mutex_init(&frsp->mutex);
+ frsp->per_cpu_ref = alloc_percpu(struct frsrcu_struct_array);
+ return frsp->per_cpu_ref ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(init_frsrcu_struct);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_frsrcu_struct(struct frsrcu_struct *frsp, const char *name,
+ struct lock_class_key *key)
+{
+ /* Don't re-initialize a lock while it is held. */
+ debug_check_no_locks_freed((void *)frsp, sizeof(*frsp));
+ lockdep_init_map(&frsp->dep_map, name, key, 0);
+ return init_frsrcu_struct(frsp);
+}
+EXPORT_SYMBOL_GPL(__init_frsrcu_struct);
+
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * frsrcu_readers_active_idx -- returns approximate number of readers
+ * active on the specified rank of per-CPU counters.
+ */
+
+static int frsrcu_readers_active_idx(struct frsrcu_struct *frsp, int idx)
+{
+ int cpu;
+ int sum;
+
+ sum = 0;
+ for_each_possible_cpu(cpu)
+ sum += READ_ONCE(per_cpu_ptr(frsp->per_cpu_ref, cpu)->c[idx]);
+ return sum;
+}
+
+/**
+ * frsrcu_readers_active - returns approximate number of readers.
+ * @frsp: which frsrcu_struct counts active readers (holding frsrcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active frsrcu_struct. That said, it
+ * can be useful as an error check at cleanup time.
+ */
+static int frsrcu_readers_active(struct frsrcu_struct *frsp)
+{
+ return frsrcu_readers_active_idx(frsp, 0) +
+ frsrcu_readers_active_idx(frsp, 1);
+}
+
+/**
+ * cleanup_frsrcu_struct - deconstruct a sleep-RCU structure
+ * @frsp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given frsrcu_struct that
+ * was initialized via init_frsrcu_struct(), else you leak memory.
+ */
+void cleanup_frsrcu_struct(struct frsrcu_struct *frsp)
+{
+ int sum;
+
+ sum = frsrcu_readers_active(frsp);
+ WARN_ON(sum); /* Leakage unless caller handles error. */
+ if (sum != 0)
+ return;
+ free_percpu(frsp->per_cpu_ref);
+ frsp->per_cpu_ref = NULL;
+}
+EXPORT_SYMBOL_GPL(cleanup_frsrcu_struct);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * frsrcu_struct. Must be called from process context. Returns an index
+ * that must be passed to the matching frsrcu_read_unlock().
+ */
+int __frsrcu_read_lock(struct frsrcu_struct *frsp)
+{
+ int idx;
+
+ preempt_disable();
+ idx = frsp->completed & 0x1;
+ barrier(); /* ensure compiler looks -once- at frsp->completed. */
+ __this_cpu_inc(frsp->per_cpu_ref->c[idx]);
+ preempt_enable();
+ return idx;
+}
+EXPORT_SYMBOL_GPL(__frsrcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the frsrcu_struct. Note that this may well be a different CPU
+ * than that which was incremented by the corresponding frsrcu_read_lock().
+ * Must be called from process context.
+ */
+void __frsrcu_read_unlock(struct frsrcu_struct *frsp, int idx)
+{
+ preempt_disable();
+ __this_cpu_dec(frsp->per_cpu_ref->c[idx]);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(__frsrcu_read_unlock);
+
+/*
+ * We use an adaptive strategy for synchronize_frsrcu() and especially for
+ * synchronize_frsrcu_expedited(). We spin for a fixed time period (defined
+ * below) to allow FRSRCU readers to exit their read-side critical sections.
+ * If there are still some readers after 10 microseconds, we repeatedly
+ * block for 1-millisecond time periods. This approach has done well in
+ * testing, so there is no need for a config parameter.
+ */
+#define SYNCHRONIZE_FRSRCU_READER_DELAY 10
+
+/*
+ * Helper function for synchronize_frsrcu() and synchronize_frsrcu_expedited().
+ */
+static void
+__synchronize_frsrcu(struct frsrcu_struct *frsp, void (*sync_func)(void))
+{
+ int idx;
+
+ rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+ !lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+
+ idx = frsp->completed;
+ mutex_lock(&frsp->mutex);
+
+ /*
+ * Check to see if someone else did the work for us while we were
+ * waiting to acquire the lock. We need -two- advances of
+ * the counter, not just one. If there was but one, we might have
+ * shown up -after- our helper's first synchronize_sched(), thus
+ * having failed to prevent CPU-reordering races with concurrent
+ * frsrcu_read_unlock()s on other CPUs (see comment below). So we
+ * either (1) wait for two or (2) supply the second ourselves.
+ */
+
+ if ((frsp->completed - idx) >= 2) {
+ mutex_unlock(&frsp->mutex);
+ return;
+ }
+
+ sync_func(); /* Force memory barrier on all CPUs. */
+
+ /*
+ * The preceding synchronize_sched() ensures that any CPU that
+ * sees the new value of frsp->completed will also see any preceding
+ * changes to data structures made by this CPU. This prevents
+ * some other CPU from reordering the accesses in its FRSRCU
+ * read-side critical section to precede the corresponding
+ * frsrcu_read_lock() -- ensuring that such references will in
+ * fact be protected.
+ *
+ * So it is now safe to do the flip.
+ */
+
+ idx = frsp->completed & 0x1;
+ frsp->completed++;
+
+ sync_func(); /* Force memory barrier on all CPUs. */
+
+ /*
+ * At this point, because of the preceding synchronize_sched(),
+ * all frsrcu_read_lock() calls using the old counters have completed.
+ * Their corresponding critical sections might well be still
+ * executing, but the frsrcu_read_lock() primitives themselves
+ * will have finished executing. We initially give readers
+ * an arbitrarily chosen 10 microseconds to get out of their
+ * FRSRCU read-side critical sections, then loop waiting 1/HZ
+ * seconds per iteration. The 10-microsecond value has done
+ * very well in testing.
+ */
+
+ if (frsrcu_readers_active_idx(frsp, idx))
+ udelay(SYNCHRONIZE_FRSRCU_READER_DELAY);
+ while (frsrcu_readers_active_idx(frsp, idx))
+ schedule_timeout_interruptible(1);
+
+ sync_func(); /* Force memory barrier on all CPUs. */
+
+ /*
+ * The preceding synchronize_sched() forces all frsrcu_read_unlock()
+ * primitives that were executing concurrently with the preceding
+ * for_each_possible_cpu() loop to have completed by this point.
+ * More importantly, it also forces the corresponding FRSRCU
+ * read-side critical sections to have also completed, and the
+ * corresponding references to FRSRCU-protected data items to
+ * be dropped.
+ *
+ * Note:
+ *
+ * Despite what you might think at first glance, the
+ * preceding synchronize_sched() -must- be within the
+ * critical section ended by the following mutex_unlock().
+ * Otherwise, a task taking the early exit can race
+ * with a frsrcu_read_unlock(), which might have executed
+ * just before the preceding frsrcu_readers_active() check,
+ * and whose CPU might have reordered the frsrcu_read_unlock()
+ * with the preceding critical section. In this case, there
+ * is nothing preventing the synchronize_sched() task that is
+ * taking the early exit from freeing a data structure that
+ * is still being referenced (out of order) by the task
+ * doing the frsrcu_read_unlock().
+ *
+ * Alternatively, the comparison with "2" on the early
+ * exit could be changed to "3", but this increases
+ * synchronize_frsrcu() latency for bulk loads. So the
+ * current code is preferred.
+ */
+
+ mutex_unlock(&frsp->mutex);
+}
+
+/**
+ * synchronize_frsrcu - wait for pre-existing FRSRCU read-side critical sections
+ * @frsp: frsrcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates. Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchronize_frsrcu() from the
+ * corresponding FRSRCU read-side critical section; doing so will result
+ * in deadlock. However, it is perfectly legal to call synchronize_frsrcu()
+ * on one frsrcu_struct from some other frsrcu_struct's read-side critical
+ * section.
+ */
+void synchronize_frsrcu(struct frsrcu_struct *frsp)
+{
+ __synchronize_frsrcu(frsp, synchronize_sched);
+}
+EXPORT_SYMBOL_GPL(synchronize_frsrcu);
+
+/**
+ * synchronize_frsrcu_expedited - Brute-force FSRCU grace period
+ * @frsp: frsrcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but use a "big hammer"
+ * approach to force the grace period to end quickly. This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code. In fact, if
+ * you are using synchronize_srcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then use a single synchronize_srcu()
+ * instead.
+ *
+ * Note that it is illegal to call this function while holding any lock
+ * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
+ * to call this function from a CPU-hotplug notifier. Failing to observe
+ * these restriction will result in deadlock. It is also illegal to call
+ * synchronize_srcu_expedited() from the corresponding SRCU read-side
+ * critical section; doing so will result in deadlock. However, it is
+ * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
+ * from some other srcu_struct's read-side critical section, as long as
+ * the resulting graph of srcu_structs is acyclic.
+ */
+void synchronize_frsrcu_expedited(struct frsrcu_struct *frsp)
+{
+ __synchronize_frsrcu(frsp, synchronize_sched_expedited);
+}
+EXPORT_SYMBOL_GPL(synchronize_frsrcu_expedited);
+
+/**
+ * frsrcu_batches_completed - return batches completed.
+ * @frsp: frsrcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+
+long frsrcu_batches_completed(struct frsrcu_struct *frsp)
+{
+ return frsp->completed;
+}
+EXPORT_SYMBOL_GPL(frsrcu_batches_completed);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 59e32684c23b..6dfd9b9bd895 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -44,6 +44,7 @@
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/stat.h>
+#include <linux/frsrcu.h>
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/trace_clock.h>
@@ -264,6 +265,102 @@ struct rcu_torture_ops {
static struct rcu_torture_ops *cur_ops;

/*
+ * Definitions for frsrcu torture testing.
+ */
+
+DEFINE_STATIC_FRSRCU(frsrcu_ctl);
+static struct frsrcu_struct frsrcu_ctld;
+static struct frsrcu_struct *frsrcu_ctlp = &frsrcu_ctl;
+
+static int frsrcu_torture_read_lock(void) __acquires(frsrcu_ctlp)
+{
+ return frsrcu_read_lock(frsrcu_ctlp);
+}
+
+static void frsrcu_torture_read_unlock(int idx) __releases(frsrcu_ctlp)
+{
+ frsrcu_read_unlock(frsrcu_ctlp, idx);
+}
+
+static unsigned long frsrcu_torture_completed(void)
+{
+ return frsrcu_batches_completed(frsrcu_ctlp);
+}
+
+static void frsrcu_torture_synchronize(void)
+{
+ synchronize_frsrcu(frsrcu_ctlp);
+}
+
+static void frsrcu_torture_stats(void)
+{
+ int cpu;
+ int idx = frsrcu_ctlp->completed & 0x1;
+
+ pr_alert("%s%s per-CPU(idx=%d):",
+ torture_type, TORTURE_FLAG, idx);
+ for_each_possible_cpu(cpu) {
+ long c0, c1;
+
+ c0 = (long)per_cpu_ptr(frsrcu_ctlp->per_cpu_ref, cpu)->c[!idx];
+ c1 = (long)per_cpu_ptr(frsrcu_ctlp->per_cpu_ref, cpu)->c[idx];
+ pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
+ }
+ pr_cont("\n");
+}
+
+static void frsrcu_torture_synchronize_expedited(void)
+{
+ synchronize_frsrcu_expedited(frsrcu_ctlp);
+}
+
+static void rcu_sync_torture_init(void);
+static void srcu_read_delay(struct torture_random_state *rrsp);
+
+static struct rcu_torture_ops frsrcu_ops = {
+ .ttype = FRSRCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = frsrcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = frsrcu_torture_read_unlock,
+ .started = NULL,
+ .completed = frsrcu_torture_completed,
+ .sync = frsrcu_torture_synchronize,
+ .exp_sync = frsrcu_torture_synchronize_expedited,
+ .stats = frsrcu_torture_stats,
+ .name = "frsrcu"
+};
+
+static void frsrcu_torture_init(void)
+{
+ rcu_sync_torture_init();
+ WARN_ON(init_frsrcu_struct(&frsrcu_ctld));
+ frsrcu_ctlp = &frsrcu_ctld;
+}
+
+static void frsrcu_torture_cleanup(void)
+{
+ cleanup_frsrcu_struct(&frsrcu_ctld);
+ frsrcu_ctlp = &frsrcu_ctl; /* In case of a later rcutorture run. */
+}
+
+/* As above, but dynamically allocated. */
+static struct rcu_torture_ops frsrcud_ops = {
+ .ttype = FRSRCU_FLAVOR,
+ .init = frsrcu_torture_init,
+ .cleanup = frsrcu_torture_cleanup,
+ .readlock = frsrcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = frsrcu_torture_read_unlock,
+ .started = NULL,
+ .completed = frsrcu_torture_completed,
+ .sync = frsrcu_torture_synchronize,
+ .exp_sync = frsrcu_torture_synchronize_expedited,
+ .stats = frsrcu_torture_stats,
+ .name = "frsrcud"
+};
+
+/*
* Definitions for rcu torture testing.
*/

@@ -810,7 +907,7 @@ rcu_torture_cbflood(void *arg)
int err = 1;
int i;
int j;
- struct rcu_head *rhp;
+ struct rcu_head *rhp = NULL;

if (cbflood_n_per_burst > 0 &&
cbflood_inter_holdoff > 0 &&
@@ -823,9 +920,7 @@ rcu_torture_cbflood(void *arg)
}
if (err) {
VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
- while (!torture_must_stop())
- schedule_timeout_interruptible(HZ);
- return 0;
+ goto wait_for_stop;
}
VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
do {
@@ -844,6 +939,7 @@ rcu_torture_cbflood(void *arg)
stutter_wait("rcu_torture_cbflood");
} while (!torture_must_stop());
vfree(rhp);
+wait_for_stop:
torture_kthread_stopping("rcu_torture_cbflood");
return 0;
}
@@ -1709,6 +1805,7 @@ rcu_torture_init(void)
int cpu;
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
+ &frsrcu_ops, &frsrcud_ops,
&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
&sched_ops, RCUTORTURE_TASKS_OPS
};
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b908048f8d6a..fb39380eb1d5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1233,6 +1233,7 @@ config RCU_TORTURE_TEST
depends on DEBUG_KERNEL
select TORTURE_TEST
select SRCU
+ select FRSRCU
select TASKS_RCU
default n
help

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/