Re: [mm PATCH 4/6] RCU: preemptible RCU

From: Dipankar Sarma
Date: Mon Jan 15 2007 - 14:30:06 EST


From: Paul McKenney <paulmck@xxxxxxxxxx>

This patch implements a new version of RCU which allows its read-side
critical sections to be preempted. It uses a set of counter pairs
to keep track of the read-side critical sections and flips them
when all tasks exit read-side critical section. The details
of this implementation can be found in this paper -

http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf

This patch was developed as a part of the -rt kernel
development and meant to provide better latencies when
read-side critical sections of RCU don't disable preemption.
As a consequence of keeping track of RCU readers, the readers
have a slight overhead (optimizations in the paper).
This implementation co-exists with the "classic" RCU
implementations and can be switched to at compiler.

Signed-off-by: Paul McKenney <paulmck@xxxxxxxxxx>
Signed-off-by: Dipankar Sarma <dipankar@xxxxxxxxxx>



diff -puN include/linux/init_task.h~rcu-preempt include/linux/init_task.h
--- linux-2.6.20-rc3-mm1-rcu/include/linux/init_task.h~rcu-preempt 2007-01-15 15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/include/linux/init_task.h 2007-01-15 15:36:51.000000000 +0530
@@ -90,6 +90,14 @@ extern struct nsproxy init_nsproxy;

extern struct group_info init_groups;

+#ifdef CONFIG_PREEMPT_RCU
+#define INIT_PREEMPT_RCU \
+ .rcu_read_lock_nesting = 0, \
+ .rcu_flipctr_idx = 0,
+#else
+#define INIT_PREEMPT_RCU
+#endif
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -111,6 +119,7 @@ extern struct group_info init_groups;
.run_list = LIST_HEAD_INIT(tsk.run_list), \
.ioprio = 0, \
.time_slice = HZ, \
+ INIT_PREEMPT_RCU \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
.ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \
diff -puN include/linux/rcuclassic.h~rcu-preempt include/linux/rcuclassic.h
--- linux-2.6.20-rc3-mm1-rcu/include/linux/rcuclassic.h~rcu-preempt 2007-01-15 15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/include/linux/rcuclassic.h 2007-01-15 15:36:51.000000000 +0530
@@ -142,7 +142,6 @@ extern int rcu_needs_cpu(int cpu);
extern void __rcu_init(void);
extern void rcu_check_callbacks(int cpu, int user);
extern void rcu_restart_cpu(int cpu);
-extern long rcu_batches_completed(void);

#endif /* __KERNEL__ */
#endif /* __LINUX_RCUCLASSIC_H */
diff -puN include/linux/rcupdate.h~rcu-preempt include/linux/rcupdate.h
--- linux-2.6.20-rc3-mm1-rcu/include/linux/rcupdate.h~rcu-preempt 2007-01-15 15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/include/linux/rcupdate.h 2007-01-15 15:36:51.000000000 +0530
@@ -41,7 +41,11 @@
#include <linux/percpu.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#ifdef CONFIG_CLASSIC_RCU
#include <linux/rcuclassic.h>
+#else
+#include <linux/rcupreempt.h>
+#endif

/**
* struct rcu_head - callback structure for use with RCU
@@ -216,10 +220,13 @@ extern void FASTCALL(call_rcu_bh(struct
/* Exported common interfaces */
extern void synchronize_rcu(void);
extern void rcu_barrier(void);
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);

/* Internal to kernel */
extern void rcu_init(void);
extern void rcu_check_callbacks(int cpu, int user);
+extern int rcu_needs_cpu(int cpu);

#endif /* __KERNEL__ */
#endif /* __LINUX_RCUPDATE_H */
diff -puN /dev/null include/linux/rcupreempt.h
--- /dev/null 2006-03-26 18:34:52.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/include/linux/rcupreempt.h 2007-01-15 15:36:51.000000000 +0530
@@ -0,0 +1,65 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (RT implementation)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Author: Paul McKenney <paulmck@xxxxxxxxxx>
+ *
+ * Based on the original work by Paul McKenney <paul.mckenney@xxxxxxxxxx>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUPREEMPT_H
+#define __LINUX_RCUPREEMPT_H
+
+#ifdef __KERNEL__
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+
+#define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); }
+#define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); }
+
+#define __rcu_read_lock_nesting() (current->rcu_read_lock_nesting)
+
+extern void __synchronize_sched(void);
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUPREEMPT_H */
diff -puN include/linux/sched.h~rcu-preempt include/linux/sched.h
--- linux-2.6.20-rc3-mm1-rcu/include/linux/sched.h~rcu-preempt 2007-01-15 15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/include/linux/sched.h 2007-01-15 15:36:51.000000000 +0530
@@ -848,6 +848,11 @@ struct task_struct {
cpumask_t cpus_allowed;
unsigned int time_slice, first_time_slice;

+#ifdef CONFIG_PREEMPT_RCU
+ int rcu_read_lock_nesting;
+ int rcu_flipctr_idx;
+#endif
+
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
#endif
diff -puN kernel/fork.c~rcu-preempt kernel/fork.c
--- linux-2.6.20-rc3-mm1-rcu/kernel/fork.c~rcu-preempt 2007-01-15 15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/kernel/fork.c 2007-01-15 15:36:51.000000000 +0530
@@ -941,6 +941,16 @@ static inline void rt_mutex_init_task(st
#endif
}

+#ifdef CONFIG_PREEMPT_RCU
+static inline void rcu_task_init(struct task_struct *p)
+{
+ p->rcu_read_lock_nesting = 0;
+ p->rcu_flipctr_idx = 0;
+}
+#else
+static inline void rcu_task_init(struct task_struct *p) {}
+#endif
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1026,6 +1036,7 @@ static struct task_struct *copy_process(

INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
+ rcu_task_init(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);

diff -puN kernel/Kconfig.preempt~rcu-preempt kernel/Kconfig.preempt
--- linux-2.6.20-rc3-mm1-rcu/kernel/Kconfig.preempt~rcu-preempt 2007-01-15 15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/kernel/Kconfig.preempt 2007-01-15 15:36:51.000000000 +0530
@@ -63,3 +63,29 @@ config PREEMPT_BKL
Say Y here if you are building a kernel for a desktop system.
Say N if you are unsure.

+choice
+ prompt "RCU implementation type:"
+ default CLASSIC_RCU
+
+config CLASSIC_RCU
+ bool "Classic RCU"
+ help
+ This option selects the classic RCU implementation that is
+ designed for best read-side performance on non-realtime
+ systems.
+
+ Say Y if you are unsure.
+
+config PREEMPT_RCU
+ bool "Preemptible RCU"
+ help
+ This option reduces the latency of the kernel by making certain
+ RCU sections preemptible. Normally RCU code is non-preemptible, if
+ this option is selected then read-only RCU sections become
+ preemptible. This helps latency, but may expose bugs due to
+ now-naive assumptions about each RCU read-side critical section
+ remaining on a given CPU through its execution.
+
+ Say N if you are unsure.
+
+endchoice
diff -puN kernel/Makefile~rcu-preempt kernel/Makefile
--- linux-2.6.20-rc3-mm1-rcu/kernel/Makefile~rcu-preempt 2007-01-15 15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/kernel/Makefile 2007-01-15 15:36:51.000000000 +0530
@@ -6,9 +6,9 @@ obj-y = sched.o fork.o exec_domain.o
exit.o itimer.o time.o softirq.o resource.o \
sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
signal.o sys.o kmod.o workqueue.o pid.o \
- rcupdate.o rcuclassic.o extable.o params.o posix-timers.o \
+ extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
+ hrtimer.o rwsem.o latency.o nsproxy.o rcupdate.o srcu.o

obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
@@ -46,6 +46,8 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
obj-$(CONFIG_DEBUG_SYNCHRO_TEST) += synchro-test.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_UTS_NS) += utsname.o
diff -puN /dev/null kernel/rcupreempt.c
--- /dev/null 2006-03-26 18:34:52.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/kernel/rcupreempt.c 2007-01-15 15:36:51.000000000 +0530
@@ -0,0 +1,595 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Authors: Paul E. McKenney <paulmck@xxxxxxxxxx>
+ * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
+ * for pushing me away from locks and towards counters, and
+ * to Suparna Bhattacharya for pushing me completely away
+ * from atomic instructions on the read side.
+ *
+ * Papers: http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/cpumask.h>
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+struct rcu_data {
+ spinlock_t lock;
+ long completed; /* Number of last completed batch. */
+ struct rcu_head *nextlist;
+ struct rcu_head **nexttail;
+ struct rcu_head *waitlist;
+ struct rcu_head **waittail;
+ struct rcu_head *donelist;
+ struct rcu_head **donetail;
+};
+struct rcu_ctrlblk {
+ spinlock_t fliplock;
+ long completed; /* Number of last completed batch. */
+};
+static struct rcu_data rcu_data;
+static struct rcu_ctrlblk rcu_ctrlblk = {
+ .fliplock = SPIN_LOCK_UNLOCKED,
+ .completed = 0,
+};
+static DEFINE_PER_CPU(int [2], rcu_flipctr) = { 0, 0 };
+
+/*
+ * States for rcu_try_flip() and friends.
+ */
+enum rcu_try_flip_state {
+ RCU_TRY_FLIP_IDLE, /* "I" */
+ RCU_TRY_FLIP_GP, /* "G" */
+ RCU_TRY_FLIP_WAITACK, /* "A" */
+ RCU_TRY_FLIP_WAITZERO, /* "Z" */
+ RCU_TRY_FLIP_WAITMB /* "M" */
+};
+static enum rcu_try_flip_state rcu_try_flip_state = RCU_TRY_FLIP_IDLE;
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has seen
+ * the most recent counter flip.
+ */
+enum rcu_flip_flag_value {
+ RCU_FLIP_SEEN, /* Steady/initial state, last flip seen. */
+ /* Only GP detector can update. */
+ RCU_FLIPPED /* Flip just completed, need confirmation. */
+ /* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU(enum rcu_flip_flag_value, rcu_flip_flag) = RCU_FLIP_SEEN;
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has executed the
+ * needed memory barrier to fence in memory references from its last RCU
+ * read-side critical section in the just-completed grace period.
+ */
+enum rcu_mb_flag_value {
+ RCU_MB_DONE, /* Steady/initial state, no mb()s required. */
+ /* Only GP detector can update. */
+ RCU_MB_NEEDED /* Flip just completed, need an mb(). */
+ /* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU(enum rcu_mb_flag_value, rcu_mb_flag) = RCU_MB_DONE;
+
+/*
+ * Return the number of RCU batches processed thus far. Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+ return rcu_ctrlblk.completed;
+}
+
+long rcu_batches_completed_bh(void)
+{
+ return rcu_ctrlblk.completed;
+}
+
+void __rcu_read_lock(void)
+{
+ int idx;
+ int nesting;
+
+ nesting = current->rcu_read_lock_nesting;
+ if (nesting != 0) {
+
+ /* An earlier rcu_read_lock() covers us, just count this one. */
+ current->rcu_read_lock_nesting = nesting + 1;
+
+ } else {
+ unsigned long oldirq;
+
+ /*
+ * Disable local interrupts to prevent the grace-period
+ * detection state machine from seeing us half-done.
+ */
+ local_irq_save(oldirq);
+
+ /*
+ * Outermost nesting of rcu_read_lock(), so atomically
+ * increment the current counter for the current CPU.
+ */
+ idx = rcu_ctrlblk.completed & 0x1;
+ smp_read_barrier_depends();
+ barrier();
+ __get_cpu_var(rcu_flipctr)[idx]++;
+ barrier();
+
+ /*
+ * Now that the per-CPU counter has been incremented, we
+ * are protected. We can therefore safely increment
+ * the nesting counter, relieving further NMIs of the
+ * need to do so.
+ */
+ current->rcu_read_lock_nesting = nesting + 1;
+ barrier();
+
+ /*
+ * Now that we have prevented any NMIs from storing
+ * to the ->rcu_flipctr_idx, we can safely use it to
+ * remember which counter to decrement in the matching
+ * rcu_read_unlock().
+ */
+ current->rcu_flipctr_idx = idx;
+ local_irq_restore(oldirq);
+ }
+}
+
+void __rcu_read_unlock(void)
+{
+ int idx;
+ int nesting;
+
+ nesting = current->rcu_read_lock_nesting;
+ if (nesting > 1) {
+ /*
+ * We are still protected by an enclosing rcu_read_lock(),
+ * so simply decrement the counter.
+ */
+ current->rcu_read_lock_nesting = nesting - 1;
+
+ } else {
+ unsigned long oldirq;
+
+ /*
+ * Disable local interrupts to prevent the grace-period
+ * detection state machine from seeing us half-done.
+ */
+ local_irq_save(oldirq);
+
+ /*
+ * Outermost nesting of rcu_read_unlock(), so we must
+ * decrement the current counter for the current CPU.
+ * This must be done carefully, because NMIs can
+ * occur at any point in this code, and any rcu_read_lock()
+ * and rcu_read_unlock() pairs in the NMI handlers
+ * must interact non-destructively with this code.
+ * Lots of barrier() calls, and -very- careful ordering.
+ *
+ * Changes to this code, including this one, must be
+ * inspected, validated, and tested extremely carefully!!!
+ */
+
+ /*
+ * First, pick up the index. Enforce ordering for
+ * both compilers and for DEC Alpha.
+ */
+ idx = current->rcu_flipctr_idx;
+ smp_read_barrier_depends();
+ barrier();
+
+ /*
+ * It is now safe to decrement the task's nesting count.
+ * NMIs that occur after this statement will route
+ * their rcu_read_lock() calls through this "else" clause
+ * of this "if" statement, and thus will start incrementing
+ * the per-CPU counter on their own. Enforce ordering for
+ * compilers.
+ */
+ current->rcu_read_lock_nesting = nesting - 1;
+ barrier();
+
+ /*
+ * Decrement the per-CPU counter. NMI handlers
+ * might increment it as well, but they had better
+ * properly nest their rcu_read_lock()/rcu_read_unlock()
+ * pairs so that the value is restored before the handler
+ * returns to us.
+ */
+ __get_cpu_var(rcu_flipctr)[idx]--;
+ local_irq_restore(oldirq);
+ }
+}
+
+static void __rcu_advance_callbacks(void)
+{
+ if ((rcu_data.completed >> 1) != (rcu_ctrlblk.completed >> 1)) {
+ if (rcu_data.waitlist != NULL) {
+ *rcu_data.donetail = rcu_data.waitlist;
+ rcu_data.donetail = rcu_data.waittail;
+ }
+ if (rcu_data.nextlist != NULL) {
+ rcu_data.waitlist = rcu_data.nextlist;
+ rcu_data.waittail = rcu_data.nexttail;
+ rcu_data.nextlist = NULL;
+ rcu_data.nexttail = &rcu_data.nextlist;
+ } else {
+ rcu_data.waitlist = NULL;
+ rcu_data.waittail = &rcu_data.waitlist;
+ }
+ rcu_data.completed = rcu_ctrlblk.completed;
+ } else if (rcu_data.completed != rcu_ctrlblk.completed)
+ rcu_data.completed = rcu_ctrlblk.completed;
+}
+
+/*
+ * Get here when RCU is idle. Decide whether we need to
+ * move out of idle state, and return zero if so.
+ * "Straightforward" approach for the moment, might later
+ * use callback-list lengths, grace-period duration, or
+ * some such to determine when to exit idle state.
+ * Might also need a pre-idle test that does not acquire
+ * the lock, but let's get the simple case working first...
+ */
+static int rcu_try_flip_idle(int flipctr)
+{
+ if (!rcu_pending(smp_processor_id()))
+ return 1;
+ return 0;
+}
+
+/*
+ * Flip processing up to and including the flip, as well as
+ * telling CPUs to acknowledge the flip.
+ */
+static int rcu_try_flip_in_gp(int flipctr)
+{
+ int cpu;
+
+ /*
+ * Do the flip.
+ */
+ rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
+
+ /*
+ * Need a memory barrier so that other CPUs see the new
+ * counter value before they see the subsequent change of all
+ * the rcu_flip_flag instances to RCU_FLIPPED.
+ */
+ smp_mb();
+
+ /* Now ask each CPU for acknowledgement of the flip. */
+
+ for_each_possible_cpu(cpu)
+ per_cpu(rcu_flip_flag, cpu) = RCU_FLIPPED;
+
+ return 0;
+}
+
+/*
+ * Wait for CPUs to acknowledge the flip.
+ */
+static int rcu_try_flip_waitack(int flipctr)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ if (per_cpu(rcu_flip_flag, cpu) != RCU_FLIP_SEEN)
+ return 1;
+
+ /*
+ * Make sure our checks above don't bleed into subsequent
+ * waiting for the sum of the counters to reach zero.
+ */
+ smp_mb();
+ return 0;
+}
+
+/*
+ * Wait for collective ``last'' counter to reach zero,
+ * then tell all CPUs to do an end-of-grace-period memory barrier.
+ */
+static int rcu_try_flip_waitzero(int flipctr)
+{
+ int cpu;
+ int lastidx = !(flipctr & 0x1);
+ int sum = 0;
+
+ /* Check to see if the sum of the "last" counters is zero. */
+
+ for_each_possible_cpu(cpu)
+ sum += per_cpu(rcu_flipctr, cpu)[lastidx];
+ if (sum != 0)
+ return 1;
+
+ /* Make sure we don't call for memory barriers before we see zero. */
+ smp_mb();
+
+ /* Call for a memory barrier from each CPU. */
+ for_each_possible_cpu(cpu)
+ per_cpu(rcu_mb_flag, cpu) = RCU_MB_NEEDED;
+
+ return 0;
+}
+
+/*
+ * Wait for all CPUs to do their end-of-grace-period memory barrier.
+ * Return 0 once all CPUs have done so.
+ */
+static int rcu_try_flip_waitmb(int flipctr)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ if (per_cpu(rcu_mb_flag, cpu) != RCU_MB_DONE)
+ return 1;
+
+ smp_mb(); /* Ensure that the above checks precede any following flip. */
+ return 0;
+}
+
+/*
+ * Attempt a single flip of the counters. Remember, a single flip does
+ * -not- constitute a grace period. Instead, the interval between
+ * at least three consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void rcu_try_flip(void)
+{
+ long flipctr;
+ unsigned long oldirq;
+
+ if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq)))
+ return;
+
+ /*
+ * Take the next transition(s) through the RCU grace-period
+ * flip-counter state machine.
+ */
+ flipctr = rcu_ctrlblk.completed;
+ switch (rcu_try_flip_state) {
+ case RCU_TRY_FLIP_IDLE:
+ if (rcu_try_flip_idle(flipctr))
+ break;
+ rcu_try_flip_state = RCU_TRY_FLIP_GP;
+ case RCU_TRY_FLIP_GP:
+ if (rcu_try_flip_in_gp(flipctr))
+ break;
+ rcu_try_flip_state = RCU_TRY_FLIP_WAITACK;
+ case RCU_TRY_FLIP_WAITACK:
+ if (rcu_try_flip_waitack(flipctr))
+ break;
+ rcu_try_flip_state = RCU_TRY_FLIP_WAITZERO;
+ case RCU_TRY_FLIP_WAITZERO:
+ if (rcu_try_flip_waitzero(flipctr))
+ break;
+ rcu_try_flip_state = RCU_TRY_FLIP_WAITMB;
+ case RCU_TRY_FLIP_WAITMB:
+ if (rcu_try_flip_waitmb(flipctr))
+ break;
+ rcu_try_flip_state = RCU_TRY_FLIP_IDLE;
+ }
+ spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+}
+
+/*
+ * Check to see if this CPU needs to report that it has seen the most
+ * recent counter flip, thereby declaring that all subsequent
+ * rcu_read_lock() invocations will respect this flip.
+ */
+static void rcu_check_flipseen(int cpu)
+{
+ if (per_cpu(rcu_flip_flag, cpu) == RCU_FLIPPED) {
+ smp_mb(); /* Subsequent counter acccesses must see new value */
+ per_cpu(rcu_flip_flag, cpu) = RCU_FLIP_SEEN;
+ smp_mb(); /* probably be implied by interrupt, but... */
+ }
+}
+
+/*
+ * Check to see if this CPU needs to do a memory barrier in order to
+ * ensure that any prior RCU read-side critical sections have committed
+ * their counter manipulations and critical-section memory references
+ * before declaring the grace period to be completed.
+ */
+static void rcu_check_mb(int cpu)
+{
+ if (per_cpu(rcu_mb_flag, cpu) == RCU_MB_NEEDED) {
+ smp_mb();
+ per_cpu(rcu_mb_flag, cpu) = RCU_MB_DONE;
+ }
+}
+
+/*
+ * This function is periodically called from hardware-irq context on
+ * each CPU.
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+ unsigned long oldirq;
+
+ rcu_check_flipseen(cpu);
+ rcu_check_mb(cpu);
+ if (rcu_ctrlblk.completed == rcu_data.completed) {
+ rcu_try_flip();
+ if (rcu_ctrlblk.completed == rcu_data.completed) {
+ return;
+ }
+ }
+ spin_lock_irqsave(&rcu_data.lock, oldirq);
+ __rcu_advance_callbacks();
+ if (rcu_data.donelist == NULL)
+ spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+ else {
+ spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+ raise_softirq(RCU_SOFTIRQ);
+ }
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so. This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+ return rcu_pending(cpu);
+}
+
+/*
+ * Needed by dynticks, to make sure all RCU processing has finished
+ * when we go idle:
+ */
+void rcu_advance_callbacks(int cpu, int user)
+{
+ unsigned long oldirq;
+
+ if (rcu_ctrlblk.completed == rcu_data.completed) {
+ rcu_try_flip();
+ if (rcu_ctrlblk.completed == rcu_data.completed) {
+ return;
+ }
+ }
+ spin_lock_irqsave(&rcu_data.lock, oldirq);
+ __rcu_advance_callbacks();
+ spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+}
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+ unsigned long flags;
+ struct rcu_head *next, *list;
+
+ spin_lock_irqsave(&rcu_data.lock, flags);
+ list = rcu_data.donelist;
+ if (list == NULL) {
+ spin_unlock_irqrestore(&rcu_data.lock, flags);
+ return;
+ }
+ rcu_data.donelist = NULL;
+ rcu_data.donetail = &rcu_data.donelist;
+ spin_unlock_irqrestore(&rcu_data.lock, flags);
+ while (list) {
+ next = list->next;
+ list->func(list);
+ list = next;
+ }
+}
+
+void fastcall call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu))
+{
+ unsigned long flags;
+
+ head->func = func;
+ head->next = NULL;
+ spin_lock_irqsave(&rcu_data.lock, flags);
+ __rcu_advance_callbacks();
+ *rcu_data.nexttail = head;
+ rcu_data.nexttail = &head->next;
+ spin_unlock_irqrestore(&rcu_data.lock, flags);
+}
+
+/*
+ * Wait until all currently running preempt_disable() code segments
+ * (including hardware-irq-disable segments) complete. Note that
+ * in -rt this does -not- necessarily result in all currently executing
+ * interrupt -handlers- having completed.
+ */
+void __synchronize_sched(void)
+{
+ cpumask_t oldmask;
+ int cpu;
+
+ if (sched_getaffinity(0, &oldmask) < 0) {
+ oldmask = cpu_possible_map;
+ }
+ for_each_online_cpu(cpu) {
+ sched_setaffinity(0, cpumask_of_cpu(cpu));
+ schedule();
+ }
+ sched_setaffinity(0, oldmask);
+}
+
+int rcu_pending(int cpu)
+{
+ return (rcu_data.donelist != NULL ||
+ rcu_data.waitlist != NULL ||
+ rcu_data.nextlist != NULL);
+}
+
+void __init __rcu_init(void)
+{
+ spin_lock_init(&rcu_data.lock);
+ rcu_data.completed = 0;
+ rcu_data.nextlist = NULL;
+ rcu_data.nexttail = &rcu_data.nextlist;
+ rcu_data.waitlist = NULL;
+ rcu_data.waittail = &rcu_data.waitlist;
+ rcu_data.donelist = NULL;
+ rcu_data.donetail = &rcu_data.donelist;
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+}
+
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+ synchronize_rcu();
+}
+
+
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+

_
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/