[PATCH RFC 13/16] prcu: Comment source code
From: lianglihao
Date: Tue Jan 23 2018 - 03:09:38 EST
From: Lihao Liang <lianglihao@xxxxxxxxxx>
Signed-off-by: Lihao Liang <lianglihao@xxxxxxxxxx>
---
include/linux/prcu.h | 73 ++++++++++++++++-----
kernel/rcu/prcu.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 225 insertions(+), 26 deletions(-)
diff --git a/include/linux/prcu.h b/include/linux/prcu.h
index bb20fa40..9f740985 100644
--- a/include/linux/prcu.h
+++ b/include/linux/prcu.h
@@ -1,3 +1,11 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (PRCU version).
+ * PRCU public definitions.
+ *
+ * Authors: Heng Zhang <heng.z@xxxxxxxxxx>
+ * Lihao Liang <lianglihao@xxxxxxxxxx>
+ */
+
#ifndef __LINUX_PRCU_H
#define __LINUX_PRCU_H
@@ -8,12 +16,26 @@
#include <linux/completion.h>
#ifdef CONFIG_PRCU
+
+/*
+ * Simple list structure of callback versions.
+ *
+ * Note: Ideally, we would like to add the version field
+ * to the rcu_head struct. But if we do so, other users of
+ * rcu_head in the Linux kernel will complain hard and loudly.
+ */
struct prcu_version_head {
unsigned long long version;
struct prcu_version_head *next;
};
-/* Simple unsegmented callback list for PRCU. */
+/*
+ * Simple unsegmented callback list for PRCU.
+ *
+ * Note: Since we can't add a new version field to rcu_head,
+ * we have to make our own callback list for PRCU instead of
+ * using the existing rcu_cblist. Sigh!
+ */
struct prcu_cblist {
struct rcu_head *head;
struct rcu_head **tail;
@@ -27,31 +49,47 @@ struct prcu_cblist {
.version_head = NULL, .version_tail = &n.version_head, \
}
+/*
+ * PRCU's per-CPU state.
+ */
struct prcu_local_struct {
- unsigned int locked;
- unsigned int online;
- unsigned long long version;
- unsigned long long cb_version;
- struct rcu_head barrier_head;
- struct prcu_cblist cblist;
+ unsigned int locked; /* Nesting level of PRCU read-side */
+ /* critcal sections */
+ unsigned int online; /* Indicates whether a context-switch */
+ /* has occurred on this CPU */
+ unsigned long long version; /* Local grace-period version */
+ unsigned long long cb_version; /* Local callback version */
+ struct rcu_head barrier_head; /* PRCU callback list */
+ struct prcu_cblist cblist; /* PRCU callback version list */
};
+/*
+ * PRCU's global state.
+ */
struct prcu_struct {
- atomic64_t global_version;
- atomic64_t cb_version;
- atomic_t active_ctr;
- atomic_t barrier_cpu_count;
- struct mutex mtx;
- struct mutex barrier_mtx;
- wait_queue_head_t wait_q;
- struct completion barrier_completion;
+ atomic64_t global_version; /* Global grace-period version */
+ atomic64_t cb_version; /* Global callback version */
+ atomic_t active_ctr; /* Outstanding PRCU tasks */
+ /* being context-switched */
+ atomic_t barrier_cpu_count; /* # CPUs waiting on prcu_barrier() */
+ struct mutex mtx; /* Serialize synchronize_prcu() */
+ struct mutex barrier_mtx; /* Serialize prcu_barrier() */
+ wait_queue_head_t wait_q; /* Wait for synchronize_prcu() */
+ struct completion barrier_completion; /* Wait for prcu_barrier() */
};
+/*
+ * PRCU APIs.
+ */
void prcu_read_lock(void);
void prcu_read_unlock(void);
void synchronize_prcu(void);
void call_prcu(struct rcu_head *head, rcu_callback_t func);
void prcu_barrier(void);
+
+/*
+ * Internal non-public functions.
+ */
void prcu_init(void);
void prcu_note_context_switch(void);
int prcu_pending(void);
@@ -60,11 +98,16 @@ void prcu_check_callbacks(void);
#else /* #ifdef CONFIG_PRCU */
+/*
+ * If CONFIG_PRCU is not defined,
+ * map its APIs to RCU's counterparts.
+ */
#define prcu_read_lock rcu_read_lock
#define prcu_read_unlock rcu_read_unlock
#define synchronize_prcu synchronize_rcu
#define call_prcu call_rcu
#define prcu_barrier rcu_barrier
+
#define prcu_init() do {} while (0)
#define prcu_note_context_switch() do {} while (0)
#define prcu_pending() 0
diff --git a/kernel/rcu/prcu.c b/kernel/rcu/prcu.c
index 49cb70e6..ef2c7730 100644
--- a/kernel/rcu/prcu.c
+++ b/kernel/rcu/prcu.c
@@ -1,3 +1,17 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (PRCU version).
+ * This PRCU implementation is based on a fast consensus protocol
+ * published in the following paper:
+ *
+ * Fast Consensus Using Bounded Staleness for Scalable Read-mostly Synchronization.
+ * Haibo Chen, Heng Zhang, Ran Liu, Binyu Zang, and Haibing Guan.
+ * IEEE Transactions on Parallel and Distributed Systems (TPDS), 2016.
+ * https://dl.acm.org/citation.cfm?id=3024114.3024143
+ *
+ * Authors: Heng Zhang <heng.z@xxxxxxxxxx>
+ * Lihao Liang <lianglihao@xxxxxxxxxx>
+ */
+
#include <linux/smp.h>
#include <linux/percpu.h>
#include <linux/prcu.h>
@@ -8,8 +22,16 @@
#include "rcu.h"
+/* Data structures. */
+
+/*
+ * Initialize PRCU's per-CPU local structure.
+ */
DEFINE_PER_CPU_SHARED_ALIGNED(struct prcu_local_struct, prcu_local);
+/*
+ * Initialize PRCU's global structure.
+ */
struct prcu_struct global_prcu = {
.global_version = ATOMIC64_INIT(0),
.cb_version = ATOMIC64_INIT(0),
@@ -20,7 +42,9 @@ struct prcu_struct global_prcu = {
};
struct prcu_struct *prcu = &global_prcu;
-/* Initialize simple callback list. */
+/*
+ * Initialize simple PRCU callback list.
+ */
static void prcu_cblist_init(struct prcu_cblist *rclp)
{
rclp->head = NULL;
@@ -31,8 +55,8 @@ static void prcu_cblist_init(struct prcu_cblist *rclp)
}
/*
- * Dequeue the oldest rcu_head structure from the specified callback list;
- * store the callback grace period version number into the version pointer.
+ * Dequeue the oldest rcu_head structure from the specified callback list.
+ * Store the callback version number into the version pointer.
*/
static struct rcu_head *prcu_cblist_dequeue(struct prcu_cblist *rclp)
{
@@ -59,6 +83,11 @@ static struct rcu_head *prcu_cblist_dequeue(struct prcu_cblist *rclp)
return rhp;
}
+/* PRCU function implementations. */
+
+/*
+ * Update local PRCU state of the current CPU.
+ */
static inline void prcu_report(struct prcu_local_struct *local)
{
unsigned long long global_version;
@@ -70,6 +99,15 @@ static inline void prcu_report(struct prcu_local_struct *local)
cmpxchg(&local->version, local_version, global_version);
}
+/*
+ * Mark the beginning of a PRCU read-side critical section.
+ *
+ * A PRCU quiescent state of a CPU is when its local ->locked and
+ * ->online variables become 0.
+ *
+ * See prcu_read_unlock() and synchronize_prcu() for more information.
+ * Also see rcu_read_lock() comment header.
+ */
void prcu_read_lock(void)
{
struct prcu_local_struct *local;
@@ -77,29 +115,50 @@ void prcu_read_lock(void)
local = get_cpu_ptr(&prcu_local);
if (!local->online) {
WRITE_ONCE(local->online, 1);
+ /*
+ * Memory barrier is needed for PRCU writers
+ * to see the updated local->online value.
+ */
smp_mb();
}
-
local->locked++;
+ /*
+ * Critical section after entry code.
+ * put_cpu_ptr() provides the needed barrier().
+ */
put_cpu_ptr(&prcu_local);
}
EXPORT_SYMBOL(prcu_read_lock);
+/*
+ * Mark the end of a PRCU read-side critical section.
+ *
+ * See prcu_read_lock() and synchronize_prcu() for more information.
+ * Also see rcu_read_unlock() comment header.
+ */
void prcu_read_unlock(void)
{
int locked;
struct prcu_local_struct *local;
- barrier();
+ barrier(); /* Critical section before exit code. */
local = get_cpu_ptr(&prcu_local);
locked = local->locked;
if (locked) {
local->locked--;
+ /*
+ * If we are executing the last PRCU task,
+ * update the CPU-local PRCU state.
+ */
if (locked == 1)
prcu_report(local);
put_cpu_ptr(&prcu_local);
} else {
put_cpu_ptr(&prcu_local);
+ /*
+ * If we are executing the last outstanding
+ * PRCU task, wake up synchronize_prcu().
+ */
if (!atomic_dec_return(&prcu->active_ctr))
wake_up(&prcu->wait_q);
}
@@ -111,10 +170,25 @@ static void prcu_handler(void *info)
struct prcu_local_struct *local;
local = this_cpu_ptr(&prcu_local);
+ /*
+ * We need to do this check locally on the current CPU
+ * because no memory barrier is used for ->locked so
+ * PRCU writers may not see its latest local value.
+ */
if (!local->locked)
WRITE_ONCE(local->version, atomic64_read(&prcu->global_version));
}
+/*
+ * Wait until a grace period has completed.
+ *
+ * A PRCU grace period can end if each CPU has passed a PRCU quiescent state
+ * -and- the global variable ->active_ctr is 0, that is all pre-existing
+ * PRCU read-side critical sections have completed.
+ *
+ * See prcu_read_lock() and prcu_read_unlock() for more information.
+ * Also see synchronize_rcu() comment header.
+ */
void synchronize_prcu(void)
{
int cpu;
@@ -122,7 +196,13 @@ void synchronize_prcu(void)
unsigned long long version;
struct prcu_local_struct *local;
+ /*
+ * Get the new global grace-period version before taking mutex,
+ * which allows multiple synchronize_prcu() calls spreading PRCU
+ * readers can return in a timely fashion.
+ */
version = atomic64_add_return(1, &prcu->global_version);
+ /* Take mutex to serialize concurrent synchronize_prcu() calls. */
mutex_lock(&prcu->mtx);
local = get_cpu_ptr(&prcu_local);
@@ -130,8 +210,14 @@ void synchronize_prcu(void)
put_cpu_ptr(&prcu_local);
cpumask_clear(&cpus);
+ /* Send an IPI to force straggling CPUs to update their PRCU state. */
for_each_possible_cpu(cpu) {
local = per_cpu_ptr(&prcu_local, cpu);
+ /*
+ * If no PRCU tasks are currently running on this CPU
+ * or a context-switch has occurred, the CPU-local PRCU
+ * state has already been updated.
+ */
if (!READ_ONCE(local->online))
continue;
if (READ_ONCE(local->version) < version) {
@@ -140,34 +226,46 @@ void synchronize_prcu(void)
}
}
+ /* Wait for outstanding CPUs to commit. */
for_each_cpu(cpu, &cpus) {
local = per_cpu_ptr(&prcu_local, cpu);
while (READ_ONCE(local->version) < version)
cpu_relax();
}
+ /* Wait for outstanding PRCU tasks to finish. */
if (atomic_read(&prcu->active_ctr))
wait_event(prcu->wait_q, !atomic_read(&prcu->active_ctr));
-
+ /* Update the global callback version to its grace-period version. */
atomic64_set(&prcu->cb_version, version);
mutex_unlock(&prcu->mtx);
}
EXPORT_SYMBOL(synchronize_prcu);
+/*
+ * Update PRCU state when a context-switch occurs.
+ */
void prcu_note_context_switch(void)
{
struct prcu_local_struct *local;
local = get_cpu_ptr(&prcu_local);
+ /* Update local and global outstanding PRCU task number. */
if (local->locked) {
atomic_add(local->locked, &prcu->active_ctr);
local->locked = 0;
}
+ /* Indicate a context-switch has occurred on this CPU. */
local->online = 0;
+ /* Update this CPU's local PRCU state. */
prcu_report(local);
put_cpu_ptr(&prcu_local);
}
+/*
+ * Queue a PRCU callback to the current CPU for invocation
+ * after a grace period.
+ */
void call_prcu(struct rcu_head *head, rcu_callback_t func)
{
unsigned long flags;
@@ -177,8 +275,12 @@ void call_prcu(struct rcu_head *head, rcu_callback_t func)
debug_rcu_head_queue(head);
- /* Use GFP_ATOMIC with IRQs disabled */
+ /* Use GFP_ATOMIC with IRQs disabled. */
vhp = kmalloc(sizeof(struct prcu_version_head), GFP_ATOMIC);
+ /*
+ * Complain about kmalloc() failure. This could be handled
+ * in a different way, e.g. return -1 to inform the caller.
+ */
if (!vhp) {
WARN_ON(1);
return;
@@ -188,8 +290,13 @@ void call_prcu(struct rcu_head *head, rcu_callback_t func)
head->next = NULL;
vhp->next = NULL;
+ /* Disable IRQs to prevent races with prcu_process_callbacks(). */
local_irq_save(flags);
local = this_cpu_ptr(&prcu_local);
+ /*
+ * Assign the CPU-local callback version to the given callback
+ * and add it to the PRCU callback list of the current CPU.
+ */
vhp->version = local->version;
rclp = &local->cblist;
rclp->len++;
@@ -201,6 +308,13 @@ void call_prcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL(call_prcu);
+/*
+ * Check to see if there is any immediate PRCU-related work
+ * to be done by the current CPU, returning 1 if so.
+ *
+ * Currently, it only checks whether this CPU has callbacks
+ * that are ready to invoke.
+ */
int prcu_pending(void)
{
struct prcu_local_struct *local = get_cpu_ptr(&prcu_local);
@@ -211,18 +325,33 @@ int prcu_pending(void)
return cb_version < atomic64_read(&prcu->cb_version) && rclp->head;
}
+/*
+ * Perform PRCU core processing for the current CPU using softirq.
+ */
void invoke_prcu_core(void)
{
if (cpu_online(smp_processor_id()))
raise_softirq(PRCU_SOFTIRQ);
}
+/*
+ * Schedule PRCU core processing.
+ *
+ * This function must be called from hardirq context.
+ * It is normally invoked from the scheduling-clock interrupt.
+ */
void prcu_check_callbacks(void)
{
if (prcu_pending())
invoke_prcu_core();
}
+/*
+ * Process PRCU callbacks whose grace period has completed.
+ * Do this using softirq for each CPU.
+ *
+ * Also see the prcu_barrier() comment header.
+ */
static __latent_entropy void prcu_process_callbacks(struct softirq_action *unused)
{
unsigned long flags;
@@ -237,18 +366,24 @@ static __latent_entropy void prcu_process_callbacks(struct softirq_action *unuse
cb_version = atomic64_read(&prcu->cb_version);
- /* Disable interrupts to prevent races with call_prcu() */
+ /* Disable IRQs to prevent races with call_prcu(). */
local_irq_save(flags);
local = this_cpu_ptr(&prcu_local);
rclp = &local->cblist;
rhp = rclp->head;
vhp = rclp->version_head;
+ /*
+ * Process PRCU callbacks with version number smaller
+ * than the global PRCU callback version whose associated
+ * grace periods have completed.
+ */
for (; rhp && vhp && vhp->version < cb_version;
rhp = rclp->head, vhp = rclp->version_head) {
rhp = prcu_cblist_dequeue(rclp);
debug_rcu_head_unqueue(rhp);
rhp->func(rhp);
}
+ /* Record the version number of callbacks to be processed. */
local->cb_version = cb_version;
local_irq_restore(flags);
}
@@ -274,7 +409,18 @@ static void prcu_barrier_func(void *info)
call_prcu(&local->barrier_head, prcu_barrier_callback);
}
-/* Waiting for all PRCU callbacks to complete. */
+/*
+ * Waiting for all PRCU callbacks to complete.
+ *
+ * NOTE: The current PRCU implementation relies on synchronize_prcu()
+ * to update its global grace-period and callback version numbers.
+ * If there is no synchronize_prcu() running and call_prcu() is called,
+ * rcu_process_callbacks() wont't make progress and prcu_barrier() will
+ * -not- return.
+ *
+ * This needs to be fixed, e.g. using a grace-period expediting mechanism
+ * as found in the Linux-kernel RCU implementation.
+ */
void prcu_barrier(void)
{
int cpu;
@@ -292,9 +438,13 @@ void prcu_barrier(void)
/*
* Register a new callback on each CPU using IPI to prevent races
- * with call_prcu(). When that callback is invoked, we will know
+ * with call_prcu(). When that callback is invoked, we will know
* that all of the corresponding CPU's preceding callbacks have
- * been invoked.
+ * been invoked. Note that we must use the wait version of
+ * smp_call_function_single(). Otherwise prcu_barrier_func()
+ * might not finish incrementing prcu->barrier_cpu_count and
+ * registering prcu_barrier_callback() on -each- CPU before
+ * we exit the loop and wait for completion. Hence a bug!
*/
for_each_possible_cpu(cpu)
smp_call_function_single(cpu, prcu_barrier_func, NULL, 1);
@@ -315,6 +465,9 @@ void prcu_barrier(void)
}
EXPORT_SYMBOL(prcu_barrier);
+/*
+ * Helper function for prcu_init() to initialize PRCU's CPU-local structure.
+ */
void prcu_init_local_struct(int cpu)
{
struct prcu_local_struct *local;
@@ -327,6 +480,9 @@ void prcu_init_local_struct(int cpu)
prcu_cblist_init(&local->cblist);
}
+/*
+ * Initialize PRCU at boot time.
+ */
void __init prcu_init(void)
{
int cpu;
--
2.14.1.729.g59c0ea183