Re: [PATCH RFC] rcu: Permit limiting of force_quiescent_state()latency

From: Mike Galbraith
Date: Sun Mar 18 2012 - 11:00:40 EST


On Sat, 2012-03-17 at 17:57 +0100, Mike Galbraith wrote:

> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -1316,47 +1316,72 @@ void rcu_check_callbacks(int cpu, int us
> * have not yet encountered a quiescent state, using the function specified.
> * Also initiate boosting for any threads blocked on the root rcu_node.
> *
> + * Returns 0 if the scan could not be completed, 1 otherwise.
> * The caller must have suppressed start of new grace periods.
> */
> -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
> +static int force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
> {
> unsigned long bit;
> int cpu;
> + int cpusvisited = 0, done = 1;
> unsigned long flags;
> unsigned long mask;
> struct rcu_node *rnp;
>
> rcu_for_each_leaf_node(rsp, rnp) {
> + if (rnp->grphi < rsp->fqs_next_cpu)
> + continue;
> + if (rnp->grplo > rcu_get_max_cpu())
> + continue;

...

> if (mask != 0) {
>
> /* rcu_report_qs_rnp() releases rnp->lock. */
> rcu_report_qs_rnp(mask, rsp, rnp, flags);
> - continue;
> + if (cpusvisited < FQS_MAX_CPU_VISIT) deadlock: if no continue..
> + continue;
> }
> raw_spin_unlock_irqrestore(&rnp->lock, flags); _thoroughly_ unlock it.


Boots/runs. Altered to not bail mid-group.

I tried a synchronized jitter test on 45 cores, patches made diddly spit
difference. There's too much other noise. Also applied both patches to
RT, and tested on 60 cores, which also made little if any difference,
but then jitter is single digit (usecs) running this test with the RT
kernel already. With NOPREEMPT kernel, jitter is low triple digit worst
case with or without patches.

---
arch/x86/Kconfig | 4 ++++
kernel/rcutree.c | 49 ++++++++++++++++++++++++++++++++++++-------------
kernel/rcutree.h | 16 ++++++++++++++++
3 files changed, 56 insertions(+), 13 deletions(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -254,6 +254,10 @@ config ARCH_CPU_PROBE_RELEASE
def_bool y
depends on HOTPLUG_CPU && !XEN

+config ARCH_RCU_FQS_LIMIT
+ int
+ default 16
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"

--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1316,47 +1316,65 @@ void rcu_check_callbacks(int cpu, int us
* have not yet encountered a quiescent state, using the function specified.
* Also initiate boosting for any threads blocked on the root rcu_node.
*
+ * Returns 0 if the scan could not be completed, 1 otherwise.
* The caller must have suppressed start of new grace periods.
*/
-static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
+static int force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
{
unsigned long bit;
int cpu;
+ int cpusvisited = 0, done = 1;
unsigned long flags;
unsigned long mask;
struct rcu_node *rnp;

rcu_for_each_leaf_node(rsp, rnp) {
+ if (rnp->grphi < rsp->fqs_next_cpu)
+ continue;
+ if (rnp->grplo > rcu_get_max_cpu())
+ continue;
mask = 0;
raw_spin_lock_irqsave(&rnp->lock, flags);
if (!rcu_gp_in_progress(rsp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
+ return done;
}
if (rnp->qsmask == 0) {
rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
continue;
}
- cpu = rnp->grplo;
bit = 1;
- for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
- if ((rnp->qsmask & bit) != 0 &&
- f(per_cpu_ptr(rsp->rda, cpu)))
+ rsp->fqs_next_cpu = rnp->grphi + 1;
+ cpusvisited += rnp->grphi - rnp->grplo;
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, bit <<= 1) {
+ if ((rnp->qsmask & bit) == 0)
+ continue;
+ if (f(per_cpu_ptr(rsp->rda, cpu)))
mask |= bit;
}
- if (mask != 0) {

+ if (mask != 0) {
/* rcu_report_qs_rnp() releases rnp->lock. */
rcu_report_qs_rnp(mask, rsp, rnp, flags);
- continue;
+ } else
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+ if (cpusvisited >= FQS_MAX_CPU_VISIT) {
+ if (FQS_MAX_CPU_VISIT != NR_CPUS)
+ done = 0;
+ break;
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
rnp = rcu_get_root(rsp);
if (rnp->qsmask == 0) {
raw_spin_lock_irqsave(&rnp->lock, flags);
rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
}
+
+ if (done)
+ rsp->fqs_next_cpu = 0;
+
+ return done;
}

/*
@@ -1366,6 +1384,7 @@ static void force_qs_rnp(struct rcu_stat
static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
{
unsigned long flags;
+ int retval;
struct rcu_node *rnp = rcu_get_root(rsp);

if (!rcu_gp_in_progress(rsp))
@@ -1398,21 +1417,25 @@ static void force_quiescent_state(struct
raw_spin_unlock(&rnp->lock); /* irqs remain disabled */

/* Record dyntick-idle state. */
- force_qs_rnp(rsp, dyntick_save_progress_counter);
+ retval = force_qs_rnp(rsp, dyntick_save_progress_counter);
raw_spin_lock(&rnp->lock); /* irqs already disabled */
- if (rcu_gp_in_progress(rsp))
+ if (rcu_gp_in_progress(rsp) && retval) {
rsp->signaled = RCU_FORCE_QS;
+ rsp->fqs_next_cpu = 0;
+ } else if (rcu_gp_in_progress(rsp))
+ rsp->jiffies_force_qs = jiffies - 1;
break;

case RCU_FORCE_QS:

/* Check dyntick-idle state, send IPI to laggarts. */
raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
- force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
+ retval = force_qs_rnp(rsp, rcu_implicit_dynticks_qs);

/* Leave state in case more forcing is required. */
-
raw_spin_lock(&rnp->lock); /* irqs already disabled */
+ if (rcu_gp_in_progress(rsp) && !retval)
+ rsp->jiffies_force_qs = jiffies - 1;
break;
}
rsp->fqs_active = 0;
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -354,6 +354,21 @@ do { \
} while (0)

/*
+ * Large latency-sensitive configurations can limit force_quiescent_state()
+ * latencies by defining an CONFIG_ARCH_RCU_FQS_LIMIT. This should be
+ * sized based on that architecture's cache-miss latency and the maximum
+ * desired force_quiescent_state latency. For example, if the cache-miss
+ * latency was 100 nanoseconds, and the maximum force_quiescent_state()
+ * latency contribution was 5 microseconds, then that architecture should
+ * define CONFIG_ARCH_RCU_FQS_LIMIT to be 50.
+ */
+#ifdef CONFIG_ARCH_RCU_FQS_LIMIT
+#define FQS_MAX_CPU_VISIT CONFIG_ARCH_RCU_FQS_LIMIT
+#else /* #ifdef CONFIG_ARCH_RCU_FQS_LIMIT */
+#define FQS_MAX_CPU_VISIT NR_CPUS
+#endif /* #else #ifdef CONFIG_ARCH_RCU_FQS_LIMIT */
+
+/*
* RCU global state, including node hierarchy. This hierarchy is
* represented in "heap" form in a dense array. The root (first level)
* of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
@@ -391,6 +406,7 @@ struct rcu_state {
/* starting new GP. */
raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */
+ int fqs_next_cpu; /* Next CPU for fqs to scan. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
unsigned long n_force_qs; /* Number of calls to */


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/