[RFC] Make need_resched() return true when rcu_urgent_qs requested
From: David Woodhouse
Date: Fri Jul 06 2018 - 10:53:41 EST
In 4.15 without CONFIG_PREEMPT we observed expand_fdtable() taking
about 10 seconds for synchronize_sched() to complete, when most of the
other threads were running KVM guests.
In vcpu_run() there's a loop with the fairly common construct:
if (need_resched()) {
â local unlocks â
cond_resched();
â local locks â
}
But because need_resched() wasn't true (until half the RCU warning time
was completed and rcu_implicit_dynticks_qs() calls resched_cpu()), that
never happens and cond_resched() is never called. In cond_resched()
there is an unconditional call to rcu_all_qs() which would DTRT.
Now, there's a simple way to fix it for the specific case of KVM â we
can find a place we can just call rcu_all_qs(), something like this:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00520711..a304693 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7214,6 +7214,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
smp_mb__after_srcu_read_unlock();
+ /* Force quiescent state (if requested) before entering guest mode */
+ rcu_all_qs();
+
/*
* This handles the case where a posted interrupt was
* notified with kvm_vcpu_kick.
But I wonder if we should attempt to fix the general case by making
need_resched() return true when an RCU quiescent state is needed. To do
that without having an out-of-line function call in kernel/rcu/tree.c
would look something like the patch below. Paul, did you say you had
other ideas about how to export/inline it?
Alternatively â or perhaps additionally â shouldn't CPUs which are
currently in guest mode be counted as quiescent anyway? Or is that
something we'll only ever want to do in full NOHZ mode?
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index b3dbf95..2f8a3bd 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -126,6 +126,7 @@ static inline bool rcu_is_watching(void) { return true; }
/* Avoid RCU read-side critical sections leaking across. */
static inline void rcu_all_qs(void) { barrier(); }
+static inline bool rcu_urgent_qs_requested(void) { return false; }
/* RCUtree hotplug events */
#define rcutree_prepare_cpu NULL
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 37d6fd3..d20b987 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,6 +30,36 @@
#ifndef __LINUX_RCUTREE_H
#define __LINUX_RCUTREE_H
+/*
+ * Dynticks per-CPU state.
+ */
+struct rcu_dynticks {
+ long long dynticks_nesting; /* Track irq/process nesting level. */
+ /* Process level is worth LLONG_MAX/2. */
+ int dynticks_nmi_nesting; /* Track NMI nesting level. */
+ atomic_t dynticks; /* Even value for idle, else odd. */
+ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
+ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
+ bool rcu_urgent_qs; /* GP old need light quiescent state. */
+#ifdef CONFIG_RCU_FAST_NO_HZ
+ bool all_lazy; /* Are all CPU's CBs lazy? */
+ unsigned long nonlazy_posted;
+ /* # times non-lazy CBs posted to CPU. */
+ unsigned long nonlazy_posted_snap;
+ /* idle-period nonlazy_posted snapshot. */
+ unsigned long last_accelerate;
+ /* Last jiffy CBs were accelerated. */
+ unsigned long last_advance_all;
+ /* Last jiffy CBs were all advanced. */
+ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+};
+DECLARE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
+static __always_inline bool rcu_urgent_qs_requested(void)
+{
+ return unlikely(raw_cpu_read(rcu_dynticks.rcu_urgent_qs));
+}
+
void rcu_note_context_switch(bool preempt);
int rcu_needs_cpu(u64 basem, u64 *nextevt);
void rcu_cpu_stall_reset(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e4d4e60..89f5814 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1616,7 +1616,8 @@ static inline int spin_needbreak(spinlock_t *lock)
static __always_inline bool need_resched(void)
{
- return unlikely(tif_need_resched());
+ return unlikely(tif_need_resched()) ||
+ rcu_urgent_qs_requested();
}
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f9c0ca2..cf1c66c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -264,10 +264,11 @@ void rcu_bh_qs(void)
#define rcu_eqs_special_exit() do { } while (0)
#endif
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
};
+EXPORT_SYMBOL(rcu_dynticks); /* for need_resched() */
/*
* There's a few places, currently just in the tracing infrastructure,
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 46a5d19..462b25b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -34,31 +34,6 @@
#include "rcu_segcblist.h"
-/*
- * Dynticks per-CPU state.
- */
-struct rcu_dynticks {
- long long dynticks_nesting; /* Track irq/process nesting level. */
- /* Process level is worth LLONG_MAX/2. */
- int dynticks_nmi_nesting; /* Track NMI nesting level. */
- atomic_t dynticks; /* Even value for idle, else odd. */
- bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
- unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
- bool rcu_urgent_qs; /* GP old need light quiescent state. */
-#ifdef CONFIG_RCU_FAST_NO_HZ
- bool all_lazy; /* Are all CPU's CBs lazy? */
- unsigned long nonlazy_posted;
- /* # times non-lazy CBs posted to CPU. */
- unsigned long nonlazy_posted_snap;
- /* idle-period nonlazy_posted snapshot. */
- unsigned long last_accelerate;
- /* Last jiffy CBs were accelerated. */
- unsigned long last_advance_all;
- /* Last jiffy CBs were all advanced. */
- int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-};
-
/* RCU's kthread states for tracing. */
#define RCU_KTHREAD_STOPPED 0
#define RCU_KTHREAD_RUNNING 1
--
dwmw2