[PATCH 31/32] nohz: Exit RCU idle mode when we schedule before resuming userspace

From: Frederic Weisbecker
Date: Wed Mar 21 2012 - 10:01:04 EST


When a CPU running tickless resumes userspace, it enters into
RCU idle mode. But if we are preempted on kernel exit, after we
entered RCU idle mode but before we actually resumed userspace,
through an explicit call to schedule, we need to re-enable RCU in
case this function makes use of RCU read side critical section
and also for the next task to be scheduled.

NOTE: If we are preempted while running adaptive tickless, it means
we will receive an IPI that will escape the RCU idle mode for us. So
this patch is useful only when such IPI arrives too late.

Signed-off-by: Frederic Weisbecker <fweisbec@xxxxxxxxx>
Cc: Alessio Igor Bogani <abogani@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Avi Kivity <avi@xxxxxxxxxx>
Cc: Chris Metcalf <cmetcalf@xxxxxxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxx>
Cc: Daniel Lezcano <daniel.lezcano@xxxxxxxxxx>
Cc: Geoff Levand <geoff@xxxxxxxxxxxxx>
Cc: Gilad Ben Yossef <gilad@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Max Krasnyansky <maxk@xxxxxxxxxxxx>
Cc: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Stephen Hemminger <shemminger@xxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Sven-Thorsten Dietrich <thebigcorporation@xxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Zen Lin <zen@xxxxxxxxxxxxxx>
---
arch/x86/kernel/entry_64.S | 8 ++++----
include/linux/tick.h | 3 ++-
kernel/sched/core.c | 14 ++++++++++++++
kernel/time/tick-sched.c | 9 ++++++---
4 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 54f269c..c86d963 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -522,7 +522,7 @@ sysret_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ call schedule_user
popq_cfi %rdi
jmp sysret_check

@@ -630,7 +630,7 @@ int_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ call schedule_user
popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -898,7 +898,7 @@ retint_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ call schedule_user
popq_cfi %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -1398,7 +1398,7 @@ paranoid_userspace:
paranoid_schedule:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
+ call schedule_user
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
jmp paranoid_userspace
diff --git a/include/linux/tick.h b/include/linux/tick.h
index e2a49ad..93add37 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -162,7 +162,7 @@ extern void tick_nohz_exit_exception(struct pt_regs *regs);
extern void tick_nohz_check_adaptive(void);
extern void tick_nohz_pre_schedule(void);
extern void tick_nohz_post_schedule(void);
-extern void tick_nohz_cpu_exit_qs(void);
+extern void tick_nohz_cpu_exit_qs(bool irq);
extern bool tick_nohz_account_tick(void);
extern void tick_nohz_flush_current_times(bool restart_tick);
#else /* !CPUSETS_NO_HZ */
@@ -173,6 +173,7 @@ static inline void tick_nohz_exit_exception(struct pt_regs *regs) { }
static inline void tick_nohz_check_adaptive(void) { }
static inline void tick_nohz_pre_schedule(void) { }
static inline void tick_nohz_post_schedule(void) { }
+static inline void tick_nohz_cpu_exit_qs(bool irq) { }
static inline bool tick_nohz_account_tick(void) { return false; }
#endif /* CPUSETS_NO_HZ */

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5debfd7..cd4cb58 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3358,6 +3358,20 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
}
#endif

+asmlinkage void __sched schedule_user(void)
+{
+ /*
+ * We may arrive here before resuming userspace.
+ * If we are running tickless, RCU may be in idle
+ * mode. We need to reenable RCU for the next task
+ * and also in case schedule() make use of RCU itself.
+ */
+ preempt_disable();
+ tick_nohz_cpu_exit_qs(false);
+ preempt_enable_no_resched();
+ schedule();
+}
+
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6c66977..8b6a21b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -962,10 +962,13 @@ void tick_nohz_enter_kernel(void)
local_irq_restore(flags);
}

-void tick_nohz_cpu_exit_qs(void)
+void tick_nohz_cpu_exit_qs(bool irq)
{
if (__get_cpu_var(nohz_task_ext_qs)) {
- rcu_user_exit_irq();
+ if (irq)
+ rcu_user_exit_irq();
+ else
+ rcu_user_exit();
__get_cpu_var(nohz_task_ext_qs) = 0;
}
}
@@ -1005,7 +1008,7 @@ static void tick_nohz_restart_adaptive(void)
tick_nohz_flush_current_times(true);
tick_nohz_restart_sched_tick();
clear_thread_flag(TIF_NOHZ);
- tick_nohz_cpu_exit_qs();
+ tick_nohz_cpu_exit_qs(true);
}

void tick_nohz_check_adaptive(void)
--
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/