[PATCH 19/32] nohz/cpuset: Account user and system times in adaptive nohz mode

From: Frederic Weisbecker
Date: Wed Mar 21 2012 - 10:03:35 EST


If we are not running the tick, we are not anymore regularly counting
the user/system cputime at every jiffies.

To solve this, save a snapshot of the jiffies when we stop the tick
and keep track of where we saved it: user or system. On top of this,
we account the cputime elapsed when we cross the kernel entry/exit
boundaries and when we restart the tick.

Signed-off-by: Frederic Weisbecker <fweisbec@xxxxxxxxx>
Cc: Alessio Igor Bogani <abogani@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Avi Kivity <avi@xxxxxxxxxx>
Cc: Chris Metcalf <cmetcalf@xxxxxxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxx>
Cc: Daniel Lezcano <daniel.lezcano@xxxxxxxxxx>
Cc: Geoff Levand <geoff@xxxxxxxxxxxxx>
Cc: Gilad Ben Yossef <gilad@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Max Krasnyansky <maxk@xxxxxxxxxxxx>
Cc: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Stephen Hemminger <shemminger@xxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Sven-Thorsten Dietrich <thebigcorporation@xxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Zen Lin <zen@xxxxxxxxxxxxxx>
---
include/linux/tick.h | 12 ++++
kernel/sched/core.c | 1 +
kernel/time/tick-sched.c | 131 +++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 03b6edd..598b492 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -153,11 +153,23 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
# endif /* !NO_HZ */

#ifdef CONFIG_CPUSETS_NO_HZ
+extern void tick_nohz_enter_kernel(void);
+extern void tick_nohz_exit_kernel(void);
+extern void tick_nohz_enter_exception(struct pt_regs *regs);
+extern void tick_nohz_exit_exception(struct pt_regs *regs);
extern void tick_nohz_check_adaptive(void);
+extern void tick_nohz_pre_schedule(void);
extern void tick_nohz_post_schedule(void);
+extern bool tick_nohz_account_tick(void);
#else /* !CPUSETS_NO_HZ */
+static inline void tick_nohz_enter_kernel(void) { }
+static inline void tick_nohz_exit_kernel(void) { }
+static inline void tick_nohz_enter_exception(struct pt_regs *regs) { }
+static inline void tick_nohz_exit_exception(struct pt_regs *regs) { }
static inline void tick_nohz_check_adaptive(void) { }
+static inline void tick_nohz_pre_schedule(void) { }
static inline void tick_nohz_post_schedule(void) { }
+static inline bool tick_nohz_account_tick(void) { return false; }
#endif /* CPUSETS_NO_HZ */

#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eca842e..5debfd7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1923,6 +1923,7 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
+ tick_nohz_pre_schedule();
sched_info_switch(prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 9359e6c..ff78126 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -526,7 +526,13 @@ static bool can_stop_adaptive_tick(void)

static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts)
{
+ struct pt_regs *regs = get_irq_regs();
int cpu = smp_processor_id();
+ int was_stopped;
+ int user = 0;
+
+ if (regs)
+ user = user_mode(regs);

if (!cpuset_adaptive_nohz() || is_idle_task(current))
return;
@@ -537,7 +543,36 @@ static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts)
if (!can_stop_adaptive_tick())
return;

+ /*
+ * If we stop the tick between the syscall exit hook and the actual
+ * return to userspace, we'll think we are in system space (due to
+ * user_mode() thinking so). And since we passed the syscall exit hook
+ * already we won't realize we are in userspace. So the time spent
+ * tickless would be spuriously accounted as belonging to system.
+ *
+ * To avoid this kind of problem, we only stop the tick from userspace
+ * (until we find a better solution).
+ * We can later enter the kernel and keep the tick stopped. But the place
+ * where we stop the tick must be userspace.
+ * We make an exception for kernel threads since they always execute in
+ * kernel space.
+ */
+ if (!user && current->mm)
+ return;
+
+ was_stopped = ts->tick_stopped;
tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+
+ if (!was_stopped && ts->tick_stopped) {
+ WARN_ON_ONCE(ts->saved_jiffies_whence != JIFFIES_SAVED_NONE);
+ if (user)
+ ts->saved_jiffies_whence = JIFFIES_SAVED_USER;
+ else if (!current->mm)
+ ts->saved_jiffies_whence = JIFFIES_SAVED_SYS;
+
+ ts->saved_jiffies = jiffies;
+ set_thread_flag(TIF_NOHZ);
+ }
}
#else
static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts) { }
@@ -862,6 +897,70 @@ void tick_check_idle(int cpu)
}

#ifdef CONFIG_CPUSETS_NO_HZ
+void tick_nohz_exit_kernel(void)
+{
+ unsigned long flags;
+ struct tick_sched *ts;
+ unsigned long delta_jiffies;
+
+ local_irq_save(flags);
+
+ ts = &__get_cpu_var(tick_cpu_sched);
+
+ if (!ts->tick_stopped) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ WARN_ON_ONCE(ts->saved_jiffies_whence != JIFFIES_SAVED_SYS);
+
+ delta_jiffies = jiffies - ts->saved_jiffies;
+ account_system_ticks(current, delta_jiffies);
+
+ ts->saved_jiffies = jiffies;
+ ts->saved_jiffies_whence = JIFFIES_SAVED_USER;
+
+ local_irq_restore(flags);
+}
+
+void tick_nohz_enter_kernel(void)
+{
+ unsigned long flags;
+ struct tick_sched *ts;
+ unsigned long delta_jiffies;
+
+ local_irq_save(flags);
+
+ ts = &__get_cpu_var(tick_cpu_sched);
+
+ if (!ts->tick_stopped) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ WARN_ON_ONCE(ts->saved_jiffies_whence != JIFFIES_SAVED_USER);
+
+ delta_jiffies = jiffies - ts->saved_jiffies;
+ account_user_ticks(current, delta_jiffies);
+
+ ts->saved_jiffies = jiffies;
+ ts->saved_jiffies_whence = JIFFIES_SAVED_SYS;
+
+ local_irq_restore(flags);
+}
+
+void tick_nohz_enter_exception(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ tick_nohz_enter_kernel();
+}
+
+void tick_nohz_exit_exception(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ tick_nohz_exit_kernel();
+}
+
/*
* Take the timer duty if nobody is taking care of it.
* If a CPU already does and and it's in a nohz cpuset,
@@ -880,13 +979,22 @@ static void tick_do_timer_check_handler(int cpu)
}
}

+static void tick_nohz_restart_adaptive(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ tick_nohz_account_ticks(ts);
+ tick_nohz_restart_sched_tick();
+ clear_thread_flag(TIF_NOHZ);
+}
+
void tick_nohz_check_adaptive(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);

if (ts->tick_stopped && !is_idle_task(current)) {
if (!can_stop_adaptive_tick())
- tick_nohz_restart_sched_tick();
+ tick_nohz_restart_adaptive();
}
}

@@ -898,6 +1006,26 @@ void cpuset_exit_nohz_interrupt(void *unused)
tick_nohz_restart_adaptive();
}

+/*
+ * Flush cputime and clear hooks before context switch in case we
+ * haven't yet received the IPI that should take care of that.
+ */
+void tick_nohz_pre_schedule(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ /*
+ * We are holding the rq lock and if we restart the tick now
+ * we could deadlock by acquiring the lock twice. Instead
+ * we do that on post schedule time. For now do the cleanups
+ * on the prev task.
+ */
+ if (ts->tick_stopped) {
+ tick_nohz_account_ticks(ts);
+ clear_thread_flag(TIF_NOHZ);
+ }
+}
+
void tick_nohz_post_schedule(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
@@ -910,7 +1038,6 @@ void tick_nohz_post_schedule(void)
if (ts->tick_stopped)
tick_nohz_restart_sched_tick();
}
-
#else

static void tick_do_timer_check_handler(int cpu)
--
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/