Re: [PATCH 1/2] timer: Avoid waking up an idle-core by migrate running timer

From: Thomas Gleixner
Date: Wed Apr 22 2015 - 17:56:15 EST


On Wed, 22 Apr 2015, Eric Dumazet wrote:
> On Wed, 2015-04-22 at 20:56 +0200, Thomas Gleixner wrote:
> > On Wed, 22 Apr 2015, Eric Dumazet wrote:
> > > Check commit 4a8e320c929991c9480 ("net: sched: use pinned timers")
> > > for a specific example of the problems that can be raised.
> >
> > If you have a problem with the core timer code then it should be fixed
> > there and not worked around in some place which will ruin stuff for
> > power saving interested users. I'm so tired of this 'I fix it in my
> > sandbox' attitude, really. If the core code has a shortcoming we fix
> > it there right away because you are probably not the only one who runs
> > into that shortcoming. So if we don't fix it in the core we end up
> > with a metric ton of slightly different (or broken) workarounds which
> > affect the workload/system characteristics of other people.
> >
> > Just for the record. Even the changelog of this commit is blatantly
> > wrong:
> >
> > "We can see that timers get migrated into a single cpu, presumably
> > idle at the time timers are set up."
>
> Spare me the obvious typo. A 'not' is missing.

:)

> >
> > The timer migration moves timers to non idle cpus to leave the idle
> > ones alone for power saving sake.
> >
> > I can see and understand the reason why you want to avoid that, but I
> > have to ask the question whether this pinning is the correct behaviour
> > under all workloads and system characteristics. If yes, then the patch
> > is the right answer, if no, then it is simply the wrong approach.

I take your 'Awesome' below as a no then.

> > > but /proc/sys/kernel/timer_migration adds a fair overhead in many
> > > workloads.
> > >
> > > get_nohz_timer_target() has to touch 3 cache lines per cpu...
> >
> > And this is something we can fix and completely avoid if we think
> > about it. Looking at the code I have to admit that the out of line
> > call and the sysctl variable lookup is silly. But its not rocket
> > science to fix this.
>
> Awesome.

Here you go. Completely untested, at least it compiles.

Thanks,

tglx
---

Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -343,14 +343,10 @@ extern int runqueue_is_locked(int cpu);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void nohz_balance_enter_idle(int cpu);
extern void set_cpu_sd_state_idle(void);
-extern int get_nohz_timer_target(int pinned);
+extern int get_nohz_timer_target(void);
#else
static inline void nohz_balance_enter_idle(int cpu) { }
static inline void set_cpu_sd_state_idle(void) { }
-static inline int get_nohz_timer_target(int pinned)
-{
- return smp_processor_id();
-}
#endif

/*
Index: linux/include/linux/sched/sysctl.h
===================================================================
--- linux.orig/include/linux/sched/sysctl.h
+++ linux/include/linux/sched/sysctl.h
@@ -57,24 +57,12 @@ extern unsigned int sysctl_numa_balancin
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
-extern unsigned int sysctl_timer_migration;
extern unsigned int sysctl_sched_shares_window;

int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos);
#endif
-#ifdef CONFIG_SCHED_DEBUG
-static inline unsigned int get_sysctl_timer_migration(void)
-{
- return sysctl_timer_migration;
-}
-#else
-static inline unsigned int get_sysctl_timer_migration(void)
-{
- return 1;
-}
-#endif

/*
* control realtime throttling:
Index: linux/include/linux/timer.h
===================================================================
--- linux.orig/include/linux/timer.h
+++ linux/include/linux/timer.h
@@ -254,6 +254,16 @@ extern void run_local_timers(void);
struct hrtimer;
extern enum hrtimer_restart it_real_fn(struct hrtimer *);

+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#include <linux/sysctl.h>
+
+extern unsigned int sysctl_timer_migration;
+int timer_migration_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+extern struct static_key timer_migration_enabled;
+#endif
+
unsigned long __round_jiffies(unsigned long j, int cpu);
unsigned long __round_jiffies_relative(unsigned long j, int cpu);
unsigned long round_jiffies(unsigned long j);
Index: linux/kernel/sched/core.c
===================================================================
--- linux.orig/kernel/sched/core.c
+++ linux/kernel/sched/core.c
@@ -593,13 +593,12 @@ void resched_cpu(int cpu)
* selecting an idle cpu will add more delays to the timers than intended
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
*/
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
{
- int cpu = smp_processor_id();
- int i;
+ int i, cpu = smp_processor_id();
struct sched_domain *sd;

- if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+ if (!idle_cpu(cpu))
return cpu;

rcu_read_lock();
@@ -7088,8 +7087,6 @@ void __init sched_init_smp(void)
}
#endif /* CONFIG_SMP */

-const_debug unsigned int sysctl_timer_migration = 1;
-
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
Index: linux/kernel/sysctl.c
===================================================================
--- linux.orig/kernel/sysctl.c
+++ linux/kernel/sysctl.c
@@ -349,15 +349,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "timer_migration",
- .data = &sysctl_timer_migration,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
- },
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
@@ -1132,6 +1123,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ {
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = timer_migration_handler,
+ },
+#endif
{ }
};

Index: linux/kernel/time/hrtimer.c
===================================================================
--- linux.orig/kernel/time/hrtimer.c
+++ linux/kernel/time/hrtimer.c
@@ -190,6 +190,23 @@ hrtimer_check_target(struct hrtimer *tim
#endif
}

+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static inline struct hrtimer_cpu_base *get_target_base(int pinned)
+{
+ if (pinned)
+ return this_cpu_ptr(&hrtimer_bases);
+ if (static_key_true(&timer_migration_enabled))
+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+ return this_cpu_ptr(&hrtimer_bases);
+}
+#else
+
+static inline struct hrtimer_cpu_base *get_target_base(int pinned)
+{
+ return this_cpu_ptr(&hrtimer_bases);
+}
+#endif
+
/*
* Switch the timer base to the current CPU when possible.
*/
@@ -197,14 +214,13 @@ static inline struct hrtimer_clock_base
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
int pinned)
{
+ struct hrtimer_cpu_base *new_cpu_base, *this_base;
struct hrtimer_clock_base *new_base;
- struct hrtimer_cpu_base *new_cpu_base;
- int this_cpu = smp_processor_id();
- int cpu = get_nohz_timer_target(pinned);
int basenum = base->index;

+ this_base = this_cpu_ptr(&hrtimer_bases);
+ new_cpu_base = get_target_base(pinned);
again:
- new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[basenum];

if (base != new_base) {
@@ -225,17 +241,19 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);

- if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
- cpu = this_cpu;
+ if (new_cpu_base != this_base &&
+ hrtimer_check_target(timer, new_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
+ new_cpu_base = this_base;
timer->base = base;
goto again;
}
timer->base = new_base;
} else {
- if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
- cpu = this_cpu;
+ if (new_cpu_base != this_base &&
+ hrtimer_check_target(timer, new_base)) {
+ new_cpu_base = this_base;
goto again;
}
}
Index: linux/kernel/time/timer.c
===================================================================
--- linux.orig/kernel/time/timer.c
+++ linux/kernel/time/timer.c
@@ -104,6 +104,49 @@ EXPORT_SYMBOL(boot_tvec_bases);

static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;

+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+unsigned int sysctl_timer_migration = 1;
+struct static_key timer_migration_enabled = STATIC_KEY_INIT_TRUE;
+
+int timer_migration_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ static DEFINE_MUTEX(mutex);
+ bool keyon;
+ int ret;
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ goto unlock;
+
+ keyon = static_key_enabled(&timer_migration_enabled);
+ if (sysctl_timer_migration && !keyon )
+ static_key_slow_inc(&timer_migration_enabled);
+ else if (!sysctl_timer_migration && keyon)
+ static_key_slow_dec(&timer_migration_enabled);
+
+unlock:
+ mutex_unlock(&mutex);
+ return ret;
+}
+
+static inline struct tvec_base *get_target_base(int pinned)
+{
+ if (pinned)
+ return __this_cpu_read(tvec_bases);
+ if (static_key_true(&timer_migration_enabled))
+ return per_cpu(tvec_bases, get_nohz_timer_target());
+ return __this_cpu_read(tvec_bases);
+}
+#else
+static inline struct tvec_base *get_target_base(int pinned)
+{
+ return __this_cpu_read(tvec_bases);
+}
+#endif
+
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
{
@@ -774,7 +817,7 @@ __mod_timer(struct timer_list *timer, un
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret = 0 , cpu;
+ int ret = 0;

timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
@@ -787,8 +830,7 @@ __mod_timer(struct timer_list *timer, un

debug_activate(timer, expires);

- cpu = get_nohz_timer_target(pinned);
- new_base = per_cpu(tvec_bases, cpu);
+ new_base = get_target_base(pinned);

if (base != new_base) {
/*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/