[PATCH 2/2] timer: make deferrable cpu unbound timers really not bound to a cpu

From: Joonwoo Park
Date: Mon Apr 27 2015 - 22:39:32 EST


When a deferrable work (INIT_DEFERRABLE_WORK, etc.) is queued via
queue_delayed_work() it's probably intended to run the work item on any
CPU that isn't idle. However, we queue the work to run at a later time
by starting a deferrable timer that binds to whatever CPU the work is
queued on which is same with queue_delayed_work_on(smp_processor_id())
effectively.

As a result WORK_CPU_UNBOUND work items aren't really cpu unbound now.
In fact this is perfectly fine with UP kernel and also won't affect much a
system without dyntick with SMP kernel too as every cpus run timers
periodically. But on SMP systems with dyntick current implementation leads
deferrable timers not very scalable because the timer's base which has
queued the deferrable timer won't wake up till next non-deferrable timer
expires even though there are possible other non idle cpus are running
which are able to run expired deferrable timers.

The deferrable work is a good example of the current implementation's
victim like below.

INIT_DEFERRABLE_WORK(&dwork, fn);
CPU 0 CPU 1
queue_delayed_work(wq, &dwork, HZ);
queue_delayed_work_on(WORK_CPU_UNBOUND);
...
__mod_timer() -> queues timer to the
current cpu's timer
base.
...
tick_nohz_idle_enter() -> cpu enters idle.
A second later
cpu 0 is now in idle. cpu 1 exits idle or wasn't in idle so
now it's in active but won't
cpu 0 won't wake up till next handle cpu unbound deferrable timer
non-deferrable timer expires. as it's in cpu 0's timer base.

To make all cpu unbound deferrable timers are scalable, introduce a common
timer base which is only for cpu unbound deferrable timers to make those
are indeed cpu unbound so that can be scheduled by tick_do_timer_cpu.
This common timer fixes scalability issue of delayed work and all other cpu
unbound deferrable timer using implementations.

CC: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
CC: John Stultz <john.stultz@xxxxxxxxxx>
CC: Tejun Heo <tj@xxxxxxxxxx>
Signed-off-by: Joonwoo Park <joonwoop@xxxxxxxxxxxxxx>
---
Changes in v3:
* Make only tick_do_timer_cpu to run deferral timer wheel to reduce cache bouncing.

Changes in v4:
* Kill CONFIG_SMP ifdefry.
* Allocate and initialize tvec_base_deferrable at compile time.
* Pin pinned deferrable timer.
* s/deferral/deferrable/

include/linux/timer.h | 14 ++++++-
kernel/time/timer.c | 103 ++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 97 insertions(+), 20 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197..45847ca 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -34,6 +34,9 @@ struct timer_list {
};

extern struct tvec_base boot_tvec_bases;
+#ifdef CONFIG_SMP
+extern struct tvec_base tvec_base_deferrable;
+#endif

#ifdef CONFIG_LOCKDEP
/*
@@ -70,12 +73,21 @@ extern struct tvec_base boot_tvec_bases;

#define TIMER_FLAG_MASK 0x3LU

+#ifdef CONFIG_SMP
+#define __TIMER_BASE(_flags) \
+ ((_flags) & TIMER_DEFERRABLE ? \
+ (unsigned long)&tvec_base_deferrable + (_flags) : \
+ (unsigned long)&boot_tvec_bases + (_flags))
+#else
+#define __TIMER_BASE(_flags) ((unsigned long)&boot_tvec_bases + (_flags))
+#endif
+
#define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
.entry = { .prev = TIMER_ENTRY_STATIC }, \
.function = (_function), \
.expires = (_expires), \
.data = (_data), \
- .base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \
+ .base = (void *)(__TIMER_BASE(_flags)), \
.slack = -1, \
__TIMER_LOCKDEP_MAP_INITIALIZER( \
__FILE__ ":" __stringify(__LINE__)) \
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e5d5733c..133e94a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
#include <asm/timex.h>
#include <asm/io.h>

+#include "tick-internal.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>

@@ -103,6 +105,9 @@ struct tvec_base boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);

static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+struct tvec_base tvec_base_deferrable;
+#endif

/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
@@ -662,10 +667,63 @@ static inline void debug_assert_init(struct timer_list *timer)
debug_timer_assert_init(timer);
}

+#ifdef CONFIG_SMP
+static inline struct tvec_base *__get_timer_base(unsigned int flags)
+{
+ if (flags & TIMER_DEFERRABLE)
+ return &tvec_base_deferrable;
+ else
+ return raw_cpu_read(tvec_bases);
+}
+
+static inline bool is_deferrable_timer_base(struct tvec_base *base)
+{
+ return base == &tvec_base_deferrable;
+}
+
+static inline void __run_timers(struct tvec_base *base);
+static inline void __run_deferrable_timers(void)
+{
+ if (smp_processor_id() == tick_do_timer_cpu &&
+ time_after_eq(jiffies, tvec_base_deferrable.timer_jiffies))
+ __run_timers(&tvec_base_deferrable);
+}
+
+static void __init init_timer_cpu(struct tvec_base *base, int cpu);
+static inline void init_deferrable_timer(void)
+{
+ init_timer_cpu(&tvec_base_deferrable, NR_CPUS);
+}
+#else
+static inline struct tvec_base *__get_timer_base(unsigned int flags)
+{
+ return raw_cpu_read(tvec_bases);
+}
+
+static inline bool is_deferrable_timer_base(struct tvec_base *base)
+{
+ return false;
+}
+
+static inline void __run_deferrable_timers(void)
+{
+}
+
+static inline void init_deferrable_timer(void)
+{
+ /*
+ * initialize cpu unbound deferrable timer base only when CONFIG_SMP.
+ * UP kernel handles the timers with cpu 0 timer base.
+ */
+}
+#endif
+
static void do_init_timer(struct timer_list *timer, unsigned int flags,
const char *name, struct lock_class_key *key)
{
- struct tvec_base *base = raw_cpu_read(tvec_bases);
+ struct tvec_base *base;
+
+ base = __get_timer_base(flags);

timer->entry.next = NULL;
timer->base = (void *)((unsigned long)base | flags);
@@ -787,24 +845,26 @@ __mod_timer(struct timer_list *timer, unsigned long expires,

debug_activate(timer, expires);

- cpu = get_nohz_timer_target(pinned);
- new_base = per_cpu(tvec_bases, cpu);
+ if (!is_deferrable_timer_base(base) || pinned == TIMER_PINNED) {
+ cpu = get_nohz_timer_target(pinned);
+ new_base = per_cpu(tvec_bases, cpu);

- if (base != new_base) {
- /*
- * We are trying to schedule the timer on the local CPU.
- * However we can't change timer's base while it is running,
- * otherwise del_timer_sync() can't detect that the timer's
- * handler yet has not finished. This also guarantees that
- * the timer is serialized wrt itself.
- */
- if (likely(base->running_timer != timer)) {
- /* See the comment in lock_timer_base() */
- timer_set_base(timer, NULL);
- spin_unlock(&base->lock);
- base = new_base;
- spin_lock(&base->lock);
- timer_set_base(timer, base);
+ if (base != new_base) {
+ /*
+ * We are trying to schedule the timer on the local CPU.
+ * However we can't change timer's base while it is
+ * running, otherwise del_timer_sync() can't detect that
+ * the timer's handler yet has not finished. This also
+ * guarantees that the timer is serialized wrt itself.
+ */
+ if (likely(base->running_timer != timer)) {
+ /* See the comment in lock_timer_base() */
+ timer_set_base(timer, NULL);
+ spin_unlock(&base->lock);
+ base = new_base;
+ spin_lock(&base->lock);
+ timer_set_base(timer, base);
+ }
}
}

@@ -1411,6 +1471,8 @@ static void run_timer_softirq(struct softirq_action *h)

hrtimer_run_pending();

+ __run_deferrable_timers();
+
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
}
@@ -1623,7 +1685,8 @@ static void __init init_timer_cpu(struct tvec_base *base, int cpu)
BUG_ON(base != tbase_get_base(base));

base->cpu = cpu;
- per_cpu(tvec_bases, cpu) = base;
+ if (cpu != NR_CPUS)
+ per_cpu(tvec_bases, cpu) = base;
spin_lock_init(&base->lock);

for (j = 0; j < TVN_SIZE; j++) {
@@ -1655,6 +1718,8 @@ static void __init init_timer_cpus(void)

init_timer_cpu(base, cpu);
}
+
+ init_deferrable_timer();
}

void __init init_timers(void)
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/