Re: [patch 20/39] tick: nohz: Rework next timer evaluation

From: Paul E. McKenney
Date: Thu Apr 16 2015 - 12:42:36 EST


On Tue, Apr 14, 2015 at 09:08:58PM -0000, Thomas Gleixner wrote:
> The evaluation of the next timer in the nohz code is based on jiffies
> while all the tick internals are nano seconds based. We have also to
> convert hrtimer nanoseconds to jiffies in the !highres case. That's
> just wrong and introduces interesting corner cases.
>
> Turn it around and convert the next timer wheel timer expiry and the
> rcu event to clock monotonic and base all calculations on
> nanoseconds. That identifies the case where no timer is pending
> clearly with an absolute expiry value of KTIME_MAX.
>
> Makes the code more readable and gets rid of the jiffies magic in the
> nohz code.
>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxxxxxxxxxx>
> Cc: Josh Triplett <josh@xxxxxxxxxxxxxxxx>
> Cc: Lai Jiangshan <laijs@xxxxxxxxxxxxxx>
> Cc: John Stultz <john.stultz@xxxxxxxxxx>
> Cc: Preeti U Murthy <preeti@xxxxxxxxxxxxxxxxxx>
> Cc: Marcelo Tosatti <mtosatti@xxxxxxxxxx>
> Cc: Frederic Weisbecker <fweisbec@xxxxxxxxx>

Good stuff!

> ---
> include/linux/hrtimer.h | 2
> include/linux/rcupdate.h | 6 +-
> include/linux/rcutree.h | 2
> include/linux/timer.h | 7 --
> kernel/rcu/tree_plugin.h | 14 +++--

I guess that I had better take a look. ;-)

Short summary: No problems, looks like you found all of the
rcu_needs_cpu() functions. (Yes, there are more of them than I would
have believed possible!)

> kernel/time/hrtimer.c | 14 ++---
> kernel/time/tick-internal.h | 2
> kernel/time/tick-sched.c | 109 +++++++++++++++++++-------------------------

I got a build error with CONFIG_NO_HZ_FULL on this one, looks like a
stray time_delta that needs to change to delta. Builds OK with that
change, and one of the three scenarios runs fine as well (the other two
are running now). Non-CONFIG_NO_HZ_FULL kernels pass a (very short)
five-minute rcutorture test. (But note that rcutorture doesn't use
hrtimers directly, changing which is on my list.)

Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3af15c3..753c211 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -653,7 +653,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
#ifdef CONFIG_NO_HZ_FULL
/* Limit the tick delta to the maximum scheduler deferment */
if (!ts->inidle)
- delta = min(time_delta, scheduler_tick_max_deferment());
+ delta = min(delta, scheduler_tick_max_deferment());
#endif

/* Calculate the next expiry time */

------------------------------------------------------------------------

> kernel/time/tick-sched.h | 2
> kernel/time/timer.c | 71 +++++++++++++---------------
> kernel/time/timer_list.c | 4 -
> 11 files changed, 107 insertions(+), 126 deletions(-)
>
> Index: tip/include/linux/hrtimer.h
> ===================================================================
> --- tip.orig/include/linux/hrtimer.h
> +++ tip/include/linux/hrtimer.h
> @@ -386,7 +386,7 @@ static inline int hrtimer_restart(struct
> /* Query timers: */
> extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
>
> -extern ktime_t hrtimer_get_next_event(void);
> +extern u64 hrtimer_get_next_event(void);
>
> /*
> * A timer is active, when it is enqueued into the rbtree or the
> Index: tip/include/linux/rcupdate.h
> ===================================================================
> --- tip.orig/include/linux/rcupdate.h
> +++ tip/include/linux/rcupdate.h
> @@ -44,6 +44,8 @@
> #include <linux/debugobjects.h>
> #include <linux/bug.h>
> #include <linux/compiler.h>
> +#include <linux/ktime.h>
> +
> #include <asm/barrier.h>
>
> extern int rcu_expedited; /* for sysctl */
> @@ -1122,9 +1124,9 @@ static inline notrace void rcu_read_unlo
> __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
>
> #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
> -static inline int rcu_needs_cpu(unsigned long *delta_jiffies)
> +static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
> {
> - *delta_jiffies = ULONG_MAX;
> + *nextevt = KTIME_MAX;
> return 0;
> }
> #endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
> Index: tip/include/linux/rcutree.h
> ===================================================================
> --- tip.orig/include/linux/rcutree.h
> +++ tip/include/linux/rcutree.h
> @@ -32,7 +32,7 @@
>
> void rcu_note_context_switch(void);
> #ifndef CONFIG_RCU_NOCB_CPU_ALL
> -int rcu_needs_cpu(unsigned long *delta_jiffies);
> +int rcu_needs_cpu(u64 basem, u64 *nextevt);
> #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
> void rcu_cpu_stall_reset(void);
>
> Index: tip/include/linux/timer.h
> ===================================================================
> --- tip.orig/include/linux/timer.h
> +++ tip/include/linux/timer.h
> @@ -188,13 +188,6 @@ extern void set_timer_slack(struct timer
> #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1)
>
> /*
> - * Return when the next timer-wheel timeout occurs (in absolute jiffies),
> - * locks the timer base and does the comparison against the given
> - * jiffie.
> - */
> -extern unsigned long get_next_timer_interrupt(unsigned long now);
> -
> -/*
> * Timer-statistics info:
> */
> #ifdef CONFIG_TIMER_STATS
> Index: tip/kernel/rcu/tree_plugin.h
> ===================================================================
> --- tip.orig/kernel/rcu/tree_plugin.h
> +++ tip/kernel/rcu/tree_plugin.h
> @@ -1372,9 +1372,9 @@ static void rcu_prepare_kthreads(int cpu
> * any flavor of RCU.
> */
> #ifndef CONFIG_RCU_NOCB_CPU_ALL
> -int rcu_needs_cpu(unsigned long *delta_jiffies)
> +int rcu_needs_cpu(u64 basemono, u64 *nextevt)
> {
> - *delta_jiffies = ULONG_MAX;
> + *nextevt = KTIME_MAX;

I was going to ask about basemono, but I see it in the next hunk.

> return rcu_cpu_has_callbacks(NULL);
> }
> #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
> @@ -1485,16 +1485,17 @@ static bool __maybe_unused rcu_try_advan
> * The caller must have disabled interrupts.
> */
> #ifndef CONFIG_RCU_NOCB_CPU_ALL
> -int rcu_needs_cpu(unsigned long *dj)
> +int rcu_needs_cpu(u64 basemono, u64 *nextevt)
> {
> struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
> + unsigned long dj;
>
> /* Snapshot to detect later posting of non-lazy callback. */
> rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
>
> /* If no callbacks, RCU doesn't need the CPU. */
> if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
> - *dj = ULONG_MAX;
> + *nextevt = KTIME_MAX;
> return 0;
> }
>
> @@ -1508,11 +1509,12 @@ int rcu_needs_cpu(unsigned long *dj)
>
> /* Request timer delay depending on laziness, and round. */
> if (!rdtp->all_lazy) {
> - *dj = round_up(rcu_idle_gp_delay + jiffies,
> + dj = round_up(rcu_idle_gp_delay + jiffies,
> rcu_idle_gp_delay) - jiffies;
> } else {
> - *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
> + dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
> }
> + *nextevt = basemono + dj * TICK_NSEC;

The multiply would have been a problem back in the day, but should
be just fine on modern hardware. I suppose that slow hardware could
compensate by having the scheduling-clock period be an exact power of
two worth of nanoseconds.

> return 0;
> }
> #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
> Index: tip/kernel/time/hrtimer.c
> ===================================================================
> --- tip.orig/kernel/time/hrtimer.c
> +++ tip/kernel/time/hrtimer.c
> @@ -1072,26 +1072,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining)
> /**
> * hrtimer_get_next_event - get the time until next expiry event
> *
> - * Returns the delta to the next expiry event or KTIME_MAX if no timer
> - * is pending.
> + * Returns the next expiry time or KTIME_MAX if no timer is pending.
> */
> -ktime_t hrtimer_get_next_event(void)
> +u64 hrtimer_get_next_event(void)
> {
> struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
> - ktime_t mindelta = { .tv64 = KTIME_MAX };
> + u64 expires = KTIME_MAX;
> unsigned long flags;
>
> raw_spin_lock_irqsave(&cpu_base->lock, flags);
>
> if (!__hrtimer_hres_active(cpu_base))
> - mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
> - ktime_get());
> + expires = __hrtimer_get_next_event(cpu_base).tv64;
>
> raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
>
> - if (mindelta.tv64 < 0)
> - mindelta.tv64 = 0;
> - return mindelta;
> + return expires;
> }
> #endif
>
> Index: tip/kernel/time/tick-internal.h
> ===================================================================
> --- tip.orig/kernel/time/tick-internal.h
> +++ tip/kernel/time/tick-internal.h
> @@ -137,3 +137,5 @@ extern void tick_nohz_init(void);
> # else
> static inline void tick_nohz_init(void) { }
> #endif
> +
> +extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
> Index: tip/kernel/time/tick-sched.c
> ===================================================================
> --- tip.orig/kernel/time/tick-sched.c
> +++ tip/kernel/time/tick-sched.c
> @@ -582,39 +582,46 @@ static void tick_nohz_restart(struct tic
> static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
> ktime_t now, int cpu)
> {
> - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
> - ktime_t last_update, expires, ret = { .tv64 = 0 };
> - unsigned long rcu_delta_jiffies;
> struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
> - u64 time_delta;
> -
> - time_delta = timekeeping_max_deferment();
> + u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
> + unsigned long seq, basejiff;
> + ktime_t tick;
>
> /* Read jiffies and the time when jiffies were updated last */
> do {
> seq = read_seqbegin(&jiffies_lock);
> - last_update = last_jiffies_update;
> - last_jiffies = jiffies;
> + basemono = last_jiffies_update.tv64;
> + basejiff = jiffies;
> } while (read_seqretry(&jiffies_lock, seq));
> + ts->last_jiffies = basejiff;
>
> - if (rcu_needs_cpu(&rcu_delta_jiffies) ||
> + if (rcu_needs_cpu(basemono, &next_rcu) ||
> arch_needs_cpu() || irq_work_needs_cpu()) {
> - next_jiffies = last_jiffies + 1;
> - delta_jiffies = 1;
> + next_tick = basemono + TICK_NSEC;
> } else {
> - /* Get the next timer wheel timer */
> - next_jiffies = get_next_timer_interrupt(last_jiffies);
> - delta_jiffies = next_jiffies - last_jiffies;
> - if (rcu_delta_jiffies < delta_jiffies) {
> - next_jiffies = last_jiffies + rcu_delta_jiffies;
> - delta_jiffies = rcu_delta_jiffies;
> - }
> + /*
> + * Get the next pending timer. If high resolution
> + * timers are enabled this only takes the timer wheel
> + * timers into account. If high resolution timers are
> + * disabled this also looks at the next expiring
> + * hrtimer.
> + */
> + next_tmr = get_next_timer_interrupt(basejiff, basemono);
> + ts->next_timer = next_tmr;
> + /* Take the next rcu event into account */
> + next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
> }
>
> - if ((long)delta_jiffies <= 1) {
> + /*
> + * If the tick is due in the next period, keep it ticking or
> + * restart it proper.
> + */
> + delta = next_tick - basemono;
> + if (delta <= (u64)TICK_NSEC) {
> + tick.tv64 = 0;
> if (!ts->tick_stopped)
> goto out;
> - if (delta_jiffies == 0) {
> + if (delta == 0) {
> /* Tick is stopped, but required now. Enforce it */
> tick_nohz_restart(ts, now);
> goto out;
> @@ -629,54 +636,39 @@ static ktime_t tick_nohz_stop_sched_tick
> * do_timer() never invoked. Keep track of the fact that it
> * was the one which had the do_timer() duty last. If this cpu
> * is the one which had the do_timer() duty last, we limit the
> - * sleep time to the timekeeping max_deferement value which we
> - * retrieved above. Otherwise we can sleep as long as we want.
> + * sleep time to the timekeeping max_deferement value.
> + * Otherwise we can sleep as long as we want.
> */
> + delta = timekeeping_max_deferment();
> if (cpu == tick_do_timer_cpu) {
> tick_do_timer_cpu = TICK_DO_TIMER_NONE;
> ts->do_timer_last = 1;
> } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
> - time_delta = KTIME_MAX;
> + delta = KTIME_MAX;
> ts->do_timer_last = 0;
> } else if (!ts->do_timer_last) {
> - time_delta = KTIME_MAX;
> + delta = KTIME_MAX;
> }
>
> #ifdef CONFIG_NO_HZ_FULL
> + /* Limit the tick delta to the maximum scheduler deferment */
> if (!ts->inidle)
> - time_delta = min(time_delta, scheduler_tick_max_deferment());
> + delta = min(time_delta, scheduler_tick_max_deferment());

s/time_delta/delta/?

> #endif
>
> - /*
> - * calculate the expiry time for the next timer wheel
> - * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
> - * there is no timer pending or at least extremely far into
> - * the future (12 days for HZ=1000). In this case we set the
> - * expiry to the end of time.
> - */
> - if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
> - /*
> - * Calculate the time delta for the next timer event.
> - * If the time delta exceeds the maximum time delta
> - * permitted by the current clocksource then adjust
> - * the time delta accordingly to ensure the
> - * clocksource does not wrap.
> - */
> - time_delta = min_t(u64, time_delta,
> - tick_period.tv64 * delta_jiffies);
> - }
> -
> - if (time_delta < KTIME_MAX)
> - expires = ktime_add_ns(last_update, time_delta);
> + /* Calculate the next expiry time */
> + if (delta < (KTIME_MAX - basemono))
> + expires = basemono + delta;
> else
> - expires.tv64 = KTIME_MAX;
> + expires = KTIME_MAX;
> +
> + expires = min_t(u64, expires, next_tick);
> + tick.tv64 = expires;
>
> /* Skip reprogram of event if its not changed */
> - if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
> + if (ts->tick_stopped && (expires == dev->next_event.tv64))
> goto out;
>
> - ret = expires;
> -
> /*
> * nohz_stop_sched_tick can be called several times before
> * the nohz_restart_sched_tick is called. This happens when
> @@ -694,26 +686,23 @@ static ktime_t tick_nohz_stop_sched_tick
> }
>
> /*
> - * If the expiration time == KTIME_MAX, then
> - * in this case we simply stop the tick timer.
> + * If the expiration time == KTIME_MAX, then we simply stop
> + * the tick timer.
> */
> - if (unlikely(expires.tv64 == KTIME_MAX)) {
> + if (unlikely(expires == KTIME_MAX)) {
> if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
> hrtimer_cancel(&ts->sched_timer);
> goto out;
> }
>
> if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
> - hrtimer_start(&ts->sched_timer, expires,
> - HRTIMER_MODE_ABS_PINNED);
> + hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
> else
> - tick_program_event(expires, 1);
> + tick_program_event(tick, 1);
> out:
> - ts->next_jiffies = next_jiffies;
> - ts->last_jiffies = last_jiffies;
> + /* Update the estimated sleep length */
> ts->sleep_length = ktime_sub(dev->next_event, now);
> -
> - return ret;
> + return tick;
> }
>
> static void tick_nohz_full_stop_tick(struct tick_sched *ts)
> Index: tip/kernel/time/tick-sched.h
> ===================================================================
> --- tip.orig/kernel/time/tick-sched.h
> +++ tip/kernel/time/tick-sched.h
> @@ -57,7 +57,7 @@ struct tick_sched {
> ktime_t iowait_sleeptime;
> ktime_t sleep_length;
> unsigned long last_jiffies;
> - unsigned long next_jiffies;
> + u64 next_timer;
> ktime_t idle_expires;
> int do_timer_last;
> };
> Index: tip/kernel/time/timer.c
> ===================================================================
> --- tip.orig/kernel/time/timer.c
> +++ tip/kernel/time/timer.c
> @@ -49,6 +49,8 @@
> #include <asm/timex.h>
> #include <asm/io.h>
>
> +#include "tick-internal.h"
> +
> #define CREATE_TRACE_POINTS
> #include <trace/events/timer.h>
>
> @@ -1311,54 +1313,48 @@ cascade:
> * Check, if the next hrtimer event is before the next timer wheel
> * event:
> */
> -static unsigned long cmp_next_hrtimer_event(unsigned long now,
> - unsigned long expires)
> +static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
> {
> - ktime_t hr_delta = hrtimer_get_next_event();
> - struct timespec tsdelta;
> - unsigned long delta;
> -
> - if (hr_delta.tv64 == KTIME_MAX)
> - return expires;
> + u64 nextevt = hrtimer_get_next_event();
>
> /*
> - * Expired timer available, let it expire in the next tick
> + * If high resolution timers are enabled
> + * hrtimer_get_next_event() returns KTIME_MAX.
> */
> - if (hr_delta.tv64 <= 0)
> - return now + 1;
> -
> - tsdelta = ktime_to_timespec(hr_delta);
> - delta = timespec_to_jiffies(&tsdelta);
> + if (expires <= nextevt)
> + return expires;
>
> /*
> - * Limit the delta to the max value, which is checked in
> - * tick_nohz_stop_sched_tick():
> + * If the next timer is already expired, return the tick base
> + * time so the tick is fired immediately.
> */
> - if (delta > NEXT_TIMER_MAX_DELTA)
> - delta = NEXT_TIMER_MAX_DELTA;
> + if (nextevt <= basem)
> + return basem;
>
> /*
> - * Take rounding errors in to account and make sure, that it
> - * expires in the next tick. Otherwise we go into an endless
> - * ping pong due to tick_nohz_stop_sched_tick() retriggering
> - * the timer softirq
> + * Round up to the next jiffie. High resolution timers are
> + * off, so the hrtimers are expired in the tick and we need to
> + * make sure that this tick really expires the timer to avoid
> + * a ping pong of the nohz stop code.
> + *
> + * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
> */
> - if (delta < 1)
> - delta = 1;
> - now += delta;
> - if (time_before(now, expires))
> - return now;
> - return expires;
> + return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
> }
>
> /**
> - * get_next_timer_interrupt - return the jiffy of the next pending timer
> - * @now: current time (in jiffies)
> + * get_next_timer_interrupt - return the time (clock mono) of the next timer
> + * @basej: base time jiffies
> + * @basem: base time clock monotonic
> + *
> + * Returns the tick aligned clock monotonic time of the next pending
> + * timer or KTIME_MAX if no timer is pending.
> */
> -unsigned long get_next_timer_interrupt(unsigned long now)
> +u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
> {
> struct tvec_base *base = __this_cpu_read(tvec_bases);
> - unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
> + u64 expires = KTIME_MAX;
> + unsigned long nextevt;
>
> /*
> * Pretend that there is no timer pending if the cpu is offline.
> @@ -1371,14 +1367,15 @@ unsigned long get_next_timer_interrupt(u
> if (base->active_timers) {
> if (time_before_eq(base->next_timer, base->timer_jiffies))
> base->next_timer = __next_timer_interrupt(base);
> - expires = base->next_timer;
> + nextevt = base->next_timer;
> + if (time_before_eq(nextevt, basej))
> + expires = basem;
> + else
> + expires = basem + (nextevt - basej) * TICK_NSEC;
> }
> spin_unlock(&base->lock);
>
> - if (time_before_eq(expires, now))
> - return now;
> -
> - return cmp_next_hrtimer_event(now, expires);
> + return cmp_next_hrtimer_event(basem, expires);
> }
> #endif
>
> Index: tip/kernel/time/timer_list.c
> ===================================================================
> --- tip.orig/kernel/time/timer_list.c
> +++ tip/kernel/time/timer_list.c
> @@ -184,7 +184,7 @@ static void print_cpu(struct seq_file *m
> P_ns(idle_sleeptime);
> P_ns(iowait_sleeptime);
> P(last_jiffies);
> - P(next_jiffies);
> + P(next_timer);
> P_ns(idle_expires);
> SEQ_printf(m, "jiffies: %Lu\n",
> (unsigned long long)jiffies);
> @@ -282,7 +282,7 @@ static void timer_list_show_tickdevices_
>
> static inline void timer_list_header(struct seq_file *m, u64 now)
> {
> - SEQ_printf(m, "Timer List Version: v0.7\n");
> + SEQ_printf(m, "Timer List Version: v0.8\n");
> SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
> SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
> SEQ_printf(m, "\n");
>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/