Re: [REGRESSION] Needless shutting down of oneshot timer in nohz mode

From: Thomas Gleixner
Date: Sun Sep 13 2020 - 15:08:58 EST


Hi!,

On Fri, Sep 11 2020 at 23:48, Steven Rostedt wrote:
> The VMware PhotonOS team is evaluating 4.19-rt compared to CentOS
> 3.10-rt (franken kernel from Red Hat). They found a regression between
> the two kernels that was found to be introduced by:
>
> d25408756accb ("clockevents: Stop unused clockevent devices")
>
> The issue is running this on a guest, and it causes a noticeable wake
> up latency in cyclictest. The 4.19-rt kernel has two extra apic
> instructions causing for two extra VMEXITs to occur over the 3.10-rt
> kernel. I found out the reason why, and this is true for vanilla 5.9-rc
> as well.
>
> When running isocpus with NOHZ_FULL, I see the following.
>
> tick_nohz_idle_stop_tick() {
> hrtimer_start_range_ns() {
> remove_hrtimer(timer)
> /* no more timers on the base */
> expires = KTIME_MAX;
> tick_program_event() {
> clock_switch_state(ONESHOT_STOPPED);
> /* call to apic to shutdown timer */
> }
> }
> [..]
> hrtimer_reprogram(timer) {
> tick_program_event() {
> clock_switch_state(ONESHOT);
> /* call to apic to enable timer again! */
> }
> }
> }
>
>
> Thus, we are needlessly shutting down and restarting the apic every
> time we call tick_nohz_stop_tick() if there is a timer still on the
> queue.
>
> I'm not exactly sure how to fix this. Is there a way we can hold off
> disabling the clock here until we know that it isn't going to be
> immediately enabled again?

For the hrtimer_start_range_ns() case we can do that. Something like the
completely untested below.

Thanks,

tglx
---
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 95b6a708b040..9931a7f66e47 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -209,6 +209,9 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
return base;
}

+static void
+hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal);
+
/*
* We switch the timer base to a power-optimized selected CPU target,
* if:
@@ -223,7 +226,7 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
*/
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
- int pinned)
+ int pinned, bool *reprogram_old_base)
{
struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
struct hrtimer_clock_base *new_base;
@@ -247,6 +250,23 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
if (unlikely(hrtimer_callback_running(timer)))
return base;

+ /*
+ * The caller has removed the first expiring timer from
+ * @base, but avoided reprogramming the clocksource as it
+ * immediately enqueues a timer again. If the base stays
+ * the same and the removed timer was the only timer on
+ * that CPU base then reprogramming in hrtimer_remove()
+ * would shut down the clock event device just to restart
+ * it when the timer is enqueued.
+ *
+ * timer->base->lock is about to be dropped. Check whether
+ * the current base needs an update.
+ */
+ if (*reprogram_old_base) {
+ *reprogram_old_base = false;
+ hrtimer_force_reprogram(base->cpu_base, 1);
+ }
+
/* See the comment in lock_hrtimer_base() */
WRITE_ONCE(timer->base, &migration_base);
raw_spin_unlock(&base->cpu_base->lock);
@@ -288,7 +308,12 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
return base;
}

-# define switch_hrtimer_base(t, b, p) (b)
+static inline struct hrtimer_clock_base *
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ int pinned, bool *reprogram_old_base)
+{
+ return base;
+}

#endif /* !CONFIG_SMP */

@@ -1090,9 +1115,20 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
struct hrtimer_clock_base *base)
{
struct hrtimer_clock_base *new_base;
+ bool reprogram_old_base;
+ int ret;
+
+ /*
+ * If this is the first expiring timer then after removing the
+ * timer the clock event needs to be reprogrammed. But if the timer
+ * stays on the same base then this might be a pointless exercise
+ * because it's immediately enqueued again. Store the state and
+ * delay reprogramming. See below.
+ */
+ reprogram_old_base = timer == base->cpu_base->next_timer;

/* Remove an active timer from the queue: */
- remove_hrtimer(timer, base, true);
+ remove_hrtimer(timer, base, false);

if (mode & HRTIMER_MODE_REL)
tim = ktime_add_safe(tim, base->get_time());
@@ -1101,10 +1137,21 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,

hrtimer_set_expires_range_ns(timer, tim, delta_ns);

- /* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+ /*
+ * Switch the timer base, if necessary. It the timer was the first
+ * expiring timer and the timer base is switched then the old base
+ * is reprogrammed before dropping the cpu base lock. In that case
+ * reprogram_old_base is then set to false.
+ */
+ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED,
+ &reprogram_old_base);

- return enqueue_hrtimer(timer, new_base, mode);
+ ret = enqueue_hrtimer(timer, new_base, mode);
+ if (reprogram_old_base) {
+ hrtimer_force_reprogram(base->cpu_base, 1);
+ ret = 0;
+ }
+ return ret;
}

/**