Re: [PATCH RFC] tick/sched: Prevent pointless NOHZ transitions

From: Christian Loehle

Date: Tue Feb 24 2026 - 04:36:09 EST


On 2/24/26 08:32, Thomas Gleixner wrote:
> During a hackbench run with a fully loaded machine CPUs go briefly idle
> when they run out of tasks, which is expected. What's not expected are
> pointless NOHZ transitions like this:
>
> hackbench-1915 [001] d..2. 84.086755: sched_switch: prev_comm=hackbench prev_pid=1915 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120
> 1) <idle>-0 [001] dn.2. 84.086757: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=305340000000 softexpires=305340000000 mode=ABS|PINNED|HARD was_armed=1
> <idle>-0 [001] dn.2. 84.086757: hrtimer_rearm: next_event=83885523974 deferred=0
> 2) <idle>-0 [001] dN.2. 84.086761: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=82950000000 softexpires=82950000000 mode=ABS|PINNED|HARD was_armed=1
> <idle>-0 [001] dN.2. 84.086761: hrtimer_rearm: next_event=82950000000 deferred=0
> <idle>-0 [001] d..2. 84.086767: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=hackbench next_pid=2138 next_prio=120
> hackbench-2138 [001] d..2. 84.086779: sched_switch: prev_comm=hackbench prev_pid=2138 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120
>
> #1 switches to NOHZ mode targeting the next expiring timer and #2
> switches back to tick mode a whopping 4us later.
>
> This happens with both TEO and MENU governors in a VM guest. That's not
> only pointless it's also a performance issue as each rearm of the timer
> implies a VM exit.

This is the (drv->state_count <= 1) case I assume, no governor does anything
sensible in that case.
I was also curious about the performance angle recently FWIW, but didn't
hear back:
https://lore.kernel.org/all/73439919-e24d-4bd5-a7ed-d7633beb5e4f@xxxxxxx/

>
> Keep track of the idle time with a moving average and check it for being
> larger than TICK_NSEC in can_stop_idle_tick(). That cures this behaviour
> while still allowing the system to go into long idle sleeps once the
> work load stopped.
>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>
> ---
> kernel/time/tick-sched.c | 20 +++++++++++++++++---
> kernel/time/tick-sched.h | 9 +++++++++
> 2 files changed, 26 insertions(+), 3 deletions(-)

Why here and not in cpuidle?
We've recently added some code for the single state case to skip
governor see
e5c9ffc6ae1b ("cpuidle: Skip governor when only one idle state is available")
where that could also live.

>
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -751,6 +751,16 @@ static void tick_nohz_update_jiffies(kti
> touch_softlockup_watchdog_sched();
> }
>
> +static void tick_nohz_update_idle_duration(struct tick_sched *ts, ktime_t now)
> +{
> + ktime_t delta = now - ts->idle_dur_entry;
> + unsigned int idx = ts->idle_dur_idx;
> +
> + ts->idle_dur_sum += delta - ts->idle_dur[idx];
> + ts->idle_dur[idx] = delta;
> + ts->idle_dur_idx = (idx + 1) & IDLE_DUR_MASK;
> +}
> +
> static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
> {
> ktime_t delta;
> @@ -760,6 +770,8 @@ static void tick_nohz_stop_idle(struct t
>
> delta = ktime_sub(now, ts->idle_entrytime);
>
> + tick_nohz_update_idle_duration(ts, now);
> +
> write_seqcount_begin(&ts->idle_sleeptime_seq);
> if (nr_iowait_cpu(smp_processor_id()) > 0)
> ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
> @@ -1224,7 +1236,7 @@ static bool can_stop_idle_tick(int cpu,
> return false;
> }
>
> - return true;
> + return ts->idle_dur_sum > TICK_NSEC * IDLE_DUR_ENTRIES;
> }
>
> /**
> @@ -1292,6 +1304,7 @@ void tick_nohz_idle_enter(void)
>
> tick_sched_flag_set(ts, TS_FLAG_INIDLE);
> tick_nohz_start_idle(ts);
> + ts->idle_dur_entry = ts->idle_entrytime;
>
> local_irq_enable();
> }
> @@ -1490,11 +1503,12 @@ void tick_nohz_idle_exit(void)
> idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
> tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
>
> - if (idle_active || tick_stopped)
> - now = ktime_get();
> + now = ktime_get();
>
> if (idle_active)
> tick_nohz_stop_idle(ts, now);
> + else
> + tick_nohz_update_idle_duration(ts, now);
>
> if (tick_stopped)
> tick_nohz_idle_update_tick(ts, now);
> --- a/kernel/time/tick-sched.h
> +++ b/kernel/time/tick-sched.h
> @@ -30,6 +30,9 @@ struct tick_device {
> /* High resolution tick mode */
> #define TS_FLAG_HIGHRES BIT(5)
>
> +#define IDLE_DUR_ENTRIES 8
> +#define IDLE_DUR_MASK (IDLE_DUR_ENTRIES - 1)
> +
> /**
> * struct tick_sched - sched tick emulation and no idle tick control/stats
> *
> @@ -95,6 +98,12 @@ struct tick_sched {
> ktime_t idle_sleeptime;
> ktime_t iowait_sleeptime;
>
> + /* Idle duration */
> + ktime_t idle_dur[IDLE_DUR_ENTRIES];
> + ktime_t idle_dur_entry;
> + ktime_t idle_dur_sum;
> + unsigned int idle_dur_idx;
> +
> /* Full dynticks handling */
> atomic_t tick_dep_mask;
>
>