Re: [patch 2/2] sched/idle: Make default_idle_call() NOHZ aware

From: Rafael J. Wysocki

Date: Fri Mar 06 2026 - 16:32:09 EST

On Fri, Mar 6, 2026 at 10:21 PM Rafael J. Wysocki <rafael@xxxxxxxxxx> wrote:
>
> On Wednesday, March 4, 2026 4:03:06 AM CET Qais Yousef wrote:
> > On 03/02/26 22:25, Rafael J. Wysocki wrote:
> > > On Mon, Mar 2, 2026 at 12:04 PM Christian Loehle
> > > <christian.loehle@xxxxxxx> wrote:
> > > >
> > > > On 3/1/26 19:30, Thomas Gleixner wrote:
> > > > > Guests fall back to default_idle_call() as there is no cpuidle driver
> > > > > available to them by default. That causes a problem in fully loaded
> > > > > scenarios where CPUs go briefly idle for a couple of microseconds:
> > > > >
> > > > > tick_nohz_idle_stop_tick() is invoked unconditionally which means unless
> > > > > there is timer pending in the next tick, the tick is stopped and a couple
> > > > > of microseconds later when the idle condition goes away restarted. That
> > > > > requires to program the clockevent device twice which implies a VM exit for
> > > > > each reprogramming.
> > > > >
> > > > > It was suggested to remove the tick_nohz_idle_stop_tick() invocation from
> > > > > the default idle code, but would be counterproductive. It would not allow
> > > > > the host to go into deeper idle states when the guest CPU is fully idle as
> > > > > it has to maintain the periodic tick.
> > > > >
> > > > > Cure this by implementing a trivial moving average filter which keeps track
> > > > > of the recent idle recidency time and only stop the tick when the average
> > > > > is larger than a tick.
> > > > >
> > > > > Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>
> > > > > ---
> > > > > kernel/sched/idle.c | 65 +++++++++++++++++++++++++++++++++++++++++++++-------
> > > > > 1 file changed, 57 insertions(+), 8 deletions(-)
> > > > >
> > > > > --- a/kernel/sched/idle.c
> > > > > +++ b/kernel/sched/idle.c
> > > > > @@ -105,12 +105,7 @@ static inline void cond_tick_broadcast_e
> > > > > static inline void cond_tick_broadcast_exit(void) { }
> > > > > #endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */
> > > > >
> > > > > -/**
> > > > > - * default_idle_call - Default CPU idle routine.
> > > > > - *
> > > > > - * To use when the cpuidle framework cannot be used.
> > > > > - */
> > > > > -static void __cpuidle default_idle_call(void)
> > > > > +static void __cpuidle __default_idle_call(void)
> > > > > {
> > > > > instrumentation_begin();
> > > > > if (!current_clr_polling_and_test()) {
> > > > > @@ -130,6 +125,61 @@ static void __cpuidle default_idle_call(
> > > > > instrumentation_end();
> > > > > }
> > > > >
> > > > > +#ifdef CONFIG_NO_HZ_COMMON
> > > > > +
> > > > > +/* Limit to 4 entries so it fits in a cache line */
> > > > > +#define IDLE_DUR_ENTRIES 4
> > > > > +#define IDLE_DUR_MASK (IDLE_DUR_ENTRIES - 1)
> > > > > +
> > > > > +struct idle_nohz_data {
> > > > > + u64 duration[IDLE_DUR_ENTRIES];
> > > > > + u64 entry_time;
> > > > > + u64 sum;
> > > > > + unsigned int idx;
> > > > > +};
> > > > > +
> > > > > +static DEFINE_PER_CPU_ALIGNED(struct idle_nohz_data, nohz_data);
> > > > > +
> > > > > +/**
> > > > > + * default_idle_call - Default CPU idle routine.
> > > > > + *
> > > > > + * To use when the cpuidle framework cannot be used.
> > > > > + */
> > > > > +static void default_idle_call(void)
> > > > > +{
> > > > > + struct idle_nohz_data *nd = this_cpu_ptr(&nohz_data);
> > > > > + unsigned int idx = nd->idx;
> > > > > + s64 delta;
> > > > > +
> > > > > + /*
> > > > > + * If the CPU spends more than a tick on average in idle, try to stop
> > > > > + * the tick.
> > > > > + */
> > > > > + if (nd->sum > TICK_NSEC * IDLE_DUR_ENTRIES)
> > > > > + tick_nohz_idle_stop_tick();
> > > > > +
> > > > > + __default_idle_call();
> > > > > +
> > > > > + /*
> > > > > + * Build a moving average of the time spent in idle to prevent stopping
> > > > > + * the tick on a loaded system which only goes idle briefly.
> > > > > + */
> > > > > + delta = max(sched_clock() - nd->entry_time, 0);
> > > > > + nd->sum += delta - nd->duration[idx];
> > > > > + nd->duration[idx] = delta;
> > > > > + nd->idx = (idx + 1) & IDLE_DUR_MASK;
> > > > > +}
> > > > > +
> > > > > +static void default_idle_enter(void)
> > > > > +{
> > > > > + this_cpu_write(nohz_data.entry_time, sched_clock());
> > > > > +}
> > > > > +
> > > > > +#else /* CONFIG_NO_HZ_COMMON */
> > > > > +static inline void default_idle_call(void { __default_idle_call(); }
> > > > > +static inline void default_idle_enter(void) { }
> > > > > +#endif /* !CONFIG_NO_HZ_COMMON */
> > > > > +
> > > > > static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
> > > > > struct cpuidle_device *dev,
> > > > > u64 max_latency_ns)
> > > > > @@ -186,8 +236,6 @@ static void cpuidle_idle_call(void)
> > > > > }
> > > > >
> > > > > if (cpuidle_not_available(drv, dev)) {
> > > > > - tick_nohz_idle_stop_tick();
> > > > > -
> > > > > default_idle_call();
> > > > > goto exit_idle;
> > > > > }
> > > > > @@ -276,6 +324,7 @@ static void do_idle(void)
> > > > >
> > > > > __current_set_polling();
> > > > > tick_nohz_idle_enter();
> > > > > + default_idle_enter();
> > > > >
> > > > > while (!need_resched()) {
> > > > >
> > > > >
> > > >
> > > > How does this work? We don't stop the tick until the average idle time is larger,
> > > > but if we don't stop the tick how is that possible?
> > > >
> > > > Why don't we just require one or two consecutive tick wakeups before stopping?
> > >
> > > Exactly my thought and I think one should be sufficient.
> >
> > I concur. From our experience with TEO util threshold these averages can
> > backfire. I think one tick is sufficient delay to not be obviously broken.
>
> So if I'm not mistaken, it would be something like the appended prototype
> (completely untested, but it builds for me).
>
> ---
> drivers/cpuidle/cpuidle.c | 10 ----------
> kernel/sched/idle.c | 32 ++++++++++++++++++++++++--------
> 2 files changed, 24 insertions(+), 18 deletions(-)
>
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -359,16 +359,6 @@ noinstr int cpuidle_enter_state(struct c
> int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
> bool *stop_tick)
> {
> - /*
> - * If there is only a single idle state (or none), there is nothing
> - * meaningful for the governor to choose. Skip the governor and
> - * always use state 0 with the tick running.
> - */
> - if (drv->state_count <= 1) {
> - *stop_tick = false;
> - return 0;
> - }
> -
> return cpuidle_curr_governor->select(drv, dev, stop_tick);
> }
>
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -161,6 +161,14 @@ static int call_cpuidle(struct cpuidle_d
> return cpuidle_enter(drv, dev, next_state);
> }
>
> +static void idle_call_stop_or_retain_tick(bool stop_tick)
> +{
> + if (stop_tick || tick_nohz_tick_stopped())
> + tick_nohz_idle_stop_tick();
> + else
> + tick_nohz_idle_retain_tick();
> +}
> +
> /**
> * cpuidle_idle_call - the main idle function
> *
> @@ -170,7 +178,7 @@ static int call_cpuidle(struct cpuidle_d
> * set, and it returns with polling set. If it ever stops polling, it
> * must clear the polling bit.
> */
> -static void cpuidle_idle_call(void)
> +static void cpuidle_idle_call(bool got_tick)
> {
> struct cpuidle_device *dev = cpuidle_get_device();
> struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> @@ -186,7 +194,7 @@ static void cpuidle_idle_call(void)
> }
>
> if (cpuidle_not_available(drv, dev)) {
> - tick_nohz_idle_stop_tick();
> + idle_call_stop_or_retain_tick(!got_tick);

Oh, I got this backwards (here and below).

The tick should be stopped if we've got the tick previously, but you
get the idea.

[Note to self: Don't send patches in the night.]

> default_idle_call();
> goto exit_idle;
> @@ -221,7 +229,7 @@ static void cpuidle_idle_call(void)
>
> next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
> call_cpuidle(drv, dev, next_state);
> - } else {
> + } else if (drv->state_count > 1) {
> bool stop_tick = true;
>
> /*
> @@ -229,16 +237,22 @@ static void cpuidle_idle_call(void)
> */
> next_state = cpuidle_select(drv, dev, &stop_tick);
>
> - if (stop_tick || tick_nohz_tick_stopped())
> - tick_nohz_idle_stop_tick();
> - else
> - tick_nohz_idle_retain_tick();
> + idle_call_stop_or_retain_tick(stop_tick);
>
> entered_state = call_cpuidle(drv, dev, next_state);
> /*
> * Give the governor an opportunity to reflect on the outcome
> */
> cpuidle_reflect(dev, entered_state);
> + } else {
> + /*
> + * If there is only a single idle state (or none), there is
> + * nothing meaningful for the governor to choose. Skip the
> + * governor and always use state 0.
> + */
> + idle_call_stop_or_retain_tick(!got_tick);
> +
> + call_cpuidle(drv, dev, 0);
> }
>
> exit_idle:
> @@ -259,6 +273,7 @@ exit_idle:
> static void do_idle(void)
> {
> int cpu = smp_processor_id();
> + bool got_tick = false;
>
> /*
> * Check if we need to update blocked load
> @@ -329,8 +344,9 @@ static void do_idle(void)
> tick_nohz_idle_restart_tick();
> cpu_idle_poll();
> } else {
> - cpuidle_idle_call();
> + cpuidle_idle_call(got_tick);
> }
> + got_tick = tick_nohz_idle_got_tick();
> arch_cpu_idle_exit();
> }