Re: [patch 2/2] sched/idle: Make default_idle_call() NOHZ aware

From: Rafael J. Wysocki

Date: Mon Mar 02 2026 - 16:49:41 EST


On Mon, Mar 2, 2026 at 1:19 PM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Mon, Mar 02, 2026 at 01:17:55PM +0100, Peter Zijlstra wrote:
> > On Sun, Mar 01, 2026 at 08:30:51PM +0100, Thomas Gleixner wrote:
> > > Guests fall back to default_idle_call() as there is no cpuidle driver
> > > available to them by default. That causes a problem in fully loaded
> > > scenarios where CPUs go briefly idle for a couple of microseconds:
> > >
> > > tick_nohz_idle_stop_tick() is invoked unconditionally which means unless
> > > there is timer pending in the next tick, the tick is stopped and a couple
> > > of microseconds later when the idle condition goes away restarted. That
> > > requires to program the clockevent device twice which implies a VM exit for
> > > each reprogramming.
> > >
> > > It was suggested to remove the tick_nohz_idle_stop_tick() invocation from
> > > the default idle code, but would be counterproductive. It would not allow
> > > the host to go into deeper idle states when the guest CPU is fully idle as
> > > it has to maintain the periodic tick.
> > >
> > > Cure this by implementing a trivial moving average filter which keeps track
> > > of the recent idle recidency time and only stop the tick when the average
> > > is larger than a tick.
> > >
> > > Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>
> >
> > How about so? No reason to not also pass this into the idle governors.
> > This way it becomes a common least functionality. Governor can override,
> > but it had better have a good reason.

First, I hope you have seen the responses from Christian and Frederic.

Second, in the cases when the governor decides, it is better to leave
it to decide or somebody somewhere will complain even if you are
absolutely convinced that you can do better.

> > ---
> > --- a/kernel/sched/idle.c
> > +++ b/kernel/sched/idle.c
> > @@ -6,10 +6,12 @@
> > * (NOTE: these are not related to SCHED_IDLE batch scheduled
> > * tasks which are handled in sched/fair.c )
> > */
> > +#include <linux/sched/clock.h>
> > #include <linux/cpuidle.h>
> > #include <linux/suspend.h>
> > #include <linux/livepatch.h>
> > #include "sched.h"
> > +#include "pelt.h"
> > #include "smp.h"
> >
> > /* Linker adds these: start and end of __cpuidle functions */
> > @@ -105,12 +107,7 @@ static inline void cond_tick_broadcast_e
> > static inline void cond_tick_broadcast_exit(void) { }
> > #endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */
> >
> > -/**
> > - * default_idle_call - Default CPU idle routine.
> > - *
> > - * To use when the cpuidle framework cannot be used.
> > - */
> > -static void __cpuidle default_idle_call(void)
> > +static void __cpuidle __default_idle_call(void)
> > {
> > instrumentation_begin();
> > if (!current_clr_polling_and_test()) {
> > @@ -130,6 +127,63 @@ static void __cpuidle default_idle_call(
> > instrumentation_end();
> > }
> >
> > +#ifdef CONFIG_NO_HZ_COMMON
> > +
> > +/* Limit to 4 entries so it fits in a cache line */
> > +#define IDLE_DUR_ENTRIES 4
> > +#define IDLE_DUR_MASK (IDLE_DUR_ENTRIES - 1)
> > +
> > +struct idle_nohz_data {
> > + u64 duration[IDLE_DUR_ENTRIES];
> > + u64 entry_time;
> > + u64 sum;
> > + unsigned int idx;
> > +};
> > +
> > +static DEFINE_PER_CPU_ALIGNED(struct idle_nohz_data, nohz_data);
> > +
> > +static void default_idle_enter(void)
> > +{
> > + this_cpu_write(nohz_data.entry_time, sched_clock());
> > +}
> > +
> > +static inline bool default_stop_tick(void)
> > +{
> > + struct idle_nohz_data *nd = this_cpu_ptr(&nohz_data);
> > + return nd->sum > TICK_NSEC * IDLE_DUR_ENTRIES;
> > +}
> > +
> > +static void default_reflect(void)
> > +{
> > + struct idle_nohz_data *nd = this_cpu_ptr(&nohz_data);
> > + unsigned int idx = nd->idx;
> > + s64 delta;
> > +
> > + /*
> > + * Build a moving average of the time spent in idle to prevent stopping
> > + * the tick on a loaded system which only goes idle briefly.
> > + */
> > + delta = max(sched_clock() - nd->entry_time, 0);
> > + nd->sum += delta - nd->duration[idx];
> > + nd->duration[idx] = delta;
> > + nd->idx = (idx + 1) & IDLE_DUR_MASK;
> > +}

So I'd prefer to do something even simpler as suggested by Christian.

> > +#else /* CONFIG_NO_HZ_COMMON */
> > +static inline void default_idle_enter(void) { }
> > +static inline bool default_stop_tick(void) { return false; }
> > +static inline void default_reflect(void) { }
> > +#endif /* !CONFIG_NO_HZ_COMMON */
> > +
> > +static inline void default_idle_call(void)
> > +{
> > + if (default_stop_tick())
> > + tick_nohz_idle_stop_tick();
> > +
> > + __default_idle_call();
> > +
> > + default_reflect();
> > +}
> > +
> > static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
> > struct cpuidle_device *dev,
> > u64 max_latency_ns)
> > @@ -186,8 +240,6 @@ static void cpuidle_idle_call(void)
> > }
> >
> > if (cpuidle_not_available(drv, dev)) {
> > - tick_nohz_idle_stop_tick();
> > -
> > default_idle_call();
> > goto exit_idle;
> > }
> > @@ -222,7 +274,7 @@ static void cpuidle_idle_call(void)
> > next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
> > call_cpuidle(drv, dev, next_state);
> > } else {
> > - bool stop_tick = true;
> > + bool stop_tick = default_stop_tick();
> >
> > /*
> > * Ask the cpuidle framework to choose a convenient idle state.
> > @@ -238,6 +290,7 @@ static void cpuidle_idle_call(void)
> > /*
> > * Give the governor an opportunity to reflect on the outcome
> > */
> > + default_reflect();
> > cpuidle_reflect(dev, entered_state);
> > }
> >
> > @@ -276,6 +329,7 @@ static void do_idle(void)
> >
> > __current_set_polling();
> > tick_nohz_idle_enter();
> > + default_idle_enter();
> >
> > while (!need_resched()) {
> >
>
> Damn, lost hunk:
>
>
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index 65fbb8e807b9..c7876e9e024f 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -359,16 +359,6 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
> int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
> bool *stop_tick)
> {
> - /*
> - * If there is only a single idle state (or none), there is nothing
> - * meaningful for the governor to choose. Skip the governor and
> - * always use state 0 with the tick running.
> - */
> - if (drv->state_count <= 1) {
> - *stop_tick = false;
> - return 0;
> - }
> -
> return cpuidle_curr_governor->select(drv, dev, stop_tick);
> }
>
>
> Also, I suppose menu wants this?

Well, as I said, I think it's better to leave the governors alone at
this point and maybe updated them later in a separate patch (or
patches) that can be reverted individually if need be.

> diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
> index 899ff16ff1fe..a75fe1fca65d 100644
> --- a/drivers/cpuidle/governors/menu.c
> +++ b/drivers/cpuidle/governors/menu.c
> @@ -290,7 +290,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
> * it right away and keep the tick running if state[0] is a
> * polling one.
> */
> - *stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING);
> + if (drv->states[0].flags & CPUIDLE_FLAG_POLLING)
> + *stop_tick = false;
> return 0;
> }
>