Re: [patch 2/2] sched/idle: Make default_idle_call() NOHZ aware
From: Peter Zijlstra
Date: Mon Mar 02 2026 - 07:19:58 EST
On Mon, Mar 02, 2026 at 01:17:55PM +0100, Peter Zijlstra wrote:
> On Sun, Mar 01, 2026 at 08:30:51PM +0100, Thomas Gleixner wrote:
> > Guests fall back to default_idle_call() as there is no cpuidle driver
> > available to them by default. That causes a problem in fully loaded
> > scenarios where CPUs go briefly idle for a couple of microseconds:
> >
> > tick_nohz_idle_stop_tick() is invoked unconditionally which means unless
> > there is timer pending in the next tick, the tick is stopped and a couple
> > of microseconds later when the idle condition goes away restarted. That
> > requires to program the clockevent device twice which implies a VM exit for
> > each reprogramming.
> >
> > It was suggested to remove the tick_nohz_idle_stop_tick() invocation from
> > the default idle code, but would be counterproductive. It would not allow
> > the host to go into deeper idle states when the guest CPU is fully idle as
> > it has to maintain the periodic tick.
> >
> > Cure this by implementing a trivial moving average filter which keeps track
> > of the recent idle recidency time and only stop the tick when the average
> > is larger than a tick.
> >
> > Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>
>
> How about so? No reason to not also pass this into the idle governors.
> This way it becomes a common least functionality. Governor can override,
> but it had better have a good reason.
>
> ---
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -6,10 +6,12 @@
> * (NOTE: these are not related to SCHED_IDLE batch scheduled
> * tasks which are handled in sched/fair.c )
> */
> +#include <linux/sched/clock.h>
> #include <linux/cpuidle.h>
> #include <linux/suspend.h>
> #include <linux/livepatch.h>
> #include "sched.h"
> +#include "pelt.h"
> #include "smp.h"
>
> /* Linker adds these: start and end of __cpuidle functions */
> @@ -105,12 +107,7 @@ static inline void cond_tick_broadcast_e
> static inline void cond_tick_broadcast_exit(void) { }
> #endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */
>
> -/**
> - * default_idle_call - Default CPU idle routine.
> - *
> - * To use when the cpuidle framework cannot be used.
> - */
> -static void __cpuidle default_idle_call(void)
> +static void __cpuidle __default_idle_call(void)
> {
> instrumentation_begin();
> if (!current_clr_polling_and_test()) {
> @@ -130,6 +127,63 @@ static void __cpuidle default_idle_call(
> instrumentation_end();
> }
>
> +#ifdef CONFIG_NO_HZ_COMMON
> +
> +/* Limit to 4 entries so it fits in a cache line */
> +#define IDLE_DUR_ENTRIES 4
> +#define IDLE_DUR_MASK (IDLE_DUR_ENTRIES - 1)
> +
> +struct idle_nohz_data {
> + u64 duration[IDLE_DUR_ENTRIES];
> + u64 entry_time;
> + u64 sum;
> + unsigned int idx;
> +};
> +
> +static DEFINE_PER_CPU_ALIGNED(struct idle_nohz_data, nohz_data);
> +
> +static void default_idle_enter(void)
> +{
> + this_cpu_write(nohz_data.entry_time, sched_clock());
> +}
> +
> +static inline bool default_stop_tick(void)
> +{
> + struct idle_nohz_data *nd = this_cpu_ptr(&nohz_data);
> + return nd->sum > TICK_NSEC * IDLE_DUR_ENTRIES;
> +}
> +
> +static void default_reflect(void)
> +{
> + struct idle_nohz_data *nd = this_cpu_ptr(&nohz_data);
> + unsigned int idx = nd->idx;
> + s64 delta;
> +
> + /*
> + * Build a moving average of the time spent in idle to prevent stopping
> + * the tick on a loaded system which only goes idle briefly.
> + */
> + delta = max(sched_clock() - nd->entry_time, 0);
> + nd->sum += delta - nd->duration[idx];
> + nd->duration[idx] = delta;
> + nd->idx = (idx + 1) & IDLE_DUR_MASK;
> +}
> +#else /* CONFIG_NO_HZ_COMMON */
> +static inline void default_idle_enter(void) { }
> +static inline bool default_stop_tick(void) { return false; }
> +static inline void default_reflect(void) { }
> +#endif /* !CONFIG_NO_HZ_COMMON */
> +
> +static inline void default_idle_call(void)
> +{
> + if (default_stop_tick())
> + tick_nohz_idle_stop_tick();
> +
> + __default_idle_call();
> +
> + default_reflect();
> +}
> +
> static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
> struct cpuidle_device *dev,
> u64 max_latency_ns)
> @@ -186,8 +240,6 @@ static void cpuidle_idle_call(void)
> }
>
> if (cpuidle_not_available(drv, dev)) {
> - tick_nohz_idle_stop_tick();
> -
> default_idle_call();
> goto exit_idle;
> }
> @@ -222,7 +274,7 @@ static void cpuidle_idle_call(void)
> next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
> call_cpuidle(drv, dev, next_state);
> } else {
> - bool stop_tick = true;
> + bool stop_tick = default_stop_tick();
>
> /*
> * Ask the cpuidle framework to choose a convenient idle state.
> @@ -238,6 +290,7 @@ static void cpuidle_idle_call(void)
> /*
> * Give the governor an opportunity to reflect on the outcome
> */
> + default_reflect();
> cpuidle_reflect(dev, entered_state);
> }
>
> @@ -276,6 +329,7 @@ static void do_idle(void)
>
> __current_set_polling();
> tick_nohz_idle_enter();
> + default_idle_enter();
>
> while (!need_resched()) {
>
Damn, lost hunk:
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 65fbb8e807b9..c7876e9e024f 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -359,16 +359,6 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
bool *stop_tick)
{
- /*
- * If there is only a single idle state (or none), there is nothing
- * meaningful for the governor to choose. Skip the governor and
- * always use state 0 with the tick running.
- */
- if (drv->state_count <= 1) {
- *stop_tick = false;
- return 0;
- }
-
return cpuidle_curr_governor->select(drv, dev, stop_tick);
}
Also, I suppose menu wants this?
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 899ff16ff1fe..a75fe1fca65d 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -290,7 +290,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
* it right away and keep the tick running if state[0] is a
* polling one.
*/
- *stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING);
+ if (drv->states[0].flags & CPUIDLE_FLAG_POLLING)
+ *stop_tick = false;
return 0;
}