Re: [patch 2/2] sched/idle: Make default_idle_call() NOHZ aware

From: Peter Zijlstra

Date: Mon Mar 02 2026 - 07:20:23 EST


On Sun, Mar 01, 2026 at 08:30:51PM +0100, Thomas Gleixner wrote:
> Guests fall back to default_idle_call() as there is no cpuidle driver
> available to them by default. That causes a problem in fully loaded
> scenarios where CPUs go briefly idle for a couple of microseconds:
>
> tick_nohz_idle_stop_tick() is invoked unconditionally which means unless
> there is timer pending in the next tick, the tick is stopped and a couple
> of microseconds later when the idle condition goes away restarted. That
> requires to program the clockevent device twice which implies a VM exit for
> each reprogramming.
>
> It was suggested to remove the tick_nohz_idle_stop_tick() invocation from
> the default idle code, but would be counterproductive. It would not allow
> the host to go into deeper idle states when the guest CPU is fully idle as
> it has to maintain the periodic tick.
>
> Cure this by implementing a trivial moving average filter which keeps track
> of the recent idle recidency time and only stop the tick when the average
> is larger than a tick.
>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>

How about so? No reason to not also pass this into the idle governors.
This way it becomes a common least functionality. Governor can override,
but it had better have a good reason.

---
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -6,10 +6,12 @@
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
+#include <linux/sched/clock.h>
#include <linux/cpuidle.h>
#include <linux/suspend.h>
#include <linux/livepatch.h>
#include "sched.h"
+#include "pelt.h"
#include "smp.h"

/* Linker adds these: start and end of __cpuidle functions */
@@ -105,12 +107,7 @@ static inline void cond_tick_broadcast_e
static inline void cond_tick_broadcast_exit(void) { }
#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */

-/**
- * default_idle_call - Default CPU idle routine.
- *
- * To use when the cpuidle framework cannot be used.
- */
-static void __cpuidle default_idle_call(void)
+static void __cpuidle __default_idle_call(void)
{
instrumentation_begin();
if (!current_clr_polling_and_test()) {
@@ -130,6 +127,63 @@ static void __cpuidle default_idle_call(
instrumentation_end();
}

+#ifdef CONFIG_NO_HZ_COMMON
+
+/* Limit to 4 entries so it fits in a cache line */
+#define IDLE_DUR_ENTRIES 4
+#define IDLE_DUR_MASK (IDLE_DUR_ENTRIES - 1)
+
+struct idle_nohz_data {
+ u64 duration[IDLE_DUR_ENTRIES];
+ u64 entry_time;
+ u64 sum;
+ unsigned int idx;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct idle_nohz_data, nohz_data);
+
+static void default_idle_enter(void)
+{
+ this_cpu_write(nohz_data.entry_time, sched_clock());
+}
+
+static inline bool default_stop_tick(void)
+{
+ struct idle_nohz_data *nd = this_cpu_ptr(&nohz_data);
+ return nd->sum > TICK_NSEC * IDLE_DUR_ENTRIES;
+}
+
+static void default_reflect(void)
+{
+ struct idle_nohz_data *nd = this_cpu_ptr(&nohz_data);
+ unsigned int idx = nd->idx;
+ s64 delta;
+
+ /*
+ * Build a moving average of the time spent in idle to prevent stopping
+ * the tick on a loaded system which only goes idle briefly.
+ */
+ delta = max(sched_clock() - nd->entry_time, 0);
+ nd->sum += delta - nd->duration[idx];
+ nd->duration[idx] = delta;
+ nd->idx = (idx + 1) & IDLE_DUR_MASK;
+}
+#else /* CONFIG_NO_HZ_COMMON */
+static inline void default_idle_enter(void) { }
+static inline bool default_stop_tick(void) { return false; }
+static inline void default_reflect(void) { }
+#endif /* !CONFIG_NO_HZ_COMMON */
+
+static inline void default_idle_call(void)
+{
+ if (default_stop_tick())
+ tick_nohz_idle_stop_tick();
+
+ __default_idle_call();
+
+ default_reflect();
+}
+
static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
struct cpuidle_device *dev,
u64 max_latency_ns)
@@ -186,8 +240,6 @@ static void cpuidle_idle_call(void)
}

if (cpuidle_not_available(drv, dev)) {
- tick_nohz_idle_stop_tick();
-
default_idle_call();
goto exit_idle;
}
@@ -222,7 +274,7 @@ static void cpuidle_idle_call(void)
next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
} else {
- bool stop_tick = true;
+ bool stop_tick = default_stop_tick();

/*
* Ask the cpuidle framework to choose a convenient idle state.
@@ -238,6 +290,7 @@ static void cpuidle_idle_call(void)
/*
* Give the governor an opportunity to reflect on the outcome
*/
+ default_reflect();
cpuidle_reflect(dev, entered_state);
}

@@ -276,6 +329,7 @@ static void do_idle(void)

__current_set_polling();
tick_nohz_idle_enter();
+ default_idle_enter();

while (!need_resched()) {