[PATCH RFC] tick/sched: Prevent pointless NOHZ transitions
From: Thomas Gleixner
Date: Tue Feb 24 2026 - 03:34:41 EST
During a hackbench run with a fully loaded machine CPUs go briefly idle
when they run out of tasks, which is expected. What's not expected are
pointless NOHZ transitions like this:
hackbench-1915 [001] d..2. 84.086755: sched_switch: prev_comm=hackbench prev_pid=1915 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120
1) <idle>-0 [001] dn.2. 84.086757: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=305340000000 softexpires=305340000000 mode=ABS|PINNED|HARD was_armed=1
<idle>-0 [001] dn.2. 84.086757: hrtimer_rearm: next_event=83885523974 deferred=0
2) <idle>-0 [001] dN.2. 84.086761: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=82950000000 softexpires=82950000000 mode=ABS|PINNED|HARD was_armed=1
<idle>-0 [001] dN.2. 84.086761: hrtimer_rearm: next_event=82950000000 deferred=0
<idle>-0 [001] d..2. 84.086767: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=hackbench next_pid=2138 next_prio=120
hackbench-2138 [001] d..2. 84.086779: sched_switch: prev_comm=hackbench prev_pid=2138 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120
#1 switches to NOHZ mode targeting the next expiring timer and #2
switches back to tick mode a whopping 4us later.
This happens with both TEO and MENU governors in a VM guest. That's not
only pointless it's also a performance issue as each rearm of the timer
implies a VM exit.
Keep track of the idle time with a moving average and check it for being
larger than TICK_NSEC in can_stop_idle_tick(). That cures this behaviour
while still allowing the system to go into long idle sleeps once the
work load stopped.
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>
---
kernel/time/tick-sched.c | 20 +++++++++++++++++---
kernel/time/tick-sched.h | 9 +++++++++
2 files changed, 26 insertions(+), 3 deletions(-)
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -751,6 +751,16 @@ static void tick_nohz_update_jiffies(kti
touch_softlockup_watchdog_sched();
}
+static void tick_nohz_update_idle_duration(struct tick_sched *ts, ktime_t now)
+{
+ ktime_t delta = now - ts->idle_dur_entry;
+ unsigned int idx = ts->idle_dur_idx;
+
+ ts->idle_dur_sum += delta - ts->idle_dur[idx];
+ ts->idle_dur[idx] = delta;
+ ts->idle_dur_idx = (idx + 1) & IDLE_DUR_MASK;
+}
+
static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
ktime_t delta;
@@ -760,6 +770,8 @@ static void tick_nohz_stop_idle(struct t
delta = ktime_sub(now, ts->idle_entrytime);
+ tick_nohz_update_idle_duration(ts, now);
+
write_seqcount_begin(&ts->idle_sleeptime_seq);
if (nr_iowait_cpu(smp_processor_id()) > 0)
ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
@@ -1224,7 +1236,7 @@ static bool can_stop_idle_tick(int cpu,
return false;
}
- return true;
+ return ts->idle_dur_sum > TICK_NSEC * IDLE_DUR_ENTRIES;
}
/**
@@ -1292,6 +1304,7 @@ void tick_nohz_idle_enter(void)
tick_sched_flag_set(ts, TS_FLAG_INIDLE);
tick_nohz_start_idle(ts);
+ ts->idle_dur_entry = ts->idle_entrytime;
local_irq_enable();
}
@@ -1490,11 +1503,12 @@ void tick_nohz_idle_exit(void)
idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
- if (idle_active || tick_stopped)
- now = ktime_get();
+ now = ktime_get();
if (idle_active)
tick_nohz_stop_idle(ts, now);
+ else
+ tick_nohz_update_idle_duration(ts, now);
if (tick_stopped)
tick_nohz_idle_update_tick(ts, now);
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -30,6 +30,9 @@ struct tick_device {
/* High resolution tick mode */
#define TS_FLAG_HIGHRES BIT(5)
+#define IDLE_DUR_ENTRIES 8
+#define IDLE_DUR_MASK (IDLE_DUR_ENTRIES - 1)
+
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
*
@@ -95,6 +98,12 @@ struct tick_sched {
ktime_t idle_sleeptime;
ktime_t iowait_sleeptime;
+ /* Idle duration */
+ ktime_t idle_dur[IDLE_DUR_ENTRIES];
+ ktime_t idle_dur_entry;
+ ktime_t idle_dur_sum;
+ unsigned int idle_dur_idx;
+
/* Full dynticks handling */
atomic_t tick_dep_mask;