[PATCH v4 15/20] sched/core: Compute steal values at regular intervals
From: Shrikanth Hegde
Date: Wed Jun 17 2026 - 13:47:16 EST
Kick off the work to compute the steal time at regular interval.
Gated with steal monitor enabled static key check to avoid any overhead
when its disabled.
The sampling period can changed at runtime using steal_mon/sampling_period.
By default is 1000 milliseconds. I.e. 1 second
This work is done by first active housekeeping CPU only. Hence it won't
need any complicated synchronization.
Now, that sched_steal_mon_enabled() is available which is a static branch,
add this to hotpath such as wakeup and load balance.
This will make them effectively nop when the feature is disabled.
Signed-off-by: Shrikanth Hegde <sshegde@xxxxxxxxxxxxx>
---
v3->v4:
- Add static key check in hotpaths. Could be split into a separate
patch. Let me know if thats better.
include/linux/sched.h | 2 ++
kernel/sched/core.c | 28 +++++++++++++++++++++++++++-
kernel/sched/debug.c | 1 +
kernel/sched/fair.c | 3 ++-
kernel/sched/sched.h | 10 +++++++++-
5 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ce6bc8a22eb1..5b15353ed7ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2527,5 +2527,7 @@ struct steal_monitor_t {
unsigned int high_threshold;
unsigned int sampling_period_ms;
};
+
+extern struct steal_monitor_t steal_mon;
#endif
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cc48632dd42d..f1a91021e357 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5793,7 +5793,7 @@ void sched_tick(void)
unsigned long hw_pressure;
u64 resched_latency;
- if (!cpu_preferred(cpu))
+ if (sched_steal_mon_enabled() && !cpu_preferred(cpu))
sched_push_current_non_preferred_cpu(rq);
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
@@ -5834,6 +5834,9 @@ void sched_tick(void)
rq->idle_balance = idle_cpu(cpu);
sched_balance_trigger(rq);
}
+
+ if (sched_steal_mon_enabled())
+ sched_trigger_steal_computation(cpu);
}
#ifdef CONFIG_NO_HZ_FULL
@@ -11407,4 +11410,27 @@ void sched_steal_detection_work(struct work_struct *work)
now = ktime_get();
sm->prev_time = now;
}
+
+void sched_trigger_steal_computation(int cpu)
+{
+ int first_hk_cpu = cpumask_first_and(housekeeping_cpumask(HK_TYPE_KERNEL_NOISE),
+ cpu_active_mask);
+ ktime_t now;
+
+ /* Done by first active housekeeping CPU only */
+ if (likely(cpu != first_hk_cpu))
+ return;
+
+ /*
+ * Since everything is updated by first housekeeping CPU,
+ * There is no need for complex syncronization.
+ */
+ now = ktime_get();
+
+ /* Default is once per second */
+ if (likely(ktime_ms_delta(now, steal_mon.prev_time) < steal_mon.sampling_period_ms))
+ return;
+
+ schedule_work_on(first_hk_cpu, &steal_mon.work);
+}
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2d62858f9cc0..55b8beb42574 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -649,6 +649,7 @@ static ssize_t sched_sm_en_write(struct file *filp, const char __user *ubuf,
static_branch_enable(&__sched_sm_enable);
} else if (!sched_sm_wr_enable && orig) {
static_branch_disable(&__sched_sm_enable);
+ cancel_work_sync(&steal_mon.work);
cpumask_copy(&__cpu_preferred_mask, cpu_active_mask);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3f3c7f0ca489..b02a414ffaae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13292,7 +13292,8 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
/* Spread load among preferred CPUs */
- cpumask_and(cpus, cpus, cpu_preferred_mask);
+ if (sched_steal_mon_enabled())
+ cpumask_and(cpus, cpus, cpu_preferred_mask);
schedstat_inc(sd->lb_count[idle]);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 984da3827f19..f3814099cc0b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1060,6 +1060,7 @@ struct root_domain {
struct perf_domain __rcu *pd;
};
+static inline bool sched_steal_mon_enabled(void);
extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
@@ -1436,7 +1437,7 @@ static inline bool available_idle_cpu(int cpu)
if (!idle_rq(cpu_rq(cpu)))
return 0;
- if (!cpu_preferred(cpu))
+ if (sched_steal_mon_enabled() && !cpu_preferred(cpu))
return 0;
if (vcpu_is_preempted(cpu))
@@ -4243,8 +4244,15 @@ DECLARE_STATIC_KEY_FALSE(__sched_sm_enable);
void sched_init_steal_monitor(void);
void sched_steal_detection_work(struct work_struct *work);
void sched_push_current_non_preferred_cpu(struct rq *rq);
+void sched_trigger_steal_computation(int cpu);
+static inline bool sched_steal_mon_enabled(void)
+{
+ return static_branch_unlikely(&__sched_sm_enable);
+}
#else /* !CONFIG_PREFERRED_CPU */
static inline void sched_push_current_non_preferred_cpu(struct rq *rq) { }
static inline void sched_init_steal_monitor(void) { }
+static inline void sched_trigger_steal_computation(int cpu) { }
+static inline bool sched_steal_mon_enabled(void) { return false; }
#endif
#endif /* _KERNEL_SCHED_SCHED_H */
--
2.47.3