Re: [PATCH v4 15/20] sched/core: Compute steal values at regular intervals
From: Yury Norov
Date: Thu Jun 18 2026 - 00:04:31 EST
On Wed, Jun 17, 2026 at 11:11:34PM +0530, Shrikanth Hegde wrote:
> Kick off the work to compute the steal time at regular interval.
> Gated with steal monitor enabled static key check to avoid any overhead
> when its disabled.
>
> The sampling period can changed at runtime using steal_mon/sampling_period.
> By default is 1000 milliseconds. I.e. 1 second
>
> This work is done by first active housekeeping CPU only. Hence it won't
> need any complicated synchronization.
>
> Now, that sched_steal_mon_enabled() is available which is a static branch,
> add this to hotpath such as wakeup and load balance.
> This will make them effectively nop when the feature is disabled.
>
> Signed-off-by: Shrikanth Hegde <sshegde@xxxxxxxxxxxxx>
> ---
> v3->v4:
> - Add static key check in hotpaths. Could be split into a separate
> patch. Let me know if thats better.
>
> include/linux/sched.h | 2 ++
> kernel/sched/core.c | 28 +++++++++++++++++++++++++++-
> kernel/sched/debug.c | 1 +
> kernel/sched/fair.c | 3 ++-
> kernel/sched/sched.h | 10 +++++++++-
> 5 files changed, 41 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index ce6bc8a22eb1..5b15353ed7ef 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2527,5 +2527,7 @@ struct steal_monitor_t {
> unsigned int high_threshold;
> unsigned int sampling_period_ms;
> };
> +
> +extern struct steal_monitor_t steal_mon;
> #endif
> #endif
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index cc48632dd42d..f1a91021e357 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5793,7 +5793,7 @@ void sched_tick(void)
> unsigned long hw_pressure;
> u64 resched_latency;
>
> - if (!cpu_preferred(cpu))
> + if (sched_steal_mon_enabled() && !cpu_preferred(cpu))
> sched_push_current_non_preferred_cpu(rq);
This looks like CPU can be non-preferred only if steal monitor is
enabled. To properly implement it, you need to mark all active CPUs
as preferred during the steal monitor disabling. That way you don't
need to complicate the condition.
>
> if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
> @@ -5834,6 +5834,9 @@ void sched_tick(void)
> rq->idle_balance = idle_cpu(cpu);
> sched_balance_trigger(rq);
> }
> +
> + if (sched_steal_mon_enabled())
> + sched_trigger_steal_computation(cpu);
> }
>
> #ifdef CONFIG_NO_HZ_FULL
> @@ -11407,4 +11410,27 @@ void sched_steal_detection_work(struct work_struct *work)
> now = ktime_get();
> sm->prev_time = now;
> }
> +
> +void sched_trigger_steal_computation(int cpu)
> +{
> + int first_hk_cpu = cpumask_first_and(housekeeping_cpumask(HK_TYPE_KERNEL_NOISE),
> + cpu_active_mask);
> + ktime_t now;
> +
> + /* Done by first active housekeeping CPU only */
> + if (likely(cpu != first_hk_cpu))
> + return;
> +
> + /*
> + * Since everything is updated by first housekeeping CPU,
> + * There is no need for complex syncronization.
> + */
> + now = ktime_get();
> +
> + /* Default is once per second */
> + if (likely(ktime_ms_delta(now, steal_mon.prev_time) < steal_mon.sampling_period_ms))
> + return;
> +
> + schedule_work_on(first_hk_cpu, &steal_mon.work);
I think, there should be a better way to schedule a work on regular
interval...
Maybe steal_mon.work would schedule itself? So, the first time it's
scheduled on steal monitor enablement, and then just reschedules
itself. This way you'll avoid polluting sched_tick().
> +}
> #endif
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 2d62858f9cc0..55b8beb42574 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -649,6 +649,7 @@ static ssize_t sched_sm_en_write(struct file *filp, const char __user *ubuf,
> static_branch_enable(&__sched_sm_enable);
> } else if (!sched_sm_wr_enable && orig) {
> static_branch_disable(&__sched_sm_enable);
> + cancel_work_sync(&steal_mon.work);
> cpumask_copy(&__cpu_preferred_mask, cpu_active_mask);
> }
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3f3c7f0ca489..b02a414ffaae 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -13292,7 +13292,8 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
>
> /* Spread load among preferred CPUs */
> - cpumask_and(cpus, cpus, cpu_preferred_mask);
> + if (sched_steal_mon_enabled())
> + cpumask_and(cpus, cpus, cpu_preferred_mask);
Again, if you mark do cpumask_copy(preferred, active) on the steal
monitor disablement, you don't need to complicate core logic here and
there.
>
> schedstat_inc(sd->lb_count[idle]);
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 984da3827f19..f3814099cc0b 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1060,6 +1060,7 @@ struct root_domain {
> struct perf_domain __rcu *pd;
> };
>
> +static inline bool sched_steal_mon_enabled(void);
> extern void init_defrootdomain(void);
> extern int sched_init_domains(const struct cpumask *cpu_map);
> extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
> @@ -1436,7 +1437,7 @@ static inline bool available_idle_cpu(int cpu)
> if (!idle_rq(cpu_rq(cpu)))
> return 0;
>
> - if (!cpu_preferred(cpu))
> + if (sched_steal_mon_enabled() && !cpu_preferred(cpu))
> return 0;
>
> if (vcpu_is_preempted(cpu))
> @@ -4243,8 +4244,15 @@ DECLARE_STATIC_KEY_FALSE(__sched_sm_enable);
> void sched_init_steal_monitor(void);
> void sched_steal_detection_work(struct work_struct *work);
> void sched_push_current_non_preferred_cpu(struct rq *rq);
> +void sched_trigger_steal_computation(int cpu);
> +static inline bool sched_steal_mon_enabled(void)
> +{
> + return static_branch_unlikely(&__sched_sm_enable);
> +}
> #else /* !CONFIG_PREFERRED_CPU */
> static inline void sched_push_current_non_preferred_cpu(struct rq *rq) { }
> static inline void sched_init_steal_monitor(void) { }
> +static inline void sched_trigger_steal_computation(int cpu) { }
> +static inline bool sched_steal_mon_enabled(void) { return false; }
> #endif
> #endif /* _KERNEL_SCHED_SCHED_H */
> --
> 2.47.3