Re: [PATCH v4 14/20] sched/core: Introduce a simple steal monitor

From: Yury Norov

Date: Thu Jun 18 2026 - 00:31:01 EST


On Wed, Jun 17, 2026 at 11:11:33PM +0530, Shrikanth Hegde wrote:
> Start with a simple steal monitor.
>
> It is meant to look at steal time and make the decision to
> reduce/increase the preferred CPUs.
>
> It has
> - work function to execute the steal time calculations and decision
> making periodically.
> - low and high thresholds for steal time.
> - sampling period to control the frequency of steal time calculations.
> - cache the previous decision to avoid oscillations

This monitor is the one implementation out of quite many possible,
right? I don't think it should live in the core scheduler files, it
should be a module.

> Signed-off-by: Shrikanth Hegde <sshegde@xxxxxxxxxxxxx>
> ---
> v3->v4:
> - Drop tmp_mask
>
> include/linux/sched.h | 11 +++++++++++
> kernel/sched/core.c | 23 +++++++++++++++++++++++
> kernel/sched/sched.h | 3 +++
> 3 files changed, 37 insertions(+)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 5f523782ca28..ce6bc8a22eb1 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2517,4 +2517,15 @@ extern void migrate_enable(void);
>
> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>
> +#ifdef CONFIG_PREFERRED_CPU
> +struct steal_monitor_t {
> + struct work_struct work;
> + ktime_t prev_time;
> + u64 prev_steal;
> + int previous_decision;
> + unsigned int low_threshold;
> + unsigned int high_threshold;
> + unsigned int sampling_period_ms;
> +};
> +#endif
> #endif
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 24d4abc74241..cc48632dd42d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9138,6 +9138,8 @@ void __init sched_init(void)
>
> preempt_dynamic_init();
>
> + sched_init_steal_monitor();
> +
> scheduler_running = 1;
> }
>
> @@ -11384,4 +11386,25 @@ void sched_push_current_non_preferred_cpu(struct rq *rq)
> stop_one_cpu_nowait(rq->cpu, sched_non_preferred_cpu_push_stop,
> push_task, this_cpu_ptr(&npc_push_task_work));
> }
> +
> +struct steal_monitor_t steal_mon;
> +
> +void sched_init_steal_monitor(void)
> +{
> + INIT_WORK(&steal_mon.work, sched_steal_detection_work);
> + steal_mon.low_threshold = 200; /* 2% steal time */
> + steal_mon.high_threshold = 500; /* 5% steal time */
> + steal_mon.sampling_period_ms = 1000; /* once per second */
> +}
> +
> +/* This is only a skeleton. Subsequent patches introduce more of it */
> +void sched_steal_detection_work(struct work_struct *work)
> +{
> + struct steal_monitor_t *sm = container_of(work, struct steal_monitor_t, work);
> + ktime_t now;
> +
> + /* Update the prev_time for next iteration*/
> + now = ktime_get();
> + sm->prev_time = now;
> +}
> #endif
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 9cb006c21090..984da3827f19 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -4240,8 +4240,11 @@ static inline bool task_has_preferred_cpus(struct task_struct *p)
> #ifdef CONFIG_PREFERRED_CPU
> DECLARE_STATIC_KEY_FALSE(__sched_sm_enable);
>
> +void sched_init_steal_monitor(void);
> +void sched_steal_detection_work(struct work_struct *work);
> void sched_push_current_non_preferred_cpu(struct rq *rq);
> #else /* !CONFIG_PREFERRED_CPU */
> static inline void sched_push_current_non_preferred_cpu(struct rq *rq) { }
> +static inline void sched_init_steal_monitor(void) { }
> #endif
> #endif /* _KERNEL_SCHED_SCHED_H */
> --
> 2.47.3