[PATCH v2 13/17] sched/core: Introduce a simple steal monitor

From: Shrikanth Hegde

Date: Tue Apr 07 2026 - 15:25:39 EST


Start with a simple steal monitor.

It is meant to look at steal time and make the decision to
reduce/increase the preferred CPUs.

It has
- work function to execute the steal time calculations and decision
making periodically.
- temporary cpumask, which will be used in the work function. This helps
to avoid cpumask allocation in periodic work function.
- low and high thesholds for steal time.
- sampling period to control the frequency of steal time calculations.
- cache the previous decision to avoid oscillations

Signed-off-by: Shrikanth Hegde <sshegde@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 23 +++++++++++++++++++++++
kernel/sched/sched.h | 14 ++++++++++++++
2 files changed, 37 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7a9442439eb2..8c80600ddd28 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9083,6 +9083,8 @@ void __init sched_init(void)

preempt_dynamic_init();

+ sched_init_steal_monitor();
+
scheduler_running = 1;
}

@@ -11332,4 +11334,25 @@ void sched_push_current_non_preferred_cpu(struct rq *rq)
local_irq_restore(flags);
}

+struct steal_monitor_t steal_mon;
+
+void sched_init_steal_monitor(void)
+{
+ INIT_WORK(&steal_mon.work, sched_steal_detection_work);
+ zalloc_cpumask_var(&steal_mon.tmp_mask, GFP_NOWAIT);
+ steal_mon.low_threshold = 200; /* 2% steal time */
+ steal_mon.high_threshold = 500; /* 5% steal time */
+ steal_mon.sampling_period_ms = 1000; /* once per second */
+}
+
+/* This is only a skeleton. Subsequent patches introduce more of it */
+void sched_steal_detection_work(struct work_struct *work)
+{
+ struct steal_monitor_t *sm = container_of(work, struct steal_monitor_t, work);
+ ktime_t now;
+
+ /* Update the prev_time for next iteration*/
+ now = ktime_get();
+ sm->prev_time = now;
+}
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c1d037f11c62..c0fbfb04eda3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4138,12 +4138,25 @@ DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
#include "ext.h"

#ifdef CONFIG_PARAVIRT
+struct steal_monitor_t {
+ struct work_struct work;
+ cpumask_var_t tmp_mask;
+ ktime_t prev_time;
+ u64 prev_steal;
+ int previous_decision;
+ unsigned int low_threshold;
+ unsigned int high_threshold;
+ unsigned int sampling_period_ms;
+};
+
static inline bool task_can_run_on_preferred_cpu(struct task_struct *p)
{
return cpumask_intersects(p->cpus_ptr, cpu_preferred_mask);
}

void sched_push_current_non_preferred_cpu(struct rq *rq);
+void sched_init_steal_monitor(void);
+void sched_steal_detection_work(struct work_struct *work);
#else
static inline bool task_can_run_on_preferred_cpu(struct task_struct *p)
{
@@ -4151,6 +4164,7 @@ static inline bool task_can_run_on_preferred_cpu(struct task_struct *p)
}

static inline void sched_push_current_non_preferred_cpu(struct rq *rq) { }
+static inline void sched_init_steal_monitor(void) { }
#endif

#endif /* _KERNEL_SCHED_SCHED_H */
--
2.47.3