Re: [PATCH v2 00/17] sched/paravirt: Introduce cpu_preferred_mask and steal-driven vCPU backoff
From: Shrikanth Hegde
Date: Fri Apr 10 2026 - 05:52:10 EST
On 4/8/26 12:49 AM, Shrikanth Hegde wrote:
In the virtualized environment, often there is vCPU overcommit. i.e. sum
of CPUs in all guests(virtual CPU aka vCPU) exceed the underlying physical CPU
(managed by host aka pCPU).
Patch to write custom CPUs into preferred CPUs.
This might help one echo specific CPUs based on their hardware
topology. This could be used to find out the different kind
of patterns across HWs and kind of arch specific hooks one might need
if generic STEAL_MONITOR can't cater to all needs.
Note: This disables the generic steal when custom mask is provided and
enables it once empty mask is echoed.
---
drivers/base/cpu.c | 54 ++++++++++++++++++++++++++++++++++++++++++-
include/linux/sched.h | 3 +++
kernel/sched/core.c | 4 ++++
3 files changed, 60 insertions(+), 1 deletion(-)
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 0a6cf37f2001..133f28b15906 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -392,12 +392,64 @@ static int cpu_uevent(const struct device *dev, struct kobj_uevent_env *env)
#endif
#ifdef CONFIG_PARAVIRT
+static ssize_t preferred_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ cpumask_var_t temp_mask;
+ int retval = 0;
+ int cpu;
+
+ if (!alloc_cpumask_var(&temp_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = cpulist_parse(buf, temp_mask);
+ if (retval)
+ goto free_mask;
+
+ /* ALL cpus can't be marked as paravirt */
+ if (cpumask_equal(temp_mask, cpu_online_mask)) {
+ retval = -EINVAL;
+ goto free_mask;
+ }
+ if (cpumask_weight(temp_mask) > num_online_cpus()) {
+ retval = -EINVAL;
+ goto free_mask;
+ }
+
+ /* Echoing > means all CPUs are preferred and Enables generic steal monitor */
+ if (cpumask_empty(temp_mask)) {
+ static_branch_disable(&disable_generic_steal_mon);
+ cpumask_copy((struct cpumask *)&__cpu_preferred_mask, cpu_online_mask);
+
+ } else {
+ /*
+ * Explicit Specification of Usable CPUs and Disables generic steal
+ * monitor
+ */
+ static_branch_enable(&disable_generic_steal_mon);
+ cpumask_copy((struct cpumask *)&__cpu_preferred_mask, temp_mask);
+
+ /* Enable tick on nohz_full cpu */
+ for_each_cpu_andnot(cpu, cpu_online_mask, temp_mask) {
+ if (tick_nohz_full_cpu(cpu))
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+ }
+ }
+
+ retval = count;
+
+free_mask:
+ free_cpumask_var(temp_mask);
+ return retval;
+}
+
static ssize_t preferred_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%*pbl\n", cpumask_pr_args(cpu_preferred_mask));
}
-static DEVICE_ATTR_RO(preferred);
+static DEVICE_ATTR_RW(preferred);
#endif
const struct bus_type cpu_subsys = {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6c0d5d36f21c..3760c8047ffe 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2515,4 +2515,7 @@ extern void migrate_enable(void);
DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
+#ifdef CONFIG_PARAVIRT
+DECLARE_STATIC_KEY_FALSE(disable_generic_steal_mon);
+#endif
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cb9110f95ebf..680da55070f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11339,6 +11339,7 @@ void sched_push_current_non_preferred_cpu(struct rq *rq)
}
struct steal_monitor_t steal_mon;
+DEFINE_STATIC_KEY_FALSE(disable_generic_steal_mon);
void sched_init_steal_monitor(void)
{
@@ -11428,6 +11429,9 @@ void sched_trigger_steal_computation(int cpu)
if (likely(cpu != first_hk_cpu))
return;
+ if (static_branch_unlikely(&disable_generic_steal_mon))
+ return;
+
/*
* Since everything is updated by first housekeeping CPU,
* There is no need for complex syncronization.
--
2.47.3