Re: [PATCH 1/2] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection
From: K Prateek Nayak
Date: Mon Apr 20 2026 - 01:49:50 EST
Hello Andrea,
On 4/18/2026 1:54 PM, Andrea Righi wrote:
>>> @@ -7774,6 +7774,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>>> static int
>>> select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>>> {
>>> + bool prefers_idle_core = sched_smt_active() && test_idle_cores(target);
>>
>> Somehow I miss a:
>>
>> if (prefers_idle_core)
>> set_idle_cores(target, false)
>>
>> The one in select_idle_sibling() -> select_idle_cpu() isn't executed
>> anymore in with ASYM_CPUCAPACITY.
>>
>
> Right, we need to add this as also pointed by Vincent.
>
>>
>> Another thing is that sic() iterates over CPUs sd_asym_cpucapacity
>> whereas the idle core thing lives in sd_llc/sd_llc_shared. Both sd's are
>> probably th same on your system.
>
> Hm... they're the same on my machine, but if they're different, clearing
> has_idle_cores here is not right and it might lead to false positives. We should
> only clear it only when both domains span the same CPUs (or just check if
> sd_asym_cpucapacity and sd_llc are the same).
>
> However, if they're not the same, I'm not sure exactly what we should do...
> maybe ignore has_idle_cores and always do the scan for now?
With your changes, only two places actually care about test_idle_cores():
- select_idle_capacity()
- select_idle_cpu()
If we go into select_idle_capacity(), we don't do select_idle_cpu() so
the two paths are mutually exclusive.
In nohz_balancer_kick(), if we find, sd_asym_cpucapacity, we simply
don't care about the sd_llc_shared->nr_busy_cpus during balancing so
that begs the question if we can simply track idle_cores at
sd_asym_cpucapacity for these systems?
Following is only build tested for now but I'll try to spoof asym
cpucapacity on my system and check if it holds up or not:
(On top of tip:sched/core at sched-core-2026-04-13 + this series)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 78f2d2c4e24f..509146c486ac 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7913,7 +7913,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
- if (sched_feat(SIS_UTIL)) {
+ if (sched_feat(SIS_UTIL) && sd->shared) {
/*
* Increment because !--nr is the condition to stop scan.
*
@@ -12856,7 +12856,8 @@ static void set_cpu_sd_state_busy(int cpu)
goto unlock;
sd->nohz_idle = 0;
- atomic_inc(&sd->shared->nr_busy_cpus);
+ if (sd->shared)
+ atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -12885,7 +12886,8 @@ static void set_cpu_sd_state_idle(int cpu)
goto unlock;
sd->nohz_idle = 1;
- atomic_dec(&sd->shared->nr_busy_cpus);
+ if (sd->shared)
+ atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 5847b83d9d55..45b919b39c7d 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -680,19 +680,38 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
int size = 1;
+ sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
+ if (sd) {
+ /*
+ * If sd_asym_cpucapacity exists,
+ * the shared object should exist too.
+ */
+ WARN_ON_ONCE(!sd->shared);
+ sds = sd->shared;
+ }
+
+ rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
+
sd = highest_flag_domain(cpu, SD_SHARE_LLC);
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- /* If sd_llc exists, sd_llc_shared should exist too. */
- WARN_ON_ONCE(!sd->shared);
- sds = sd->shared;
+ /*
+ * If sd_asym_cpucapacity doesn't exist,
+ * sd_llc_shared must have a sd->shared linked.
+ */
+ if (!sds) {
+ WARN_ON_ONCE(!sd->shared);
+ sds = sd->shared;
+ }
}
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
+
+ /* TODO: Rename sd_llc_shared to fit the new role. */
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -711,9 +730,6 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
-
- sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
- rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
}
/*
@@ -2650,6 +2666,15 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
}
}
+static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
+{
+ int sd_id = cpumask_first(sched_domain_span(sd));
+
+ sd->shared = *per_cpu_ptr(d->sds, sd_id);
+ atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+ atomic_inc(&sd->shared->ref);
+}
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
@@ -2712,16 +2737,33 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (!sd)
continue;
+ /*
+ * In case of ASYM_CPUCAPACITY, attach sd->shared to
+ * sd_asym_cpucapacity for wakeup stat tracking.
+ *
+ * XXX: This assumes SD_ASYM_CPUCAPACITY_FULL domain
+ * always has more than one group else it is prone to
+ * degeneration.
+ */
+ if (has_asym) {
+ while (sd && !(sd->flags & SD_ASYM_CPUCAPACITY_FULL))
+ sd = sd->parent;
+
+ init_sched_domain_shared(&d, sd);
+ }
+
/* First, find the topmost SD_SHARE_LLC domain */
+ sd = *per_cpu_ptr(d.sd, i);
while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
sd = sd->parent;
if (sd->flags & SD_SHARE_LLC) {
- int sd_id = cpumask_first(sched_domain_span(sd));
-
- sd->shared = *per_cpu_ptr(d.sds, sd_id);
- atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
- atomic_inc(&sd->shared->ref);
+ /*
+ * Initialize the sd->shared for SD_SHARE_LLC if
+ * SD_ASYM_CPUCAPACITY_FULL hasn't claimed it already.
+ */
+ if (!has_asym)
+ init_sched_domain_shared(&d, sd);
/*
* In presence of higher domains, adjust the
---
I still have one question: Can first SD_ASYM_CPUCAPACITY_FULL be set at
a SD_NUMA?
We'll need to deal with overlapping domains then but seems like it could
be possible with weird cpusets :-(
But in that case, do we even want to search CPUs outside the NUMA in
select_idle_capacity()? I don't think anything stops this currently but
I might be wrong.
--
Thanks and Regards,
Prateek