Re: [PATCH v2 2/5] sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity
From: Peter Zijlstra
Date: Mon May 18 2026 - 17:01:04 EST
On Sat, May 16, 2026 at 07:58:50AM +0200, Andrea Righi wrote:
> From: K Prateek Nayak <kprateek.nayak@xxxxxxx>
>
> On asymmetric CPU capacity systems, the wakeup path uses
> select_idle_capacity(), which scans the span of sd_asym_cpucapacity
> rather than sd_llc.
>
> The has_idle_cores hint however lives on sd_llc->shared, so the
> wakeup-time read of has_idle_cores operates on an LLC-scoped blob while
> the actual scan/decision spans the asym domain; nr_busy_cpus also lives
> in the same shared sched_domain data, but it's never used in the asym
> CPU capacity scenario.
>
> Therefore, move the sched_domain_shared object to sd_asym_cpucapacity
> whenever the CPU has a SD_ASYM_CPUCAPACITY_FULL ancestor and that
> ancestor is non-overlapping (i.e., not built from SD_NUMA). In that case
> the scope of has_idle_cores matches the scope of the wakeup scan.
>
> Fall back to attaching the shared object to sd_llc in three cases:
>
> 1) plain symmetric systems (no SD_ASYM_CPUCAPACITY_FULL anywhere);
>
> 2) CPUs in an exclusive cpuset that carves out a symmetric capacity
> island: has_asym is system-wide but those CPUs have no
> SD_ASYM_CPUCAPACITY_FULL ancestor in their hierarchy and follow
> the symmetric LLC path in select_idle_sibling();
>
> 3) exotic topologies where SD_ASYM_CPUCAPACITY_FULL lands on an
> SD_NUMA-built domain. init_sched_domain_shared() keys the shared
> blob off cpumask_first(span), which on overlapping NUMA domains
> would alias unrelated spans onto the same blob. Keep the shared
> object on the LLC there; select_idle_capacity() gracefully skips
> the has_idle_cores preference when sd->shared is NULL.
>
> While at it, also rename the per-CPU sd_llc_shared to sd_balance_shared,
> as it is no longer strictly tied to the LLC.
>
> Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
> Acked-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> Co-developed-by: Andrea Righi <arighi@xxxxxxxxxx>
> Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
> Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
> ---
> Changes in v2:
> - update comment referencing to the old sd_llc->shared->has_idle_cores
> (Shrikanth Hegde)
Right, so I just merged a branch that has this series with a branch that
has the cache aware load balancing stuff on, and the result ain't
pretty.
That cache aware thing really wants sd_llc_shared. Now, I imagine that
for now the intersection between ASYM and SCHED_CACHE is not that
interesting, but at the same time, I'm fairly sure that is something
people will end up looking at.
For now, I've stomped on things and the merge holds the below. It
builds, not tested much beyond that.
I've pushed out the whole pile into queue/sched/core.
diff --cc kernel/sched/topology.c
index f96d50131495,e47a3f72eb72..000000000000
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@@ -663,9 -670,9 +670,10 @@@ static void destroy_sched_domains(struc
*/
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
- DEFINE_PER_CPU(int, sd_llc_id);
+ DEFINE_PER_CPU(int, sd_llc_id) = -1;
DEFINE_PER_CPU(int, sd_share_id);
+ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
@@@ -729,6 -717,9 +718,20 @@@ static void update_top_cache_domain(in
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
+
+ sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
++ /*
++ * The shared object is attached to sd_asym_cpucapacity only when the
++ * asym domain is non-overlapping (i.e., not built from SD_NUMA).
++ * On overlapping (NUMA) asym domains we fall back to letting the
++ * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
++ * here.
++ */
++ if (sd && sd->shared)
++ sds = sd->shared;
++
+ rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
++ rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds);
}
/*
@@@ -2663,54 -2906,61 +2916,109 @@@ static void adjust_numa_imbalance(struc
}
}
+static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
+{
+ int sd_id = cpumask_first(sched_domain_span(sd));
+
+ sd->shared = *per_cpu_ptr(d->sds, sd_id);
+ /*
+ * nr_busy_cpus is consumed only by the NOHZ kick path via
+ * sd_balance_shared; on the asym-capacity path it is initialized but
+ * never read.
+ */
+ atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+ atomic_inc(&sd->shared->ref);
+}
+
+/*
+ * For asymmetric CPU capacity, attach sched_domain_shared on the innermost
+ * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is
+ * not an overlapping NUMA-built domain (then LLC should claim shared).
+ *
+ * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island),
+ * then LLC must claim shared instead.
+ *
+ * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values
+ * are present in the domain span, so the asym domain we attach to cannot
+ * degenerate into a single-capacity group. The relevant edge cases are instead
+ * covered by the caveats above.
+ *
+ * Return true if this CPU's asym path claimed sd->shared, false otherwise.
+ */
+static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
+{
+ struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu);
+ struct sched_domain *sd_asym;
+
+ if (!sd)
+ return false;
+
+ sd_asym = sd;
+ while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
+ sd_asym = sd_asym->parent;
+
+ if (!sd_asym || (sd_asym->flags & SD_NUMA))
+ return false;
+
+ init_sched_domain_shared(d, sd_asym);
+ return true;
+}
+
+ static int __sched_domains_alloc_llc_id(void)
+ {
+ int lid, max;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
+ /*
+ * llc_id space should never grow larger than the
+ * possible number of CPUs in the system.
+ */
+ if (lid >= nr_cpu_ids)
+ return -1;
+
+ __cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
+ max = cpumask_last(sched_domains_llc_id_allocmask);
+ if (max > max_lid)
+ max_lid = max;
+
+ return lid;
+ }
+
+ static void __sched_domains_free_llc_id(int cpu)
+ {
+ int i, lid, max;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = per_cpu(sd_llc_id, cpu);
+ if (lid == -1 || lid >= nr_cpu_ids)
+ return;
+
+ per_cpu(sd_llc_id, cpu) = -1;
+
+ for_each_cpu(i, llc_mask(cpu)) {
+ /* An online CPU owns the llc_id. */
+ if (per_cpu(sd_llc_id, i) == lid)
+ return;
+ }
+
+ __cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
+
+ max = cpumask_last(sched_domains_llc_id_allocmask);
+ /* shrink max lid to save memory */
+ if (max < max_lid)
+ max_lid = max;
+ }
+
+ void sched_domains_free_llc_id(int cpu)
+ {
+ sched_domains_mutex_lock();
+ __sched_domains_free_llc_id(cpu);
+ sched_domains_mutex_unlock();
+ }
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
@@@ -2775,20 -3049,16 +3107,15 @@@ build_sched_domains(const struct cpumas
if (!sd)
continue;
+ if (has_asym)
- asym_claimed = claim_asym_sched_domain_shared(&d, i);
++ claim_asym_sched_domain_shared(&d, i);
+
/* First, find the topmost SD_SHARE_LLC domain */
while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
sd = sd->parent;
if (sd->flags & SD_SHARE_LLC) {
- /*
- * Initialize the sd->shared for SD_SHARE_LLC unless
- * the asym path above already claimed it.
- */
- if (!asym_claimed)
- init_sched_domain_shared(&d, sd);
- int sd_id = cpumask_first(sched_domain_span(sd));
-
- sd->shared = *per_cpu_ptr(d.sds, sd_id);
- atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
- atomic_inc(&sd->shared->ref);
++ init_sched_domain_shared(&d, sd);
/*
* In presence of higher domains, adjust the
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9dd4a94801c9..300320b0248a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2191,6 +2191,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(int, sd_share_id);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);