Re: [PATCH 2/5] sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity
From: Vincent Guittot
Date: Wed May 06 2026 - 05:51:34 EST
On Tue, 28 Apr 2026 at 16:44, Andrea Righi <arighi@xxxxxxxxxx> wrote:
>
> From: K Prateek Nayak <kprateek.nayak@xxxxxxx>
>
> On asymmetric CPU capacity systems, the wakeup path uses
> select_idle_capacity(), which scans the span of sd_asym_cpucapacity
> rather than sd_llc.
>
> The has_idle_cores hint however lives on sd_llc->shared, so the
> wakeup-time read of has_idle_cores operates on an LLC-scoped blob while
> the actual scan/decision spans the asym domain; nr_busy_cpus also lives
> in the same shared sched_domain data, but it's never used in the asym
> CPU capacity scenario.
>
> Therefore, move the sched_domain_shared object to sd_asym_cpucapacity
> whenever the CPU has a SD_ASYM_CPUCAPACITY_FULL ancestor and that
> ancestor is non-overlapping (i.e., not built from SD_NUMA). In that case
> the scope of has_idle_cores matches the scope of the wakeup scan.
>
> Fall back to attaching the shared object to sd_llc in three cases:
>
> 1) plain symmetric systems (no SD_ASYM_CPUCAPACITY_FULL anywhere);
>
> 2) CPUs in an exclusive cpuset that carves out a symmetric capacity
> island: has_asym is system-wide but those CPUs have no
> SD_ASYM_CPUCAPACITY_FULL ancestor in their hierarchy and follow
> the symmetric LLC path in select_idle_sibling();
>
> 3) exotic topologies where SD_ASYM_CPUCAPACITY_FULL lands on an
> SD_NUMA-built domain. init_sched_domain_shared() keys the shared
> blob off cpumask_first(span), which on overlapping NUMA domains
> would alias unrelated spans onto the same blob. Keep the shared
> object on the LLC there; select_idle_capacity() gracefully skips
> the has_idle_cores preference when sd->shared is NULL.
>
> While at it, also rename the per-CPU sd_llc_shared to sd_balance_shared,
> as it is no longer strictly tied to the LLC.
>
> Co-developed-by: Andrea Righi <arighi@xxxxxxxxxx>
> Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
> Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
> ---
> kernel/sched/fair.c | 17 +++++---
> kernel/sched/sched.h | 2 +-
> kernel/sched/topology.c | 90 +++++++++++++++++++++++++++++++++++------
> 3 files changed, 89 insertions(+), 20 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e0f75dedc8456..bbdf537f61154 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7790,7 +7790,7 @@ static inline void set_idle_cores(int cpu, int val)
> {
> struct sched_domain_shared *sds;
>
> - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
> + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
> if (sds)
> WRITE_ONCE(sds->has_idle_cores, val);
> }
> @@ -7799,7 +7799,7 @@ static inline bool test_idle_cores(int cpu)
> {
> struct sched_domain_shared *sds;
>
> - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
> + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
> if (sds)
> return READ_ONCE(sds->has_idle_cores);
>
> @@ -7808,7 +7808,7 @@ static inline bool test_idle_cores(int cpu)
>
> /*
> * Scans the local SMT mask to see if the entire core is idle, and records this
> - * information in sd_llc_shared->has_idle_cores.
> + * information in sd_balance_shared->has_idle_cores.
> *
> * Since SMT siblings share all cache levels, inspecting this limited remote
> * state should be fairly cheap.
> @@ -7925,7 +7925,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
> int i, cpu, idle_cpu = -1, nr = INT_MAX;
>
> - if (sched_feat(SIS_UTIL)) {
> + if (sched_feat(SIS_UTIL) && sd->shared) {
If shared is attached to sd_asym_cpucapacity instead of sd_llc we
should never reach this point. Or I'm missing a case ?
> /*
> * Increment because !--nr is the condition to stop scan.
> *
> @@ -12826,7 +12826,11 @@ static void set_cpu_sd_state_busy(int cpu)
> struct sched_domain *sd;
> sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
>
> - if (!sd || !sd->nohz_idle)
> + /*
> + * sd->nohz_idle only pairs with nr_busy_cpus on sd->shared; if this
> + * domain has no shared object there is nothing to clear or account.
> + */
> + if (!sd || !sd->shared || !sd->nohz_idle)
> return;
> sd->nohz_idle = 0;
>
> @@ -12851,7 +12855,8 @@ static void set_cpu_sd_state_idle(int cpu)
> struct sched_domain *sd;
> sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
>
> - if (!sd || sd->nohz_idle)
> + /* See set_cpu_sd_state_busy(): nohz_idle is only used with sd->shared. */
> + if (!sd || !sd->shared || sd->nohz_idle)
> return;
> sd->nohz_idle = 1;
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 9f63b15d309d1..330f5893c4561 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2170,7 +2170,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DECLARE_PER_CPU(int, sd_llc_size);
> DECLARE_PER_CPU(int, sd_llc_id);
> DECLARE_PER_CPU(int, sd_share_id);
> -DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 5847b83d9d552..69d465cc93ab4 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -665,7 +665,7 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DEFINE_PER_CPU(int, sd_llc_size);
> DEFINE_PER_CPU(int, sd_llc_id);
> DEFINE_PER_CPU(int, sd_share_id);
> -DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> @@ -680,20 +680,38 @@ static void update_top_cache_domain(int cpu)
> int id = cpu;
> int size = 1;
>
> + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
> + /*
> + * The shared object is attached to sd_asym_cpucapacity only when the
> + * asym domain is non-overlapping (i.e., not built from SD_NUMA).
> + * On overlapping (NUMA) asym domains we fall back to letting the
> + * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
> + * here.
> + */
> + if (sd && sd->shared)
> + sds = sd->shared;
> +
> + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
> +
> sd = highest_flag_domain(cpu, SD_SHARE_LLC);
> if (sd) {
> id = cpumask_first(sched_domain_span(sd));
> size = cpumask_weight(sched_domain_span(sd));
>
> - /* If sd_llc exists, sd_llc_shared should exist too. */
> - WARN_ON_ONCE(!sd->shared);
> - sds = sd->shared;
> + /*
> + * If sd_asym_cpucapacity didn't claim the shared object,
> + * sd_llc must have one linked.
> + */
> + if (!sds) {
> + WARN_ON_ONCE(!sd->shared);
> + sds = sd->shared;
> + }
> }
>
> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> per_cpu(sd_llc_size, cpu) = size;
> per_cpu(sd_llc_id, cpu) = id;
> - rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
> + rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds);
>
> sd = lowest_flag_domain(cpu, SD_CLUSTER);
> if (sd)
> @@ -711,9 +729,6 @@ static void update_top_cache_domain(int cpu)
>
> sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
> rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
> -
> - sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
> - rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
> }
>
> /*
> @@ -2650,6 +2665,49 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
> }
> }
>
> +static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
> +{
> + int sd_id = cpumask_first(sched_domain_span(sd));
> +
> + sd->shared = *per_cpu_ptr(d->sds, sd_id);
> + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
> + atomic_inc(&sd->shared->ref);
> +}
> +
> +/*
> + * For asymmetric CPU capacity, attach sched_domain_shared on the innermost
> + * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is
> + * not an overlapping NUMA-built domain (then LLC should claim shared).
> + *
> + * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island),
> + * then LLC must claim shared instead.
> + *
> + * Note: SD_ASYM_CPUCAPACITY_FULL is only set when multiple distinct capacities
> + * exist in the domain span, so the asym domain we attach to cannot degenerate
> + * into a single-capacity group. The relevant edge cases are instead covered by
> + * the caveats above.
> + *
> + * Return true if this CPU's asym path claimed sd->shared, false otherwise.
> + */
> +static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
> +{
> + struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu);
> + struct sched_domain *sd_asym;
> +
> + if (!sd)
> + return false;
> +
> + sd_asym = sd;
> + while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
> + sd_asym = sd_asym->parent;
> +
> + if (!sd_asym || (sd_asym->flags & SD_NUMA))
> + return false;
> +
> + init_sched_domain_shared(d, sd_asym);
> + return true;
> +}
> +
> /*
> * Build sched domains for a given set of CPUs and attach the sched domains
> * to the individual CPUs
> @@ -2708,20 +2766,26 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> }
>
> for_each_cpu(i, cpu_map) {
> + bool asym_claimed = false;
> +
> sd = *per_cpu_ptr(d.sd, i);
> if (!sd)
> continue;
>
> + if (has_asym)
> + asym_claimed = claim_asym_sched_domain_shared(&d, i);
> +
> /* First, find the topmost SD_SHARE_LLC domain */
> while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
> sd = sd->parent;
>
> if (sd->flags & SD_SHARE_LLC) {
> - int sd_id = cpumask_first(sched_domain_span(sd));
> -
> - sd->shared = *per_cpu_ptr(d.sds, sd_id);
> - atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
> - atomic_inc(&sd->shared->ref);
> + /*
> + * Initialize the sd->shared for SD_SHARE_LLC unless
> + * the asym path above already claimed it.
> + */
> + if (!asym_claimed)
> + init_sched_domain_shared(&d, sd);
>
> /*
> * In presence of higher domains, adjust the
> --
> 2.54.0
>