Re: [PATCH 1/5] sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity
From: K Prateek Nayak
Date: Fri Apr 24 2026 - 01:14:37 EST
Hello Andrea,
On 4/23/2026 1:06 PM, Andrea Righi wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 69361c63353ad..934eb663f445e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7925,7 +7925,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
> int i, cpu, idle_cpu = -1, nr = INT_MAX;
>
> - if (sched_feat(SIS_UTIL)) {
> + if (sched_feat(SIS_UTIL) && sd->shared) {
> /*
> * Increment because !--nr is the condition to stop scan.
> *
> @@ -12840,7 +12840,8 @@ static void set_cpu_sd_state_busy(int cpu)
> goto unlock;
> sd->nohz_idle = 0;
I just realised this flag only matters for accounting to "nr_busy_cpus"
and we can bail out earlier if we don't have an sd->shared altogether.
You can probably adapt this to use guard(rcu)() while you are at it
and send these bits as a separate cleanup first saying that the
assumption of sd_llc->shared always existing will change with the
coming patches and you are introducing guard rails for the same.
>
> - atomic_inc(&sd->shared->nr_busy_cpus);
> + if (sd->shared)
> + atomic_inc(&sd->shared->nr_busy_cpus);
> unlock:
> rcu_read_unlock();
> }
> @@ -12869,7 +12870,8 @@ static void set_cpu_sd_state_idle(int cpu)
> goto unlock;
> sd->nohz_idle = 1;
>
> - atomic_dec(&sd->shared->nr_busy_cpus);
> + if (sd->shared)
> + atomic_dec(&sd->shared->nr_busy_cpus);
> unlock:
> rcu_read_unlock();
> }
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 5847b83d9d552..dc50193b198c6 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -680,19 +680,39 @@ static void update_top_cache_domain(int cpu)
> int id = cpu;
> int size = 1;
>
> + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
> + /*
> + * The shared object is attached to sd_asym_cpucapacity only when the
> + * asym domain is non-overlapping (i.e., not built from SD_NUMA).
> + * On overlapping (NUMA) asym domains we fall back to letting the
> + * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
> + * here.
> + */
> + if (sd && sd->shared)
> + sds = sd->shared;
> +
> + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
> +
> sd = highest_flag_domain(cpu, SD_SHARE_LLC);
> if (sd) {
> id = cpumask_first(sched_domain_span(sd));
> size = cpumask_weight(sched_domain_span(sd));
>
> - /* If sd_llc exists, sd_llc_shared should exist too. */
> - WARN_ON_ONCE(!sd->shared);
> - sds = sd->shared;
> + /*
> + * If sd_asym_cpucapacity didn't claim the shared object,
> + * sd_llc must have one linked.
> + */
> + if (!sds) {
> + WARN_ON_ONCE(!sd->shared);
> + sds = sd->shared;
> + }
> }
>
> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> per_cpu(sd_llc_size, cpu) = size;
> per_cpu(sd_llc_id, cpu) = id;
> +
> + /* TODO: Rename sd_llc_shared to fit the new role. */
> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
Would love for folks to chime in but IMO "sd_wakeup_shared" sounds
pretty reasonable since it is mainly the wakeup path that depends on
this except for one !ASYM load balancing trigger.
>
> sd = lowest_flag_domain(cpu, SD_CLUSTER);
> @@ -711,9 +731,6 @@ static void update_top_cache_domain(int cpu)
>
> sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
> rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
> -
> - sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
> - rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
> }
>
> /*
> @@ -2650,6 +2667,15 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
> }
> }
>
> +static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
> +{
> + int sd_id = cpumask_first(sched_domain_span(sd));
> +
> + sd->shared = *per_cpu_ptr(d->sds, sd_id);
> + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
> + atomic_inc(&sd->shared->ref);
> +}
> +
> /*
> * Build sched domains for a given set of CPUs and attach the sched domains
> * to the individual CPUs
> @@ -2708,20 +2734,53 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> }
>
> for_each_cpu(i, cpu_map) {
> + struct sched_domain *sd_asym = NULL;
> + bool asym_claimed = false;
> +
> sd = *per_cpu_ptr(d.sd, i);
> if (!sd)
> continue;
>
> + /*
> + * In case of ASYM_CPUCAPACITY, attach sd->shared to
> + * sd_asym_cpucapacity for wakeup stat tracking.
> + *
> + * Caveats:
> + *
> + * 1) has_asym is system-wide, but a given CPU may still
> + * lack an SD_ASYM_CPUCAPACITY_FULL ancestor (e.g., an
> + * exclusive cpuset carving out a symmetric capacity island).
> + * Such CPUs must fall through to the LLC seeding path below.
> + *
> + * 2) Skip the asym attach if the asym ancestor is an
> + * overlapping domain (SD_NUMA). On those topologies let the
> + * LLC path own the shared object instead.
> + *
> + * XXX: This assumes SD_ASYM_CPUCAPACITY_FULL domain
> + * always has more than one group else it is prone to
> + * degeneration.
I looked into this and we only set SD_ASYM_CPUCAPACITY if we find more
than one capacity and SD_ASYM_CPUCAPACITY_FULL implies there are atleast
two CPUs covering differnt capcities in the span.
The very first SD_ASYM_CPUCAPACITY_FULL domain should be safe from
degeneration when it is non-overlapping.
> + */
> + sd_asym = sd;
> + while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
> + sd_asym = sd_asym->parent;
> +
> + if (sd_asym && !(sd_asym->flags & SD_NUMA)) {
> + init_sched_domain_shared(&d, sd_asym);
> + asym_claimed = true;
> + }
We should probably guard this behind a "has_asym" check. Maybe even
extract into a sperate helper if the nesting gets too deep. Thoughts?
> +
> /* First, find the topmost SD_SHARE_LLC domain */
> + sd = *per_cpu_ptr(d.sd, i);
nit.
I think this reassignment is no longer required since you use a separate
"sd_asym" variable now.
> while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
> sd = sd->parent;
>
> if (sd->flags & SD_SHARE_LLC) {
> - int sd_id = cpumask_first(sched_domain_span(sd));
> -
> - sd->shared = *per_cpu_ptr(d.sds, sd_id);
> - atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
> - atomic_inc(&sd->shared->ref);
> + /*
> + * Initialize the sd->shared for SD_SHARE_LLC unless
> + * the asym path above already claimed it.
> + */
> + if (!asym_claimed)
> + init_sched_domain_shared(&d, sd);
Tbh, if "has_asym" is true, we probabaly don't even need this since the
nr_busy_cpus accounting gets us nothing.
Might save a little overhead and space on those systems but I would
love to hear if there are any concerns if we just drop the
sd_llc->shared when we detect asym capacities.
>
> /*
> * In presence of higher domains, adjust the
--
Thanks and Regards,
Prateek