Re: [PATCH v2 2/5] sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity

From: Andrea Righi

Date: Mon May 18 2026 - 17:33:16 EST

Hi Peter,

On Mon, May 18, 2026 at 10:58:59PM +0200, Peter Zijlstra wrote:
> On Sat, May 16, 2026 at 07:58:50AM +0200, Andrea Righi wrote:
...
> Right, so I just merged a branch that has this series with a branch that
> has the cache aware load balancing stuff on, and the result ain't
> pretty.
>
> That cache aware thing really wants sd_llc_shared. Now, I imagine that
> for now the intersection between ASYM and SCHED_CACHE is not that
> interesting, but at the same time, I'm fairly sure that is something
> people will end up looking at.
>
> For now, I've stomped on things and the merge holds the below. It
> builds, not tested much beyond that.
>
> I've pushed out the whole pile into queue/sched/core.

Conceptually makes sense to me. IIUC cache-aware code necessarily needs per-LLC
util_avg/capacity, while the asym path needs has_idle_cores at asym span, so you
basically restored sd_llc_shared alongside sd_balance_shared.

I'll re-run my tests with your sched/core branch and report back.

Thanks!
-Andrea

>
> diff --cc kernel/sched/topology.c
> index f96d50131495,e47a3f72eb72..000000000000
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@@ -663,9 -670,9 +670,10 @@@ static void destroy_sched_domains(struc
> */
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DEFINE_PER_CPU(int, sd_llc_size);
> - DEFINE_PER_CPU(int, sd_llc_id);
> + DEFINE_PER_CPU(int, sd_llc_id) = -1;
> DEFINE_PER_CPU(int, sd_share_id);
> + DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> @@@ -729,6 -717,9 +718,20 @@@ static void update_top_cache_domain(in
>
> sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
> rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
> +
> + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
> ++ /*
> ++ * The shared object is attached to sd_asym_cpucapacity only when the
> ++ * asym domain is non-overlapping (i.e., not built from SD_NUMA).
> ++ * On overlapping (NUMA) asym domains we fall back to letting the
> ++ * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
> ++ * here.
> ++ */
> ++ if (sd && sd->shared)
> ++ sds = sd->shared;
> ++
> + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
> ++ rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds);
> }
>
> /*
> @@@ -2663,54 -2906,61 +2916,109 @@@ static void adjust_numa_imbalance(struc
> }
> }
>
> +static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
> +{
> + int sd_id = cpumask_first(sched_domain_span(sd));
> +
> + sd->shared = *per_cpu_ptr(d->sds, sd_id);
> + /*
> + * nr_busy_cpus is consumed only by the NOHZ kick path via
> + * sd_balance_shared; on the asym-capacity path it is initialized but
> + * never read.
> + */
> + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
> + atomic_inc(&sd->shared->ref);
> +}
> +
> +/*
> + * For asymmetric CPU capacity, attach sched_domain_shared on the innermost
> + * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is
> + * not an overlapping NUMA-built domain (then LLC should claim shared).
> + *
> + * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island),
> + * then LLC must claim shared instead.
> + *
> + * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values
> + * are present in the domain span, so the asym domain we attach to cannot
> + * degenerate into a single-capacity group. The relevant edge cases are instead
> + * covered by the caveats above.
> + *
> + * Return true if this CPU's asym path claimed sd->shared, false otherwise.
> + */
> +static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
> +{
> + struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu);
> + struct sched_domain *sd_asym;
> +
> + if (!sd)
> + return false;
> +
> + sd_asym = sd;
> + while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
> + sd_asym = sd_asym->parent;
> +
> + if (!sd_asym || (sd_asym->flags & SD_NUMA))
> + return false;
> +
> + init_sched_domain_shared(d, sd_asym);
> + return true;
> +}
> +
> + static int __sched_domains_alloc_llc_id(void)
> + {
> + int lid, max;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> + /*
> + * llc_id space should never grow larger than the
> + * possible number of CPUs in the system.
> + */
> + if (lid >= nr_cpu_ids)
> + return -1;
> +
> + __cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
> + max = cpumask_last(sched_domains_llc_id_allocmask);
> + if (max > max_lid)
> + max_lid = max;
> +
> + return lid;
> + }
> +
> + static void __sched_domains_free_llc_id(int cpu)
> + {
> + int i, lid, max;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = per_cpu(sd_llc_id, cpu);
> + if (lid == -1 || lid >= nr_cpu_ids)
> + return;
> +
> + per_cpu(sd_llc_id, cpu) = -1;
> +
> + for_each_cpu(i, llc_mask(cpu)) {
> + /* An online CPU owns the llc_id. */
> + if (per_cpu(sd_llc_id, i) == lid)
> + return;
> + }
> +
> + __cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
> +
> + max = cpumask_last(sched_domains_llc_id_allocmask);
> + /* shrink max lid to save memory */
> + if (max < max_lid)
> + max_lid = max;
> + }
> +
> + void sched_domains_free_llc_id(int cpu)
> + {
> + sched_domains_mutex_lock();
> + __sched_domains_free_llc_id(cpu);
> + sched_domains_mutex_unlock();
> + }
> +
> /*
> * Build sched domains for a given set of CPUs and attach the sched domains
> * to the individual CPUs
> @@@ -2775,20 -3049,16 +3107,15 @@@ build_sched_domains(const struct cpumas
> if (!sd)
> continue;
>
> + if (has_asym)
> - asym_claimed = claim_asym_sched_domain_shared(&d, i);
> ++ claim_asym_sched_domain_shared(&d, i);
> +
> /* First, find the topmost SD_SHARE_LLC domain */
> while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
> sd = sd->parent;
>
> if (sd->flags & SD_SHARE_LLC) {
> - /*
> - * Initialize the sd->shared for SD_SHARE_LLC unless
> - * the asym path above already claimed it.
> - */
> - if (!asym_claimed)
> - init_sched_domain_shared(&d, sd);
> - int sd_id = cpumask_first(sched_domain_span(sd));
> -
> - sd->shared = *per_cpu_ptr(d.sds, sd_id);
> - atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
> - atomic_inc(&sd->shared->ref);
> ++ init_sched_domain_shared(&d, sd);
>
> /*
> * In presence of higher domains, adjust the
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 9dd4a94801c9..300320b0248a 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2191,6 +2191,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DECLARE_PER_CPU(int, sd_llc_size);
> DECLARE_PER_CPU(int, sd_llc_id);
> DECLARE_PER_CPU(int, sd_share_id);
> +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);