Re: [PATCH v2 2/5] sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity

From: K Prateek Nayak

Date: Tue May 19 2026 - 01:53:39 EST


Hello Peter, Andrea,

On 5/19/2026 2:28 AM, Peter Zijlstra wrote:
> @@@ -2775,20 -3049,16 +3107,15 @@@ build_sched_domains(const struct cpumas
> if (!sd)
> continue;
>
> + if (has_asym)
> - asym_claimed = claim_asym_sched_domain_shared(&d, i);
> ++ claim_asym_sched_domain_shared(&d, i);
> +
> /* First, find the topmost SD_SHARE_LLC domain */
> while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
> sd = sd->parent;
>
> if (sd->flags & SD_SHARE_LLC) {
> - /*
> - * Initialize the sd->shared for SD_SHARE_LLC unless
> - * the asym path above already claimed it.
> - */
> - if (!asym_claimed)
> - init_sched_domain_shared(&d, sd);
> - int sd_id = cpumask_first(sched_domain_span(sd));
> -
> - sd->shared = *per_cpu_ptr(d.sds, sd_id);
> - atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
> - atomic_inc(&sd->shared->ref);
> ++ init_sched_domain_shared(&d, sd);

This will run into a small problem with "nr_idle_scan" if
cpumask_first(sched_domain_span(sd)) is the same for both sd_asym and
sd_llc.

Load balancer at different domains will populate "nr_idle_scan" with
different values and they alias to same ->shared if one isn't
degenerated and I believe there is at least one way to hit the WARN_ON()
from cpu_attach_domain() if the SD_ASYM_CPUCAPACITY_FULL comes before
the last SD_SHARE_LLC domain and the latter is degenerated.

How about this:

(On top of queue:sched/core; Lightly tested on !ASYM_CPUCAPACITY system)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fe09d3268bc9..1d2c98dca211 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -67,7 +67,15 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
- int nr_idle_scan;
+ union {
+ int nr_idle_scan;
+ /*
+ * Used during allocation to claim the
+ * sched_domain_shared object at
+ * multiple levels.
+ */
+ int alloc_flags;
+ };
#ifdef CONFIG_SCHED_CACHE
unsigned long util_avg;
unsigned long capacity;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index dbfd9657f897..9ebd14652e9d 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -623,6 +623,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
} while (sg != first);
}

+static void free_sched_domain_shared(struct sched_domain_shared *sds)
+{
+ if (sds && atomic_dec_and_test(&sds->ref))
+ kfree(sds);
+}
+
static void destroy_sched_domain(struct sched_domain *sd)
{
/*
@@ -631,9 +637,7 @@ static void destroy_sched_domain(struct sched_domain *sd)
* dropping group/capacity references, freeing where none remain.
*/
free_sched_groups(sd->groups, 1);
-
- if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
- kfree(sd->shared);
+ free_sched_domain_shared(sd->shared);

#ifdef CONFIG_SCHED_CACHE
/* only the bottom sd has llc_counts array */
@@ -755,7 +759,14 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

/* Pick reference to parent->shared. */
if (parent->shared) {
- WARN_ON_ONCE(tmp->shared);
+ /*
+ * It is safe to free a sd->shared that
+ * has not been published yet. If a
+ * sd->shared was published, the refcount
+ * will end up being non-zero and it will
+ * not be freed here.
+ */
+ free_sched_domain_shared(tmp->shared);
tmp->shared = parent->shared;
parent->shared = NULL;
}
@@ -2916,11 +2927,40 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
}
}

-static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
+static void
+init_sched_domain_shared(struct s_data *d, struct sched_domain *sd, int flags)
{
- int sd_id = cpumask_first(sched_domain_span(sd));
+ int cpu;
+
+ /*
+ * Multiple domains can try to claim a shared object like
+ * SD_ASYM_CPUCAPACITY and SD_SHARE_LLC which can alias to
+ * same cpumask_first(sched_domain_span(sd)) CPU and can
+ * cause "nr_idle_scan" to be populated incorrectly during
+ * load balncing.
+ *
+ * Find the first CPU in sched_domain_span(sd) with an
+ * unclaimed domain (!alloc_flags) or where the alloc_flag
+ * matches the requested flag (SD_* flag)
+ */
+ for_each_cpu(cpu, sched_domain_span(sd)) {
+ struct sched_domain_shared *sds = *per_cpu_ptr(d->sds, cpu);
+
+ /*
+ * If the domain only has single CPU, allow temporary overlap
+ * in allocation since the domains will be degenerated anyways.
+ */
+ if (!sds->alloc_flags ||
+ sd->span_weight == 1 ||
+ sds->alloc_flags == flags) {
+ sds->alloc_flags = flags;
+ sd->shared = sds;
+ break;
+ }
+ }
+
+ BUG_ON(!sd->shared);

- sd->shared = *per_cpu_ptr(d->sds, sd_id);
/*
* nr_busy_cpus is consumed only by the NOHZ kick path via
* sd_balance_shared; on the asym-capacity path it is initialized but
@@ -2960,7 +3000,7 @@ static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
if (!sd_asym || (sd_asym->flags & SD_NUMA))
return false;

- init_sched_domain_shared(d, sd_asym);
+ init_sched_domain_shared(d, sd_asym, SD_ASYM_CPUCAPACITY);
return true;
}

@@ -3115,7 +3155,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
sd = sd->parent;

if (sd->flags & SD_SHARE_LLC) {
- init_sched_domain_shared(&d, sd);
+ init_sched_domain_shared(&d, sd, SD_SHARE_LLC);

/*
* In presence of higher domains, adjust the
--
Thanks and Regards,
Prateek