[PATCH v4 4/9] sched/topology: Switch to assigning "sd->shared" from s_data

From: K Prateek Nayak

Date: Thu Mar 12 2026 - 00:47:42 EST


Use the "sched_domain_shared" object allocated in s_data for
"sd->shared" assignments. Assign "sd->shared" for the topmost
SD_SHARE_LLC domain before degeneration and rely on the degeneration
path to correctly pass down the shared object to "sd_llc".

sd_degenerate_parent() ensures degenerating domains must have the same
sched_domain_span() which ensures 1:1 passing down of the shared object.
If the topmost SD_SHARE_LLC domain degenerates, the shared object is
freed from destroy_sched_domain() when the last reference is dropped.

claim_allocations() NULLs out the objects that have been assigned as
"sd->shared" and the unassigned ones are freed from the __sds_free()
path.

To keep all the claim_allocations() bits in one place,
claim_allocations() has been extended to accept "s_data" and iterate the
domains internally to free both "sched_domain_shared" and the
per-topology-level data for the particular CPU in one place.

Post cpu_attach_domain(), all reclaims of "sd->shared" are handled via
call_rcu() on the sched_domain object via destroy_sched_domains_rcu().

Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
---
Changelog v3..v4:

o Moved claiming the per-CPU "d.sds" reference into claim_allocations()
to keep everything in one place. (Shrikanth)

o Slightly different diff as a result of moving the "imb_numa_nr"
calculation into a separate helper in Patch 2.
---
kernel/sched/topology.c | 73 +++++++++++++++++++++++++----------------
1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f0541c6511fa..ebd955faab40 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -685,6 +685,9 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
+
+ /* If sd_llc exists, sd_llc_shared should exist too. */
+ WARN_ON_ONCE(!sd->shared);
sds = sd->shared;
}

@@ -733,6 +736,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;

+ /* Pick reference to parent->shared. */
+ if (parent->shared) {
+ WARN_ON_ONCE(tmp->shared);
+ tmp->shared = parent->shared;
+ parent->shared = NULL;
+ }
+
if (parent->parent) {
parent->parent->child = tmp;
parent->parent->groups->flags = tmp->flags;
@@ -1586,21 +1596,28 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
* sched_group structure so that the subsequent __free_domain_allocs()
* will not free the data we're using.
*/
-static void claim_allocations(int cpu, struct sched_domain *sd)
+static void claim_allocations(int cpu, struct s_data *d)
{
- struct sd_data *sdd = sd->private;
+ struct sched_domain *sd;
+
+ if (atomic_read(&(*per_cpu_ptr(d->sds, cpu))->ref))
+ *per_cpu_ptr(d->sds, cpu) = NULL;

- WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
- *per_cpu_ptr(sdd->sd, cpu) = NULL;
+ for (sd = *per_cpu_ptr(d->sd, cpu); sd; sd = sd->parent) {
+ struct sd_data *sdd = sd->private;

- if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
- *per_cpu_ptr(sdd->sds, cpu) = NULL;
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;

- if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
- *per_cpu_ptr(sdd->sg, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+ *per_cpu_ptr(sdd->sds, cpu) = NULL;

- if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
- *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+ }
}

#ifdef CONFIG_NUMA
@@ -1740,16 +1757,6 @@ sd_init(struct sched_domain_topology_level *tl,
sd->cache_nice_tries = 1;
}

- /*
- * For all levels sharing cache; connect a sched_domain_shared
- * instance.
- */
- if (sd->flags & SD_SHARE_LLC) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
- atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
-
sd->private = sdd;

return sd;
@@ -2731,12 +2738,20 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
sd = sd->parent;

- /*
- * In presence of higher domains, adjust the
- * NUMA imbalance stats for the hierarchy.
- */
- if (IS_ENABLED(CONFIG_NUMA) && (sd->flags & SD_SHARE_LLC) && sd->parent)
- adjust_numa_imbalance(sd);
+ if (sd->flags & SD_SHARE_LLC) {
+ int sd_id = cpumask_first(sched_domain_span(sd));
+
+ sd->shared = *per_cpu_ptr(d.sds, sd_id);
+ atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+ atomic_inc(&sd->shared->ref);
+
+ /*
+ * In presence of higher domains, adjust the
+ * NUMA imbalance stats for the hierarchy.
+ */
+ if (IS_ENABLED(CONFIG_NUMA) && sd->parent)
+ adjust_numa_imbalance(sd);
+ }
}

/* Calculate CPU capacity for physical packages and nodes */
@@ -2744,10 +2759,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (!cpumask_test_cpu(i, cpu_map))
continue;

- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- claim_allocations(i, sd);
+ claim_allocations(i, &d);
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent)
init_sched_groups_capacity(i, sd);
- }
}

/* Attach the domains */
--
2.34.1