Re: [PATCH v2 04/23] sched/cache: Make LLC id continuous

From: K Prateek Nayak

Date: Tue Dec 23 2025 - 00:32:18 EST

Hello Tim, Chenyu,

On 12/4/2025 4:37 AM, Tim Chen wrote:
> +/*
> + * Assign continuous llc id for the CPU, and return
> + * the assigned llc id.
> + */
> +static int update_llc_id(struct sched_domain *sd,
> + int cpu)
> +{
> + int id = per_cpu(sd_llc_id, cpu), i;
> +
> + if (id >= 0)
> + return id;
> +
> + if (sd) {
> + /* Look for any assigned id and reuse it.*/
> + for_each_cpu(i, sched_domain_span(sd)) {
> + id = per_cpu(sd_llc_id, i);
> +
> + if (id >= 0) {
> + per_cpu(sd_llc_id, cpu) = id;
> + return id;
> + }
> + }
> + }

I don't really like tying this down to the sched_domain span since
partition and other weirdness can cause the max_llc count to go
unnecessarily high. The tl->mask() (from sched_domain_topology_level)
should give the mask considering all online CPUs and not bothering
about cpusets.

How about something like:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b17d8e3cb55..c19b1c4e6472 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8270,6 +8270,18 @@ static void cpuset_cpu_active(void)
static void cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
+ /*
+ * This is necessary since offline CPUs are
+ * taken out of the tl->mask() and a newly
+ * onlined CPU in same LLC will not realize
+ * whether it should reuse the LLC ID owned
+ * by an offline CPU without knowing the
+ * LLC association.
+ *
+ * Safe to release the reference if this is
+ * the last CPU in the LLC going offline.
+ */
+ sched_domain_free_llc_id(cpu);
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..1378a1cfad18 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -631,6 +631,7 @@ void update_sched_domain_debugfs(void)
i++;
}

+ debugfs_create_u32("llc_id", 0444, d_cpu, (u32 *)per_cpu_ptr(&sd_llc_id, cpu));
__cpumask_clear_cpu(cpu, sd_sysctl_cpus);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3ceaa9dc9a9e..69fad88b57d8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2142,6 +2142,7 @@ extern int group_balance_cpu(struct sched_group *sg);

extern void update_sched_domain_debugfs(void);
extern void dirty_sched_domain_sysctl(int cpu);
+void sched_domain_free_llc_id(int cpu);

extern int sched_update_scaling(void);

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..d6e134767f30 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -20,6 +20,46 @@ void sched_domains_mutex_unlock(void)
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
+static cpumask_var_t sched_llc_id_alloc_mask;
+DEFINE_PER_CPU(int, sd_llc_id) = -1;
+static int max_llcs = 0;
+
+static inline int sched_domain_alloc_llc_id(void)
+{
+ int llc_id;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ llc_id = cpumask_first_zero(sched_llc_id_alloc_mask);
+ BUG_ON((unsigned int)llc_id >= nr_cpumask_bits);
+ cpumask_set_cpu(llc_id, sched_llc_id_alloc_mask);
+ ++max_llcs;
+
+ return llc_id;
+}
+
+void sched_domain_free_llc_id(int cpu)
+{
+ int i, llc_id = per_cpu(sd_llc_id, cpu);
+ bool found = false;
+
+ lockdep_assert_cpus_held(); /* For cpu_active_mask. */
+ guard(mutex)(&sched_domains_mutex);
+
+ per_cpu(sd_llc_id, cpu) = -1;
+ for_each_cpu(i, cpu_active_mask) {
+ if (per_cpu(sd_llc_id, i) == llc_id) {
+ found = true;
+ break;
+ }
+ }
+
+ /* Allow future hotplugs to claim this ID */
+ if (!found) {
+ cpumask_clear_cpu(llc_id, sched_llc_id_alloc_mask);
+ --max_llcs;
+ }
+}

static int __init sched_debug_setup(char *str)
{
@@ -658,7 +698,6 @@ static void destroy_sched_domains(struct sched_domain *sd)
*/
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(int, sd_share_id);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
@@ -684,7 +723,6 @@ static void update_top_cache_domain(int cpu)

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);

sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -2567,10 +2605,35 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att

/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
- struct sched_domain_topology_level *tl;
+ struct sched_domain_topology_level *tl, *tl_llc = NULL;
+ bool done = false;

sd = NULL;
for_each_sd_topology(tl) {
+ int flags = 0;
+
+ if (tl->sd_flags)
+ flags = (*tl->sd_flags)();
+
+ if (flags & SD_SHARE_LLC) {
+ tl_llc = tl;
+
+ /*
+ * Entire cpu_map has been covered. We are
+ * traversing only to find the highest
+ * SD_SHARE_LLC level.
+ */
+ if (done)
+ continue;
+ }
+
+ /*
+ * Since SD_SHARE_LLC is SDF_SHARED_CHILD, we can
+ * safely break out if the entire cpu_map has been
+ * covered by a child domain.
+ */
+ if (done)
+ break;

sd = build_sched_domain(tl, cpu_map, attr, sd, i);

@@ -2579,7 +2642,41 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
+ done = true;
+ }
+
+ /* First time visiting this CPU. Assign the llc_id. */
+ if (per_cpu(sd_llc_id, i) == -1) {
+ int j, llc_id = -1;
+
+ /*
+ * In case there are no SD_SHARE_LLC domains,
+ * each CPU gets its own llc_id. Find the first
+ * free bit on the mask and use it.
+ */
+ if (!tl_llc) {
+ per_cpu(sd_llc_id, i) = sched_domain_alloc_llc_id();
+ continue;
+ }
+
+ /*
+ * Visit all the CPUs of the LLC irrespective of the
+ * partition constraints and find if any of them have
+ * a valid llc_id.
+ */
+ for_each_cpu(j, tl_llc->mask(tl, i)) {
+ llc_id = per_cpu(sd_llc_id, j);
+
+ /* Found a valid llc_id for CPU's LLC. */
+ if (llc_id != -1)
+ break;
+ }
+
+ /* Valid llc_id not found. Allocate a new one. */
+ if (llc_id == -1)
+ llc_id = sched_domain_alloc_llc_id();
+
+ per_cpu(sd_llc_id, i) = llc_id;
}
}

@@ -2759,6 +2856,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)

zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
+ zalloc_cpumask_var(&sched_llc_id_alloc_mask, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);

arch_update_cpu_topology();
---

AFAICT, "sd_llc_id" isn't compared across different partitions so having
the CPUs that are actually associated with same physical LLC but across
different partitions sharing the same "sd_llc_id" shouldn't be a problem.

Thoughts?

--
Thanks and Regards,
Prateek