Re: [PATCH v3 04/21] sched/cache: Make LLC id continuous

From: K Prateek Nayak

Date: Mon Feb 16 2026 - 02:44:50 EST

Hello Tim, Chenyu,

On 2/11/2026 3:48 AM, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@xxxxxxxxx>
>
> Introduce an index mapping between CPUs and their LLCs. This provides
> a continuous per LLC index needed for cache-aware load balancing in
> later patches.
>
> The existing per_cpu llc_id usually points to the first CPU of the
> LLC domain, which is sparse and unsuitable as an array index. Using
> llc_id directly would waste memory.
>
> With the new mapping, CPUs in the same LLC share a continuous id:
>
> per_cpu(llc_id, CPU=0...15) = 0
> per_cpu(llc_id, CPU=16...31) = 1
> per_cpu(llc_id, CPU=32...47) = 2
> ...
>
> Once a CPU has been assigned an llc_id, this ID persists even when
> the CPU is taken offline and brought back online, which can facilitate
> the management of the ID.
>
> Co-developed-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
> Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
> Co-developed-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
> Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
> Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx>
> ---
>
> Notes:
> v2->v3:
> Allocate the LLC id according to the topology level data directly, rather
> than calculating from the sched domain. This simplifies the code.
> (Peter Zijlstra, K Prateek Nayak)
>
> kernel/sched/topology.c | 47 ++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 44 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index cf643a5ddedd..ca46b5cf7f78 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -20,6 +20,7 @@ void sched_domains_mutex_unlock(void)
> /* Protected by sched_domains_mutex: */
> static cpumask_var_t sched_domains_tmpmask;
> static cpumask_var_t sched_domains_tmpmask2;
> +static int tl_max_llcs;
>
> static int __init sched_debug_setup(char *str)
> {
> @@ -658,7 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> */
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DEFINE_PER_CPU(int, sd_llc_size);
> -DEFINE_PER_CPU(int, sd_llc_id);
> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
> DEFINE_PER_CPU(int, sd_share_id);
> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
>
> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> per_cpu(sd_llc_size, cpu) = size;
> - per_cpu(sd_llc_id, cpu) = id;
> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>
> sd = lowest_flag_domain(cpu, SD_CLUSTER);
> @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>
> /* Set up domains for CPUs specified by the cpu_map: */
> for_each_cpu(i, cpu_map) {
> - struct sched_domain_topology_level *tl;
> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
> + int lid;
>
> sd = NULL;
> for_each_sd_topology(tl) {
> + int flags = 0;
> +
> + if (tl->sd_flags)
> + flags = (*tl->sd_flags)();
> +
> + if (flags & SD_SHARE_LLC)
> + tl_llc = tl;

nit. This loop breaks out when sched_domain_span(sd) covers the entire
cpu_map and it might have not reached the topmost SD_SHARE_LLC domain
yet. Is that cause for any concern?

>
> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>
> @@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> if (cpumask_equal(cpu_map, sched_domain_span(sd)))
> break;
> }
> +
> + lid = per_cpu(sd_llc_id, i);
> + if (lid == -1) {
> + int j;
> +
> + /*
> + * Assign the llc_id to the CPUs that do not
> + * have an LLC.
> + */
> + if (!tl_llc) {
> + per_cpu(sd_llc_id, i) = tl_max_llcs++;
> +
> + continue;
> + }
> +
> + /* try to reuse the llc_id of its siblings */
> + for_each_cpu(j, tl_llc->mask(tl_llc, i)) {

My only large concern that remains is the fact that offline CPUs are
taken out the the tl->mask() which can lead to interesting cases where
CPUs on same LLC can have different llc_id:

o Boot with maxcpus=1

o Run:

for i in {1..$NRCPUS}; do
echo 1 > /sys/devices/system/cpu/cpu$i/online;
echo 0 > /sys/devices/system/cpu/cpu$i/online;
done

o Finally run:

echo 1 | tee /sys/devices/system/cpu/cpu*/online;

Once all CPUs are online, only the CPUs in boot CPU's LLC will have
the same llc_id. Every other CPU will have a unique llc_id which might
make the system behave unexpectedly.

I'm wondering if we can do something like below on top of this patch:

(Only build tested; Prepared on top of this patch in Tim's tree)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c6efa71cf500..aee1be89ab4c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8268,6 +8268,8 @@ static void cpuset_cpu_active(void)
static void cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
+ /* XXX: Is this the right spot? */
+ sched_domains_free_llc_id(cpu);
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de5b701c3950..31a8910297c7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3903,6 +3903,7 @@ static inline bool sched_cache_enabled(void)
}
#endif
extern void init_sched_mm(struct task_struct *p);
+void sched_domains_free_llc_id(int cpu);

extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ca46b5cf7f78..04c1ab489ee2 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -18,6 +18,7 @@ void sched_domains_mutex_unlock(void)
}

/* Protected by sched_domains_mutex: */
+static cpumask_var_t sched_domains_llc_id_allocmask;
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
static int tl_max_llcs;
@@ -2543,6 +2544,53 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
return true;
}

+static int __sched_domains_alloc_llc_id(void)
+{
+ int lid;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
+ if (lid >= tl_max_llcs)
+ tl_max_llcs++;
+
+ /*
+ * llc_id space should never grow larger than the
+ * possible number of CPUs in the system.
+ */
+ if (!unlikely(WARN_ON_ONCE(lid >= nr_cpumask_bits)))
+ cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
+ return lid;
+}
+
+static void __sched_domains_free_llc_id(int cpu)
+{
+ int i, lid;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = per_cpu(sd_llc_id, cpu);
+ if (lid == -1)
+ return;
+
+ per_cpu(sd_llc_id, cpu) = -1;
+
+ for_each_online_cpu(i) {
+ /* An online CPU owns the llc_id. */
+ if (per_cpu(sd_llc_id, i) == lid)
+ return;
+ }
+
+ cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
+}
+
+void sched_domains_free_llc_id(int cpu)
+{
+ sched_domains_mutex_lock();
+ __sched_domains_free_llc_id(cpu);
+ sched_domains_mutex_unlock();
+}
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
@@ -2599,7 +2647,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
* have an LLC.
*/
if (!tl_llc) {
- per_cpu(sd_llc_id, i) = tl_max_llcs++;
+ per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();

continue;
}
@@ -2620,7 +2668,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att

/* a new LLC is detected */
if (lid == -1)
- per_cpu(sd_llc_id, i) = tl_max_llcs++;
+ per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
}
}

@@ -2798,6 +2846,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
{
int err;

+ zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
---

It doesn't compact tl_max_llcs, but it should promote reuse of llc_id if
all CPUs of a LLC go offline. I know it is a ridiculous scenario but it
is possible nonetheless.

I'll let Peter and Valentin be the judge of additional space and
complexity needed for these bits :-)

> + if (i == j)
> + continue;
> +
> + lid = per_cpu(sd_llc_id, j);
> +
> + if (lid != -1) {
> + per_cpu(sd_llc_id, i) = lid;
> +
> + break;
> + }
> + }
> +
> + /* a new LLC is detected */
> + if (lid == -1)
> + per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + }
> }
>
> if (WARN_ON(!topology_span_sane(cpu_map)))

--
Thanks and Regards,
Prateek