Re: [PATCH v3 04/21] sched/cache: Make LLC id continuous

From: Peter Zijlstra

Date: Thu Feb 19 2026 - 10:42:47 EST

On Mon, Feb 16, 2026 at 01:14:20PM +0530, K Prateek Nayak wrote:
> > @@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > if (cpumask_equal(cpu_map, sched_domain_span(sd)))
> > break;
> > }
> > +
> > + lid = per_cpu(sd_llc_id, i);
> > + if (lid == -1) {
> > + int j;
> > +
> > + /*
> > + * Assign the llc_id to the CPUs that do not
> > + * have an LLC.
> > + */
> > + if (!tl_llc) {
> > + per_cpu(sd_llc_id, i) = tl_max_llcs++;
> > +
> > + continue;
> > + }
> > +
> > + /* try to reuse the llc_id of its siblings */
> > + for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
>
>
> My only large concern that remains is the fact that offline CPUs are
> taken out the the tl->mask() which can lead to interesting cases where
> CPUs on same LLC can have different llc_id:
>
> o Boot with maxcpus=1
>
> o Run:
>
> for i in {1..$NRCPUS}; do
> echo 1 > /sys/devices/system/cpu/cpu$i/online;
> echo 0 > /sys/devices/system/cpu/cpu$i/online;
> done

Lol, cute ;-)

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c6efa71cf500..aee1be89ab4c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8268,6 +8268,8 @@ static void cpuset_cpu_active(void)
> static void cpuset_cpu_inactive(unsigned int cpu)
> {
> if (!cpuhp_tasks_frozen) {
> + /* XXX: Is this the right spot? */
> + sched_domains_free_llc_id(cpu);
> cpuset_update_active_cpus();
> } else {
> num_cpus_frozen++;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index de5b701c3950..31a8910297c7 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3903,6 +3903,7 @@ static inline bool sched_cache_enabled(void)
> }
> #endif
> extern void init_sched_mm(struct task_struct *p);
> +void sched_domains_free_llc_id(int cpu);
>
> extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
> extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index ca46b5cf7f78..04c1ab489ee2 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -18,6 +18,7 @@ void sched_domains_mutex_unlock(void)
> }
>
> /* Protected by sched_domains_mutex: */
> +static cpumask_var_t sched_domains_llc_id_allocmask;
> static cpumask_var_t sched_domains_tmpmask;
> static cpumask_var_t sched_domains_tmpmask2;
> static int tl_max_llcs;
> @@ -2543,6 +2544,53 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
> return true;
> }
>
> +static int __sched_domains_alloc_llc_id(void)
> +{
> + int lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> + if (lid >= tl_max_llcs)
> + tl_max_llcs++;

Urgh,. should we not rather track the max lid?

Also, we allocate max_llc sized data structures, if this thing is
'variable' we must also always store a copy of the 'lid' size of the
time of allocation.

> +
> + /*
> + * llc_id space should never grow larger than the
> + * possible number of CPUs in the system.
> + */
> + if (!unlikely(WARN_ON_ONCE(lid >= nr_cpumask_bits)))
> + cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);

__cpumask_set_cpu()

Since you're serializing everything with that sched_domains_mutex, this
need not be an atomic op.

> + return lid;
> +}
> +
> +static void __sched_domains_free_llc_id(int cpu)
> +{
> + int i, lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = per_cpu(sd_llc_id, cpu);
> + if (lid == -1)
> + return;
> +
> + per_cpu(sd_llc_id, cpu) = -1;
> +
> + for_each_online_cpu(i) {
> + /* An online CPU owns the llc_id. */
> + if (per_cpu(sd_llc_id, i) == lid)
> + return;
> + }
> +
> + cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);

__cpumask_clear_cpu()

> +}

So this deals with Madadi's issue I suppose.

> +void sched_domains_free_llc_id(int cpu)
> +{
> + sched_domains_mutex_lock();
> + __sched_domains_free_llc_id(cpu);
> + sched_domains_mutex_unlock();
> +}
> +
> /*
> * Build sched domains for a given set of CPUs and attach the sched domains
> * to the individual CPUs
> @@ -2599,7 +2647,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> * have an LLC.
> */
> if (!tl_llc) {
> - per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
>
> continue;
> }
> @@ -2620,7 +2668,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>
> /* a new LLC is detected */
> if (lid == -1)
> - per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
> }
> }
>
> @@ -2798,6 +2846,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
> {
> int err;
>
> + zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
> zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
> zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
> zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
> ---
>
> It doesn't compact tl_max_llcs, but it should promote reuse of llc_id if
> all CPUs of a LLC go offline. I know it is a ridiculous scenario but it
> is possible nonetheless.
>
> I'll let Peter and Valentin be the judge of additional space and
> complexity needed for these bits :-)

It appears straight forward enough I suppose.