Re: [RFC][PATCH] sched: Cache aware load-balancing

From: Peter Zijlstra
Date: Wed Mar 26 2025 - 06:42:44 EST


On Wed, Mar 26, 2025 at 11:25:53AM +0100, Peter Zijlstra wrote:
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1286,8 +1286,8 @@ static void task_cache_work(struct callb
> struct task_struct *p = current;
> struct mm_struct *mm = p->mm;
> unsigned long m_a_occ = 0;
> - int cpu, m_a_cpu = -1;
> - cpumask_var_t cpus;
> + int m_a_cpu = -1;
> + int cpu;
>
> WARN_ON_ONCE(work != &p->cache_work);
>
> @@ -1296,46 +1296,46 @@ static void task_cache_work(struct callb
> if (p->flags & PF_EXITING)
> return;
>
> - if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
> - return;
> -
> scoped_guard (cpus_read_lock) {
> - cpumask_copy(cpus, cpu_online_mask);
>
> - for_each_cpu(cpu, cpus) {
> - /* XXX sched_cluster_active */
> - struct sched_domain *sd = per_cpu(sd_llc, cpu);
> - unsigned long occ, m_occ = 0, a_occ = 0;
> - int m_cpu = -1, nr = 0, i;
> + for_each_online_cpu(cpu) {
> + struct sched_domain *sd;
> + struct sched_domain_shared *sds;
> + unsigned long occ;
> +
> + for_each_domain(cpu, sd) {
> + if (!(sd->flags & SD_SHARE_LLC))
> + break;
>
> - for_each_cpu(i, sched_domain_span(sd)) {
> + sds = sd->shared;
> occ = fraction_mm_sched(cpu_rq(i),
> per_cpu_ptr(mm->pcpu_sched, i));
> - a_occ += occ;
> - if (occ > m_occ) {
> - m_occ = occ;
> - m_cpu = i;
> - }
> - nr++;
> - trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
> - per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
> - }
> -
> - a_occ /= nr;
> - if (a_occ > m_a_occ) {
> - m_a_occ = a_occ;
> - m_a_cpu = m_cpu;
> + sds->sum_occ += occ + 1;
> }
> + }
>
> - trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
> - per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
> + for_each_online_cpu(cpu) {
> + struct sched_domain *sd;
> + struct sched_domain_shared *sds;
> +
> + for_each_domain(cpu, sd) {
> + if (!(sd->flags & SD_SHARE_LLC))
> + break;
> +
> + sds = sd->shared;
> + if (sds->agg_occ) {
> + sds->avg_occ = (sds->agg_occ - sd->span_weight) /
> + sd->span_weight;
> + sds->sum_occ = 0;
> + }

s/agg_occ/sum_occ/g, stupid last minute renames etc.. :-)

>
> - for_each_cpu(i, sched_domain_span(sd)) {
> - /* XXX threshold ? */
> - per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
> + if (sd == per_cpu(sd_llc, cpu)) {
> + if (sds->avg_occ > m_a_occ) {
> + m_a_occ = sds->avg_occ;
> + m_a_cpu = cpu;
> + }
> + }
> }
> -
> - cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> }
> }
>
> @@ -1346,8 +1346,6 @@ static void task_cache_work(struct callb
> m_a_cpu = -1;
>
> mm->mm_sched_cpu = m_a_cpu;
> -
> - free_cpumask_var(cpus);
> }
>
> void init_sched_mm(struct task_struct *p)