[PATCH v4 1/2] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus

From: Luo Gengkun

Date: Thu Jun 18 2026 - 02:16:40 EST

The overhead of task_cache_work() is high, especially in multi-NUMA
systems. Currently, task_cache_work() tries to find the pref_llc by
scanning all CPUs in the system. However, most of these scans are
meaningless, such as those for CPUs that have never been visited or were
accessed a long time ago.

To address this problem, introduce visited_cpus to track the visited CPUs
and evict them once they have not been accessed for a duration exceeding
llc_epoch_affinity_timeout. With this patch, get_scan_cpumasks() is no
longer need and is therefore removed.

Signed-off-by: Luo Gengkun <luogengkun2@xxxxxxxxxx>
---
include/linux/sched.h | 2 ++
kernel/sched/fair.c | 81 +++++++++++++++----------------------------
2 files changed, 29 insertions(+), 54 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b3204a15d512..9f3946c6bd5d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2385,6 +2385,7 @@ static __always_inline int task_mm_cid(struct task_struct *t)
struct sched_cache_time {
u64 runtime;
unsigned long epoch;
+ unsigned long epoch_timeout;
};

struct sched_cache_stat {
@@ -2395,6 +2396,7 @@ struct sched_cache_stat {
unsigned long next_scan;
unsigned long footprint;
int cpu;
+ struct cpumask visited_cpus;
} ____cacheline_aligned_in_smp;

#else
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d78467ec6ee1..61f71857e9b1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1585,6 +1585,7 @@ void mm_init_sched(struct mm_struct *mm,
pcpu_sched->runtime = 0;
/* a slightly stale cpu epoch is acceptible */
pcpu_sched->epoch = rq->cpu_epoch;
+ pcpu_sched->epoch_timeout = rq->cpu_epoch;
epoch = rq->cpu_epoch;
}

@@ -1594,6 +1595,7 @@ void mm_init_sched(struct mm_struct *mm,
mm->sc_stat.next_scan = jiffies;
mm->sc_stat.nr_running_avg = 0;
mm->sc_stat.footprint = 0;
+ cpumask_clear(&mm->sc_stat.visited_cpus);
/*
* The update to mm->sc_stat should not be reordered
* before initialization to mm's other fields, in case
@@ -1635,11 +1637,21 @@ static inline void __update_mm_sched(struct rq *rq,
}
}

-static unsigned long fraction_mm_sched(struct rq *rq,
- struct sched_cache_time *pcpu_sched)
+static unsigned long fraction_mm_sched(int cpu,
+ struct mm_struct *mm)
{
+ struct sched_cache_time *pcpu_sched =
+ per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu);
+ struct rq *rq = cpu_rq(cpu);
+
guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);

+ /* Skip the rq that has not been hit for a long time */
+ if ((rq->cpu_epoch - pcpu_sched->epoch_timeout) > llc_epoch_affinity_timeout) {
+ cpumask_clear_cpu(cpu, &mm->sc_stat.visited_cpus);
+ return 0;
+ }
+
__update_mm_sched(rq, pcpu_sched);

/*
@@ -1711,6 +1723,9 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
pcpu_sched->runtime += delta_exec;
rq->cpu_runtime += delta_exec;
epoch = rq->cpu_epoch;
+ pcpu_sched->epoch_timeout = epoch;
+ if (!cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus))
+ cpumask_set_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
}

/*
@@ -1761,51 +1776,6 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
}
}

-static void get_scan_cpumasks(cpumask_var_t cpus, struct task_struct *p)
-{
-#ifdef CONFIG_NUMA_BALANCING
- int cpu, curr_cpu, nid, pref_nid;
-
- if (!static_branch_likely(&sched_numa_balancing))
- goto out;
-
- cpu = READ_ONCE(p->mm->sc_stat.cpu);
- if (cpu != -1)
- nid = cpu_to_node(cpu);
- curr_cpu = task_cpu(p);
-
- /*
- * Scanning in the preferred NUMA node is ideal. However, the NUMA
- * preferred node is per-task rather than per-process. It is possible
- * for different threads of the process to have distinct preferred
- * nodes; consequently, the process-wide preferred LLC may bounce
- * between different nodes. As a workaround, maintain the scan
- * CPU mask to also cover the process's current preferred LLC and the
- * current running node to mitigate the bouncing risk.
- * TBD: numa_group should be considered during task aggregation.
- */
- pref_nid = p->numa_preferred_nid;
- /* honor the task's preferred node */
- if (pref_nid == NUMA_NO_NODE)
- goto out;
-
- cpumask_or(cpus, cpus, cpumask_of_node(pref_nid));
-
- /* honor the task's preferred LLC CPU */
- if (cpu != -1 && !cpumask_test_cpu(cpu, cpus) && nid != NUMA_NO_NODE)
- cpumask_or(cpus, cpus, cpumask_of_node(nid));
-
- /* make sure the task's current running node is included */
- if (!cpumask_test_cpu(curr_cpu, cpus))
- cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu)));
-
- return;
-
-out:
-#endif
- cpumask_copy(cpus, cpu_online_mask);
-}
-
static inline void update_avg_scale(u64 *avg, u64 sample)
{
int factor = per_cpu(sd_llc_size, raw_smp_processor_id());
@@ -1866,7 +1836,7 @@ static void task_cache_work(struct callback_head *work)
scoped_guard (cpus_read_lock) {
guard(rcu)();

- get_scan_cpumasks(cpus, p);
+ cpumask_and(cpus, cpu_online_mask, &mm->sc_stat.visited_cpus);

for_each_cpu(cpu, cpus) {
/* XXX sched_cluster_active */
@@ -1878,18 +1848,21 @@ static void task_cache_work(struct callback_head *work)
continue;

for_each_cpu(i, sched_domain_span(sd)) {
- occ = fraction_mm_sched(cpu_rq(i),
- per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+ cur = rcu_dereference_all(cpu_rq(i)->curr);
+ if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
+ cur->mm == mm)
+ nr_running++;
+
+ occ = fraction_mm_sched(i, mm);
+ if (occ == 0)
+ continue;
+
a_occ += occ;
if (occ > m_occ) {
m_occ = occ;
m_cpu = i;
}

- cur = rcu_dereference_all(cpu_rq(i)->curr);
- if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
- cur->mm == mm)
- nr_running++;
}

/*
--
2.34.1