[PATCH v3] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus.

From: Luo Gengkun

Date: Thu Apr 23 2026 - 04:29:01 EST

The overhead of task_cache_work is high, espeically in multi-NUMA system.
Currently, task_cache_work try to find the pref_llc by scan all cpus in the
system. However, most of these scans are meaningless, such as those for
cpus that have never been visited or were accessed a long time ago.

To address this problem, this patch introduces visited_cpus to track the
visited cpus and uses llc_epoch_affinity_timeout to evict cpus that have
timed out.

Signed-off-by: Luo Gengkun <luogengkun2@xxxxxxxxxx>
---
Changes history
**v3 Changes:**
1. Remove the static key and enable this feature by default.
2. Reuse llc_epoch_affinity_timeout instead of introducing
llc_epoch_visited_timeout.
3. Move the calculation of rq->cpu_epoch - pcpu_sched->epoch into
fraction_mm_sched() to avoid race between task_cache_work() and
__update_mm_sched().
4. Reset work->next at the end of task_cache_work() to prevent concurrent
executions by multiple threads within the same process.

**v2 Changes:**
1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.
---
include/linux/sched.h | 1 +
kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++--------
2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfa4bfd099c6..f2327a13fda8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2390,6 +2390,7 @@ struct sched_cache_time {

struct sched_cache_stat {
struct sched_cache_time __percpu *pcpu_sched;
+ struct cpumask visited_cpus;
raw_spinlock_t lock;
unsigned long epoch;
u64 nr_running_avg;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4e22696a0b1..49369f656d53 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1466,6 +1466,7 @@ void mm_init_sched(struct mm_struct *mm,
raw_spin_lock_init(&mm->sc_stat.lock);
mm->sc_stat.epoch = epoch;
mm->sc_stat.cpu = -1;
+ cpumask_clear(&mm->sc_stat.visited_cpus);

/*
* The update to mm->sc_stat should not be reordered
@@ -1507,11 +1508,18 @@ static inline void __update_mm_sched(struct rq *rq,
}
}

-static unsigned long fraction_mm_sched(struct rq *rq,
- struct sched_cache_time *pcpu_sched)
+static unsigned long fraction_mm_sched(int cpu, struct mm_struct *mm)
{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu);
guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);

+ /* Skip the rq that has not been hit for a long time */
+ if ((rq->cpu_epoch - pcpu_sched->epoch) > llc_epoch_affinity_timeout) {
+ cpumask_clear_cpu(cpu, &mm->sc_stat.visited_cpus);
+ return 0;
+ }
+
__update_mm_sched(rq, pcpu_sched);

/*
@@ -1582,6 +1590,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
pcpu_sched->runtime += delta_exec;
rq->cpu_runtime += delta_exec;
epoch = rq->cpu_epoch;
+ if (!cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus))
+ cpumask_set_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
}

/*
@@ -1627,7 +1637,11 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)

guard(raw_spinlock)(&mm->sc_stat.lock);

- if (work->next == work) {
+ /*
+ * Pairs with smp_store_release in task_cache_work() to ensure that
+ * tash_cache_work() has finished before re-queueing the work.
+ */
+ if (smp_load_acquire(&work->next) == work) {
task_work_add(p, work, TWA_RESUME);
WRITE_ONCE(mm->sc_stat.epoch, epoch);
}
@@ -1695,6 +1709,8 @@ static inline void update_avg_scale(u64 *avg, u64 sample)
*avg += div64_s64(diff, divisor);
}

+DEFINE_FREE(reset_work, struct callback_head *, smp_store_release(&_T->next, _T))
+
static void task_cache_work(struct callback_head *work)
{
int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu;
@@ -1703,11 +1719,14 @@ static void task_cache_work(struct callback_head *work)
struct mm_struct *mm = p->mm;
unsigned long m_a_occ = 0;
cpumask_var_t cpus;
+ /*
+ * Reset work->next at the end to avoid race between threads
+ * within a process.
+ */
+ struct callback_head *_w __free(reset_work) = work;

WARN_ON_ONCE(work != &p->cache_work);

- work->next = work;
-
if (p->flags & PF_EXITING)
return;

@@ -1725,6 +1744,7 @@ static void task_cache_work(struct callback_head *work)

scoped_guard (cpus_read_lock) {
get_scan_cpumasks(cpus, p);
+ cpumask_and(cpus, cpus, &mm->sc_stat.visited_cpus);

for_each_cpu(cpu, cpus) {
/* XXX sched_cluster_active */
@@ -1735,9 +1755,11 @@ static void task_cache_work(struct callback_head *work)
if (!sd)
continue;

- for_each_cpu(i, sched_domain_span(sd)) {
- occ = fraction_mm_sched(cpu_rq(i),
- per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+ for_each_cpu_and(i, sched_domain_span(sd), &mm->sc_stat.visited_cpus) {
+ occ = fraction_mm_sched(i, mm);
+ if (occ == 0)
+ continue;
+
a_occ += occ;
if (occ > m_occ) {
m_occ = occ;
--
2.34.1