[PATCH v2] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus.

From: Luo Gengkun

Date: Tue Apr 14 2026 - 10:42:58 EST

The overhead of task_cache_work is high, espeically in multi-NUMA system.
Currently, task_cache_work try to find the pref_llc by scan all cpus in the
system. However, most of these scans are meaningless, such as those for
cpus that have never been visited or were accessed a long time ago.

To address this problem, this patch introduces visited_cpus to track the
visited cpus and uses llc_epoch_visited_timeout to evict cpus that have
timed out.

Signed-off-by: Luo Gengkun <luogengkun2@xxxxxxxxxx>
---
Thanks for the reviews. I've updated the patch based on your feedback.

v2 Changes:
1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.
---
include/linux/sched.h | 1 +
kernel/sched/debug.c | 50 +++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 25 +++++++++++++++++++---
kernel/sched/sched.h | 6 ++++++
4 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfa4bfd099c6..f2327a13fda8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2390,6 +2390,7 @@ struct sched_cache_time {

struct sched_cache_stat {
struct sched_cache_time __percpu *pcpu_sched;
+ struct cpumask visited_cpus;
raw_spinlock_t lock;
unsigned long epoch;
u64 nr_running_avg;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4469e1c152c8..46aa73939f9e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -247,6 +247,54 @@ static const struct file_operations sched_cache_enable_fops = {
.llseek = seq_lseek,
.release = single_release,
};
+
+static void sched_cache_timeout_set(void)
+{
+ if (llc_epoch_visited_timeout) {
+ if (!static_branch_likely(&sched_cache_timeout))
+ static_branch_enable(&sched_cache_timeout);
+ } else {
+ if (static_branch_likely(&sched_cache_timeout))
+ static_branch_disable(&sched_cache_timeout);
+ }
+}
+
+static ssize_t
+sched_cache_timeout_enable_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int val, ret;
+
+ ret = kstrtouint_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ llc_epoch_visited_timeout = val;
+
+ sched_cache_timeout_set();
+
+ return cnt;
+}
+
+static int sched_cache_timeout_enable_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", llc_epoch_visited_timeout);
+ return 0;
+}
+
+static int sched_cache_timeout_enable_open(struct inode *inode,
+ struct file *filp)
+{
+ return single_open(filp, sched_cache_timeout_enable_show, NULL);
+}
+
+static const struct file_operations sched_cache_timeout_enable_fops = {
+ .open = sched_cache_timeout_enable_open,
+ .write = sched_cache_timeout_enable_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif

#ifdef CONFIG_PREEMPT_DYNAMIC
@@ -669,6 +717,8 @@ static __init int sched_init_debug(void)
llc = debugfs_create_dir("llc_balancing", debugfs_sched);
debugfs_create_file("enabled", 0644, llc, NULL,
&sched_cache_enable_fops);
+ debugfs_create_file("epoch_visited_timeout", 0644, llc, NULL,
+ &sched_cache_timeout_enable_fops);
debugfs_create_u32("aggr_tolerance", 0644, llc,
&llc_aggr_tolerance);
debugfs_create_u32("epoch_period", 0644, llc,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4e22696a0b1..89f44ea97fee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1285,9 +1285,12 @@ static void set_next_buddy(struct sched_entity *se);
__read_mostly unsigned int llc_aggr_tolerance = 1;
__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+__read_mostly unsigned int llc_epoch_visited_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
__read_mostly unsigned int llc_imb_pct = 20;
__read_mostly unsigned int llc_overaggr_pct = 50;

+DEFINE_STATIC_KEY_TRUE(sched_cache_timeout);
+
static int llc_id(int cpu)
{
if (cpu < 0)
@@ -1466,6 +1469,7 @@ void mm_init_sched(struct mm_struct *mm,
raw_spin_lock_init(&mm->sc_stat.lock);
mm->sc_stat.epoch = epoch;
mm->sc_stat.cpu = -1;
+ cpumask_clear(&mm->sc_stat.visited_cpus);

/*
* The update to mm->sc_stat should not be reordered
@@ -1582,6 +1586,9 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
pcpu_sched->runtime += delta_exec;
rq->cpu_runtime += delta_exec;
epoch = rq->cpu_epoch;
+ if (sched_cache_timeout_enabled() &&
+ !cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus))
+ cpumask_set_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
}

/*
@@ -1724,7 +1731,10 @@ static void task_cache_work(struct callback_head *work)
return;

scoped_guard (cpus_read_lock) {
- get_scan_cpumasks(cpus, p);
+ if (!sched_cache_timeout_enabled())
+ get_scan_cpumasks(cpus, p);
+ else
+ cpumask_and(cpus, cpu_online_mask, &mm->sc_stat.visited_cpus);

for_each_cpu(cpu, cpus) {
/* XXX sched_cluster_active */
@@ -1736,8 +1746,17 @@ static void task_cache_work(struct callback_head *work)
continue;

for_each_cpu(i, sched_domain_span(sd)) {
- occ = fraction_mm_sched(cpu_rq(i),
- per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+ struct rq *rq = cpu_rq(i);
+ struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, i);
+ /* Skip the rq that has not been hit for a long time */
+ if (sched_cache_timeout_enabled() &&
+ cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus) &&
+ (rq->cpu_epoch - pcpu_sched->epoch) >
+ llc_epoch_visited_timeout) {
+ cpumask_clear_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
+ continue;
+ }
+ occ = fraction_mm_sched(rq, pcpu_sched);
a_occ += occ;
if (occ > m_occ) {
m_occ = occ;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b757812725f7..2ba09e9567af 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4037,10 +4037,12 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
#ifdef CONFIG_SCHED_CACHE
DECLARE_STATIC_KEY_FALSE(sched_cache_present);
DECLARE_STATIC_KEY_FALSE(sched_cache_active);
+DECLARE_STATIC_KEY_TRUE(sched_cache_timeout);
extern int sysctl_sched_cache_user;
extern unsigned int llc_aggr_tolerance;
extern unsigned int llc_epoch_period;
extern unsigned int llc_epoch_affinity_timeout;
+extern unsigned int llc_epoch_visited_timeout;
extern unsigned int llc_imb_pct;
extern unsigned int llc_overaggr_pct;

@@ -4051,6 +4053,10 @@ static inline bool sched_cache_enabled(void)

extern void sched_cache_active_set_unlocked(void);

+static inline bool sched_cache_timeout_enabled(void)
+{
+ return static_branch_unlikely(&sched_cache_timeout);
+}
#endif

void sched_domains_free_llc_id(int cpu);
--
2.34.1