[RFC PATCH] sched/fair: dynamically scale the period of cache work
From: Jianyong Wu
Date: Mon Apr 13 2026 - 03:28:29 EST
When a preferred LLC is selected and remains stable, task_cache_work does
not need to run frequently. Because it scans all system CPUs for
computation, high-frequency execution hurts performance. We thus reduce
the scan rate in such cases.
On the other hand, if the preferred node becomes suboptimal, we should
increase the scan frequency to quickly find a better placement. The scan
period is therefore dynamically adjusted.
Signed-off-by: Jianyong Wu <wujianyong@xxxxxxxx>
---
Hi ChenYu, Tim, Gengkun,
I have another approach to address this issue, based on the observation
that the scan work can be canceled if the preferred node is stable.This
patch merely demonstrates the idea, but still needs more testing to
verify its functionality. I'm sending it out early to gather feedback and
opinions.
Thanks
Jianyong
---
include/linux/sched.h | 4 +++
kernel/sched/debug.c | 6 ++++
kernel/sched/fair.c | 69 ++++++++++++++++++++++++++++++++++++-------
kernel/sched/sched.h | 3 ++
4 files changed, 72 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e24b2b86aba4..87ce70ba6552 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2393,7 +2393,11 @@ struct sched_cache_stat {
struct sched_cache_time __percpu *pcpu_time;
raw_spinlock_t lock;
unsigned long epoch;
+ unsigned long last_reset_tick;
+ unsigned long next_scan;
+ unsigned long scan_period;
u64 nr_running_avg;
+ int need_scan;
int cpu;
} ____cacheline_aligned_in_smp;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4469e1c152c8..56ebc379127a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -679,6 +679,12 @@ static __init int sched_init_debug(void)
&llc_overaggr_pct);
debugfs_create_u32("imb_pct", 0644, llc,
&llc_imb_pct);
+ debugfs_create_u32("scan_period_max", 0644, llc,
+ &llc_scan_period_max);
+ debugfs_create_u32("scan_period_min", 0644, llc,
+ &llc_scan_period_min);
+ debugfs_create_u32("scan_period_threshold", 0644, llc,
+ &llc_scan_period_threshold);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f446d755f3c5..974fe4b992ca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1287,6 +1287,9 @@ __read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
__read_mostly unsigned int llc_imb_pct = 20;
__read_mostly unsigned int llc_overaggr_pct = 50;
+__read_mostly unsigned int llc_scan_period_min = 1;
+__read_mostly unsigned int llc_scan_period_max = 64 * HZ;
+__read_mostly unsigned int llc_scan_period_threshold = HZ;
bool sched_cache_inuse(void)
{
@@ -1486,6 +1489,7 @@ void mm_init_sched(struct mm_struct *mm,
raw_spin_lock_init(&mm->sc_stat.lock);
mm->sc_stat.epoch = epoch;
mm->sc_stat.cpu = -1;
+ mm->sc_stat.scan_period = llc_scan_period_min;
/*
* The update to mm->sc_stat should not be reordered
@@ -1611,15 +1615,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
epoch = rq->cpu_epoch;
}
- /*
- * If this process hasn't hit task_cache_work() for a while, or it
- * has only 1 thread, invalidate its preferred state.
- */
+ /* If it has only 1 thread, invalidate its preferred state */
if (time_after(epoch,
- READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
- get_nr_threads(p) <= 1 ||
+ READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
+ get_nr_threads(p) <= 1 ||
exceed_llc_nr(mm, cpu_of(rq), p) ||
exceed_llc_capacity(mm, cpu_of(rq), p)) {
+ mm->sc_stat.scan_period = llc_scan_period_min;
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
}
@@ -1652,6 +1654,10 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
if (time_after_eq(mm->sc_stat.epoch, epoch))
return;
+ if (llc_scan_period_min < llc_scan_period_max && time_before(jiffies, mm->sc_stat.next_scan) &&
+ !mm->sc_stat.need_scan)
+ return;
+
guard(raw_spinlock)(&mm->sc_stat.lock);
if (work->next == work) {
@@ -1728,7 +1734,7 @@ static void task_cache_work(struct callback_head *work)
struct task_struct *p = current, *cur;
unsigned long curr_m_a_occ = 0;
struct mm_struct *mm = p->mm;
- unsigned long m_a_occ = 0;
+ unsigned long m_a_occ = 0, need_scan = 0, now;
cpumask_var_t cpus;
u64 t0, scan_cost;
@@ -1753,6 +1759,12 @@ static void task_cache_work(struct callback_head *work)
t0 = sched_clock_cpu(curr_cpu);
+ now = jiffies;
+ if (time_before(now, READ_ONCE(mm->sc_stat.next_scan)))
+ return;
+
+ WRITE_ONCE(mm->sc_stat.next_scan, (now + mm->sc_stat.scan_period));
+
scoped_guard (cpus_read_lock) {
get_scan_cpumasks(cpus, p);
@@ -1811,7 +1823,8 @@ static void task_cache_work(struct callback_head *work)
scan_cost = sched_clock_cpu(curr_cpu) - t0;
trace_sched_llc_scan(p, scan_cost);
- if (m_a_occ > (2 * curr_m_a_occ)) {
+ need_scan = READ_ONCE(mm->sc_stat.need_scan);
+ if (m_a_occ > (2 * curr_m_a_occ) || need_scan) {
/*
* Avoid switching sc_stat.cpu too fast.
* The reason to choose 2X is because:
@@ -1822,9 +1835,35 @@ static void task_cache_work(struct callback_head *work)
* 3. 2X is chosen based on test results, as it delivers
* the optimal performance gain so far.
*/
- mm->sc_stat.cpu = m_a_cpu;
+ if (m_a_occ > (2 * curr_m_a_occ))
+ mm->sc_stat.cpu = m_a_cpu;
+
+ if (!mm->sc_stat.last_reset_tick)
+ mm->sc_stat.last_reset_tick = now;
+
+ /* Change scan_period when preferred LLC changed */
+ if (((mm->sc_stat.cpu != -1) && (m_a_cpu != -1)
+ && (llc_id(mm->sc_stat.cpu) != llc_id(m_a_cpu)))
+ || need_scan) {
+ if (!need_scan)
+ need_scan = 1;
+
+ WRITE_ONCE(mm->sc_stat.scan_period,
+ max(mm->sc_stat.scan_period >> 1, llc_scan_period_min));
+ WRITE_ONCE(mm->sc_stat.last_reset_tick, now);
+ }
+ }
+
+ if ((now - READ_ONCE(mm->sc_stat.last_reset_tick) > llc_scan_period_threshold)
+ && !need_scan) {
+ WRITE_ONCE(mm->sc_stat.scan_period, min(mm->sc_stat.scan_period << 1,
+ llc_scan_period_max));
+ WRITE_ONCE(mm->sc_stat.last_reset_tick, now);
}
+ if (READ_ONCE(mm->sc_stat.need_scan))
+ WRITE_ONCE(mm->sc_stat.need_scan, 0);
+
update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running);
free_cpumask_var(cpus);
}
@@ -10260,6 +10299,7 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
struct mm_struct *mm;
bool to_pref;
int cpu;
+ enum llc_mig ret;
mm = p->mm;
if (!mm)
@@ -10287,8 +10327,17 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
else
return mig_unrestricted;
- return can_migrate_llc(src_cpu, dst_cpu,
+ ret = can_migrate_llc(src_cpu, dst_cpu,
task_util(p), to_pref);
+
+ /*
+ * If the preferred node cannot accommodate the process,
+ * accelerate task_cache_work to find a better node.
+ */
+ if (to_pref && ret == mig_forbid)
+ mm->sc_stat.need_scan = 1;
+
+ return ret;
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b757812725f7..08462175f73f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4043,6 +4043,9 @@ extern unsigned int llc_epoch_period;
extern unsigned int llc_epoch_affinity_timeout;
extern unsigned int llc_imb_pct;
extern unsigned int llc_overaggr_pct;
+extern unsigned int llc_scan_period_min;
+extern unsigned int llc_scan_period_max;
+extern unsigned int llc_scan_period_threshold;
static inline bool sched_cache_enabled(void)
{
--
2.34.1