[RFC PATCH 7/8] sched: Let sched cache take precedence over NUMA balancing

From: Jianyong Wu

Date: Wed Jun 24 2026 - 23:09:17 EST

Cache-aware scheduling optimizes thread aggregation without tracking
memory locality, leaving expensive remote memory accesses possible.

Two key conflicts exist between NUMA balancing and cache-aware logic.
First, NUMA balancing assigns a per-task preferred node, whereas cache
scheduling operates at the thread-group granularity. Second, the node
selected by NUMA balancing can clash with cache-aware placement,
breaking the scheduler's LLC-preferred node logic. Threads within one
group may end up with disjoint preferred NUMA nodes, completely
defeating cache aggregation.

Resolve this by prioritizing cache-aware scheduling: cache logic
controls task placement and migration, while NUMA balancing only
manages page migration.

This retains the strengths of both subsystems: cache-aware scheduling
optimizes thread packing and CPU load balance, and NUMA balancing
improves memory locality.

Add a debugfs tunable to disable this mode and restore original
behavior:
echo 0 > /sys/kernel/debug/sched/llc_balancing/override_numa_balance

Signed-off-by: Jianyong Wu <wujianyong@xxxxxxxx>
---
kernel/sched/debug.c | 2 ++
kernel/sched/fair.c | 16 +++++++++++++++-
kernel/sched/sched.h | 1 +
3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 40584b27ea0c..1882e901bab5 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -682,6 +682,8 @@ static __init int sched_init_debug(void)
&llc_overaggr_pct);
debugfs_create_u32("imb_pct", 0644, llc,
&llc_imb_pct);
+ debugfs_create_bool("override_numa_balance", 0644, llc,
+ &llc_override_numa_balance);
#endif

debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c72837d95cac..171df11d0234 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1409,6 +1409,12 @@ __read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
__read_mostly unsigned int llc_imb_pct = 20;
__read_mostly unsigned int llc_overaggr_pct = 50;
+bool llc_override_numa_balance = true;
+
+static inline bool sched_cache_override_numa(void)
+{
+ return sched_cache_enabled() && llc_override_numa_balance;
+}

static int llc_id(int cpu)
{
@@ -1672,7 +1678,8 @@ static int get_pref_llc(struct task_struct *p, struct mm_struct *mm)
* than sched_setnuma() at least -- and thus the
* conflict only exists for a short period of time.
*/
- if (static_branch_likely(&sched_numa_balancing) &&
+ if (!sched_cache_override_numa() &&
+ static_branch_likely(&sched_numa_balancing) &&
p->numa_preferred_nid >= 0 &&
cpu_to_node(mm_sched_cpu) != p->numa_preferred_nid)
mm_sched_llc = -1;
@@ -3947,6 +3954,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!static_branch_likely(&sched_numa_balancing))
return;

+ /*
+ * We just want to migrate page other than migrate task
+ * once sched cache override numa balance is enabled.
+ */
+ if (sched_cache_override_numa())
+ return;
+
/* for example, ksmd faulting in a user's mm */
if (!p->mm)
return;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c7c2dea65edd..44d1278b16d4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4100,6 +4100,7 @@ extern unsigned int llc_epoch_period;
extern unsigned int llc_epoch_affinity_timeout;
extern unsigned int llc_imb_pct;
extern unsigned int llc_overaggr_pct;
+extern bool llc_override_numa_balance;

static inline bool sched_cache_enabled(void)
{
--
2.34.1