[Patch v4 20/22] sched/cache: Add user control to adjust the aggressiveness of cache-aware scheduling

From: Tim Chen

Date: Wed Apr 01 2026 - 17:56:11 EST

From: Chen Yu <yu.c.chen@xxxxxxxxx>

Introduce a set of debugfs knobs to control how aggressive the
cache aware scheduling do the task aggregation.

(1) aggr_tolerance
With sched_cache enabled, the scheduler uses a process's RSS as a
proxy for its LLC footprint to determine if aggregating tasks on the
preferred LLC could cause cache contention. If RSS exceeds the LLC
size, aggregation is skipped. Some workloads with large RSS but small
actual memory footprints may still benefit from aggregation. Since
the kernel cannot efficiently track per-task cache usage (resctrl is
user-space only), userspace can provide a more accurate hint.

Introduce /sys/kernel/debug/sched/llc_balancing/aggr_tolerance to
let users control how strictly RSS limits aggregation. Values range
from 0 to 100:
- 0: Cache-aware scheduling is disabled.
- 1: Strict; tasks with RSS larger than LLC size are skipped.
- >=100: Aggressive; tasks are aggregated regardless of RSS.
For example, with a 32MB L3 cache:

- aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
- aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
(784GB = (1 + (99 - 1) * 256) * 32MB).
Similarly, /sys/kernel/debug/sched/llc_balancing/aggr_tolerance also
controls how strictly the number of active threads is considered when
doing cache aware load balance. The number of SMTs is also considered.
High SMT counts reduce the aggregation capacity, preventing excessive
task aggregation on SMT-heavy systems like Power10/Power11.

Yangyu suggested introducing separate aggregation controls for the
number of active threads and memory RSS checks. Since there are plans
to add per-process/task group controls, fine-grained tunables are
deferred to that implementation.

(2) epoch_period, epoch_affinity_timeout,
imb_pct, overaggr_pct are also turned into tunable.

Suggested-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
Suggested-by: Madadi Vineeth Reddy <vineethr@xxxxxxxxxxxxx>
Suggested-by: Shrikanth Hegde <sshegde@xxxxxxxxxxxxx>
Suggested-by: Tingyin Duan <tingyin.duan@xxxxxxxxx>
Suggested-by: Jianyong Wu <jianyong.wu@xxxxxxxxxxx>
Suggested-by: Yangyu Chen <cyy@xxxxxxxxxxxx>
Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx>
Co-developed-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---

Notes:
v3->v4:
Create the debugfs knobs under debug/sched/llc_balancing directory.
(Peter Zijlstra)

kernel/sched/debug.c | 10 ++++++++
kernel/sched/fair.c | 60 ++++++++++++++++++++++++++++++++++++++------
kernel/sched/sched.h | 5 ++++
3 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 3019412d8009..4469e1c152c8 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -669,6 +669,16 @@ static __init int sched_init_debug(void)
llc = debugfs_create_dir("llc_balancing", debugfs_sched);
debugfs_create_file("enabled", 0644, llc, NULL,
&sched_cache_enable_fops);
+ debugfs_create_u32("aggr_tolerance", 0644, llc,
+ &llc_aggr_tolerance);
+ debugfs_create_u32("epoch_period", 0644, llc,
+ &llc_epoch_period);
+ debugfs_create_u32("epoch_affinity_timeout", 0644, llc,
+ &llc_epoch_affinity_timeout);
+ debugfs_create_u32("overaggr_pct", 0644, llc,
+ &llc_overaggr_pct);
+ debugfs_create_u32("imb_pct", 0644, llc,
+ &llc_imb_pct);
#endif

debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a2d1b8b2a188..e4e22696a0b1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1282,6 +1282,11 @@ static void set_next_buddy(struct sched_entity *se);
*/
#define EPOCH_PERIOD (HZ / 100) /* 10 ms */
#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */
+__read_mostly unsigned int llc_aggr_tolerance = 1;
+__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
+__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+__read_mostly unsigned int llc_imb_pct = 20;
+__read_mostly unsigned int llc_overaggr_pct = 50;

static int llc_id(int cpu)
{
@@ -1316,10 +1321,22 @@ static inline bool valid_llc_buf(struct sched_domain *sd,
return true;
}

+static inline int get_sched_cache_scale(int mul)
+{
+ if (!llc_aggr_tolerance)
+ return 0;
+
+ if (llc_aggr_tolerance >= 100)
+ return INT_MAX;
+
+ return (1 + (llc_aggr_tolerance - 1) * mul);
+}
+
static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
{
struct cacheinfo *ci;
u64 rss, llc;
+ int scale;

/*
* get_cpu_cacheinfo_level() can not be used
@@ -1344,13 +1361,42 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
rss = get_mm_counter(mm, MM_ANONPAGES) +
get_mm_counter(mm, MM_SHMEMPAGES);

- return (llc < (rss * PAGE_SIZE));
+ /*
+ * Scale the LLC size by 256*llc_aggr_tolerance
+ * and compare it to the task's RSS size.
+ *
+ * Suppose the L3 size is 32MB. If the
+ * llc_aggr_tolerance is 1:
+ * When the RSS is larger than 32MB, the process
+ * is regarded as exceeding the LLC capacity. If
+ * the llc_aggr_tolerance is 99:
+ * When the RSS is larger than 784GB, the process
+ * is regarded as exceeding the LLC capacity:
+ * 784GB = (1 + (99 - 1) * 256) * 32MB
+ * If the llc_aggr_tolerance is 100:
+ * ignore the RSS.
+ */
+ scale = get_sched_cache_scale(256);
+ if (scale == INT_MAX)
+ return false;
+
+ return ((llc * (u64)scale) < (rss * PAGE_SIZE));
}

static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
{
+ int scale;
+
+ /*
+ * Scale the number of 'cores' in a LLC by llc_aggr_tolerance
+ * and compare it to the task's active threads.
+ */
+ scale = get_sched_cache_scale(1);
+ if (scale == INT_MAX)
+ return false;
+
return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads),
- per_cpu(sd_llc_size, cpu));
+ (scale * per_cpu(sd_llc_size, cpu)));
}

static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
@@ -1448,9 +1494,9 @@ static inline void __update_mm_sched(struct rq *rq,
long delta = now - rq->cpu_epoch_next;

if (delta > 0) {
- n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
+ n = (delta + llc_epoch_period - 1) / max(llc_epoch_period, 1U);
rq->cpu_epoch += n;
- rq->cpu_epoch_next += n * EPOCH_PERIOD;
+ rq->cpu_epoch_next += n * llc_epoch_period;
__shr_u64(&rq->cpu_runtime, n);
}

@@ -1543,7 +1589,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
* has only 1 thread, invalidate its preferred state.
*/
if (time_after(epoch,
- READ_ONCE(mm->sc_stat.epoch) + EPOCH_LLC_AFFINITY_TIMEOUT) ||
+ READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
get_nr_threads(p) <= 1 ||
exceed_llc_nr(mm, cpu_of(rq)) ||
exceed_llc_capacity(mm, cpu_of(rq))) {
@@ -10018,7 +10064,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
*/
static bool fits_llc_capacity(unsigned long util, unsigned long max)
{
- u32 aggr_pct = 50;
+ u32 aggr_pct = llc_overaggr_pct;

/*
* For single core systems, raise the aggregation
@@ -10038,7 +10084,7 @@ static bool fits_llc_capacity(unsigned long util, unsigned long max)
*/
/* Allows dst util to be bigger than src util by up to bias percent */
#define util_greater(util1, util2) \
- ((util1) * 100 > (util2) * 120)
+ ((util1) * 100 > (util2) * (100 + llc_imb_pct))

static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
unsigned long *cap)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5561bdcc8bf5..b757812725f7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4038,6 +4038,11 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
DECLARE_STATIC_KEY_FALSE(sched_cache_present);
DECLARE_STATIC_KEY_FALSE(sched_cache_active);
extern int sysctl_sched_cache_user;
+extern unsigned int llc_aggr_tolerance;
+extern unsigned int llc_epoch_period;
+extern unsigned int llc_epoch_affinity_timeout;
+extern unsigned int llc_imb_pct;
+extern unsigned int llc_overaggr_pct;

static inline bool sched_cache_enabled(void)
{
--
2.32.0