[Patch v4 22/22] -- DO NOT APPLY!!! -- sched/cache/debug: Add ftrace to track the load balance statistics

From: Tim Chen

Date: Wed Apr 01 2026 - 17:51:01 EST

From: Chen Yu <yu.c.chen@xxxxxxxxx>

Debug patch only.

To help investigate any potential performance regressions caused by
cache-aware scheduling in the future, introduce these ftrace events.

The user leverages these trace events (via bpftrace, etc.)
to monitor the cache-aware load balancing activity - specifically,
whether tasks are moved to their preferred LLC, moved out of their
preferred LLC, whether cache-aware load balancing is skipped
due to exceeding the memory footprint limit or too many active
tasks, and the reason why LLC preferred migration is allowed or
not.

Together with existing scheduler events, the newly introduced
events above can be used to narrow down the performance regression.
For example, the regression could be caused by excessive task
migrations among CPUs, which can be tracked either by
trace_sched_attach_task() or by checking the return value of
select_task_rq_fair(). Alternatively, it could be caused by
over-aggregation within a single LLC, which can be identified
via context switch events.

The scanning time to find the hottest LLC is simply recorded,
which can be used to evaluate whether the statistics calculation
for cache-aware scheduling is costly.

Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx>
Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---

Notes:
v3->v4:
Add more trace events.

include/trace/events/sched.h | 140 +++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 64 +++++++++++++---
2 files changed, 192 insertions(+), 12 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..8d1d5fa32ad2 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -10,6 +10,146 @@
#include <linux/tracepoint.h>
#include <linux/binfmts.h>

+#ifdef CONFIG_SCHED_CACHE
+TRACE_EVENT(sched_llc_mig,
+ TP_PROTO(unsigned long dst_util, unsigned long dst_cap,
+ unsigned long src_util, unsigned long src_cap,
+ int to_pref, int mig_hint),
+
+ TP_ARGS(dst_util, dst_cap, src_util, src_cap, to_pref, mig_hint),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, dst_util)
+ __field(unsigned long, dst_cap)
+ __field(unsigned long, src_util)
+ __field(unsigned long, src_cap)
+ __field(int, to_pref)
+ __field(int, mig_hint)
+ ),
+
+ TP_fast_assign(
+ __entry->dst_util = dst_util;
+ __entry->dst_cap = dst_cap;
+ __entry->src_util = src_util;
+ __entry->src_cap = src_cap;
+ __entry->to_pref = to_pref;
+ __entry->mig_hint = mig_hint;
+ ),
+
+ TP_printk("dst_util=%lu dst_cap=%lu src_util=%lu src_cap=%lu to_pref=%d mig_hint=%d",
+ __entry->dst_util, __entry->dst_cap, __entry->src_util,
+ __entry->src_cap, __entry->to_pref, __entry->mig_hint)
+);
+
+TRACE_EVENT(sched_llc_scan,
+
+ TP_PROTO(struct task_struct *t, unsigned long cost),
+
+ TP_ARGS(t, cost),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, cost)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+ __entry->pid = t->pid;
+ __entry->cost = cost;
+ ),
+
+ TP_printk("comm=%s pid=%d scan_cost=%lu",
+ __entry->comm, __entry->pid,
+ __entry->cost)
+);
+
+TRACE_EVENT(sched_exceed_llc_cap,
+
+ TP_PROTO(struct task_struct *t, int exceeded, int scale,
+ unsigned long llc, unsigned long rss),
+
+ TP_ARGS(t, exceeded, scale, llc, rss),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, exceeded)
+ __field(int, scale)
+ __field(unsigned long, llc)
+ __field(unsigned long, rss)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+ __entry->pid = t->pid;
+ __entry->exceeded = exceeded;
+ __entry->scale = scale;
+ __entry->llc = llc;
+ __entry->rss = rss;
+ ),
+
+ TP_printk("comm=%s pid=%d exceed_cap=%d scale=%d llc=%lu rss=%lu",
+ __entry->comm, __entry->pid,
+ __entry->exceeded, __entry->scale,
+ __entry->llc, __entry->rss)
+);
+
+TRACE_EVENT(sched_exceed_llc_nr,
+
+ TP_PROTO(struct task_struct *t, int exceeded),
+
+ TP_ARGS(t, exceeded),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, exceeded)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+ __entry->pid = t->pid;
+ __entry->exceeded = exceeded;
+ ),
+
+ TP_printk("comm=%s pid=%d exceed_nr=%d",
+ __entry->comm, __entry->pid,
+ __entry->exceeded)
+);
+
+TRACE_EVENT(sched_attach_task,
+
+ TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
+ int attach_cpu, int attach_llc),
+
+ TP_ARGS(t, pref_cpu, pref_llc, attach_cpu, attach_llc),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, pref_cpu)
+ __field(int, pref_llc)
+ __field(int, attach_cpu)
+ __field(int, attach_llc)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+ __entry->pid = t->pid;
+ __entry->pref_cpu = pref_cpu;
+ __entry->pref_llc = pref_llc;
+ __entry->attach_cpu = attach_cpu;
+ __entry->attach_llc = attach_llc;
+ ),
+
+ TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d",
+ __entry->comm, __entry->pid,
+ __entry->pref_cpu, __entry->pref_llc,
+ __entry->attach_cpu, __entry->attach_llc)
+);
+#endif
+
/*
* Tracepoint for calling kthread_stop, performed to end a kthread:
*/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2b12918b00fd..f446d755f3c5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1337,9 +1337,11 @@ static inline int get_sched_cache_scale(int mul)
return (1 + (llc_aggr_tolerance - 1) * mul);
}

-static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu,
+ struct task_struct *p)
{
struct cacheinfo *ci;
+ bool exceeded;
u64 rss, llc;
int scale;

@@ -1385,11 +1387,17 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
if (scale == INT_MAX)
return false;

- return ((llc * (u64)scale) < (rss * PAGE_SIZE));
+ exceeded = ((llc * (u64)scale) < (rss * PAGE_SIZE));
+
+ trace_sched_exceed_llc_cap(p, exceeded, scale, llc, rss);
+
+ return exceeded;
}

-static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+static bool exceed_llc_nr(struct mm_struct *mm, int cpu,
+ struct task_struct *p)
{
+ bool exceeded;
int scale;

/*
@@ -1400,8 +1408,12 @@ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
if (scale == INT_MAX)
return false;

- return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads),
+ exceeded = !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads),
(scale * per_cpu(sd_llc_size, cpu)));
+
+ trace_sched_exceed_llc_nr(p, exceeded);
+
+ return exceeded;
}

static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
@@ -1606,8 +1618,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
if (time_after(epoch,
READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
get_nr_threads(p) <= 1 ||
- exceed_llc_nr(mm, cpu_of(rq)) ||
- exceed_llc_capacity(mm, cpu_of(rq))) {
+ exceed_llc_nr(mm, cpu_of(rq), p) ||
+ exceed_llc_capacity(mm, cpu_of(rq), p)) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
}
@@ -1718,6 +1730,7 @@ static void task_cache_work(struct callback_head *work)
struct mm_struct *mm = p->mm;
unsigned long m_a_occ = 0;
cpumask_var_t cpus;
+ u64 t0, scan_cost;

WARN_ON_ONCE(work != &p->cache_work);

@@ -1728,7 +1741,7 @@ static void task_cache_work(struct callback_head *work)

curr_cpu = task_cpu(p);
if (get_nr_threads(p) <= 1 ||
- exceed_llc_capacity(mm, curr_cpu)) {
+ exceed_llc_capacity(mm, curr_cpu, p)) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;

@@ -1738,6 +1751,8 @@ static void task_cache_work(struct callback_head *work)
if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
return;

+ t0 = sched_clock_cpu(curr_cpu);
+
scoped_guard (cpus_read_lock) {
get_scan_cpumasks(cpus, p);

@@ -1793,6 +1808,9 @@ static void task_cache_work(struct callback_head *work)
}
}

+ scan_cost = sched_clock_cpu(curr_cpu) - t0;
+ trace_sched_llc_scan(p, scan_cost);
+
if (m_a_occ > (2 * curr_m_a_occ)) {
/*
* Avoid switching sc_stat.cpu too fast.
@@ -10192,8 +10210,11 @@ static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
dst_util = dst_util + tsk_util;

if (!fits_llc_capacity(dst_util, dst_cap) &&
- !fits_llc_capacity(src_util, src_cap))
+ !fits_llc_capacity(src_util, src_cap)) {
+ trace_sched_llc_mig(dst_util, dst_cap, src_util, src_cap,
+ to_pref, mig_unrestricted);
return mig_unrestricted;
+ }

if (to_pref) {
/*
@@ -10203,8 +10224,11 @@ static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
* increase the imbalance too much.
*/
if (!fits_llc_capacity(dst_util, dst_cap) &&
- util_greater(dst_util, src_util))
+ util_greater(dst_util, src_util)) {
+ trace_sched_llc_mig(dst_util, dst_cap, src_util, src_cap,
+ to_pref, mig_forbid);
return mig_forbid;
+ }
} else {
/*
* Don't migrate if we will leave preferred LLC
@@ -10214,9 +10238,15 @@ static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
* back to preferred LLC.
*/
if (fits_llc_capacity(src_util, src_cap) ||
- !util_greater(src_util, dst_util))
+ !util_greater(src_util, dst_util)) {
+ trace_sched_llc_mig(dst_util, dst_cap, src_util, src_cap,
+ to_pref, mig_forbid);
return mig_forbid;
+ }
}
+
+ trace_sched_llc_mig(dst_util, dst_cap, src_util, src_cap,
+ to_pref, mig_llc);
return mig_llc;
}

@@ -10243,8 +10273,8 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
* Skip cache aware load balance for single/too many threads
* or large memory RSS.
*/
- if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) ||
- exceed_llc_capacity(mm, dst_cpu)) {
+ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu, p) ||
+ exceed_llc_capacity(mm, dst_cpu, p)) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
return mig_unrestricted;
@@ -10722,6 +10752,16 @@ static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);

+#ifdef CONFIG_SCHED_CACHE
+ if (p->mm) {
+ int pref_cpu = p->mm->sc_stat.cpu;
+
+ trace_sched_attach_task(p,
+ pref_cpu,
+ pref_cpu != -1 ? llc_id(pref_cpu) : -1,
+ cpu_of(rq), llc_id(cpu_of(rq)));
+ }
+#endif
WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
wakeup_preempt(rq, p, 0);
--
2.32.0