[PATCH v3 21/21] -- DO NOT APPLY!!! -- sched/cache/debug: Add ftrace to track the load balance statistics
From: Tim Chen
Date: Tue Feb 10 2026 - 17:19:12 EST
From: Chen Yu <yu.c.chen@xxxxxxxxx>
Debug patch only.
The user leverages these trace events (via bpftrace, etc.)
to monitor the cache-aware load balancing activity - specifically,
whether tasks are moved to their preferred LLC, moved out of their
preferred LLC, or whether cache-aware load balancing is skipped
due to exceeding the memory footprint limit or too many active
tasks.
Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx>
Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---
Notes:
v2->v3:
Add more trace events when the process exceeds the limitation
of LLC size or number of active threads(moved from schedstat
to trace event for better bpf tracking)
include/trace/events/sched.h | 79 ++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 40 ++++++++++++++----
2 files changed, 110 insertions(+), 9 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..b73327653e4b 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -10,6 +10,85 @@
#include <linux/tracepoint.h>
#include <linux/binfmts.h>
+#ifdef CONFIG_SCHED_CACHE
+TRACE_EVENT(sched_exceed_llc_cap,
+
+ TP_PROTO(struct task_struct *t, int exceeded),
+
+ TP_ARGS(t, exceeded),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, exceeded )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+ __entry->pid = t->pid;
+ __entry->exceeded = exceeded;
+ ),
+
+ TP_printk("comm=%s pid=%d exceed_cap=%d",
+ __entry->comm, __entry->pid,
+ __entry->exceeded)
+);
+
+TRACE_EVENT(sched_exceed_llc_nr,
+
+ TP_PROTO(struct task_struct *t, int exceeded),
+
+ TP_ARGS(t, exceeded),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, exceeded )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+ __entry->pid = t->pid;
+ __entry->exceeded = exceeded;
+ ),
+
+ TP_printk("comm=%s pid=%d exceed_nr=%d",
+ __entry->comm, __entry->pid,
+ __entry->exceeded)
+);
+
+TRACE_EVENT(sched_attach_task,
+
+ TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
+ int attach_cpu, int attach_llc),
+
+ TP_ARGS(t, pref_cpu, pref_llc, attach_cpu, attach_llc),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, pref_cpu )
+ __field( int, pref_llc )
+ __field( int, attach_cpu )
+ __field( int, attach_llc )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+ __entry->pid = t->pid;
+ __entry->pref_cpu = pref_cpu;
+ __entry->pref_llc = pref_llc;
+ __entry->attach_cpu = attach_cpu;
+ __entry->attach_llc = attach_llc;
+ ),
+
+ TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d",
+ __entry->comm, __entry->pid,
+ __entry->pref_cpu, __entry->pref_llc,
+ __entry->attach_cpu, __entry->attach_llc)
+);
+#endif
+
/*
* Tracepoint for calling kthread_stop, performed to end a kthread:
*/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25cee3dd767c..977091fd0e49 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1245,9 +1245,11 @@ static inline int get_sched_cache_scale(int mul)
return (1 + (llc_aggr_tolerance - 1) * mul);
}
-static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu,
+ struct task_struct *p)
{
struct cacheinfo *ci;
+ bool exceeded;
u64 rss, llc;
int scale;
@@ -1293,12 +1295,18 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
if (scale == INT_MAX)
return false;
- return ((llc * scale) <= (rss * PAGE_SIZE));
+ exceeded = ((llc * scale) <= (rss * PAGE_SIZE));
+
+ trace_sched_exceed_llc_cap(p, exceeded);
+
+ return exceeded;
}
-static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+static bool exceed_llc_nr(struct mm_struct *mm, int cpu,
+ struct task_struct *p)
{
int smt_nr = 1, scale;
+ bool exceeded;
#ifdef CONFIG_SCHED_SMT
if (sched_smt_active())
@@ -1313,8 +1321,12 @@ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
if (scale == INT_MAX)
return false;
- return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
+ exceeded = !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
(scale * per_cpu(sd_llc_size, cpu)));
+
+ trace_sched_exceed_llc_nr(p, exceeded);
+
+ return exceeded;
}
static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
@@ -1522,8 +1534,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
if (time_after(epoch,
READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
get_nr_threads(p) <= 1 ||
- exceed_llc_nr(mm, cpu_of(rq)) ||
- exceed_llc_capacity(mm, cpu_of(rq))) {
+ exceed_llc_nr(mm, cpu_of(rq), p) ||
+ exceed_llc_capacity(mm, cpu_of(rq), p)) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
}
@@ -1600,7 +1612,7 @@ static void task_cache_work(struct callback_head *work)
curr_cpu = task_cpu(p);
if (get_nr_threads(p) <= 1 ||
- exceed_llc_capacity(mm, curr_cpu)) {
+ exceed_llc_capacity(mm, curr_cpu, p)) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
@@ -10159,8 +10171,8 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
* Skip cache aware load balance for single/too many threads
* or large memory RSS.
*/
- if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) ||
- exceed_llc_capacity(mm, dst_cpu)) {
+ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu, p) ||
+ exceed_llc_capacity(mm, dst_cpu, p)) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
return mig_unrestricted;
@@ -10602,6 +10614,16 @@ static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
+#ifdef CONFIG_SCHED_CACHE
+ if (p->mm) {
+ int pref_cpu = p->mm->sc_stat.cpu;
+
+ trace_sched_attach_task(p,
+ pref_cpu,
+ pref_cpu != -1 ? llc_id(pref_cpu) : -1,
+ cpu_of(rq), llc_id(cpu_of(rq)));
+ }
+#endif
WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
wakeup_preempt(rq, p, 0);
--
2.32.0