[PATCH v2 06/23] sched/cache: Track LLC-preferred tasks per runqueue

From: Tim Chen

Date: Wed Dec 03 2025 - 18:01:27 EST


For each runqueue, track the number of tasks with an LLC preference
and how many of them are running on their preferred LLC. This mirrors
nr_numa_running and nr_preferred_running for NUMA balancing, and will
be used by cache-aware load balancing in later patches.

Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---

Notes:
v1->v2: Invoke task_of() once and reuse its result afterwards.
(Peter Zijlstra)
Remove hacky reset_llc_stats() and introduce sched_llc_active flag
to properly pair enqueue/dequeue statistics update (Peter Zijlstra, K Prateek Nayak)

include/linux/sched.h | 2 ++
init/init_task.c | 1 +
kernel/sched/core.c | 5 ++++
kernel/sched/fair.c | 60 ++++++++++++++++++++++++++++++++++++++++---
kernel/sched/sched.h | 6 +++++
5 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1ad46220cd04..466ba8b7398c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1408,6 +1408,8 @@ struct task_struct {

#ifdef CONFIG_SCHED_CACHE
struct callback_head cache_work;
+ /*the p is currently refcounted in a rq's preferred llc stats*/
+ bool sched_llc_active;
int preferred_llc;
#endif

diff --git a/init/init_task.c b/init/init_task.c
index 44bae72b5b7d..ee78837b0aa2 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -192,6 +192,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.numa_faults = NULL,
#endif
#ifdef CONFIG_SCHED_CACHE
+ .sched_llc_active = false,
.preferred_llc = -1,
#endif
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8bdf03a4b7f..48626c81ba8e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -531,6 +531,11 @@ void __trace_set_current_state(int state_value)
}
EXPORT_SYMBOL(__trace_set_current_state);

+int task_llc(const struct task_struct *p)
+{
+ return per_cpu(sd_llc_id, task_cpu(p));
+}
+
/*
* Serialization rules:
*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 10cec83f65d5..d46a70a9d9fb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1223,6 +1223,43 @@ static int llc_id(int cpu)
return llc;
}

+static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+{
+ int pref_llc;
+
+ if (!sched_cache_enabled())
+ return;
+
+ pref_llc = p->preferred_llc;
+ if (pref_llc < 0)
+ return;
+
+ rq->nr_llc_running++;
+ rq->nr_pref_llc_running += (pref_llc == task_llc(p));
+ p->sched_llc_active = true;
+}
+
+static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+{
+ int pref_llc;
+
+ /*
+ * Borrow the uc_se->active from uclamp_rq_inc_id(),
+ * uclamp_rq_dec_id() to avoid the unbalanced calculation
+ * of rq statistics.
+ */
+ if (unlikely(!p->sched_llc_active))
+ return;
+
+ pref_llc = p->preferred_llc;
+ if (pref_llc < 0)
+ return;
+
+ rq->nr_llc_running--;
+ rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
+ p->sched_llc_active = false;
+}
+
void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
{
unsigned long epoch;
@@ -1294,6 +1331,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch
return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
}

+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
+
static inline
void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
{
@@ -1346,8 +1385,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
#endif
}

- if (p->preferred_llc != mm_sched_llc)
+ /* task not on rq accounted later in account_entity_enqueue() */
+ if (task_running_on_cpu(rq->cpu, p) &&
+ p->preferred_llc != mm_sched_llc) {
+ account_llc_dequeue(rq, p);
p->preferred_llc = mm_sched_llc;
+ account_llc_enqueue(rq, p);
+ }
}

static void task_tick_cache(struct rq *rq, struct task_struct *p)
@@ -1475,6 +1519,10 @@ void init_sched_mm(struct task_struct *p) { }

static void task_tick_cache(struct rq *rq, struct task_struct *p) { }

+static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {}
+
+static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {}
+
#endif

/*
@@ -3965,9 +4013,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
struct rq *rq = rq_of(cfs_rq);

- account_numa_enqueue(rq, task_of(se));
+ account_numa_enqueue(rq, p);
+ account_llc_enqueue(rq, p);
list_add(&se->group_node, &rq->cfs_tasks);
}
cfs_rq->nr_queued++;
@@ -3978,7 +4028,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (entity_is_task(se)) {
- account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ struct task_struct *p = task_of(se);
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_dequeue(rq, p);
+ account_llc_dequeue(rq, p);
list_del_init(&se->group_node);
}
cfs_rq->nr_queued--;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 728737641847..ee8b70647835 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1126,6 +1126,10 @@ struct rq {
unsigned int nr_preferred_running;
unsigned int numa_migrate_on;
#endif
+#ifdef CONFIG_SCHED_CACHE
+ unsigned int nr_pref_llc_running;
+ unsigned int nr_llc_running;
+#endif
#ifdef CONFIG_NO_HZ_COMMON
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
@@ -1980,6 +1984,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p)

#endif /* !CONFIG_NUMA_BALANCING */

+int task_llc(const struct task_struct *p);
+
static inline void
queue_balance_callback(struct rq *rq,
struct balance_callback *head,
--
2.32.0