[Patch v4 12/22] sched/cache: Prioritize tasks preferring destination LLC during balancing
From: Tim Chen
Date: Wed Apr 01 2026 - 17:47:42 EST
During LLC load balancing, first check for tasks that prefer the
destination LLC and balance them to it before others.
Mark source sched groups containing tasks preferring non local LLCs
with the group_llc_balance flag. This ensures the load balancer later
pulls or pushes these tasks toward their preferred LLCs.
The priority of group_llc_balance is lower than that of group_overloaded
and higher than that of all other group types. This is because
group_llc_balance may exacerbate load imbalance, and if the LLC balancing
attempt fails, the nr_balance_failed mechanism will trigger other group
types to rebalance the load.
The load balancer selects the busiest sched_group and migrates tasks
to less busy groups to distribute load across CPUs.
With cache-aware scheduling enabled, the busiest sched_group is
the one with most tasks preferring the destination LLC. If
the group has the llc_balance flag set, cache aware load balancing is
triggered.
Introduce the helper function update_llc_busiest() to identify the
sched_group with the most tasks preferring the destination LLC.
Suggested-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
Suggested-by: Madadi Vineeth Reddy <vineethr@xxxxxxxxxxxxx>
Co-developed-by: Chen Yu <yu.c.chen@xxxxxxxxx>
Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx>
Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---
Notes:
v3->v4:
Add comments to explain LLC load balance priority.
(Madadi Vineeth Reddy)
kernel/sched/fair.c | 79 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 78 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ed84086244c..c032eeebe191 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9747,6 +9747,16 @@ enum group_type {
* from balancing the load across the system.
*/
group_imbalanced,
+ /*
+ * There are tasks running on non-preferred LLC, possible to move
+ * them to their preferred LLC without creating too much imbalance.
+ * The priority of group_llc_balance is lower than that of
+ * group_overloaded and higher than that of all other group types.
+ * This is because group_llc_balance may exacerbate load imbalance.
+ * If the LLC balancing attempt fails, the nr_balance_failed
+ * mechanism will trigger other group types to rebalance the load.
+ */
+ group_llc_balance,
/*
* The CPU is overloaded and can't provide expected CPU cycles to all
* tasks.
@@ -10676,6 +10686,7 @@ struct sg_lb_stats {
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned int group_smt_balance; /* Task on busy SMT be moved */
+ unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -10934,6 +10945,9 @@ group_type group_classify(unsigned int imbalance_pct,
if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
+ if (sgs->group_llc_balance)
+ return group_llc_balance;
+
if (sg_imbalanced(group))
return group_imbalanced;
@@ -11128,11 +11142,63 @@ static void record_sg_llc_stats(struct lb_env *env,
if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
}
+
+/*
+ * Do LLC balance on sched group that contains LLC, and have tasks preferring
+ * to run on LLC in idle dst_cpu.
+ */
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (!sched_cache_enabled())
+ return false;
+
+ if (env->sd->flags & SD_SHARE_LLC)
+ return false;
+
+ /*
+ * Skip cache aware tagging if nr_balanced_failed is sufficiently high.
+ * Threshold of cache_nice_tries is set to 1 higher than nr_balance_failed
+ * to avoid excessive task migration at the same time.
+ */
+ if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1)
+ return false;
+
+ if (sgs->nr_pref_dst_llc &&
+ can_migrate_llc(cpumask_first(sched_group_span(group)),
+ env->dst_cpu, 0, true) == mig_llc)
+ return true;
+
+ return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *sgs)
+{
+ /*
+ * There are more tasks that want to run on dst_cpu's LLC.
+ */
+ return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc;
+}
#else
static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
struct sched_group *group)
{
}
+
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *sgs)
+{
+ return false;
+}
#endif
/**
@@ -11237,6 +11303,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/* Check for loaded SMT group to be balanced to dst CPU */
if (smt_balance(env, sgs, group))
sgs->group_smt_balance = 1;
+
+ /* Check for tasks in this group can be moved to their preferred LLC */
+ if (llc_balance(env, sgs, group))
+ sgs->group_llc_balance = 1;
}
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
@@ -11300,6 +11370,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
/* Select the overloaded group with highest avg_load. */
return sgs->avg_load > busiest->avg_load;
+ case group_llc_balance:
+ /* Select the group with most tasks preferring dst LLC */
+ return update_llc_busiest(env, busiest, sgs);
+
case group_imbalanced:
/*
* Select the 1st imbalanced group as we don't have any way to
@@ -11562,6 +11636,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
return false;
break;
+ case group_llc_balance:
case group_imbalanced:
case group_asym_packing:
case group_smt_balance:
@@ -11694,6 +11769,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
return NULL;
break;
+ case group_llc_balance:
case group_imbalanced:
case group_asym_packing:
case group_smt_balance:
@@ -12192,7 +12268,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
* group's child domain.
*/
if (sds.prefer_sibling && local->group_type == group_has_spare &&
- sibling_imbalance(env, &sds, busiest, local) > 1)
+ (busiest->group_type == group_llc_balance ||
+ sibling_imbalance(env, &sds, busiest, local) > 1))
goto force_balance;
if (busiest->group_type != group_overloaded) {
--
2.32.0