[RFC PATCH 4/8] sched/fair: Pick optimal src rq/group using affinity promotion metric
From: Jianyong Wu
Date: Wed Jun 24 2026 - 23:10:18 EST
The current source group/runqueue selection logic only considers LLC
preference and ignores potential NUMA affinity improvements.
This patch leverages the NUMA affinity gain calculation introduced
in the previous commit to pick the optimal source scheduling group
and runqueue during load balancing.
Signed-off-by: Jianyong Wu <wujianyong@xxxxxxxx>
---
kernel/sched/fair.c | 37 +++++++++++++++++++++++++++++--------
1 file changed, 29 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da6e2b5e6306..9141e6c8eba8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10613,7 +10613,7 @@ static bool is_domain_overload(struct sched_domain *sd)
* The node here is a generic conception for a set of cpu.
* It usually indicates one of sched domain for LLC level and above.
*/
-static enum llc_mig __maybe_unused can_migrate_node(int src_cpu, int dst_cpu,
+static enum llc_mig can_migrate_node(int src_cpu, int dst_cpu,
struct task_struct *p, bool to_pref)
{
struct sched_domain *domain;
@@ -11852,8 +11852,8 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
return false;
if (sgs->nr_pref_dst_llc &&
- can_migrate_llc(cpumask_first(sched_group_span(group)),
- env->dst_cpu, 0, true) == mig_llc)
+ can_migrate_node(cpumask_first(sched_group_span(group)),
+ env->dst_cpu, NULL, true) == mig_llc)
return true;
return false;
@@ -11922,7 +11922,7 @@ static int get_affi_llcs(int src_llc, int dst_llc, int *affi_llcs, int *dist)
* Rt_i is the number of tasks on the rq with LLCi as their preferred LLC,
* obtainable from rq->sd->pf.
*/
-static int __maybe_unused cal_affinity_score(struct rq *rq, int src_cpu, int dst_llc,
+static int cal_affinity_score(struct rq *rq, int src_cpu, int dst_llc,
int *affi_llcs, int *dist, int *last_llc, int *num)
{
struct sched_domain *sd_tmp = rcu_dereference(rq->sd);
@@ -11980,6 +11980,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
{
int i, nr_running, local_group, sd_flags = env->sd->flags;
bool balancing_at_rd = !env->sd->parent;
+#ifdef CONFIG_SCHED_CACHE
+ int last_llc = -1, llc_num;
+ int *cache_llc = kmalloc_array(max_lid + 1, sizeof(int), GFP_NOWAIT);
+ int *dist = kmalloc_array(max_lid + 1, sizeof(int), GFP_NOWAIT);
+#endif
memset(sgs, 0, sizeof(*sgs));
@@ -12009,7 +12014,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (llc_id(i) != dst_llc) {
sd_tmp = rcu_dereference_all(rq->sd);
if (sd_tmp && (unsigned int)dst_llc < sd_tmp->llc_max)
- sgs->nr_pref_dst_llc += sd_tmp->llc_counts[dst_llc];
+ sgs->nr_pref_dst_llc += cal_affinity_score(rq, i,
+ dst_llc, cache_llc, dist, &last_llc, &llc_num);
}
}
#endif
@@ -12050,6 +12056,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
}
}
+#ifdef CONFIG_SCHED_CACHE
+ kfree(dist);
+ kfree(cache_llc);
+#endif
sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
@@ -13107,9 +13117,15 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
unsigned int __maybe_unused busiest_pref_llc = 0;
struct sched_domain __maybe_unused *sd_tmp;
- unsigned int busiest_nr = 0;
int __maybe_unused dst_llc;
- int i;
+ int __maybe_unused *cache_llc, __maybe_unused *dist;
+ int __maybe_unused last_llc = -1, __maybe_unused llc_num, i;
+ unsigned int busiest_nr = 0;
+
+#ifdef CONFIG_SCHED_CACHE
+ cache_llc = kmalloc_array(max_lid + 1, sizeof(int), GFP_NOWAIT);
+ dist = kmalloc_array(max_lid + 1, sizeof(int), GFP_NOWAIT);
+#endif
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
unsigned long capacity, load, util;
@@ -13243,7 +13259,8 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
if (sd_tmp && (unsigned)dst_llc < sd_tmp->llc_max) {
unsigned int this_pref_llc =
- sd_tmp->llc_counts[dst_llc];
+ cal_affinity_score(rq, i, dst_llc,
+ cache_llc, dist, &last_llc, &llc_num);
if (busiest_pref_llc < this_pref_llc) {
busiest_pref_llc = this_pref_llc;
@@ -13256,6 +13273,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
}
}
+#ifdef CONFIG_SCHED_CACHE
+ kfree(cache_llc);
+ kfree(dist);
+#endif
return busiest;
}
--
2.34.1