[PATCH] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task
From: Christian Loehle
Date: Mon Feb 02 2026 - 10:35:54 EST
CPUs whose rq only have SCHED_IDLE tasks running are preferred over
true idle CPUs in many cases, this is because they are guaranteed to
not be in an idle state and might even be in a higher P-state.
This reasoning is based on the assumption that the task (e.g. wakee)
gets to run there immediately and isn't sharing the rq.
This however isn't true if the task has SCHED_IDLE policy itself, then
we are better off to continue looking for a true idle CPU.
On a Intel Xeon 2-socket with 64 logical cores in total this yields
for kernel compilation using SCHED_IDLE:
+---------+----------------------+----------------------+--------+
| workers | mainline (seconds) | patch (seconds) | delta% |
+=========+======================+======================+========+
| 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 |
| 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 |
| 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 |
| 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 |
| 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 |
| 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 |
| 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 |
| 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 |
+---------+----------------------+----------------------+--------+
Signed-off-by: Christian Loehle <christian.loehle@xxxxxxx>
---
kernel/sched/fair.c | 28 +++++++++++++++-------------
1 file changed, 15 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3eaeceda71b0..b29fa04958f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6832,9 +6832,10 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-static int sched_idle_cpu(int cpu)
+static int choose_idle_cpu(int cpu, struct task_struct *p)
{
- return sched_idle_rq(cpu_rq(cpu));
+ return available_idle_cpu(cpu) ||
+ (sched_idle_rq(cpu_rq(cpu)) && !task_has_idle_policy(p));
}
static void
@@ -7400,7 +7401,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
if (!sched_core_cookie_match(rq, p))
continue;
- if (sched_idle_cpu(i))
+ if (sched_idle_rq(rq) && !task_has_idle_policy(p))
return i;
if (available_idle_cpu(i)) {
@@ -7491,8 +7492,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
static inline int __select_idle_cpu(int cpu, struct task_struct *p)
{
- if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
- sched_cpu_cookie_match(cpu_rq(cpu), p))
+ if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
return cpu;
return -1;
@@ -7565,7 +7565,9 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
if (!available_idle_cpu(cpu)) {
idle = false;
if (*idle_cpu == -1) {
- if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
+ if (sched_idle_rq(cpu_rq(cpu)) &&
+ !task_has_idle_policy(p) &&
+ cpumask_test_cpu(cpu, cpus)) {
*idle_cpu = cpu;
break;
}
@@ -7600,7 +7602,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
*/
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ if (choose_idle_cpu(cpu, p))
return cpu;
}
@@ -7722,7 +7724,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
- if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ if (!choose_idle_cpu(cpu, p))
continue;
fits = util_fits_cpu(task_util, util_min, util_max, cpu);
@@ -7793,7 +7795,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
lockdep_assert_irqs_disabled();
- if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ if (choose_idle_cpu(target, p) &&
asym_fits_cpu(task_util, util_min, util_max, target))
return target;
@@ -7801,7 +7803,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ choose_idle_cpu(prev, p) &&
asym_fits_cpu(task_util, util_min, util_max, prev)) {
if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -7833,7 +7835,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+ choose_idle_cpu(recent_used_cpu, p) &&
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
@@ -12261,7 +12263,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
- int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -12299,7 +12301,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu);
- busy = !idle && !sched_idle_cpu(cpu);
+ busy = !idle && !sched_idle_rq(rq);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
--
2.34.1