[PATCH 2/2] sched: Remove sched_class::balance()
From: Peter Zijlstra
Date: Wed Jun 24 2026 - 08:30:50 EST
Ever since commit 50653216e4ff ("sched: Add support to pick functions to take
rf"), we have the unfortunate situation that both sched_class::balance() and
sched_class::pick_task() have overlapping functionality in that they drop
rq->lock and balance tasks.
Additionally, prev_balance() is only called for a single RQ in the core-sched
case, resulting in 'missed' balance opportunities in this case.
The only classes with a balance callback are dl and rt, prev_balance() will run
the callbacks from prev->class down, pick_next_task() runs the callbacks from
stop_class down.
Therefore, the only case where there is a difference is if prev->class ==
rt_sched_class and pick_next_task() stops at stop/dl. But in those cases the rt
pull would have been pointless, it would move a high priority task to a
runqueue that will not be able to run it.
A subsequent pick that does reach rt, must have a prev of stop/dl priority
(0,-1 resp.) and this will ensure need_pull_rt_task() is true and do the pull
then.
Therefore, move balance_{rt,dl}() into pick_task_{rt,dl}() and remove
sched_class::balance().
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 23 -----------------------
kernel/sched/deadline.c | 10 ++++++----
kernel/sched/fair.c | 38 ++++++++++----------------------------
kernel/sched/idle.c | 7 -------
kernel/sched/rt.c | 14 ++++++--------
kernel/sched/sched.h | 5 -----
kernel/sched/stop_task.c | 7 -------
7 files changed, 22 insertions(+), 82 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6092,25 +6092,6 @@ static inline void schedule_debug(struct
schedstat_inc(this_rq()->sched_count);
}
-static void prev_balance(struct rq *rq, struct rq_flags *rf)
-{
- const struct sched_class *start_class = rq->donor->sched_class;
- const struct sched_class *class;
-
- /*
- * We must do the balancing pass before put_prev_task(), such
- * that when we release the rq->lock the task is in the same
- * state as before we took rq->lock.
- *
- * We can terminate the balance pass as soon as we know there is
- * a runnable task of @class priority or higher.
- */
- for_active_class_range(class, start_class, &idle_sched_class) {
- if (class->balance && class->balance(rq, rf))
- break;
- }
-}
-
/*
* Pick up the highest-prio task:
*/
@@ -6148,8 +6129,6 @@ __pick_next_task(struct rq *rq, struct r
}
restart:
- prev_balance(rq, rf);
-
for_each_active_class(class) {
p = class->pick_task(rq, rf);
if (unlikely(p == RETRY_TASK))
@@ -6257,8 +6236,6 @@ pick_next_task(struct rq *rq, struct rq_
goto out_set_next;
}
- prev_balance(rq, rf);
-
smt_mask = cpu_smt_mask(cpu);
need_sync = !!rq->core->core_cookie;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2704,7 +2704,7 @@ static void check_preempt_equal_dl(struc
resched_curr(rq);
}
-static int balance_dl(struct rq *rq, struct rq_flags *rf)
+static void balance_dl(struct rq *rq, struct rq_flags *rf)
{
/*
* Note, rq->donor may change during rq lock drops,
@@ -2723,8 +2723,6 @@ static int balance_dl(struct rq *rq, str
pull_dl_task(rq);
rq_repin_lock(rq, rf);
}
-
- return sched_stop_runnable(rq) || sched_dl_runnable(rq);
}
/*
@@ -2817,6 +2815,11 @@ static struct task_struct *__pick_task_d
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
+ rq_modified_begin(rq, &dl_sched_class);
+ balance_dl(rq, rf);
+ if (rq_modified_above(rq, &dl_sched_class))
+ return RETRY_TASK;
+
again:
if (!sched_dl_runnable(rq))
return NULL;
@@ -3652,7 +3655,6 @@ DEFINE_SCHED_CLASS(dl) = {
.put_prev_task = put_prev_task_dl,
.set_next_task = set_next_task_dl,
- .balance = balance_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5732,7 +5732,7 @@ static inline unsigned long cfs_rq_load_
return cfs_rq->avg.load_avg;
}
-static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
+static void sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
__must_hold(__rq_lockp(this_rq));
static inline unsigned long task_util(struct task_struct *p)
@@ -9916,7 +9916,6 @@ struct task_struct *pick_task_fair(struc
struct cfs_rq *cfs_rq;
struct task_struct *p;
bool throttled;
- int new_tasks;
again:
cfs_rq = &rq->cfs;
@@ -9942,11 +9941,14 @@ struct task_struct *pick_task_fair(struc
return p;
idle:
- new_tasks = sched_balance_newidle(rq, rf);
- if (new_tasks < 0)
+ rq_modified_begin(rq, &fair_sched_class);
+ sched_balance_newidle(rq, rf);
+ if (rq_modified_above(rq, &fair_sched_class))
return RETRY_TASK;
- if (new_tasks > 0)
+
+ if (cfs_rq->nr_queued)
goto again;
+
return NULL;
}
@@ -14334,13 +14336,8 @@ static inline void nohz_newidle_balance(
/*
* sched_balance_newidle is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
- *
- * Returns:
- * < 0 - we released the lock and there are !fair tasks present
- * 0 - failed, no new tasks
- * > 0 - success, new (fair) tasks present
*/
-static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
+static void sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
__must_hold(__rq_lockp(this_rq))
{
unsigned long next_balance = jiffies + HZ;
@@ -14357,7 +14354,7 @@ static int sched_balance_newidle(struct
* Return 0; the task will be enqueued when switching to idle.
*/
if (this_rq->ttwu_pending)
- return 0;
+ return;
/*
* We must set idle_stamp _before_ calling sched_balance_rq()
@@ -14370,7 +14367,7 @@ static int sched_balance_newidle(struct
* Do not pull tasks towards !active CPUs...
*/
if (!cpu_active(this_cpu))
- return 0;
+ return;
/*
* This is OK, because current is on_cpu, which avoids it being picked
@@ -14399,7 +14396,6 @@ static int sched_balance_newidle(struct
t0 = sched_clock_cpu(this_cpu);
__sched_balance_update_blocked_averages(this_rq);
- rq_modified_begin(this_rq, &fair_sched_class);
raw_spin_rq_unlock(this_rq);
for_each_domain(this_cpu, sd) {
@@ -14457,18 +14453,6 @@ static int sched_balance_newidle(struct
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
- /*
- * While browsing the domains, we released the rq lock, a task could
- * have been enqueued in the meantime. Since we're not going idle,
- * pretend we pulled a task.
- */
- if (this_rq->cfs.h_nr_queued && !pulled_task)
- pulled_task = 1;
-
- /* If a higher prio class was modified, restart the pick */
- if (rq_modified_above(this_rq, &fair_sched_class))
- pulled_task = -1;
-
out:
/* Move the next balance forward */
if (time_after(this_rq->next_balance, next_balance))
@@ -14480,8 +14464,6 @@ static int sched_balance_newidle(struct
nohz_newidle_balance(this_rq);
rq_repin_lock(this_rq, rf);
-
- return pulled_task;
}
/*
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -464,12 +464,6 @@ select_task_rq_idle(struct task_struct *
return task_cpu(p); /* IDLE tasks as never migrated */
}
-static int
-balance_idle(struct rq *rq, struct rq_flags *rf)
-{
- return WARN_ON_ONCE(1);
-}
-
/*
* Idle tasks are unconditionally rescheduled:
*/
@@ -581,7 +575,6 @@ DEFINE_SCHED_CLASS(idle) = {
.put_prev_task = put_prev_task_idle,
.set_next_task = set_next_task_idle,
- .balance = balance_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1596,7 +1596,7 @@ static void check_preempt_equal_prio(str
resched_curr(rq);
}
-static int balance_rt(struct rq *rq, struct rq_flags *rf)
+static void balance_rt(struct rq *rq, struct rq_flags *rf)
{
/*
* Note, rq->donor may change during rq lock drops,
@@ -1615,8 +1615,6 @@ static int balance_rt(struct rq *rq, str
pull_rt_task(rq);
rq_repin_lock(rq, rf);
}
-
- return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
}
/*
@@ -1714,14 +1712,15 @@ static struct task_struct *_pick_next_ta
static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf)
{
- struct task_struct *p;
+ rq_modified_begin(rq, &rt_sched_class);
+ balance_rt(rq, rf);
+ if (rq_modified_above(rq, &rt_sched_class))
+ return RETRY_TASK;
if (!sched_rt_runnable(rq))
return NULL;
- p = _pick_next_task_rt(rq);
-
- return p;
+ return _pick_next_task_rt(rq);
}
static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next)
@@ -2609,7 +2608,6 @@ DEFINE_SCHED_CLASS(rt) = {
.put_prev_task = put_prev_task_rt,
.set_next_task = set_next_task_rt,
- .balance = balance_rt,
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2638,11 +2638,6 @@ struct sched_class {
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
/*
- * schedule/pick_next_task/prev_balance: rq->lock
- */
- int (*balance)(struct rq *rq, struct rq_flags *rf);
-
- /*
* schedule/pick_next_task: rq->lock
*/
struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -15,12 +15,6 @@ select_task_rq_stop(struct task_struct *
return task_cpu(p); /* stop tasks as never migrate */
}
-static int
-balance_stop(struct rq *rq, struct rq_flags *rf)
-{
- return sched_stop_runnable(rq);
-}
-
static void
wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
{
@@ -107,7 +101,6 @@ DEFINE_SCHED_CLASS(stop) = {
.put_prev_task = put_prev_task_stop,
.set_next_task = set_next_task_stop,
- .balance = balance_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,