Re: [PATCH v5 0/2] sched/fair: Optimize some active balance logic

From: Aiqun(Maria) Yu

Date: Mon Jun 22 2026 - 08:15:57 EST

On 6/18/2026 6:56 PM, Peter Zijlstra wrote:
>
>
> And since I've been staring at this code far too long, I accidentally
> did the below cleanup on top.
>
>
> ---
> Subject: sched/fair: Reflow sched_balance_rq()
> From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Date: Thu Jun 18 10:51:49 CEST 2026
>
> Reflow to reduce indenting.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> ---
> kernel/sched/fair.c | 136 ++++++++++++++++++++++++---------------------------
> kernel/sched/sched.h | 19 ++++++-
> 2 files changed, 82 insertions(+), 73 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -13437,82 +13437,78 @@ static int sched_balance_rq(int this_cpu
> }
> }
>
> - if (!ld_moved) {
> - schedstat_inc(sd->lb_failed[idle]);
> + if (ld_moved) {
> + sd->nr_balance_failed = 0;
> + goto out_unbalanced;
> + }
> +
> + schedstat_inc(sd->lb_failed[idle]);
> + /*
> + * Increment the failure counter only on periodic balance.
> + * We do not want newidle balance, which can be very
> + * frequent, pollute the failure counter causing
> + * excessive cache_hot migrations and active balances.
> + *
> + * Similarly for migration_misfit which is not related to
> + * load/util migration, don't pollute nr_balance_failed.
> + *
> + * The same for cache aware scheduling's allowance for
> + * load imbalance. If regular load balance does not
> + * migrate task due to LLC locality, it is a expected
> + * behavior and don't pollute nr_balance_failed.
> + * See can_migrate_task().
> + */
> + if (idle != CPU_NEWLY_IDLE &&
> + env.migration_type != migrate_misfit &&
> + !(env.flags & LBF_LLC_PINNED))
> + sd->nr_balance_failed++;
> +
> + if (!need_active_balance(&env))
> + goto out_unbalanced;
> +
> + scoped_guard (raw_spin_rq_lock_irqsave, busiest) {
> /*
> - * Increment the failure counter only on periodic balance.
> - * We do not want newidle balance, which can be very
> - * frequent, pollute the failure counter causing
> - * excessive cache_hot migrations and active balances.
> - *
> - * Similarly for migration_misfit which is not related to
> - * load/util migration, don't pollute nr_balance_failed.
> - *
> - * The same for cache aware scheduling's allowance for
> - * load imbalance. If regular load balance does not
> - * migrate task due to LLC locality, it is a expected
> - * behavior and don't pollute nr_balance_failed.
> - * See can_migrate_task().
> + * Don't kick the active_load_balance_cpu_stop,
> + * if the curr task on busiest CPU can't be
> + * moved to this_cpu:
> */
> - if (idle != CPU_NEWLY_IDLE &&
> - env.migration_type != migrate_misfit &&
> - !(env.flags & LBF_LLC_PINNED))
> - sd->nr_balance_failed++;
> -
> - if (need_active_balance(&env)) {
> - unsigned long flags;
> -
> - raw_spin_rq_lock_irqsave(busiest, flags);
> -
> - /*
> - * Don't kick the active_load_balance_cpu_stop,
> - * if the curr task on busiest CPU can't be
> - * moved to this_cpu:
> - */
> - if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
> - raw_spin_rq_unlock_irqrestore(busiest, flags);
> - goto out_one_pinned;
> - }
> -
> - /* Record that we found at least one task that could run on this_cpu */
> - env.flags &= ~LBF_ALL_PINNED;
> -
> - /*
> - * ->active_balance synchronizes accesses to
> - * ->active_balance_work. Once set, it's cleared
> - * only after active load balance is finished.
> - */
> - if (busiest->active_balance)
> - goto no_active_balance;
> -
> - /*
> - * @busiest dropped its rq_lock in the middle of
> - * scheduling out its ->curr task (->on_rq := 0), no
> - * need to forcefully punt it away with active balance.
> - */
> - if (!busiest->curr->on_rq)
> - goto no_active_balance;
> -
> - busiest->active_balance = 1;
> - busiest->push_cpu = this_cpu;
> - active_balance = 1;
> -no_active_balance:
> - preempt_disable();
> - raw_spin_rq_unlock_irqrestore(busiest, flags);
> - if (active_balance) {
> - stop_one_cpu_nowait(cpu_of(busiest),
> - active_load_balance_cpu_stop, busiest,
> - &busiest->active_balance_work);
> - }
> - preempt_enable();
> - }
> - } else {
> - sd->nr_balance_failed = 0;
> + if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr))
> + goto out_one_pinned;
> +
> + /* Record that we found at least one task that could run on this_cpu */
> + env.flags &= ~LBF_ALL_PINNED;
> +
> + /*
> + * ->active_balance synchronizes accesses to
> + * ->active_balance_work. Once set, it's cleared
> + * only after active load balance is finished.
> + */
> + if (busiest->active_balance)
> + goto out_unbalanced;
> +
> + /*
> + * @busiest dropped its rq_lock in the middle of
> + * scheduling out its ->curr task (->on_rq := 0), no
> + * need to forcefully punt it away with active balance.
> + */
> + if (!busiest->curr->on_rq)

This looks better than the previous patch.

While at it, should we check busiest->cfs.h_nr_queued instead of
!busiest->curr->on_rq?

It is possible that busiest->curr is an RT/DL task and is still on_rq
while busiest->cfs.h_nr_queued == 0 and list_empty(&busiest->cfs_tasks)
may both be true, meaning there is no CFS task available for active
balancing.

This series has been moving quickly, and I only got back from the Dragon
Boat Festival recently. I should have raised this earlier, but I am
commenting here since this is now the clean top patch.

> + goto out_unbalanced;
> +
> + busiest->active_balance = 1;
> + busiest->push_cpu = this_cpu;
> + active_balance = 1;
> + preempt_disable();
> }
> + if (active_balance) {
> + stop_one_cpu_nowait(cpu_of(busiest),
> + active_load_balance_cpu_stop, busiest,
> + &busiest->active_balance_work);
> + }
> + preempt_enable();
>
> +out_unbalanced:
> /* We were unbalanced, so reset the balancing interval */
> sd->balance_interval = sd->min_interval;
> -
> goto out;
>
> out_balanced:
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2018,7 +2018,8 @@ DEFINE_LOCK_GUARD_1(rq_lock, struct rq,
> rq_unlock(_T->lock, &_T->rf),
> struct rq_flags rf)
>
> -DECLARE_LOCK_GUARD_1_ATTRS(rq_lock, __acquires(__rq_lockp(_T)), __releases(__rq_lockp(*(struct rq **)_T)));
> +DECLARE_LOCK_GUARD_1_ATTRS(rq_lock, __acquires(__rq_lockp(_T)),
> + __releases(__rq_lockp(*(struct rq **)_T)));
> #define class_rq_lock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rq_lock, _T)
>
> DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq,
> @@ -2026,7 +2027,8 @@ DEFINE_LOCK_GUARD_1(rq_lock_irq, struct
> rq_unlock_irq(_T->lock, &_T->rf),
> struct rq_flags rf)
>
> -DECLARE_LOCK_GUARD_1_ATTRS(rq_lock_irq, __acquires(__rq_lockp(_T)), __releases(__rq_lockp(*(struct rq **)_T)));
> +DECLARE_LOCK_GUARD_1_ATTRS(rq_lock_irq, __acquires(__rq_lockp(_T)),
> + __releases(__rq_lockp(*(struct rq **)_T)));
> #define class_rq_lock_irq_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rq_lock_irq, _T)
>
> DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq,
> @@ -2034,9 +2036,20 @@ DEFINE_LOCK_GUARD_1(rq_lock_irqsave, str
> rq_unlock_irqrestore(_T->lock, &_T->rf),
> struct rq_flags rf)
>
> -DECLARE_LOCK_GUARD_1_ATTRS(rq_lock_irqsave, __acquires(__rq_lockp(_T)), __releases(__rq_lockp(*(struct rq **)_T)));
> +DECLARE_LOCK_GUARD_1_ATTRS(rq_lock_irqsave, __acquires(__rq_lockp(_T)),
> + __releases(__rq_lockp(*(struct rq **)_T)));
> #define class_rq_lock_irqsave_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rq_lock_irqsave, _T)
>
> +DEFINE_LOCK_GUARD_1(raw_spin_rq_lock_irqsave, struct rq,
> + raw_spin_rq_lock_irqsave(_T->lock, _T->flags),
> + raw_spin_rq_unlock_irqrestore(_T->lock, _T->flags),
> + unsigned long flags)
> +
> +DECLARE_LOCK_GUARD_1_ATTRS(raw_spin_rq_lock_irqsave, __acquires(__rq_lockp(_T)),
> + __releases(__rq_lockp(*(struct rq **)_T)));
> +#define class_raw_spin_rq_lock_irqsave_constructor(_T) \
> + WITH_LOCK_GUARD_1_ATTRS(raw_spin_rq_lock_irqsave, _T)
> +
> #define this_rq_lock_irq(...) __acquire_ret(_this_rq_lock_irq(__VA_ARGS__), __rq_lockp(__ret))
> static inline struct rq *_this_rq_lock_irq(struct rq_flags *rf) __acquires_ret
> {

--
Thx and BRs,
Aiqun(Maria) Yu