[PATCH v6] sched_ext: idle: Refresh idle masks during idle-to-idle transitions

From: Andrea Righi
Date: Fri Jan 10 2025 - 03:46:42 EST


With the consolidation of put_prev_task/set_next_task(), see
commit 436f3eed5c69 ("sched: Combine the last put_prev_task() and the
first set_next_task()"), we are now skipping the transition between
these two functions when the previous and the next tasks are the same.

As a result, the scx idle state of a CPU is updated only when
transitioning to or from the idle thread. While this is generally
correct, it can lead to uneven and inefficient core utilization in
certain scenarios [1].

A typical scenario involves proactive wake-ups: scx_bpf_pick_idle_cpu()
selects and marks an idle CPU as busy, followed by a wake-up via
scx_bpf_kick_cpu(), without dispatching any tasks. In this case, the CPU
continues running the idle thread, returns to idle, but remains marked
as busy, preventing it from being selected again as an idle CPU (until a
task eventually runs on it and releases the CPU).

For example, running a workload that uses 20% of each CPU, combined with
an scx scheduler using proactive wake-ups, results in the following core
utilization:

CPU 0: 25.7%
CPU 1: 29.3%
CPU 2: 26.5%
CPU 3: 25.5%
CPU 4: 0.0%
CPU 5: 25.5%
CPU 6: 0.0%
CPU 7: 10.5%

To address this, refresh the idle state also in pick_task_idle(), during
idle-to-idle transitions, but only trigger ops.update_idle() on actual
state changes to prevent unnecessary updates to the scx scheduler and
maintain balanced state transitions.

With this change in place, the core utilization in the previous example
becomes the following:

CPU 0: 18.8%
CPU 1: 19.4%
CPU 2: 18.0%
CPU 3: 18.7%
CPU 4: 19.3%
CPU 5: 18.9%
CPU 6: 18.7%
CPU 7: 19.3%

[1] https://github.com/sched-ext/scx/pull/1139

Fixes: 7c65ae81ea86 ("sched_ext: Don't call put_prev_task_scx() before picking the next task")
Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
---
kernel/sched/ext.c | 49 +++++++++++++++++++++++++++++++++++++++++----
kernel/sched/ext.h | 8 ++++----
kernel/sched/idle.c | 5 +++--
3 files changed, 52 insertions(+), 10 deletions(-)

ChangeLog v5 -> v6:
- access rq->curr in the proper rcu way
- rephrase SCHED_IDLE -> idle thread
- consolidate most of the logic into kernel/sched/ext.c

ChangeLog v4 -> v5:
- prevent unbalanced ops.update_idle() invocations

ChangeLog v3 -> v4:
- handle the core-sched case that may ignore the result of
pick_task(), triggering spurious ops.update_idle() events

ChangeLog v2 -> v3:
- add a comment to clarify why we need to update the scx idle state in
pick_task()

ChangeLog v1 -> v2:
- move the logic from put_prev_set_next_task() to scx_update_idle()

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 96b6d6aea26e..ab276cefc50d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3651,16 +3651,57 @@ static void reset_idle_masks(void)
cpumask_copy(idle_masks.smt, cpu_online_mask);
}

-void __scx_update_idle(struct rq *rq, bool idle)
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
int cpu = cpu_of(rq);

- if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
- if (!static_branch_unlikely(&scx_builtin_idle_enabled))
+ /*
+ * The scx idle state is updated only when the current task
+ * transitions from/to the idle thread, see put_prev_task_idle()
+ * and set_next_task_idle().
+ *
+ * However, the CPU can also exit/enter the idle state while
+ * running the idle thread, for example waking up the CPU via
+ * scx_bpf_kick_cpu() without dispatching a task on it.
+ *
+ * In this cases, we still need to refresh the idle masks, but we
+ * don't need to trigger ops.update_idle(), since there is
+ * basically no idle state transition.
+ */
+ if (do_notify) {
+ if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+ } else {
+ bool is_prev_idle;
+
+ /* Refresh idle masks during idle-to-idle transitions */
+ rcu_read_lock();
+ is_prev_idle = is_idle_task(rcu_dereference(rq->curr));
+ rcu_read_unlock();
+
+ if (!is_prev_idle)
return;
}

+ if (!static_branch_likely(&scx_builtin_idle_enabled))
+ return;
+
assign_cpu(cpu, idle_masks.cpu, idle);

#ifdef CONFIG_SCHED_SMT
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index b1675bb59fc4..4d022d17ac7d 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */

#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
-void __scx_update_idle(struct rq *rq, bool idle);
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);

-static inline void scx_update_idle(struct rq *rq, bool idle)
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
if (scx_enabled())
- __scx_update_idle(rq, idle);
+ __scx_update_idle(rq, idle, do_notify);
}
#else
-static inline void scx_update_idle(struct rq *rq, bool idle) {}
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
#endif

#ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 621696269584..2c85c86b455f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
dl_server_update_idle_time(rq, prev);
- scx_update_idle(rq, false);
+ scx_update_idle(rq, false, true);
}

static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
- scx_update_idle(rq, true);
+ scx_update_idle(rq, true, true);
schedstat_inc(rq->sched_goidle);
next->se.exec_start = rq_clock_task(rq);
}

struct task_struct *pick_task_idle(struct rq *rq)
{
+ scx_update_idle(rq, true, false);
return rq->idle;
}

--
2.47.1