[PATCH 5/6] sched_ext: Allow SCX_DSQ_LOCAL_ON for direct dispatches

From: Tejun Heo
Date: Wed Jul 10 2024 - 21:16:44 EST


In ops.dispatch(), SCX_DSQ_LOCAL_ON can be used to dispatch the task to the
local DSQ of any CPU. However, during direct dispatch from ops.select_cpu()
and ops.enqueue(), this isn't allowed. This is because dispatching to the
local DSQ of a remote CPU requires locking both the task's current and new
rq's and such double locking can't be done directly from ops.enqueue().

While waking up a task, as ops.select_cpu() can pick any CPU and both
ops.select_cpu() and ops.enqueue() can use SCX_DSQ_LOCAL as the dispatch
target to dispatch to the DSQ of the picked CPU, the BPF scheduler can still
do whatever it wants to do. However, while a task is being enqueued for a
different reason, e.g. after its slice expiration, only ops.enqueue() is
called and there's no way for the BPF scheduler to directly dispatch to the
local DSQ of a remote CPU. This gap in API forces schedulers into
work-arounds which are not straightforward or optimal such as skipping
direct dispatches in such cases.

Implement deferred enqueueing to allow directly dispatching to the local DSQ
of a remote CPU from ops.select_cpu() and ops.enqueue(). Such tasks are
temporarily queued on rq->scx.ddsp_deferred_locals. When the rq lock can be
safely released, the tasks are taken off the list and queued on the target
local DSQs using dispatch_to_local_dsq().

v2: - Add missing return after queue_balance_callback() in
schedule_deferred(). (David).

- dispatch_to_local_dsq() now assumes that @rq is locked but unpinned
and thus no longer takes @rf. Updated accordingly.

- UP build warning fix.

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
Tested-by: Andrea Righi <righi.andrea@xxxxxxxxx>
Cc: David Vernet <void@xxxxxxxxxxxxx>
Cc: Dan Schatzberg <schatzberg.dan@xxxxxxxxx>
Cc: Changwoo Min <changwoo@xxxxxxxxxx>
---
kernel/sched/ext.c | 168 ++++++++++++++++++++++++++++++++++++++-----
kernel/sched/sched.h | 3 +
2 files changed, 153 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 57d6ea65f857..03da2cecb547 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -888,6 +888,7 @@ static struct kobject *scx_root_kobj;
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>

+static void process_ddsp_deferred_locals(struct rq *rq);
static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
s64 exit_code,
@@ -1362,6 +1363,67 @@ static int ops_sanitize_err(const char *ops_name, s32 err)
return -EPROTO;
}

+static void run_deferred(struct rq *rq)
+{
+ process_ddsp_deferred_locals(rq);
+}
+
+#ifdef CONFIG_SMP
+static void deferred_bal_cb_workfn(struct rq *rq)
+{
+ run_deferred(rq);
+}
+#endif
+
+static void deferred_irq_workfn(struct irq_work *irq_work)
+{
+ struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work);
+
+ raw_spin_rq_lock(rq);
+ run_deferred(rq);
+ raw_spin_rq_unlock(rq);
+}
+
+/**
+ * schedule_deferred - Schedule execution of deferred actions on an rq
+ * @rq: target rq
+ *
+ * Schedule execution of deferred actions on @rq. Must be called with @rq
+ * locked. Deferred actions are executed with @rq locked but unpinned, and thus
+ * can unlock @rq to e.g. migrate tasks to other rqs.
+ */
+static void schedule_deferred(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+#ifdef CONFIG_SMP
+ /*
+ * If in the middle of waking up a task, task_woken_scx() will be called
+ * afterwards which will then run the deferred actions, no need to
+ * schedule anything.
+ */
+ if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
+ return;
+
+ /*
+ * If in balance, the balance callbacks will be called before rq lock is
+ * released. Schedule one.
+ */
+ if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
+ queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
+ deferred_bal_cb_workfn);
+ return;
+ }
+#endif
+ /*
+ * No scheduler hooks available. Queue an irq work. They are executed on
+ * IRQ re-enable which may take a bit longer than the scheduler hooks.
+ * The above WAKEUP and BALANCE paths should cover most of the cases and
+ * the time to IRQ re-enable shouldn't be long.
+ */
+ irq_work_queue(&rq->scx.deferred_irq_work);
+}
+
/**
* touch_core_sched - Update timestamp used for core-sched task ordering
* @rq: rq to read clock from, must be locked
@@ -1577,7 +1639,13 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
bool is_local = dsq == &rq->scx.local_dsq;

if (!dsq) {
- WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
+ /*
+ * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
+ * Unlinking is all that's needed to cancel.
+ */
+ if (unlikely(!list_empty(&p->scx.dsq_list.node)))
+ list_del_init(&p->scx.dsq_list.node);
+
/*
* When dispatching directly from the BPF scheduler to a local
* DSQ, the task isn't associated with any DSQ but
@@ -1586,6 +1654,7 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
*/
if (p->scx.holding_cpu >= 0)
p->scx.holding_cpu = -1;
+
return;
}

@@ -1673,17 +1742,6 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
return;
}

- /*
- * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
- * dispatching to the local DSQ of a different CPU requires unlocking
- * the current rq which isn't allowed in the enqueue path. Use
- * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
- */
- if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
- scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
- return;
- }
-
WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
WARN_ON_ONCE(p->scx.ddsp_enq_flags);

@@ -1693,13 +1751,58 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,

static void direct_dispatch(struct task_struct *p, u64 enq_flags)
{
+ struct rq *rq = task_rq(p);
struct scx_dispatch_q *dsq;
+ u64 dsq_id = p->scx.ddsp_dsq_id;
+
+ touch_core_sched_dispatch(rq, p);
+
+ p->scx.ddsp_enq_flags |= enq_flags;
+
+ /*
+ * We are in the enqueue path with @rq locked and pinned, and thus can't
+ * double lock a remote rq and enqueue to its local DSQ. For
+ * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
+ * the enqueue so that it's executed when @rq can be unlocked.
+ */
+ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+ unsigned long opss;

- touch_core_sched_dispatch(task_rq(p), p);
+ if (cpu == cpu_of(rq)) {
+ dsq_id = SCX_DSQ_LOCAL;
+ goto dispatch;
+ }
+
+ opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
+ case SCX_OPSS_NONE:
+ break;
+ case SCX_OPSS_QUEUEING:
+ /*
+ * As @p was never passed to the BPF side, _release is
+ * not strictly necessary. Still do it for consistency.
+ */
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+ break;
+ default:
+ WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()",
+ p->comm, p->pid, opss);
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+ break;
+ }

- enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
- dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p);
- dispatch_enqueue(dsq, p, enq_flags);
+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
+ list_add_tail(&p->scx.dsq_list.node,
+ &rq->scx.ddsp_deferred_locals);
+ schedule_deferred(rq);
+ return;
+ }
+
+dispatch:
+ dsq = find_dsq_for_dispatch(rq, dsq_id, p);
+ dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
}

static bool scx_rq_online(struct rq *rq)
@@ -2601,6 +2704,29 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
}
}

+static void process_ddsp_deferred_locals(struct rq *rq)
+{
+ struct task_struct *p, *tmp;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Now that @rq can be unlocked, execute the deferred enqueueing of
+ * tasks directly dispatched to the local DSQs of other CPUs. See
+ * direct_dispatch().
+ */
+ list_for_each_entry_safe(p, tmp, &rq->scx.ddsp_deferred_locals,
+ scx.dsq_list.node) {
+ s32 ret;
+
+ list_del_init(&p->scx.dsq_list.node);
+
+ ret = dispatch_to_local_dsq(rq, p->scx.ddsp_dsq_id, p,
+ p->scx.ddsp_enq_flags);
+ WARN_ON_ONCE(ret == DTL_NOT_LOCAL);
+ }
+}
+
static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
{
#ifndef CONFIG_SMP
@@ -3022,6 +3148,11 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
}
}

+static void task_woken_scx(struct rq *rq, struct task_struct *p)
+{
+ run_deferred(rq);
+}
+
static void set_cpus_allowed_scx(struct task_struct *p,
struct affinity_context *ac)
{
@@ -3538,8 +3669,6 @@ bool scx_can_stop_tick(struct rq *rq)
*
* - task_fork/dead: We need fork/dead notifications for all tasks regardless of
* their current sched_class. Call them directly from sched core instead.
- *
- * - task_woken: Unnecessary.
*/
DEFINE_SCHED_CLASS(ext) = {
.enqueue_task = enqueue_task_scx,
@@ -3559,6 +3688,7 @@ DEFINE_SCHED_CLASS(ext) = {
#ifdef CONFIG_SMP
.balance = balance_scx,
.select_task_rq = select_task_rq_scx,
+ .task_woken = task_woken_scx,
.set_cpus_allowed = set_cpus_allowed_scx,

.rq_online = rq_online_scx,
@@ -5263,11 +5393,13 @@ void __init init_sched_ext_class(void)

init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
INIT_LIST_HEAD(&rq->scx.runnable_list);
+ INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);

BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
+ init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);

if (cpu_online(cpu))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8a0e8052f6b0..be7be54484c0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -746,6 +746,7 @@ enum scx_rq_flags {
struct scx_rq {
struct scx_dispatch_q local_dsq;
struct list_head runnable_list; /* runnable tasks on this rq */
+ struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */
unsigned long ops_qseq;
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
@@ -757,6 +758,8 @@ struct scx_rq {
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
unsigned long pnt_seq;
+ struct balance_callback deferred_bal_cb;
+ struct irq_work deferred_irq_work;
struct irq_work kick_cpus_irq_work;
};
#endif /* CONFIG_SCHED_CLASS_EXT */
--
2.45.2