Re: [PATCH 5/6] sched_ext: Allow SCX_DSQ_LOCAL_ON for direct dispatches

From: David Vernet
Date: Wed Jul 10 2024 - 14:51:49 EST


On Tue, Jul 09, 2024 at 11:21:11AM -1000, Tejun Heo wrote:
> In ops.dispatch(), SCX_DSQ_LOCAL_ON can be used to dispatch the task to the
> local DSQ of any CPU. However, during direct dispatch from ops.select_cpu()
> and ops.enqueue(), this isn't allowed. This is because dispatching to the
> local DSQ of a remote CPU requires locking both the task's current and new
> rq's and such double locking can't be done directly from ops.enqueue().
>
> While waking up a task, as ops.select_cpu() can pick any CPU and both
> ops.select_cpu() and ops.enqueue() can use SCX_DSQ_LOCAL as the dispatch
> target to dispatch to the DSQ of the picked CPU, the BPF scheduler can still
> do whatever it wants to do. However, while a task is being enqueued for a
> different reason, e.g. after its slice expiration, only ops.enqueue() is
> called and there's no way for the BPF scheduler to directly dispatch to the
> local DSQ of a remote CPU. This gap in API forces schedulers into
> work-arounds which are not straightforward or optimal such as skipping
> direct dispatches in such cases.
>
> Implement deferred enqueueing to allow directly dispatching to the local DSQ
> of a remote CPU from ops.select_cpu() and ops.enqueue(). Such tasks are
> temporarily queued on rq->scx.ddsp_deferred_locals. When the rq lock can be
> safely released, the tasks are taken off the list and queued on the target
> local DSQs using dispatch_to_local_dsq().
>
> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
> Cc: David Vernet <void@xxxxxxxxxxxxx>
> Cc: Dan Schatzberg <schatzberg.dan@xxxxxxxxx>
> Cc: Changwoo Min <changwoo@xxxxxxxxxx>
> Cc: Andrea Righi <righi.andrea@xxxxxxxxx>

Hi Tejun,

Overall this looks great. It's really not very complicated at all which is
great to see, but will be a big usability improvement for schedulers so it's a
clear win. Just left a couple comments below.

> ---
> kernel/sched/ext.c | 164 ++++++++++++++++++++++++++++++++++++++-----
> kernel/sched/sched.h | 3 +
> 2 files changed, 149 insertions(+), 18 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index a88c51ce63a5..f4edc37bf89b 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -889,6 +889,7 @@ static struct kobject *scx_root_kobj;
> #define CREATE_TRACE_POINTS
> #include <trace/events/sched_ext.h>
>
> +static void process_ddsp_deferred_locals(struct rq *rq);
> static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
> static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
> s64 exit_code,
> @@ -1363,6 +1364,63 @@ static int ops_sanitize_err(const char *ops_name, s32 err)
> return -EPROTO;
> }
>
> +static void run_deferred(struct rq *rq)
> +{
> + process_ddsp_deferred_locals(rq);
> +}
> +
> +static void deferred_bal_cb_workfn(struct rq *rq)
> +{
> + run_deferred(rq);
> +}
> +
> +static void deferred_irq_workfn(struct irq_work *irq_work)
> +{
> + struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work);
> +
> + raw_spin_rq_lock(rq);
> + run_deferred(rq);
> + raw_spin_rq_unlock(rq);
> +}
> +
> +/**
> + * schedule_deferred - Schedule execution of deferred actions on an rq
> + * @rq: target rq
> + *
> + * Schedule execution of deferred actions on @rq. Must be called with @rq
> + * locked. Deferred actions are executed with @rq locked but unpinned, and thus
> + * can unlock @rq to e.g. migrate tasks to other rqs.
> + */
> +static void schedule_deferred(struct rq *rq)
> +{
> + lockdep_assert_rq_held(rq);
> +
> +#ifdef CONFIG_SMP
> + /*
> + * If in the middle of waking up a task, task_woken_scx() will be called
> + * afterwards which will then run the deferred actions, no need to
> + * schedule anything.
> + */
> + if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
> + return;
> +
> + /*
> + * If in balance, the balance callbacks will be called before rq lock is
> + * released. Schedule one.
> + */
> + if (rq->scx.flags & SCX_RQ_IN_BALANCE)
> + queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
> + deferred_bal_cb_workfn);

Should we be returning above if we're able to use a balance cb?

Also, should we maybe add a WARN_ON_ONCE(rq->balance_callback ==
&balance_push_callback)? I could see that being unnecessary given that we
should never be hitting this path when the CPU is deactivated anyways, but the
push callback thing is a kindn extraneous implementation detail so might be
worth guarding against it just in case.

> +#endif
> + /*
> + * No scheduler hooks available. Queue an irq work. They are executed on
> + * IRQ re-enable which may take a bit longer than the scheduler hooks.
> + * The above WAKEUP and BALANCE paths should cover most of the cases and
> + * the time to IRQ re-enable shouldn't be long.
> + */
> + irq_work_queue(&rq->scx.deferred_irq_work);
> +}
> +
> /**
> * touch_core_sched - Update timestamp used for core-sched task ordering
> * @rq: rq to read clock from, must be locked
> @@ -1578,7 +1636,13 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
> bool is_local = dsq == &rq->scx.local_dsq;
>
> if (!dsq) {
> - WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
> + /*
> + * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
> + * Unlinking is all that's needed to cancel.
> + */
> + if (unlikely(!list_empty(&p->scx.dsq_list.node)))
> + list_del_init(&p->scx.dsq_list.node);
> +
> /*
> * When dispatching directly from the BPF scheduler to a local
> * DSQ, the task isn't associated with any DSQ but
> @@ -1587,6 +1651,7 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
> */
> if (p->scx.holding_cpu >= 0)
> p->scx.holding_cpu = -1;
> +
> return;
> }
>
> @@ -1674,17 +1739,6 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
> return;
> }
>
> - /*
> - * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
> - * dispatching to the local DSQ of a different CPU requires unlocking
> - * the current rq which isn't allowed in the enqueue path. Use
> - * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
> - */
> - if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
> - scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
> - return;
> - }
> -
> WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
> WARN_ON_ONCE(p->scx.ddsp_enq_flags);
>
> @@ -1694,13 +1748,58 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
>
> static void direct_dispatch(struct task_struct *p, u64 enq_flags)
> {
> + struct rq *rq = task_rq(p);
> struct scx_dispatch_q *dsq;
> + u64 dsq_id = p->scx.ddsp_dsq_id;
> +
> + touch_core_sched_dispatch(rq, p);
> +
> + p->scx.ddsp_enq_flags |= enq_flags;
> +
> + /*
> + * We are in the enqueue path with @rq locked and pinned, and thus can't
> + * double lock a remote rq and enqueue to its local DSQ. For
> + * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
> + * the enqueue so that it's executed when @rq can be unlocked.
> + */
> + if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
> + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
> + unsigned long opss;
> +
> + if (cpu == cpu_of(rq)) {
> + dsq_id = SCX_DSQ_LOCAL;
> + goto dispatch;
> + }
> +
> + opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
> +
> + switch (opss & SCX_OPSS_STATE_MASK) {
> + case SCX_OPSS_NONE:
> + break;
> + case SCX_OPSS_QUEUEING:
> + /*
> + * As @p was never passed to the BPF side, _release is
> + * not strictly necessary. Still do it for consistency.
> + */
> + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
> + break;
> + default:
> + WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()",
> + p->comm, p->pid, opss);
> + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
> + break;
> + }
>
> - touch_core_sched_dispatch(task_rq(p), p);
> + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
> + list_add_tail(&p->scx.dsq_list.node,
> + &rq->scx.ddsp_deferred_locals);
> + schedule_deferred(rq);
> + return;
> + }
>
> - enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
> - dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p);
> - dispatch_enqueue(dsq, p, enq_flags);
> +dispatch:
> + dsq = find_dsq_for_dispatch(rq, dsq_id, p);
> + dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
> }
>
> static bool scx_rq_online(struct rq *rq)
> @@ -2631,6 +2730,29 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
> }
> }
>
> +static void process_ddsp_deferred_locals(struct rq *rq)
> +{
> + struct task_struct *p, *tmp;
> +
> + lockdep_assert_rq_held(rq);
> +
> + /*
> + * Now that @rq can be unlocked, execute the deferred enqueueing of
> + * tasks directly dispatched to the local DSQs of other CPUs. See
> + * direct_dispatch().
> + */
> + list_for_each_entry_safe(p, tmp, &rq->scx.ddsp_deferred_locals,
> + scx.dsq_list.node) {
> + s32 ret;
> +
> + list_del_init(&p->scx.dsq_list.node);
> +
> + ret = dispatch_to_local_dsq(rq, NULL, p->scx.ddsp_dsq_id, p,
> + p->scx.ddsp_enq_flags);
> + WARN_ON_ONCE(ret == DTL_NOT_LOCAL);
> + }

As mentioned in the other thread, it might be simplest to just pin and unpin
around this loop here to keep the logic simpler in the callee.

> +}
> +
> static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
> {
> #ifndef CONFIG_SMP
> @@ -3052,6 +3174,11 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
> }
> }
>
> +static void task_woken_scx(struct rq *rq, struct task_struct *p)
> +{
> + run_deferred(rq);
> +}
> +
> static void set_cpus_allowed_scx(struct task_struct *p,
> struct affinity_context *ac)
> {
> @@ -3568,8 +3695,6 @@ bool scx_can_stop_tick(struct rq *rq)
> *
> * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
> * their current sched_class. Call them directly from sched core instead.
> - *
> - * - task_woken: Unnecessary.
> */
> DEFINE_SCHED_CLASS(ext) = {
> .enqueue_task = enqueue_task_scx,
> @@ -3589,6 +3714,7 @@ DEFINE_SCHED_CLASS(ext) = {
> #ifdef CONFIG_SMP
> .balance = balance_scx,
> .select_task_rq = select_task_rq_scx,
> + .task_woken = task_woken_scx,

Should we update the comment in the caller in core.c given that rq is no longer
only used for statistics tracking?

> .set_cpus_allowed = set_cpus_allowed_scx,
>
> .rq_online = rq_online_scx,

[...]

Thanks,
David

Attachment: signature.asc
Description: PGP signature