Re: [GIT PULL] sched_ext: Initial pull request for v6.11

From: Peter Zijlstra
Date: Tue Jul 23 2024 - 12:34:23 EST



Taken from git diff from your tree against Linus' tree... I've not yet
read everything, but I figured I should share these bits.

---

> @@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq)
> return true;
>
> /*
> - * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
> - * if there's more than one we need the tick for involuntary
> - * preemption.
> + * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
> + * left. For CFS, if there's more than one we need the tick for
> + * involuntary preemption. For SCX, ask.
> */
> - if (rq->nr_running > 1)
> + if (!scx_switched_all() && rq->nr_running > 1)
> + return false;
> +
> + if (scx_enabled() && !scx_can_stop_tick(rq))
> return false;
>

Doesn't that boil down to something like:

if (scx_switched_all())
return scx_can_stop_tick(rq);

if (rq->nr_running > 1)
return false;

> @@ -5773,7 +5827,19 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
> struct rq_flags *rf)
> {
> #ifdef CONFIG_SMP
> + const struct sched_class *start_class = prev->sched_class;
> const struct sched_class *class;
> +
> +#ifdef CONFIG_SCHED_CLASS_EXT
> + /*
> + * SCX requires a balance() call before every pick_next_task() including
> + * when waking up from SCHED_IDLE. If @start_class is below SCX, start
> + * from SCX instead.
> + */
> + if (sched_class_above(&ext_sched_class, start_class))
> + start_class = &ext_sched_class;

if (scx_enabled() && ...)

?

> +#endif
> +
> /*
> * We must do the balancing pass before put_prev_task(), such
> * that when we release the rq->lock the task is in the same
> @@ -5782,7 +5848,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
> * We can terminate the balance pass as soon as we know there is
> * a runnable task of @class priority or higher.
> */
> - for_class_range(class, prev->sched_class, &idle_sched_class) {
> + for_active_class_range(class, start_class, &idle_sched_class) {
> if (class->balance(rq, prev, rf))
> break;

Don't you need fixing balance_fair() here? It has:

if (rq->nr_running)
return 1;

Which would return true and terminate the iteration even if there are
only scx tasks left.

> }
> @@ -5800,6 +5866,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> const struct sched_class *class;
> struct task_struct *p;
>
> + if (scx_enabled())
> + goto restart;
> +
> /*
> * Optimization: we know that if all tasks are in the fair class we can
> * call that function directly, but only if the @prev task wasn't of a
> @@ -5840,10 +5909,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> if (prev->dl_server)
> prev->dl_server = NULL;
>
> - for_each_class(class) {
> + for_each_active_class(class) {
> p = class->pick_next_task(rq);
> - if (p)
> + if (p) {
> + const struct sched_class *prev_class = prev->sched_class;
> +
> + if (class != prev_class && prev_class->switch_class)
> + prev_class->switch_class(rq, p);
> return p;
> + }
> }

So I really don't like this one.. at the very least it would need a comment
explaining why it only needs calling here and not every time a put_prev_task()
and set_next_task() pair cross a class boundary -- which would be the
consistent thing to do.

Now, IIRC, you need a class call that indicates you're about to loose the CPU
so that you can kick the task to another CPU or somesuch. And last time I got
it backwards and suggested adding an argument to pick_next_task(), but what
about put_prev_task()?

Can't we universally push put_prev_task() after the pick loop? Then we get
something like:

next = pick_task();
if (next != prev) {
put_prev_task(rq, prev, next->class != prev->class);
set_next_task(rq, next);
}

I have patches for most of this for fair (in my eevdf queue), and I
think the others are doable, after all, this is more or less what we do
for SCHED_CORE already.

/me went off hacking for a bit

I've done this; find the results at: queue.git sched/prep

I've also rebased the sched_ext tree on top of that with the below delta, which
you can find at: queue.git sched/scx

This led me to discover that you're passing the task of the other class into
the bpf stuff -- I don't think that is a sane thing to do. You get preempted,
it doesn't matter from which higher class or by which specific task, a policy
must not care about that. So I kinda bodged it, but I really think this should
be taken out.

I also found you have some terrible !SMP hack in there, which I've
broken, I've disabled it for now. This needs a proper fix, and not
something ugly like you did.

Anyway, it seems to build, boot and pass the sched_ext selftest:

PASSED: 21
SKIPPED: 0
FAILED: 0

(albeit with a fair amount of console noise -- is that expected?)

Also, why does that thing hard depend on DEBUG_BTF? (at least having
that option enabled no longer explodes build times like it used to)

Also should we /CONFIG_SCHED_CLASS_EXT/CONFIG_SCHED_BPF/ ? Then
something like: grep BPF foo-build/.config
more easily shows what's what.

I suppose I'll look more at all this in the coming days ... *sigh*


---
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index da9cac6b6cc2a..6375a648d3bf5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2015,11 +2015,11 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
}
}

-static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
+static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
{
if (!(p->scx.flags & SCX_TASK_QUEUED)) {
WARN_ON_ONCE(task_runnable(p));
- return;
+ return true;
}

ops_dequeue(p, deq_flags);
@@ -2054,6 +2054,8 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
sub_nr_running(rq, 1);

dispatch_dequeue(rq, p);
+
+ return true;
}

static void yield_task_scx(struct rq *rq)
@@ -2702,6 +2704,18 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
*/
update_other_load_avgs(rq);
}
+
+ if (!first)
+ return;
+
+ if (unlikely(!p->scx.slice)) {
+ if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
+ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
+ p->comm, p->pid);
+ scx_warned_zero_slice = true;
+ }
+ p->scx.slice = SCX_SLICE_DFL;
+ }
}

static void process_ddsp_deferred_locals(struct rq *rq)
@@ -2727,10 +2741,15 @@ static void process_ddsp_deferred_locals(struct rq *rq)
}
}

-static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
+static void switch_class_scx(struct rq *rq);
+
+static void put_prev_task_scx(struct rq *rq, struct task_struct *p, bool change_class)
{
-#ifndef CONFIG_SMP
+#if 0 // ndef CONFIG_SMP
/*
+ * XXX broken, put_prev no longer comes after balance but now comes
+ * after pick.
+ *
* UP workaround.
*
* Because SCX may transfer tasks across CPUs during dispatch, dispatch
@@ -2774,7 +2793,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
p->scx.flags &= ~SCX_TASK_BAL_KEEP;
set_task_runnable(rq, p);
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
- return;
+ goto out;
}

if (p->scx.flags & SCX_TASK_QUEUED) {
@@ -2788,7 +2807,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
*/
if (p->scx.slice && !scx_ops_bypassing()) {
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
- return;
+ goto out;
}

/*
@@ -2805,6 +2824,14 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
else
do_enqueue_task(rq, p, 0, -1);
}
+
+out:
+ /*
+ * XXX switch_class was called after put, if this can be called earlier
+ * the goto can be avoided.
+ */
+ if (change_class)
+ switch_class_scx(rq);
}

static struct task_struct *first_local_task(struct rq *rq)
@@ -2813,34 +2840,6 @@ static struct task_struct *first_local_task(struct rq *rq)
struct task_struct, scx.dsq_list.node);
}

-static struct task_struct *pick_next_task_scx(struct rq *rq)
-{
- struct task_struct *p;
-
-#ifndef CONFIG_SMP
- /* UP workaround - see the comment at the head of put_prev_task_scx() */
- if (unlikely(rq->curr->sched_class != &ext_sched_class))
- balance_one(rq, rq->curr, true);
-#endif
-
- p = first_local_task(rq);
- if (!p)
- return NULL;
-
- set_next_task_scx(rq, p, true);
-
- if (unlikely(!p->scx.slice)) {
- if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
- printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
- p->comm, p->pid);
- scx_warned_zero_slice = true;
- }
- p->scx.slice = SCX_SLICE_DFL;
- }
-
- return p;
-}
-
#ifdef CONFIG_SCHED_CORE
/**
* scx_prio_less - Task ordering for core-sched
@@ -2874,6 +2873,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
else
return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
}
+#endif /* CONFIG_SCHED_CORE */

/**
* pick_task_scx - Pick a candidate task for core-sched
@@ -2917,28 +2917,9 @@ static struct task_struct *pick_task_scx(struct rq *rq)

return first; /* this may be %NULL */
}
-#endif /* CONFIG_SCHED_CORE */

-static enum scx_cpu_preempt_reason
-preempt_reason_from_class(const struct sched_class *class)
+static void switch_class_scx(struct rq *rq)
{
-#ifdef CONFIG_SMP
- if (class == &stop_sched_class)
- return SCX_CPU_PREEMPT_STOP;
-#endif
- if (class == &dl_sched_class)
- return SCX_CPU_PREEMPT_DL;
- if (class == &rt_sched_class)
- return SCX_CPU_PREEMPT_RT;
- return SCX_CPU_PREEMPT_UNKNOWN;
-}
-
-static void switch_class_scx(struct rq *rq, struct task_struct *next)
-{
- const struct sched_class *next_class = next->sched_class;
-
- if (!scx_enabled())
- return;
#ifdef CONFIG_SMP
/*
* Pairs with the smp_load_acquire() issued by a CPU in
@@ -2950,15 +2931,6 @@ static void switch_class_scx(struct rq *rq, struct task_struct *next)
if (!static_branch_unlikely(&scx_ops_cpu_preempt))
return;

- /*
- * The callback is conceptually meant to convey that the CPU is no
- * longer under the control of SCX. Therefore, don't invoke the callback
- * if the next class is below SCX (in which case the BPF scheduler has
- * actively decided not to schedule any tasks on the CPU).
- */
- if (sched_class_above(&ext_sched_class, next_class))
- return;
-
/*
* At this point we know that SCX was preempted by a higher priority
* sched_class, so invoke the ->cpu_release() callback if we have not
@@ -2971,8 +2943,8 @@ static void switch_class_scx(struct rq *rq, struct task_struct *next)
if (!rq->scx.cpu_released) {
if (SCX_HAS_OP(cpu_release)) {
struct scx_cpu_release_args args = {
- .reason = preempt_reason_from_class(next_class),
- .task = next,
+ .reason = SCX_CPU_PREEMPT_UNKNOWN,
+ .task = NULL,
};

SCX_CALL_OP(SCX_KF_CPU_RELEASE,
@@ -3678,13 +3650,12 @@ DEFINE_SCHED_CLASS(ext) = {

.wakeup_preempt = wakeup_preempt_scx,

+ .pick_task = pick_task_scx,
.pick_next_task = pick_next_task_scx,

.put_prev_task = put_prev_task_scx,
.set_next_task = set_next_task_scx,

- .switch_class = switch_class_scx,
-
#ifdef CONFIG_SMP
.balance = balance_scx,
.select_task_rq = select_task_rq_scx,
@@ -3695,10 +3666,6 @@ DEFINE_SCHED_CLASS(ext) = {
.rq_offline = rq_offline_scx,
#endif

-#ifdef CONFIG_SCHED_CORE
- .pick_task = pick_task_scx,
-#endif
-
.task_tick = task_tick_scx,

.switching_to = switching_to_scx,