Re: [PATCH 03/15] sched/fair: Add lag based placement

From: Peter Zijlstra
Date: Fri Oct 13 2023 - 12:36:59 EST


On Fri, Oct 13, 2023 at 12:34:28AM +0200, Peter Zijlstra wrote:

> Right, so I do have this:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/eevdf&id=344944e06f11da25b49328825ed15fedd63036d3
>
> That allows tasks to sleep away the lag -- with all the gnarly bits that
> sleep time has. And it reliably fixes the above. However, it also
> depresses a bunch of other stuff. Never a free lunch etc.
>
> It is so far the least horrible of the things I've tried.

So the below is one I conceptually like more -- except I hate the code,
nor does it work as well as the one linked above.

(Mike, this isn't the same one you saw before -- it's been 'improved')

---

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29daece54a74..7f17295931de 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -895,6 +895,7 @@ struct task_struct {
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
+ unsigned sched_delayed:1;

/* Force alignment to the next boundary: */
unsigned :0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7771a4d68280..38b2e0488a38 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3833,12 +3833,21 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)

rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ if (unlikely(p->sched_delayed)) {
+ p->sched_delayed = 0;
+ /* mustn't run a delayed task */
+ WARN_ON_ONCE(task_on_cpu(rq, p));
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+ if (p->se.vlag > 0)
+ p->se.vlag = 0;
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+ }
if (!task_on_cpu(rq, p)) {
/*
* When on_rq && !on_cpu the task is preempted, see if
* it should preempt the task that is current now.
*/
- update_rq_clock(rq);
wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
@@ -6520,6 +6529,16 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
# define SM_MASK_PREEMPT SM_PREEMPT
#endif

+static void __deschedule_task(struct rq *rq, struct task_struct *p)
+{
+ deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+
+ if (p->in_iowait) {
+ atomic_inc(&rq->nr_iowait);
+ delayacct_blkio_start();
+ }
+}
+
/*
* __schedule() is the main scheduler function.
*
@@ -6604,6 +6623,8 @@ static void __sched notrace __schedule(unsigned int sched_mode)

switch_count = &prev->nivcsw;

+ WARN_ON_ONCE(prev->sched_delayed);
+
/*
* We must load prev->state once (task_struct::state is volatile), such
* that we form a control dependency vs deactivate_task() below.
@@ -6632,17 +6653,39 @@ static void __sched notrace __schedule(unsigned int sched_mode)
*
* After this, schedule() must not care about p->state any more.
*/
- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-
- if (prev->in_iowait) {
- atomic_inc(&rq->nr_iowait);
- delayacct_blkio_start();
- }
+ if (sched_feat(DELAY_DEQUEUE) &&
+ prev->sched_class->eligible_task &&
+ !prev->sched_class->eligible_task(rq, prev))
+ prev->sched_delayed = 1;
+ else
+ __deschedule_task(rq, prev);
}
switch_count = &prev->nvcsw;
}

- next = pick_next_task(rq, prev, &rf);
+ for (struct task_struct *tmp = prev;;) {
+
+ next = pick_next_task(rq, tmp, &rf);
+ if (unlikely(tmp != prev))
+ finish_task(tmp);
+
+ if (likely(!next->sched_delayed))
+ break;
+
+ next->sched_delayed = 0;
+
+ /* ttwu_runnable() */
+ if (WARN_ON_ONCE(!next->__state))
+ break;
+
+ prepare_task(next);
+ smp_wmb();
+ __deschedule_task(rq, next);
+ if (next->se.vlag > 0)
+ next->se.vlag = 0;
+ tmp = next;
+ }
+
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b2210e7cc057..3084e21abfe7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8410,6 +8410,16 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
return pick_next_task_fair(rq, NULL, NULL);
}

+static bool eligible_task_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ update_curr(cfs_rq);
+
+ return entity_eligible(cfs_rq, se);
+}
+
/*
* Account for a descheduled task:
*/
@@ -13006,6 +13016,7 @@ DEFINE_SCHED_CLASS(fair) = {

.wakeup_preempt = check_preempt_wakeup_fair,

+ .eligible_task = eligible_task_fair,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a133b46efedd..0546905f1f8f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -11,6 +11,7 @@ SCHED_FEAT(PREEMPT_SHORT, true)
SCHED_FEAT(PLACE_SLEEPER, false)
SCHED_FEAT(GENTLE_SLEEPER, true)
SCHED_FEAT(EVDF, false)
+SCHED_FEAT(DELAY_DEQUEUE, true)

/*
* Prefer to schedule the task we woke last (assuming it failed
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 245df0c6d344..35d297e1d91b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2222,6 +2222,7 @@ struct sched_class {

void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);

+ bool (*eligible_task)(struct rq *rq, struct task_struct *p);
struct task_struct *(*pick_next_task)(struct rq *rq);

void (*put_prev_task)(struct rq *rq, struct task_struct *p);
@@ -2275,7 +2276,7 @@ struct sched_class {

static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- WARN_ON_ONCE(rq->curr != prev);
+// WARN_ON_ONCE(rq->curr != prev);
prev->sched_class->put_prev_task(rq, prev);
}