Re: [RFC PATCH 1/1] sched/eevdf: Skip eligibility check for current entity during wakeup preemption

From: Peter Zijlstra
Date: Wed Apr 24 2024 - 11:08:40 EST


On Mon, Mar 25, 2024 at 11:32:26AM +0530, K Prateek Nayak wrote:
> With the curr entity's eligibility check, a wakeup preemption is very
> likely when an entity with positive lag joins the runqueue pushing the
> avg_vruntime of the runqueue backwards, making the vruntime of the
> current entity ineligible. This leads to aggressive wakeup preemption
> which was previously guarded by wakeup_granularity_ns in legacy CFS.
> Below figure depicts one such aggressive preemption scenario with EEVDF
> in DeathStarBench [1]:
>
> deadline for Nginx
> |
> +-------+ | |
> /-- | Nginx | -|------------------> |
> | +-------+ | |
> | |
> | -----------|-------------------------------> vruntime timeline
> | \--> rq->avg_vruntime
> |
> | wakes service on the same runqueue since system is busy
> |
> | +---------+|
> \-->| Service || (service has +ve lag pushes avg_vruntime backwards)
> +---------+|
> | |
> wakeup | +--|-----+ |
> preempts \---->| N|ginx | --------------------> | {deadline for Nginx}
> +--|-----+ |
> (Nginx ineligible)
> -----------|-------------------------------> vruntime timeline
> \--> rq->avg_vruntime

This graph is really hard to interpret. If you want to illustrate
avg_vruntime moves back, you should not align it. That's really
disorienting.

In both (upper and lower) nginx has the same vruntime thus *that* should
be aligned. The lower will have service placed left and with that
avg_vruntime also moves left, rendering nginx in-eligible.


> Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
> ---
> kernel/sched/fair.c | 24 ++++++++++++++++++++----
> kernel/sched/features.h | 1 +
> 2 files changed, 21 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6a16129f9a5c..a9b145a4eab0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -875,7 +875,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
> *
> * Which allows tree pruning through eligibility.
> */
> -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool wakeup_preempt)
> {
> struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
> struct sched_entity *se = __pick_first_entity(cfs_rq);
> @@ -889,7 +889,23 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> if (cfs_rq->nr_running == 1)
> return curr && curr->on_rq ? curr : se;
>
> - if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
> + if (curr && !curr->on_rq)
> + curr = NULL;
> +
> + /*
> + * When an entity with positive lag wakes up, it pushes the
> + * avg_vruntime of the runqueue backwards. This may causes the
> + * current entity to be ineligible soon into its run leading to
> + * wakeup preemption.
> + *
> + * To prevent such aggressive preemption of the current running
> + * entity during task wakeups, skip the eligibility check if the
> + * slice promised to the entity since its selection has not yet
> + * elapsed.
> + */
> + if (curr &&
> + !(sched_feat(RUN_TO_PARITY_WAKEUP) && wakeup_preempt && curr->vlag == curr->deadline) &&
> + !entity_eligible(cfs_rq, curr))
> curr = NULL;
>
> /*

So I see what you want to do, but this is highly unreadable.

I'll try something like the below on top of queue/sched/eevdf, but I
should probably first look at fixing those reported crashes on that tree
:/

---
kernel/sched/fair.c | 60 ++++++++++++++++++++++++++++++++++---------------
kernel/sched/features.h | 11 +++++----
2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a9206d8532f..23977ed1cb2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -855,6 +855,39 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
return __node_2_se(left);
}

+static inline bool pick_curr(struct cfs_rq *cfs_rq,
+ struct sched_entity *curr, struct sched_entity *wakee)
+{
+ /*
+ * Nothing to preserve...
+ */
+ if (!curr || !sched_feat(RESPECT_SLICE))
+ return false;
+
+ /*
+ * Allow preemption at the 0-lag point -- even if not all of the slice
+ * is consumed. Note: placement of positive lag can push V left and render
+ * @curr instantly ineligible irrespective the time on-cpu.
+ */
+ if (sched_feat(RUN_TO_PARITY) && !entity_eligible(cfs_rq, curr))
+ return false;
+
+ /*
+ * Don't preserve @curr when the @wakee has a shorter slice and earlier
+ * deadline. IOW, explicitly allow preemption.
+ */
+ if (sched_feat(PREEMPT_SHORT) && wakee &&
+ wakee->slice < curr->slice &&
+ (s64)(wakee->deadline - curr->deadline) < 0)
+ return false;
+
+ /*
+ * Preserve @curr to allow it to finish its first slice.
+ * See the HACK in set_next_entity().
+ */
+ return curr->vlag == curr->deadline;
+}
+
/*
* Earliest Eligible Virtual Deadline First
*
@@ -874,28 +907,27 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
*
* Which allows tree pruning through eligibility.
*/
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *wakee)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;

+ if (curr && !curr->on_rq)
+ curr = NULL;
+
/*
* We can safely skip eligibility check if there is only one entity
* in this cfs_rq, saving some cycles.
*/
if (cfs_rq->nr_running == 1)
- return curr && curr->on_rq ? curr : se;
-
- if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
- curr = NULL;
+ return curr ?: se;

/*
- * Once selected, run a task until it either becomes non-eligible or
- * until it gets a new slice. See the HACK in set_next_entity().
+ * Preserve @curr to let it finish its slice.
*/
- if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ if (pick_curr(cfs_rq, curr, wakee))
return curr;

/* Pick the leftmost entity if it's eligible */
@@ -5507,7 +5539,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
return cfs_rq->next;
}

- struct sched_entity *se = pick_eevdf(cfs_rq);
+ struct sched_entity *se = pick_eevdf(cfs_rq, NULL);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
SCHED_WARN_ON(se->sched_delayed);
@@ -8548,15 +8580,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
cfs_rq = cfs_rq_of(se);
update_curr(cfs_rq);

- if (sched_feat(PREEMPT_SHORT) && pse->slice < se->slice &&
- entity_eligible(cfs_rq, pse) &&
- (s64)(pse->deadline - se->deadline) < 0 &&
- se->vlag == se->deadline) {
- /* negate RUN_TO_PARITY */
- se->vlag = se->deadline - 1;
- }
-
- if (pick_eevdf(cfs_rq) == pse)
+ if (pick_eevdf(cfs_rq, pse) == pse)
goto preempt;

return;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 64ce99cf04ec..2285dc30294c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -10,12 +10,15 @@ SCHED_FEAT(PLACE_LAG, true)
*/
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
/*
- * Inhibit (wakeup) preemption until the current task has either matched the
- * 0-lag point or until is has exhausted it's slice.
+ * Inhibit (wakeup) preemption until the current task has exhausted its slice.
*/
-SCHED_FEAT(RUN_TO_PARITY, true)
+SCHED_FEAT(RESPECT_SLICE, true)
/*
- * Allow tasks with a shorter slice to disregard RUN_TO_PARITY
+ * Relax RESPECT_SLICE to allow preemption once current has reached 0-lag.
+ */
+SCHED_FEAT(RUN_TO_PARITY, false)
+/*
+ * Allow tasks with a shorter slice to disregard RESPECT_SLICE
*/
SCHED_FEAT(PREEMPT_SHORT, true)