Re: [RFC PATCH 1/1] sched/eevdf: Skip eligibility check for current entity during wakeup preemption

From: K Prateek Nayak
Date: Wed Apr 24 2024 - 23:34:39 EST


Hello Peter,

On 4/24/2024 8:37 PM, Peter Zijlstra wrote:
> On Mon, Mar 25, 2024 at 11:32:26AM +0530, K Prateek Nayak wrote:
>> With the curr entity's eligibility check, a wakeup preemption is very
>> likely when an entity with positive lag joins the runqueue pushing the
>> avg_vruntime of the runqueue backwards, making the vruntime of the
>> current entity ineligible. This leads to aggressive wakeup preemption
>> which was previously guarded by wakeup_granularity_ns in legacy CFS.
>> Below figure depicts one such aggressive preemption scenario with EEVDF
>> in DeathStarBench [1]:
>>
>> deadline for Nginx
>> |
>> +-------+ | |
>> /-- | Nginx | -|------------------> |
>> | +-------+ | |
>> | |
>> | -----------|-------------------------------> vruntime timeline
>> | \--> rq->avg_vruntime
>> |
>> | wakes service on the same runqueue since system is busy
>> |
>> | +---------+|
>> \-->| Service || (service has +ve lag pushes avg_vruntime backwards)
>> +---------+|
>> | |
>> wakeup | +--|-----+ |
>> preempts \---->| N|ginx | --------------------> | {deadline for Nginx}
>> +--|-----+ |
>> (Nginx ineligible)
>> -----------|-------------------------------> vruntime timeline
>> \--> rq->avg_vruntime
>
> This graph is really hard to interpret. If you want to illustrate
> avg_vruntime moves back, you should not align it. That's really
> disorienting.
>
> In both (upper and lower) nginx has the same vruntime thus *that* should
> be aligned. The lower will have service placed left and with that
> avg_vruntime also moves left, rendering nginx in-eligible.

Sorry about that. I'll keep this in mind for for future illustrations.

>
>
>> Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
>> ---
>> kernel/sched/fair.c | 24 ++++++++++++++++++++----
>> kernel/sched/features.h | 1 +
>> 2 files changed, 21 insertions(+), 4 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 6a16129f9a5c..a9b145a4eab0 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -875,7 +875,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
>> *
>> * Which allows tree pruning through eligibility.
>> */
>> -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
>> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool wakeup_preempt)
>> {
>> struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
>> struct sched_entity *se = __pick_first_entity(cfs_rq);
>> @@ -889,7 +889,23 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
>> if (cfs_rq->nr_running == 1)
>> return curr && curr->on_rq ? curr : se;
>>
>> - if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
>> + if (curr && !curr->on_rq)
>> + curr = NULL;
>> +
>> + /*
>> + * When an entity with positive lag wakes up, it pushes the
>> + * avg_vruntime of the runqueue backwards. This may causes the
>> + * current entity to be ineligible soon into its run leading to
>> + * wakeup preemption.
>> + *
>> + * To prevent such aggressive preemption of the current running
>> + * entity during task wakeups, skip the eligibility check if the
>> + * slice promised to the entity since its selection has not yet
>> + * elapsed.
>> + */
>> + if (curr &&
>> + !(sched_feat(RUN_TO_PARITY_WAKEUP) && wakeup_preempt && curr->vlag == curr->deadline) &&
>> + !entity_eligible(cfs_rq, curr))
>> curr = NULL;
>>
>> /*
>
> So I see what you want to do, but this is highly unreadable.
>
> I'll try something like the below on top of queue/sched/eevdf, but I
> should probably first look at fixing those reported crashes on that tree
> :/

I'll give this a try since Mike's suggestion seems to have fixed the
crash I was observing :) Thank you for suggesting this alternative.
--
Thanks and Regards,
Prateek

>
> ---
> kernel/sched/fair.c | 60 ++++++++++++++++++++++++++++++++++---------------
> kernel/sched/features.h | 11 +++++----
> 2 files changed, 49 insertions(+), 22 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8a9206d8532f..23977ed1cb2c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -855,6 +855,39 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
> return __node_2_se(left);
> }
>
> +static inline bool pick_curr(struct cfs_rq *cfs_rq,
> + struct sched_entity *curr, struct sched_entity *wakee)
> +{
> + /*
> + * Nothing to preserve...
> + */
> + if (!curr || !sched_feat(RESPECT_SLICE))
> + return false;
> +
> + /*
> + * Allow preemption at the 0-lag point -- even if not all of the slice
> + * is consumed. Note: placement of positive lag can push V left and render
> + * @curr instantly ineligible irrespective the time on-cpu.
> + */
> + if (sched_feat(RUN_TO_PARITY) && !entity_eligible(cfs_rq, curr))
> + return false;
> +
> + /*
> + * Don't preserve @curr when the @wakee has a shorter slice and earlier
> + * deadline. IOW, explicitly allow preemption.
> + */
> + if (sched_feat(PREEMPT_SHORT) && wakee &&
> + wakee->slice < curr->slice &&
> + (s64)(wakee->deadline - curr->deadline) < 0)
> + return false;
> +
> + /*
> + * Preserve @curr to allow it to finish its first slice.
> + * See the HACK in set_next_entity().
> + */
> + return curr->vlag == curr->deadline;
> +}
> +
> /*
> * Earliest Eligible Virtual Deadline First
> *
> @@ -874,28 +907,27 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
> *
> * Which allows tree pruning through eligibility.
> */
> -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *wakee)
> {
> struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
> struct sched_entity *se = __pick_first_entity(cfs_rq);
> struct sched_entity *curr = cfs_rq->curr;
> struct sched_entity *best = NULL;
>
> + if (curr && !curr->on_rq)
> + curr = NULL;
> +
> /*
> * We can safely skip eligibility check if there is only one entity
> * in this cfs_rq, saving some cycles.
> */
> if (cfs_rq->nr_running == 1)
> - return curr && curr->on_rq ? curr : se;
> -
> - if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
> - curr = NULL;
> + return curr ?: se;
>
> /*
> - * Once selected, run a task until it either becomes non-eligible or
> - * until it gets a new slice. See the HACK in set_next_entity().
> + * Preserve @curr to let it finish its slice.
> */
> - if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
> + if (pick_curr(cfs_rq, curr, wakee))
> return curr;
>
> /* Pick the leftmost entity if it's eligible */
> @@ -5507,7 +5539,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
> return cfs_rq->next;
> }
>
> - struct sched_entity *se = pick_eevdf(cfs_rq);
> + struct sched_entity *se = pick_eevdf(cfs_rq, NULL);
> if (se->sched_delayed) {
> dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> SCHED_WARN_ON(se->sched_delayed);
> @@ -8548,15 +8580,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> cfs_rq = cfs_rq_of(se);
> update_curr(cfs_rq);
>
> - if (sched_feat(PREEMPT_SHORT) && pse->slice < se->slice &&
> - entity_eligible(cfs_rq, pse) &&
> - (s64)(pse->deadline - se->deadline) < 0 &&
> - se->vlag == se->deadline) {
> - /* negate RUN_TO_PARITY */
> - se->vlag = se->deadline - 1;
> - }
> -
> - if (pick_eevdf(cfs_rq) == pse)
> + if (pick_eevdf(cfs_rq, pse) == pse)
> goto preempt;
>
> return;
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 64ce99cf04ec..2285dc30294c 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -10,12 +10,15 @@ SCHED_FEAT(PLACE_LAG, true)
> */
> SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
> /*
> - * Inhibit (wakeup) preemption until the current task has either matched the
> - * 0-lag point or until is has exhausted it's slice.
> + * Inhibit (wakeup) preemption until the current task has exhausted its slice.
> */
> -SCHED_FEAT(RUN_TO_PARITY, true)
> +SCHED_FEAT(RESPECT_SLICE, true)
> /*
> - * Allow tasks with a shorter slice to disregard RUN_TO_PARITY
> + * Relax RESPECT_SLICE to allow preemption once current has reached 0-lag.
> + */
> +SCHED_FEAT(RUN_TO_PARITY, false)
> +/*
> + * Allow tasks with a shorter slice to disregard RESPECT_SLICE
> */
> SCHED_FEAT(PREEMPT_SHORT, true)
>