Re: [PATCH v2] sched/fair: sanitize vruntime of entity being migrated

From: Vincent Guittot
Date: Tue Mar 14 2023 - 03:41:46 EST


On Mon, 13 Mar 2023 at 19:17, Dietmar Eggemann <dietmar.eggemann@xxxxxxx> wrote:
>
> On 13/03/2023 10:06, Dietmar Eggemann wrote:
> > On 10/03/2023 15:29, Vincent Guittot wrote:
> >> Le jeudi 09 mars 2023 � 16:14:38 (+0100), Vincent Guittot a �crit :
> >>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> >>>>
> >>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
> >>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
>
> [...]
>
> > Looks to me that this patch brings back the old numbers:
> >
> > model name : Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz
> >
> > perf stat --null --repeat 10 -- perf bench sched messaging -g 50 -l 5000
> >
> > tip sched/core
> >
> > a2e90611b9f4 - sched/fair: Remove capacity inversion detection
> > (2023-02-11 Vincent Guittot)
> >
> > 5.7295 +- 0.0219 seconds time elapsed ( +- 0.38% )
> >
> > 829c1651e9c4 - sched/fair: sanitize vruntime of entity being placed
> > (2023-02-11 Zhang Qiao)
> >
> > 6.0961 +- 0.0297 seconds time elapsed ( +- 0.49% )
> >
> > this patch on top 829c1651e9c4
> >
> > 5.7165 +- 0.0231 seconds time elapsed ( +- 0.40% )
> >
> > [...]
>
> Couldn't we not just defer setting `se->exec_start = 0` until the end of
> place_entity() for ENQUEUE_MIGRATED instead to avoid this extra se flag
> `migrated`?

Yes, that's a good point.

I'm going to use something a bit different from your proposal below by
merging initial and flag
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity
*se, int flags)

with flags:
0 for initial placement
ENQUEUE_WAKEUP for wakeup
ENQUEUE_MIGRATED for migrated task

>
> -->8--
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0c70c558b12c..4df2b3e76b30 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -550,7 +550,6 @@ struct sched_entity {
> struct rb_node run_node;
> struct list_head group_node;
> unsigned int on_rq;
> - unsigned int migrated;
>
> u64 exec_start;
> u64 sum_exec_runtime;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a8aa8cd3c745..365ee548e9f0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1057,7 +1057,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> /*
> * We are starting a new run period:
> */
> - se->migrated = 0;
> se->exec_start = rq_clock_task(rq_of(cfs_rq));
> }
>
> @@ -4649,8 +4648,8 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
> #endif
> }
>
> -static void
> -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> +static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
> + int flags, int initial)
> {
> u64 vruntime = cfs_rq->min_vruntime;
> u64 sleep_time;
> @@ -4705,6 +4704,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> se->vruntime = vruntime;
> else
> se->vruntime = max_vruntime(se->vruntime, vruntime);
> +
> + if (flags & ENQUEUE_MIGRATED)
> + se->exec_start = 0;
> }
>
> static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
> @@ -4780,7 +4782,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> account_entity_enqueue(cfs_rq, se);
>
> if (flags & ENQUEUE_WAKEUP)
> - place_entity(cfs_rq, se, 0);
> + place_entity(cfs_rq, se, flags, 0);
>
> check_schedstat_required();
> update_stats_enqueue_fair(cfs_rq, se, flags);
> @@ -7668,9 +7670,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> /* Tell new CPU we are migrated */
> se->avg.last_update_time = 0;
>
> - /* We have migrated, no longer consider this task hot */
> - se->migrated = 1;
> -
> update_scan_period(p, new_cpu);
> }
>
> @@ -8355,9 +8354,6 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> if (sysctl_sched_migration_cost == 0)
> return 0;
>
> - if (p->se.migrated)
> - return 0;
> -
> delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>
> return delta < (s64)sysctl_sched_migration_cost;
> @@ -11999,7 +11995,7 @@ static void task_fork_fair(struct task_struct *p)
> update_curr(cfs_rq);
> se->vruntime = curr->vruntime;
> }
> - place_entity(cfs_rq, se, 1);
> + place_entity(cfs_rq, se, 0, 1);
>
> if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
> /*
> @@ -12144,7 +12140,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
> * Fix up our vruntime so that the current sleep doesn't
> * cause 'unlimited' sleep bonus.
> */
> - place_entity(cfs_rq, se, 0);
> + place_entity(cfs_rq, se, 0, 0);
> se->vruntime -= cfs_rq->min_vruntime;
> }
>
>