Re: [PATCH v2] sched/fair: sanitize vruntime of entity being migrated

From: Dietmar Eggemann
Date: Mon Mar 13 2023 - 14:17:19 EST


On 13/03/2023 10:06, Dietmar Eggemann wrote:
> On 10/03/2023 15:29, Vincent Guittot wrote:
>> Le jeudi 09 mars 2023 � 16:14:38 (+0100), Vincent Guittot a �crit :
>>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>>>>
>>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
>>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:

[...]

> Looks to me that this patch brings back the old numbers:
>
> model name : Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz
>
> perf stat --null --repeat 10 -- perf bench sched messaging -g 50 -l 5000
>
> tip sched/core
>
> a2e90611b9f4 - sched/fair: Remove capacity inversion detection
> (2023-02-11 Vincent Guittot)
>
> 5.7295 +- 0.0219 seconds time elapsed ( +- 0.38% )
>
> 829c1651e9c4 - sched/fair: sanitize vruntime of entity being placed
> (2023-02-11 Zhang Qiao)
>
> 6.0961 +- 0.0297 seconds time elapsed ( +- 0.49% )
>
> this patch on top 829c1651e9c4
>
> 5.7165 +- 0.0231 seconds time elapsed ( +- 0.40% )
>
> [...]

Couldn't we not just defer setting `se->exec_start = 0` until the end of
place_entity() for ENQUEUE_MIGRATED instead to avoid this extra se flag
`migrated`?

-->8--

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0c70c558b12c..4df2b3e76b30 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -550,7 +550,6 @@ struct sched_entity {
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
- unsigned int migrated;

u64 exec_start;
u64 sum_exec_runtime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a8aa8cd3c745..365ee548e9f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1057,7 +1057,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* We are starting a new run period:
*/
- se->migrated = 0;
se->exec_start = rq_clock_task(rq_of(cfs_rq));
}

@@ -4649,8 +4648,8 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
#endif
}

-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ int flags, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
u64 sleep_time;
@@ -4705,6 +4704,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
se->vruntime = vruntime;
else
se->vruntime = max_vruntime(se->vruntime, vruntime);
+
+ if (flags & ENQUEUE_MIGRATED)
+ se->exec_start = 0;
}

static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -4780,7 +4782,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
account_entity_enqueue(cfs_rq, se);

if (flags & ENQUEUE_WAKEUP)
- place_entity(cfs_rq, se, 0);
+ place_entity(cfs_rq, se, flags, 0);

check_schedstat_required();
update_stats_enqueue_fair(cfs_rq, se, flags);
@@ -7668,9 +7670,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
/* Tell new CPU we are migrated */
se->avg.last_update_time = 0;

- /* We have migrated, no longer consider this task hot */
- se->migrated = 1;
-
update_scan_period(p, new_cpu);
}

@@ -8355,9 +8354,6 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (sysctl_sched_migration_cost == 0)
return 0;

- if (p->se.migrated)
- return 0;
-
delta = rq_clock_task(env->src_rq) - p->se.exec_start;

return delta < (s64)sysctl_sched_migration_cost;
@@ -11999,7 +11995,7 @@ static void task_fork_fair(struct task_struct *p)
update_curr(cfs_rq);
se->vruntime = curr->vruntime;
}
- place_entity(cfs_rq, se, 1);
+ place_entity(cfs_rq, se, 0, 1);

if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
/*
@@ -12144,7 +12140,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
*/
- place_entity(cfs_rq, se, 0);
+ place_entity(cfs_rq, se, 0, 0);
se->vruntime -= cfs_rq->min_vruntime;
}