[PATCH v2] sched/fair: sanitize vruntime of entity being migrated

From: Zhang Qiao
Date: Mon Mar 06 2023 - 07:57:08 EST


Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
entity being placed") fix an overflowing bug, but ignore
a case that se->exec_start is reset after a migration.

For fixing this case, we reset the vruntime of a long
sleeping task in migrate_task_rq_fair().

Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
Suggested-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Signed-off-by: Zhang Qiao <zhangqiao22@xxxxxxxxxx>
---

v1 -> v2:
- fix some typos and update comments
- reformat the patch

---
kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a1b1f855b96..74c9918ffe76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
#endif
}

+static inline bool entity_is_long_sleep(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+ u64 sleep_time;
+
+ if (se->exec_start == 0)
+ return false;
+
+ cfs_rq = cfs_rq_of(se);
+ sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
+ if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+ return true;
+
+ return false;
+}
+
+static inline u64 sched_sleeper_credit(struct sched_entity *se)
+{
+ unsigned long thresh;
+
+ if (se_is_idle(se))
+ thresh = sysctl_sched_min_granularity;
+ else
+ thresh = sysctl_sched_latency;
+
+ /*
+ * Halve their sleep time's effect, to allow
+ * for a gentler effect of sleepers:
+ */
+ if (sched_feat(GENTLE_FAIR_SLEEPERS))
+ thresh >>= 1;
+
+ return thresh;
+}
+
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
- u64 sleep_time;

/*
* The 'current' period is already promised to the current tasks,
@@ -4664,23 +4698,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime += sched_vslice(cfs_rq, se);

/* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh;
-
- if (se_is_idle(se))
- thresh = sysctl_sched_min_granularity;
- else
- thresh = sysctl_sched_latency;
-
- /*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
- */
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
-
- vruntime -= thresh;
- }
+ if (!initial)
+ vruntime -= sched_sleeper_credit(se);

/*
* Pull vruntime of the entity being placed to the base level of
@@ -4689,8 +4708,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
* the base as it may be too far off and the comparison may get
* inversed due to s64 overflow.
*/
- sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
- if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+ if (entity_is_long_sleep(se))
se->vruntime = vruntime;
else
se->vruntime = max_vruntime(se->vruntime, vruntime);
@@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
if (READ_ONCE(p->__state) == TASK_WAKING) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);

- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
+ /*
+ * We determine whether a task sleeps for long by checking
+ * se->exec_start, and if it is, we sanitize its vruntime at
+ * place_entity(). However, after a migration, this detection
+ * method fails due to se->exec_start being reset.
+ *
+ * For fixing this case, we add the same check here. For a task
+ * which has slept for a long time, its vruntime should be reset
+ * to cfs_rq->min_vruntime with a sleep credit. Because waking
+ * task's vruntime will be added to cfs_rq->min_vruntime when
+ * enqueue, we only need to reset the se->vruntime of waking task
+ * to a credit here.
+ */
+ if (entity_is_long_sleep(se))
+ se->vruntime = -sched_sleeper_credit(se);
+ else
+ se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
}

if (!task_on_rq_migrating(p)) {
--
2.17.1