[PATCH -tip 09/32] sched/fair: Snapshot the min_vruntime of CPUs on force idle

From: Joel Fernandes (Google)
Date: Tue Nov 17 2020 - 18:22:00 EST


During force-idle, we end up doing cross-cpu comparison of vruntimes
during pick_next_task. If we simply compare (vruntime-min_vruntime)
across CPUs, and if the CPUs only have 1 task each, we will always
end up comparing 0 with 0 and pick just one of the tasks all the time.
This starves the task that was not picked. To fix this, take a snapshot
of the min_vruntime when entering force idle and use it for comparison.
This min_vruntime snapshot will only be used for cross-CPU vruntime
comparison, and nothing else.

This resolves several performance issues that were seen in ChromeOS
audio usecase.

NOTE: Note, this patch will be improved in a later patch. It is just
kept here as the basis for the later patch and to make rebasing
easier. Further, it may make reverting the improvement easier in
case the improvement causes any regression.

Tested-by: Julien Desfossez <jdesfossez@xxxxxxxxxxxxxxxx>
Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
---
kernel/sched/core.c | 33 ++++++++++++++++++++-------------
kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 5 +++++
3 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 52d0e83072a4..4ee4902c2cf5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -115,19 +115,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b)
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);

- if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */
- u64 vruntime = b->se.vruntime;
-
- /*
- * Normalize the vruntime if tasks are in different cpus.
- */
- if (task_cpu(a) != task_cpu(b)) {
- vruntime -= task_cfs_rq(b)->min_vruntime;
- vruntime += task_cfs_rq(a)->min_vruntime;
- }
-
- return !((s64)(a->se.vruntime - vruntime) <= 0);
- }
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return cfs_prio_less(a, b);

return false;
}
@@ -5144,6 +5133,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *max = NULL;
const struct sched_class *class;
const struct cpumask *smt_mask;
+ bool fi_before = false;
bool need_sync;
int i, j, cpu;

@@ -5208,6 +5198,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle) {
need_sync = true;
+ fi_before = true;
rq->core->core_forceidle = false;
}
for_each_cpu(i, smt_mask) {
@@ -5219,6 +5210,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
update_rq_clock(rq_i);
}

+ /* Reset the snapshot if core is no longer in force-idle. */
+ if (!fi_before) {
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+ rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
+ }
+ }
+
/*
* Try and select tasks for each sibling in decending sched_class
* order.
@@ -5355,6 +5354,14 @@ next_class:;
resched_curr(rq_i);
}

+ /* Snapshot if core is in force-idle. */
+ if (!fi_before && rq->core->core_forceidle) {
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+ rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
+ }
+ }
+
done:
set_next_task(rq, next);
return next;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 42965c4fd71f..de82f88ba98c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10726,6 +10726,46 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+ bool samecpu = task_cpu(a) == task_cpu(b);
+ struct sched_entity *sea = &a->se;
+ struct sched_entity *seb = &b->se;
+ struct cfs_rq *cfs_rqa;
+ struct cfs_rq *cfs_rqb;
+ s64 delta;
+
+ if (samecpu) {
+ /* vruntime is per cfs_rq */
+ while (!is_same_group(sea, seb)) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
+ }
+
+ delta = (s64)(sea->vruntime - seb->vruntime);
+ goto out;
+ }
+
+ /* crosscpu: compare root level se's vruntime to decide priority */
+ while (sea->parent)
+ sea = sea->parent;
+ while (seb->parent)
+ seb = seb->parent;
+
+ cfs_rqa = sea->cfs_rq;
+ cfs_rqb = seb->cfs_rq;
+
+ /* normalize vruntime WRT their rq's base */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+out:
+ return delta > 0;
+}
#else
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index be656ca8693d..d934cc51acf1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -517,6 +517,9 @@ struct cfs_rq {

u64 exec_clock;
u64 min_vruntime;
+#ifdef CONFIG_SCHED_CORE
+ u64 min_vruntime_fi;
+#endif
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
@@ -1130,6 +1133,8 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
return &rq->__lock;
}

+bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
+
#else /* !CONFIG_SCHED_CORE */

static inline bool sched_core_enabled(struct rq *rq)
--
2.29.2.299.gdc1121823c-goog