Re: [RFC PATCH v7 11/23] sched/fair: core wide cfs task priority comparison

From: Joel Fernandes
Date: Tue Sep 22 2020 - 21:46:29 EST


On Fri, Aug 28, 2020 at 11:29:27PM +0200, Peter Zijlstra wrote:
>
>
> This is still a horrible patch..

Hi Peter,
I wrote a new patch similar to this one and it fares much better in my tests,
it is based on Aaron's idea but I do the sync only during force-idle, and not
during enqueue. Also I yanked the whole 'core wide min_vruntime' crap. There
is a regressing test which improves quite a bit with my patch (results below):

Aaron, Vineeth, Chris any other thoughts? This patch is based on Google's
4.19 device kernel so will require some massaging to apply to mainline/v7
series. I will provide an updated patch later based on v7 series.

(Works only for SMT2, maybe we can generalize it more..)
--------8<-----------

From: "Joel Fernandes (Google)" <joel@xxxxxxxxxxxxxxxxx>
Subject: [PATCH] sched: Sync the min_vruntime of cores when the system enters
force-idle

This patch provides a vruntime based way to compare two cfs task's priority, be
it on the same cpu or different threads of the same core.

It is based on Aaron Lu's patch with some important differences. Namely,
the vruntime is sync'ed only when the CPU goes into force-idle. Also I removed
the notion of core-wide min_vruntime.

Also I don't care how long a cpu in a core is force idled, I do my sync
whenever the force idle starts essentially bringing both SMTs to a common time
base. After that point, selection can happen as usual.

When running an Android audio test, with patch the perf sched latency output:

-----------------------------------------------------------------------------------------------------------------
Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |
-----------------------------------------------------------------------------------------------------------------
FinalizerDaemon:(2) | 23.969 ms | 969 | avg: 0.504 ms | max: 162.020 ms | max at: 1294.327339 s
HeapTaskDaemon:(3) | 2421.287 ms | 4733 | avg: 0.131 ms | max: 96.229 ms | max at: 1302.343366 s
adbd:(3) | 6.101 ms | 79 | avg: 1.105 ms | max: 84.923 ms | max at: 1294.431284 s

Without this patch and with Aubrey's initial patch (in v5 series), the max delay looks much better:

-----------------------------------------------------------------------------------------------------------------
Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |
-----------------------------------------------------------------------------------------------------------------
HeapTaskDaemon:(2) | 2602.109 ms | 4025 | avg: 0.231 ms | max: 19.152 ms | max at: 522.903934 s
surfaceflinger:7478 | 18.994 ms | 1206 | avg: 0.189 ms | max: 17.375 ms | max at: 520.523061 s
ksoftirqd/3:30 | 0.093 ms | 5 | avg: 3.328 ms | max: 16.567 ms | max at: 522.903871 s

The comparison bits are the same as with Aaron's patch. When the two tasks
are on the same CPU, we just need to find a common cfs_rq both sched_entities
are on and then do the comparison.

When the two tasks are on different threads of the same core, the root
level sched_entities to which the two tasks belong will be used to do
the comparison.

An ugly illustration for the cross CPU case:

cpu0 cpu1
/ | \ / | \
se1 se2 se3 se4 se5 se6
/ \ / \
se21 se22 se61 se62

Assume CPU0 and CPU1 are smt siblings and task A's se is se21 while
task B's se is se61. To compare priority of task A and B, we compare
priority of se2 and se6. Whose vruntime is smaller, who wins.

To make this work, the root level se should have a common cfs_rq min
vuntime, which I call it the core cfs_rq min vruntime.

When we adjust the min_vruntime of rq->core, we need to propgate
that down the tree so as to not cause starvation of existing tasks
based on previous vruntime.

Inspired-by: Aaron Lu <ziqian.lzq@xxxxxxxxxx>
Signed-off-by: Joel Fernandes <joel@xxxxxxxxxxxxxxxxx>
Change-Id: Ida0083a2382a37f768030112ddf43bdf024a84b3
---
kernel/sched/core.c | 17 +++++++++++++-
kernel/sched/fair.c | 54 +++++++++++++++++++++++---------------------
kernel/sched/sched.h | 2 ++
3 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aeaf72acf063..d6f25fdd78ee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4068,6 +4068,7 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
+ struct cfs_rq *cfs_rq[2];
struct task_struct *next, *max = NULL;
const struct sched_class *class;
const struct cpumask *smt_mask;
@@ -4247,6 +4248,7 @@ next_class:;
* their task. This ensures there is no inter-sibling overlap between
* non-matching user state.
*/
+ need_sync = false;
for_each_cpu(i, smt_mask) {
struct rq *rq_i = cpu_rq(i);

@@ -4255,8 +4257,10 @@ next_class:;

WARN_ON_ONCE(!rq_i->core_pick);

- if (is_idle_task(rq_i->core_pick) && rq_i->nr_running)
+ if (is_idle_task(rq_i->core_pick) && rq_i->nr_running) {
rq_i->core_forceidle = true;
+ need_sync = true;
+ }

rq_i->core_pick->core_occupation = occ;

@@ -4270,6 +4274,17 @@ next_class:;
WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
}

+ /* Something got force idled, sync the vruntimes. Works only for SMT2. */
+ if (need_sync) {
+ j = 0;
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+ cfs_rq[j++] = &rq_i->cfs;
+ }
+
+ if (j == 2)
+ update_core_cfs_rq_min_vruntime(cfs_rq[0], cfs_rq[1]);
+ }
done:
set_next_task(rq, next);
return next;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2ca3f6173c52..5bd220751d7a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -479,13 +479,6 @@ static inline struct cfs_rq *core_cfs_rq(struct cfs_rq *cfs_rq)

static inline u64 cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
{
- if (!sched_core_enabled(rq_of(cfs_rq)))
- return cfs_rq->min_vruntime;
-
-#ifdef CONFIG_SCHED_CORE
- if (is_root_cfs_rq(cfs_rq))
- return core_cfs_rq(cfs_rq)->min_vruntime;
-#endif
return cfs_rq->min_vruntime;
}

@@ -497,40 +490,47 @@ static void coresched_adjust_vruntime(struct cfs_rq *cfs_rq, u64 delta)
if (!cfs_rq)
return;

- cfs_rq->min_vruntime -= delta;
+ /*
+ * vruntime only goes forward. This can cause sleepers to be boosted
+ * but that's Ok for now.
+ */
+ cfs_rq->min_vruntime += delta;
rbtree_postorder_for_each_entry_safe(se, next,
&cfs_rq->tasks_timeline.rb_root, run_node) {
if (se->vruntime > delta)
- se->vruntime -= delta;
+ se->vruntime += delta;
if (se->my_q)
coresched_adjust_vruntime(se->my_q, delta);
}
}

-static void update_core_cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
+void update_core_cfs_rq_min_vruntime(struct cfs_rq *cfs_rqa,
+ struct cfs_rq *cfs_rqb)
{
- struct cfs_rq *cfs_rq_core;
-
- if (!sched_core_enabled(rq_of(cfs_rq)))
- return;
+ u64 delta;

- if (!is_root_cfs_rq(cfs_rq))
+ if (!is_root_cfs_rq(cfs_rqa) || !is_root_cfs_rq(cfs_rqb))
return;

- cfs_rq_core = core_cfs_rq(cfs_rq);
- if (cfs_rq_core != cfs_rq &&
- cfs_rq->min_vruntime < cfs_rq_core->min_vruntime) {
- u64 delta = cfs_rq_core->min_vruntime - cfs_rq->min_vruntime;
- coresched_adjust_vruntime(cfs_rq_core, delta);
+ if (cfs_rqa->min_vruntime < cfs_rqb->min_vruntime) {
+ delta = cfs_rqb->min_vruntime - cfs_rqa->min_vruntime;
+ /* Bring a upto speed with b. */
+ coresched_adjust_vruntime(cfs_rqa, delta);
+ } else if (cfs_rqb->min_vruntime < cfs_rqa->min_vruntime) {
+ u64 delta = cfs_rqa->min_vruntime - cfs_rqb->min_vruntime;
+ /* Bring b upto speed with a. */
+ coresched_adjust_vruntime(cfs_rqb, delta);
}
}
#endif

bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
{
+ struct cfs_rq *cfs_rqa = &cpu_rq(task_cpu(a))->cfs;
+ struct cfs_rq *cfs_rqb = &cpu_rq(task_cpu(b))->cfs;
+ bool samecpu = task_cpu(a) == task_cpu(b);
struct sched_entity *sea = &a->se;
struct sched_entity *seb = &b->se;
- bool samecpu = task_cpu(a) == task_cpu(b);
struct task_struct *p;
s64 delta;

@@ -555,7 +555,13 @@ bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
sea = sea->parent;
while (seb->parent)
seb = seb->parent;
- delta = (s64)(sea->vruntime - seb->vruntime);
+
+ WARN_ON_ONCE(sea->vruntime < cfs_rqa->min_vruntime);
+ WARN_ON_ONCE(seb->vruntime < cfs_rqb->min_vruntime);
+
+ /* normalize vruntime WRT their rq's base */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->min_vruntime - cfs_rqa->min_vruntime);

out:
p = delta > 0 ? b : a;
@@ -624,10 +630,6 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
/* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = max_vruntime(cfs_rq_min_vruntime(cfs_rq), vruntime);

-#ifdef CONFIG_SCHED_CORE
- update_core_cfs_rq_min_vruntime(cfs_rq);
-#endif
-
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d09cfbd746e5..8559d0ee157f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2549,6 +2549,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
#endif

bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
+void update_core_cfs_rq_min_vruntime(struct cfs_rq *cfs_rqa,
+ struct cfs_rq *cfs_rqb);

#ifdef CONFIG_SMP
extern struct static_key_false sched_energy_present;
--
2.28.0.681.g6f77f65b4e-goog