[PATCH] sched/fair: optimize and simplify rq leaf_cfs_rq_list

From: Chengming Zhou
Date: Wed Apr 27 2022 - 12:09:16 EST


We notice the rq leaf_cfs_rq_list has two problems when do bugfix
backports and some test profiling.

1. cfs_rqs under throttled subtree could be added to the list, and
make their fully decayed ancestors on the list, even though not needed.

2. #1 also make the leaf_cfs_rq_list management complex and error prone,
this is the list of related bugfix so far:

commit 31bc6aeaab1d ("sched/fair: Optimize update_blocked_averages()")
commit fe61468b2cbc ("sched/fair: Fix enqueue_task_fair warning")
commit b34cb07dde7c ("sched/fair: Fix enqueue_task_fair() warning some more")
commit 39f23ce07b93 ("sched/fair: Fix unthrottle_cfs_rq() for leaf_cfs_rq list")
commit 0258bdfaff5b ("sched/fair: Fix unfairness caused by missing load decay")
commit a7b359fc6a37 ("sched/fair: Correctly insert cfs_rq's to list on unthrottle")
commit fdaba61ef8a2 ("sched/fair: Ensure that the CFS parent is added after unthrottling")
commit 2630cde26711 ("sched/fair: Add ancestors of unthrottled undecayed cfs_rq")

commit 31bc6aeaab1d ("sched/fair: Optimize update_blocked_averages()")
delete every cfs_rq under throttled subtree from rq->leaf_cfs_rq_list,
and delete the throttled_hierarchy() test in update_blocked_averages(),
which optimized update_blocked_averages().

But those later bugfix add cfs_rqs under throttled subtree back to
rq->leaf_cfs_rq_list again, with their fully decayed ancestors, for
the integrity of rq->leaf_cfs_rq_list.

This patch takes another method, skip all cfs_rqs under throttled
hierarchy when list_add_leaf_cfs_rq(), to completely make cfs_rqs
under throttled subtree off the leaf_cfs_rq_list.

So we don't need to consider throttled related things in
enqueue_entity(), unthrottle_cfs_rq() and enqueue_task_fair(),
which simplify the code a lot. Also optimize update_blocked_averages()
since cfs_rqs under throttled hierarchy and their ancestors
won't be on the leaf_cfs_rq_list.

Signed-off-by: Chengming Zhou <zhouchengming@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 72 ++++++++++-----------------------------------
1 file changed, 16 insertions(+), 56 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1ad18b5cc1b8..083c3d32c899 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -309,6 +309,8 @@ const struct sched_class fair_sched_class;

#ifdef CONFIG_FAIR_GROUP_SCHED

+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
@@ -331,7 +333,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);

- if (cfs_rq->on_list)
+ if (cfs_rq->on_list || throttled_hierarchy(cfs_rq))
return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;

cfs_rq->on_list = 1;
@@ -3242,8 +3244,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */

-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-
/*
* Recomputes the group entity based on the current state of its group
* runqueue.
@@ -4356,16 +4356,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;

- /*
- * When bandwidth control is enabled, cfs might have been removed
- * because of a parent been throttled but cfs->nr_running > 1. Try to
- * add it unconditionally.
- */
- if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
+ if (cfs_rq->nr_running == 1) {
list_add_leaf_cfs_rq(cfs_rq);
-
- if (cfs_rq->nr_running == 1)
check_enqueue_throttle(cfs_rq);
+ }
}

static void __clear_buddies_last(struct sched_entity *se)
@@ -4980,11 +4974,18 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);

- /* Nothing to run but something to decay (on_list)? Complete the branch */
if (!cfs_rq->load.weight) {
- if (cfs_rq->on_list)
- goto unthrottle_throttle;
- return;
+ if (!cfs_rq->on_list)
+ return;
+ /*
+ * Nothing to run but something to decay (on_list)?
+ * Complete the branch.
+ */
+ for_each_sched_entity(se) {
+ if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
+ break;
+ }
+ goto unthrottle_throttle;
}

task_delta = cfs_rq->h_nr_running;
@@ -5022,31 +5023,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(qcfs_rq))
- list_add_leaf_cfs_rq(qcfs_rq);
}

/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);

unthrottle_throttle:
- /*
- * The cfs_rq_throttled() breaks in the above iteration can result in
- * incomplete leaf list maintenance, resulting in triggering the
- * assertion below.
- */
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(qcfs_rq))
- break;
- }
-
assert_list_leaf_cfs_rq(rq);

/* Determine whether we need to wake up potentially idle CPU: */
@@ -5701,13 +5683,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
}

/* At this point se is NULL and we are at root level*/
@@ -5731,21 +5706,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_overutilized_status(rq);

enqueue_throttle:
- if (cfs_bandwidth_used()) {
- /*
- * When bandwidth control is enabled; the cfs_rq_throttled()
- * breaks in the above iteration can result in incomplete
- * leaf list maintenance, resulting in triggering the assertion
- * below.
- */
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(cfs_rq))
- break;
- }
- }
-
assert_list_leaf_cfs_rq(rq);

hrtick_update(rq);
--
2.35.2