Re: [External] Re: [PATCH] sched/fair: optimize and simplify rq leaf_cfs_rq_list

From: Chengming Zhou
Date: Fri May 13 2022 - 00:32:26 EST


On 2022/5/12 22:02, Vincent Guittot wrote:
> Hi Chengming,
>
> Your patch is on my list but I have been busy on other stuff and we
> had enough warnings and problems with this part that I want to
> carefully review that all the cases are covered. I will review it soon
>

Hi Vincent,

Thanks for your reply. There's no rush, just take your time. I agree
that we should be very careful with this part due to previous problems.

Thanks.

> Vincent
>
> On Wed, 11 May 2022 at 14:05, Chengming Zhou
> <zhouchengming@xxxxxxxxxxxxx> wrote:
>>
>> Hello, friendly ping...
>>
>>
>> On 2022/4/28 00:05, Chengming Zhou wrote:
>>> We notice the rq leaf_cfs_rq_list has two problems when do bugfix
>>> backports and some test profiling.
>>>
>>> 1. cfs_rqs under throttled subtree could be added to the list, and
>>> make their fully decayed ancestors on the list, even though not needed.
>>>
>>> 2. #1 also make the leaf_cfs_rq_list management complex and error prone,
>>> this is the list of related bugfix so far:
>>>
>>> commit 31bc6aeaab1d ("sched/fair: Optimize update_blocked_averages()")
>>> commit fe61468b2cbc ("sched/fair: Fix enqueue_task_fair warning")
>>> commit b34cb07dde7c ("sched/fair: Fix enqueue_task_fair() warning some more")
>>> commit 39f23ce07b93 ("sched/fair: Fix unthrottle_cfs_rq() for leaf_cfs_rq list")
>>> commit 0258bdfaff5b ("sched/fair: Fix unfairness caused by missing load decay")
>>> commit a7b359fc6a37 ("sched/fair: Correctly insert cfs_rq's to list on unthrottle")
>>> commit fdaba61ef8a2 ("sched/fair: Ensure that the CFS parent is added after unthrottling")
>>> commit 2630cde26711 ("sched/fair: Add ancestors of unthrottled undecayed cfs_rq")
>>>
>>> commit 31bc6aeaab1d ("sched/fair: Optimize update_blocked_averages()")
>>> delete every cfs_rq under throttled subtree from rq->leaf_cfs_rq_list,
>>> and delete the throttled_hierarchy() test in update_blocked_averages(),
>>> which optimized update_blocked_averages().
>>>
>>> But those later bugfix add cfs_rqs under throttled subtree back to
>>> rq->leaf_cfs_rq_list again, with their fully decayed ancestors, for
>>> the integrity of rq->leaf_cfs_rq_list.
>>>
>>> This patch takes another method, skip all cfs_rqs under throttled
>>> hierarchy when list_add_leaf_cfs_rq(), to completely make cfs_rqs
>>> under throttled subtree off the leaf_cfs_rq_list.
>>>
>>> So we don't need to consider throttled related things in
>>> enqueue_entity(), unthrottle_cfs_rq() and enqueue_task_fair(),
>>> which simplify the code a lot. Also optimize update_blocked_averages()
>>> since cfs_rqs under throttled hierarchy and their ancestors
>>> won't be on the leaf_cfs_rq_list.
>>>
>>> Signed-off-by: Chengming Zhou <zhouchengming@xxxxxxxxxxxxx>
>>> ---
>>> kernel/sched/fair.c | 72 ++++++++++-----------------------------------
>>> 1 file changed, 16 insertions(+), 56 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 1ad18b5cc1b8..083c3d32c899 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -309,6 +309,8 @@ const struct sched_class fair_sched_class;
>>>
>>> #ifdef CONFIG_FAIR_GROUP_SCHED
>>>
>>> +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
>>> +
>>> /* Walk up scheduling entities hierarchy */
>>> #define for_each_sched_entity(se) \
>>> for (; se; se = se->parent)
>>> @@ -331,7 +333,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
>>> struct rq *rq = rq_of(cfs_rq);
>>> int cpu = cpu_of(rq);
>>>
>>> - if (cfs_rq->on_list)
>>> + if (cfs_rq->on_list || throttled_hierarchy(cfs_rq))
>>> return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
>>>
>>> cfs_rq->on_list = 1;
>>> @@ -3242,8 +3244,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
>>> }
>>> #endif /* CONFIG_SMP */
>>>
>>> -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
>>> -
>>> /*
>>> * Recomputes the group entity based on the current state of its group
>>> * runqueue.
>>> @@ -4356,16 +4356,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>>> __enqueue_entity(cfs_rq, se);
>>> se->on_rq = 1;
>>>
>>> - /*
>>> - * When bandwidth control is enabled, cfs might have been removed
>>> - * because of a parent been throttled but cfs->nr_running > 1. Try to
>>> - * add it unconditionally.
>>> - */
>>> - if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
>>> + if (cfs_rq->nr_running == 1) {
>>> list_add_leaf_cfs_rq(cfs_rq);
>>> -
>>> - if (cfs_rq->nr_running == 1)
>>> check_enqueue_throttle(cfs_rq);
>>> + }
>>> }
>>>
>>> static void __clear_buddies_last(struct sched_entity *se)
>>> @@ -4980,11 +4974,18 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>>> /* update hierarchical throttle state */
>>> walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
>>>
>>> - /* Nothing to run but something to decay (on_list)? Complete the branch */
>>> if (!cfs_rq->load.weight) {
>>> - if (cfs_rq->on_list)
>>> - goto unthrottle_throttle;
>>> - return;
>>> + if (!cfs_rq->on_list)
>>> + return;
>>> + /*
>>> + * Nothing to run but something to decay (on_list)?
>>> + * Complete the branch.
>>> + */
>>> + for_each_sched_entity(se) {
>>> + if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
>>> + break;
>>> + }
>>> + goto unthrottle_throttle;
>>> }
>>>
>>> task_delta = cfs_rq->h_nr_running;
>>> @@ -5022,31 +5023,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>>> /* end evaluation on encountering a throttled cfs_rq */
>>> if (cfs_rq_throttled(qcfs_rq))
>>> goto unthrottle_throttle;
>>> -
>>> - /*
>>> - * One parent has been throttled and cfs_rq removed from the
>>> - * list. Add it back to not break the leaf list.
>>> - */
>>> - if (throttled_hierarchy(qcfs_rq))
>>> - list_add_leaf_cfs_rq(qcfs_rq);
>>> }
>>>
>>> /* At this point se is NULL and we are at root level*/
>>> add_nr_running(rq, task_delta);
>>>
>>> unthrottle_throttle:
>>> - /*
>>> - * The cfs_rq_throttled() breaks in the above iteration can result in
>>> - * incomplete leaf list maintenance, resulting in triggering the
>>> - * assertion below.
>>> - */
>>> - for_each_sched_entity(se) {
>>> - struct cfs_rq *qcfs_rq = cfs_rq_of(se);
>>> -
>>> - if (list_add_leaf_cfs_rq(qcfs_rq))
>>> - break;
>>> - }
>>> -
>>> assert_list_leaf_cfs_rq(rq);
>>>
>>> /* Determine whether we need to wake up potentially idle CPU: */
>>> @@ -5701,13 +5683,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>> /* end evaluation on encountering a throttled cfs_rq */
>>> if (cfs_rq_throttled(cfs_rq))
>>> goto enqueue_throttle;
>>> -
>>> - /*
>>> - * One parent has been throttled and cfs_rq removed from the
>>> - * list. Add it back to not break the leaf list.
>>> - */
>>> - if (throttled_hierarchy(cfs_rq))
>>> - list_add_leaf_cfs_rq(cfs_rq);
>>> }
>>>
>>> /* At this point se is NULL and we are at root level*/
>>> @@ -5731,21 +5706,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>> update_overutilized_status(rq);
>>>
>>> enqueue_throttle:
>>> - if (cfs_bandwidth_used()) {
>>> - /*
>>> - * When bandwidth control is enabled; the cfs_rq_throttled()
>>> - * breaks in the above iteration can result in incomplete
>>> - * leaf list maintenance, resulting in triggering the assertion
>>> - * below.
>>> - */
>>> - for_each_sched_entity(se) {
>>> - cfs_rq = cfs_rq_of(se);
>>> -
>>> - if (list_add_leaf_cfs_rq(cfs_rq))
>>> - break;
>>> - }
>>> - }
>>> -
>>> assert_list_leaf_cfs_rq(rq);
>>>
>>> hrtick_update(rq);