Re: [PATCH v2] sched/fair: Fix insertion in rq->leaf_cfs_rq_list

From: Peter Zijlstra
Date: Wed Jan 30 2019 - 09:01:13 EST


On Wed, Jan 30, 2019 at 06:22:47AM +0100, Vincent Guittot wrote:
> Sargun reported a crash:
> "I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix
> infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
> and put it on top of 4.19.13. In addition to this, I uninlined
> list_add_leaf_cfs_rq for debugging.
>
> This revealed a new bug that we didn't get to because we kept getting
> crashes from the previous issue. When we are running with cgroups that
> are rapidly changing, with CFS bandwidth control, and in addition
> using the cpusets cgroup, we see this crash. Specifically, it seems to
> occur with cgroups that are throttled and we change the allowed
> cpuset."
>
> The algorithm used to order cfs_rq in rq->leaf_cfs_rq_list assumes that
> it will walk down to root the 1st time a cfs_rq is used and we will finish
> to add either a cfs_rq without parent or a cfs_rq with a parent that is
> already on the list. But this is not always true in presence of throttling.
> Because a cfs_rq can be throttled even if it has never been used but other CPUs
> of the cgroup have already used all the bandwdith, we are not sure to go down to
> the root and add all cfs_rq in the list.
>
> Ensure that all cfs_rq will be added in the list even if they are throttled.
>
> Reported-by: Sargun Dhillon <sargun@xxxxxxxxx>
> Fixes: 9c2791f936ef ("Fix hierarchical order in rq->leaf_cfs_rq_list")
> Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>

Given my previous patch; how about something like so instead?

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -282,13 +282,15 @@ static inline struct cfs_rq *group_cfs_r
return grp->my_q;
}

-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);

if (cfs_rq->on_list)
- return;
+ return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
+
+ cfs_rq->on_list = 1;

/*
* Ensure we either appear before our parent (if already
@@ -315,7 +317,10 @@ static inline void list_add_leaf_cfs_rq(
* list.
*/
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else if (!cfs_rq->tg->parent) {
+ return true;
+ }
+
+ if (!cfs_rq->tg->parent) {
/*
* cfs rq without parent should be put
* at the tail of the list.
@@ -327,23 +332,22 @@ static inline void list_add_leaf_cfs_rq(
* tmp_alone_branch to the beginning of the list.
*/
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else {
- /*
- * The parent has not already been added so we want to
- * make sure that it will be put after us.
- * tmp_alone_branch points to the begin of the branch
- * where we will add parent.
- */
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- rq->tmp_alone_branch);
- /*
- * update tmp_alone_branch to points to the new begin
- * of the branch
- */
- rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+ return true;
}

- cfs_rq->on_list = 1;
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the begin of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new begin
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+ return false;
}

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4999,6 +5003,12 @@ static void __maybe_unused unthrottle_of
}

#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return false;
+}
+
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
return rq_clock_task(rq_of(cfs_rq));
@@ -5190,6 +5200,21 @@ enqueue_task_fair(struct rq *rq, struct

}

+ if (cfs_bandwidth_used()) {
+ /*
+ * When bandwidth control is enabled; the cfs_rq_throttled()
+ * breaks in the above iteration can result in incomplete
+ * leaf list maintenance, resulting in triggering the assertion
+ * below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
+ }
+ }
+
assert_list_leaf_cfs_rq(rq);

hrtick_update(rq);