[PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue

From: Peter Zijlstra
Date: Sat Jul 27 2024 - 07:03:59 EST


When dequeue_task() is delayed it becomes possible to exit a task (or
cgroup) that is still enqueued. Ensure things are dequeued before
freeing.

NOTE: switched_from_fair() causes spurious wakeups due to clearing
sched_delayed after enqueueing a task in another class that should've
been dequeued. This *should* be harmless.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 61 ++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 48 insertions(+), 13 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8318,7 +8318,20 @@ static void migrate_task_rq_fair(struct

static void task_dead_fair(struct task_struct *p)
{
- remove_entity_load_avg(&p->se);
+ struct sched_entity *se = &p->se;
+
+ if (se->sched_delayed) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+ if (se->sched_delayed)
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ remove_entity_load_avg(se);
}

/*
@@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
+ /*
+ * Since this is called after changing class, this isn't quite right.
+ * Specifically, this causes the task to get queued in the target class
+ * and experience a 'spurious' wakeup.
+ *
+ * However, since 'spurious' wakeups are harmless, this shouldn't be a
+ * problem.
+ */
+ p->se.sched_delayed = 0;
+ /*
+ * While here, also clear the vlag, it makes little sense to carry that
+ * over the excursion into the new class.
+ */
+ p->se.vlag = 0;
}

static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ SCHED_WARN_ON(p->se.sched_delayed);
+
attach_task_cfs_rq(p);

set_task_max_allowed_capacity(p);
@@ -12971,28 +13000,33 @@ void online_fair_sched_group(struct task

void unregister_fair_sched_group(struct task_group *tg)
{
- unsigned long flags;
- struct rq *rq;
int cpu;

destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));

for_each_possible_cpu(cpu) {
- if (tg->se[cpu])
- remove_entity_load_avg(tg->se[cpu]);
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ struct sched_entity *se = tg->se[cpu];
+ struct rq *rq = cpu_rq(cpu);
+
+ if (se) {
+ if (se->sched_delayed) {
+ guard(rq_lock_irqsave)(rq);
+ if (se->sched_delayed)
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ remove_entity_load_avg(se);
+ }

/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
- if (!tg->cfs_rq[cpu]->on_list)
- continue;
-
- rq = cpu_rq(cpu);
-
- raw_spin_rq_lock_irqsave(rq, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_rq_unlock_irqrestore(rq, flags);
+ if (cfs_rq->on_list) {
+ guard(rq_lock_irqsave)(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
}
}