[RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

From: Aaron Lu
Date: Thu Mar 13 2025 - 03:22:37 EST


From: Valentin Schneider <vschneid@xxxxxxxxxx>

On unthrottle, enqueue throttled tasks back so they can continue to run.

Note that for this task based throttling, the only throttle place is
when it returns to user space so as long as a task is enqueued, no
matter its cfs_rq is throttled or not, it will be allowed to run till it
reaches that throttle place.

leaf_cfs_rq list is handled differently now: as long as a task is
enqueued to a throttled or not cfs_rq, this cfs_rq will be added to that
list and when cfs_rq is throttled and all its tasks are dequeued, it
will be removed from that list. I think this is easy to reason so chose
to do so.

[aaronlu: extracted from Valentin's original patches. I also changed the
implementation to using enqueue_task_fair() for queuing back tasks to
unthrottled cfs_rq]
Signed-off-by: Valentin Schneider <vschneid@xxxxxxxxxx>
Signed-off-by: Aaron Lu <ziqianlu@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
1 file changed, 45 insertions(+), 87 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab403ff7d53c8..4a95fe3785e43 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
sched_entity *se, int flags)

if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq)) {
- list_add_leaf_cfs_rq(cfs_rq);
- } else {
+ list_add_leaf_cfs_rq(cfs_rq);
#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
struct rq *rq = rq_of(cfs_rq);

if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
cfs_rq->throttled_clock = rq_clock(rq);
if (!cfs_rq->throttled_clock_self)
cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
}
+#endif
}
}

@@ -5525,8 +5524,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
sched_entity *se, int flags)
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);

- if (cfs_rq->nr_queued == 0)
+ if (cfs_rq->nr_queued == 0) {
update_idle_cfs_rq_clock_pelt(cfs_rq);
+ if (throttled_hierarchy(cfs_rq))
+ list_del_leaf_cfs_rq(cfs_rq);
+ }

return true;
}
@@ -5832,6 +5834,11 @@ static inline int throttled_lb_pair(struct
task_group *tg,
throttled_hierarchy(dest_cfs_rq);
}

+static inline bool task_is_throttled(struct task_struct *p)
+{
+ return !list_empty(&p->throttle_node);
+}
+
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static void throttle_cfs_rq_work(struct callback_head *work)
{
@@ -5885,32 +5892,45 @@ void init_cfs_throttle_work(struct task_struct *p)
INIT_LIST_HEAD(&p->throttle_node);
}

+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p, *tmp;

cfs_rq->throttle_count--;
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
- cfs_rq->throttled_clock_pelt;
+ if (cfs_rq->throttle_count)
+ return 0;

- /* Add cfs_rq with load or one or more already running entities to
the list */
- if (!cfs_rq_is_decayed(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;

- if (cfs_rq->throttled_clock_self) {
- u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;

- cfs_rq->throttled_clock_self = 0;
+ cfs_rq->throttled_clock_self = 0;

- if (SCHED_WARN_ON((s64)delta < 0))
- delta = 0;
+ if (SCHED_WARN_ON((s64)delta < 0))
+ delta = 0;

- cfs_rq->throttled_clock_self_time += delta;
- }
+ cfs_rq->throttled_clock_self_time += delta;
}

+ /* Re-enqueue the tasks that have been throttled at this level. */
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
throttle_node) {
+ list_del_init(&p->throttle_node);
+ /*
+ * FIXME: p may not be allowed to run on this rq anymore
+ * due to affinity change while p is throttled.
+ */
+ enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
+ }
+
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+
return 0;
}

@@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
*tg, void *data)

/* group is entering throttled state, stop time */
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- list_del_leaf_cfs_rq(cfs_rq);

SCHED_WARN_ON(cfs_rq->throttled_clock_self);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock_self = rq_clock(rq);

+ if (!cfs_rq->nr_queued) {
+ list_del_leaf_cfs_rq(cfs_rq);
+ return 0;
+ }
+
WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
/*
* rq_lock is held, current is (obviously) executing this in kernelspace.
@@ -6031,11 +6055,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
-
- se = cfs_rq->tg->se[cpu_of(rq)];
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];

cfs_rq->throttled = 0;

@@ -6063,62 +6083,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
break;
}
- goto unthrottle_throttle;
}

- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- /* Handle any unfinished DELAY_DEQUEUE business first. */
- if (se->sched_delayed) {
- int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
- dequeue_entity(qcfs_rq, se, flags);
- } else if (se->on_rq)
- break;
- enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- update_load_avg(qcfs_rq, se, UPDATE_TG);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- /* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
- dl_server_start(&rq->fair_server);
-
- /* At this point se is NULL and we are at root level*/
- add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);

/* Determine whether we need to wake up potentially idle CPU: */
@@ -6989,6 +6955,7 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
util_est_enqueue(&rq->cfs, p);

if (flags & ENQUEUE_DELAYED) {
+ SCHED_WARN_ON(task_is_throttled(p));
requeue_delayed_entity(se);
return;
}
@@ -7031,10 +6998,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;

- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
-
flags = ENQUEUE_WAKEUP;
}

@@ -7056,10 +7019,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)

if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
}

if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -7089,7 +7048,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
if (!task_new)
check_update_overutilized_status(rq);

-enqueue_throttle:
assert_list_leaf_cfs_rq(rq);

hrtick_update(rq);
--
2.39.5