[RFC PATCH v2 3/7] sched/fair: Handle unthrottle path for task based throttle

From: Aaron Lu
Date: Wed Apr 09 2025 - 08:12:45 EST


From: Valentin Schneider <vschneid@xxxxxxxxxx>

On unthrottle, enqueue throttled tasks back so they can continue to run.

Note that for this task based throttling, the only throttle place is
when it returns to user space so as long as a task is enqueued, no
matter its cfs_rq is throttled or not, it will be allowed to run till it
reaches that throttle place.

leaf_cfs_rq list is handled differently now: as long as a task is
enqueued to a throttled or not cfs_rq, this cfs_rq will be added to that
list and when cfs_rq is throttled and all its tasks are dequeued, it
will be removed from that list. I think this is easy to reason so chose
to do so.

Signed-off-by: Valentin Schneider <vschneid@xxxxxxxxxx>
Signed-off-by: Aaron Lu <ziqianlu@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 129 ++++++++++++++++----------------------------
1 file changed, 45 insertions(+), 84 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c566a5a90d065..4152088fc0546 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5357,18 +5357,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq)) {
- list_add_leaf_cfs_rq(cfs_rq);
- } else {
+ list_add_leaf_cfs_rq(cfs_rq);
#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
struct rq *rq = rq_of(cfs_rq);

if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
cfs_rq->throttled_clock = rq_clock(rq);
if (!cfs_rq->throttled_clock_self)
cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
}
+#endif
}
}

@@ -5826,6 +5825,11 @@ static inline int throttled_lb_pair(struct task_group *tg,
throttled_hierarchy(dest_cfs_rq);
}

+static inline bool task_is_throttled(struct task_struct *p)
+{
+ return !list_empty(&p->throttle_node);
+}
+
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static void throttle_cfs_rq_work(struct callback_head *work)
{
@@ -5878,32 +5882,41 @@ void init_cfs_throttle_work(struct task_struct *p)
INIT_LIST_HEAD(&p->throttle_node);
}

+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p, *tmp;

cfs_rq->throttle_count--;
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
- cfs_rq->throttled_clock_pelt;
+ if (cfs_rq->throttle_count)
+ return 0;

- /* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;

- if (cfs_rq->throttled_clock_self) {
- u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;

- cfs_rq->throttled_clock_self = 0;
+ cfs_rq->throttled_clock_self = 0;

- if (WARN_ON_ONCE((s64)delta < 0))
- delta = 0;
+ if (WARN_ON_ONCE((s64)delta < 0))
+ delta = 0;

- cfs_rq->throttled_clock_self_time += delta;
- }
+ cfs_rq->throttled_clock_self_time += delta;
+ }
+
+ /* Re-enqueue the tasks that have been throttled at this level. */
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+ list_del_init(&p->throttle_node);
+ enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
}

+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+
return 0;
}

@@ -5996,11 +6009,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];

- se = cfs_rq->tg->se[cpu_of(rq)];
+ /*
+ * It's possible we are called with !runtime_remaining due to things
+ * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+ * unthrottled us with a positive runtime_remaining but other still
+ * running entities consumed those runtime before we reach here.
+ *
+ * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+ * because any enqueue below will immediately trigger a throttle, which
+ * is not supposed to happen on unthrottle path.
+ */
+ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+ return;

cfs_rq->throttled = 0;

@@ -6028,62 +6050,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
break;
}
- goto unthrottle_throttle;
}

- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- /* Handle any unfinished DELAY_DEQUEUE business first. */
- if (se->sched_delayed) {
- int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
- dequeue_entity(qcfs_rq, se, flags);
- } else if (se->on_rq)
- break;
- enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- update_load_avg(qcfs_rq, se, UPDATE_TG);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- /* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
- dl_server_start(&rq->fair_server);
-
- /* At this point se is NULL and we are at root level*/
- add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);

/* Determine whether we need to wake up potentially idle CPU: */
@@ -6747,6 +6715,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void task_throttle_setup_work(struct task_struct *p) {}
+static bool task_is_throttled(struct task_struct *p) { return false; }

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
@@ -6955,6 +6924,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
util_est_enqueue(&rq->cfs, p);

if (flags & ENQUEUE_DELAYED) {
+ WARN_ON_ONCE(task_is_throttled(p));
requeue_delayed_entity(se);
return;
}
@@ -6997,10 +6967,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;

- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
-
flags = ENQUEUE_WAKEUP;
}

@@ -7022,10 +6988,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
}

if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -7055,7 +7017,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!task_new)
check_update_overutilized_status(rq);

-enqueue_throttle:
assert_list_leaf_cfs_rq(rq);

hrtick_update(rq);
--
2.39.5