Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

From: K Prateek Nayak
Date: Thu Mar 13 2025 - 23:54:13 EST


Hello Aaron,

On 3/13/2025 12:51 PM, Aaron Lu wrote:

[..snip..]

---
kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
1 file changed, 45 insertions(+), 87 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab403ff7d53c8..4a95fe3785e43 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
sched_entity *se, int flags)

if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq)) {
- list_add_leaf_cfs_rq(cfs_rq);
- } else {
+ list_add_leaf_cfs_rq(cfs_rq);
#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
struct rq *rq = rq_of(cfs_rq);

if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
cfs_rq->throttled_clock = rq_clock(rq);
if (!cfs_rq->throttled_clock_self)
cfs_rq->throttled_clock_self = rq_clock(rq);

These bits probabaly need revisiting. From what I understand, these
stats were maintained to know when a task was woken up on a
throttled hierarchy which was not connected to the parent essentially
tracking the amount of time runnable tasks were waiting on the
cfs_rq before an unthrottle event allowed them to be picked.

With per-task throttle, these semantics no longer apply since a woken
task will run and dequeue itself when exiting to userspace.

Josh do you have any thoughts on this?

-#endif
}
+#endif
}
}

@@ -5525,8 +5524,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
sched_entity *se, int flags)
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);

- if (cfs_rq->nr_queued == 0)
+ if (cfs_rq->nr_queued == 0) {
update_idle_cfs_rq_clock_pelt(cfs_rq);
+ if (throttled_hierarchy(cfs_rq))
+ list_del_leaf_cfs_rq(cfs_rq);
+ }

return true;
}
@@ -5832,6 +5834,11 @@ static inline int throttled_lb_pair(struct
task_group *tg,
throttled_hierarchy(dest_cfs_rq);
}

+static inline bool task_is_throttled(struct task_struct *p)
+{
+ return !list_empty(&p->throttle_node);
+}
+
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static void throttle_cfs_rq_work(struct callback_head *work)
{
@@ -5885,32 +5892,45 @@ void init_cfs_throttle_work(struct task_struct *p)
INIT_LIST_HEAD(&p->throttle_node);
}

+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p, *tmp;

cfs_rq->throttle_count--;
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
- cfs_rq->throttled_clock_pelt;
+ if (cfs_rq->throttle_count)
+ return 0;

- /* Add cfs_rq with load or one or more already running entities to
the list */
- if (!cfs_rq_is_decayed(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;

- if (cfs_rq->throttled_clock_self) {
- u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;

- cfs_rq->throttled_clock_self = 0;
+ cfs_rq->throttled_clock_self = 0;

- if (SCHED_WARN_ON((s64)delta < 0))
- delta = 0;
+ if (SCHED_WARN_ON((s64)delta < 0))
+ delta = 0;

- cfs_rq->throttled_clock_self_time += delta;
- }
+ cfs_rq->throttled_clock_self_time += delta;
}

+ /* Re-enqueue the tasks that have been throttled at this level. */
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
throttle_node) {
+ list_del_init(&p->throttle_node);
+ /*
+ * FIXME: p may not be allowed to run on this rq anymore
+ * due to affinity change while p is throttled.
+ */

Using dequeue_task_fair() for throttle should ensure that the core now
sees task_on_rq_queued() which should make it go throgh a full dequeue
cycle which will remove the task from the "throttled_limbo_list" and
the enqueue should put it back on the correct runqueue.

Is the above comment inaccurate with your changes or did I miss
something?

+ enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
+ }
+
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+
return 0;
}

@@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
*tg, void *data)

/* group is entering throttled state, stop time */
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- list_del_leaf_cfs_rq(cfs_rq);

SCHED_WARN_ON(cfs_rq->throttled_clock_self);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock_self = rq_clock(rq);

+ if (!cfs_rq->nr_queued) {
+ list_del_leaf_cfs_rq(cfs_rq);
+ return 0;
+ }
+

This bit can perhaps go in Patch 2?

WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
/*
* rq_lock is held, current is (obviously) executing this in kernelspace.
@@ -6031,11 +6055,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
-
- se = cfs_rq->tg->se[cpu_of(rq)];
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];

cfs_rq->throttled = 0;


[..snip..]

--
Thanks and Regards,
Prateek