[RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
From: Aaron Lu
Date: Thu Mar 13 2025 - 03:22:16 EST
From: Valentin Schneider <vschneid@xxxxxxxxxx>
Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq,
add a task work to them so that when those tasks return to user, the
actual throttle/dequeue can happen.
Note that since the throttle/dequeue always happens on a task basis when
it returns to user, it's no longer necessary for check_cfs_rq_runtime()
to return a value and pick_task_fair() acts differently according to that
return value, so check_cfs_rq_runtime() is changed to not return a
value.
[aaronlu: extracted from Valentin's original patches.
Fixed a problem that curr is not in timeline tree and has to be dealed
with explicitely;
Make check_cfs_rq_runtime() void.]
Signed-off-by: Valentin Schneider <vschneid@xxxxxxxxxx>
Signed-off-by: Aaron Lu <ziqianlu@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 201 ++++++++++++++++++++++++-------------------
kernel/sched/sched.h | 1 +
2 files changed, 112 insertions(+), 90 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 60eb5329bf526..ab403ff7d53c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5607,7 +5607,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
return se;
}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
@@ -5832,8 +5832,49 @@ static inline int throttled_lb_pair(struct
task_group *tg,
throttled_hierarchy(dest_cfs_rq);
}
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static void throttle_cfs_rq_work(struct callback_head *work)
{
+ struct task_struct *p = container_of(work, struct task_struct,
sched_throttle_work);
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+ struct rq_flags rf;
+
+ WARN_ON_ONCE(p != current);
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+
+ /*
+ * If task is exiting, then there won't be a return to userspace, so we
+ * don't have to bother with any of this.
+ */
+ if ((p->flags & PF_EXITING))
+ return;
+
+ rq = task_rq_lock(p, &rf);
+
+ se = &p->se;
+ cfs_rq = cfs_rq_of(se);
+
+ /* Raced, forget */
+ if (p->sched_class != &fair_sched_class)
+ goto out_unlock;
+
+ /*
+ * If not in limbo, then either replenish has happened or this task got
+ * migrated out of the throttled cfs_rq, move along
+ */
+ if (!cfs_rq->throttle_count)
+ goto out_unlock;
+
+ update_rq_clock(rq);
+ WARN_ON_ONCE(!list_empty(&p->throttle_node));
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+ resched_curr(rq);
+
+out_unlock:
+ task_rq_unlock(rq, p, &rf);
}
void init_cfs_throttle_work(struct task_struct *p)
@@ -5873,32 +5914,81 @@ static int tg_unthrottle_up(struct task_group
*tg, void *data)
return 0;
}
+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+ return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+ /*
+ * Kthreads and exiting tasks don't return to userspace, so adding the
+ * work is pointless
+ */
+ if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+ return;
+
+ if (task_has_throttle_work(p))
+ return;
+
+ task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p;
+ struct rb_node *node;
+
+ cfs_rq->throttle_count++;
+ if (cfs_rq->throttle_count > 1)
+ return 0;
/* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
- SCHED_WARN_ON(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock_self = rq_clock(rq);
+ SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_queued)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+
+ WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
+ /*
+ * rq_lock is held, current is (obviously) executing this in kernelspace.
+ *
+ * All other tasks enqueued on this rq have their saved PC at the
+ * context switch, so they will go through the kernel before returning
+ * to userspace. Thus, there are no tasks-in-userspace to handle, just
+ * install the task_work on all of them.
+ */
+ node = rb_first(&cfs_rq->tasks_timeline.rb_root);
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ if (!entity_is_task(se))
+ goto next;
+
+ p = task_of(se);
+ task_throttle_setup_work(p);
+next:
+ node = rb_next(node);
+ }
+
+ /* curr is not in the timeline tree */
+ if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
+ p = task_of(cfs_rq->curr);
+ task_throttle_setup_work(p);
}
- cfs_rq->throttle_count++;
return 0;
}
-static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta, dequeue = 1;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
+ int dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5919,74 +6009,13 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
raw_spin_unlock(&cfs_b->lock);
if (!dequeue)
- return false; /* Throttle no longer required. */
-
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+ return; /* Throttle no longer required. */
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- int flags;
-
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- /*
- * Abuse SPECIAL to avoid delayed dequeue in this instance.
- * This avoids teaching dequeue_entities() about throttled
- * entities and keeps things relatively simple.
- */
- flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
- if (se->sched_delayed)
- flags |= DEQUEUE_DELAYED;
- dequeue_entity(qcfs_rq, se, flags);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
-
- if (qcfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
- break;
- }
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
- }
-
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, queued_delta);
-
- /* Stop the fair server if throttling resulted in no runnable tasks */
- if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
- dl_server_stop(&rq->fair_server);
-done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
@@ -5995,7 +6024,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
SCHED_WARN_ON(cfs_rq->throttled_clock);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock = rq_clock(rq);
- return true;
+ return;
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -6471,22 +6500,22 @@ static void sync_throttle(struct task_group
*tg, int cpu)
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
- return false;
+ return;
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return false;
+ return;
/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
- return true;
+ return;
- return throttle_cfs_rq(cfs_rq);
+ throttle_cfs_rq(cfs_rq);
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -6582,6 +6611,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -7117,10 +7147,6 @@ static int dequeue_entities(struct rq *rq,
struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
-
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
slice = cfs_rq_min_slice(cfs_rq);
@@ -7157,10 +7183,6 @@ static int dequeue_entities(struct rq *rq,
struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
}
sub_nr_running(rq, h_nr_queued);
@@ -8869,8 +8891,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto again;
+ check_cfs_rq_runtime(cfs_rq);
se = pick_next_entity(rq, cfs_rq);
if (!se)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8bfa3d708081..5c2af5a70163c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -742,6 +742,7 @@ struct cfs_rq {
int throttle_count;
struct list_head throttled_list;
struct list_head throttled_csd_list;
+ struct list_head throttled_limbo_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
--
2.39.5