[PATCH 5/5] sched/fair: Unify cfs_rq throttling via account_cfs_rq_runtime()

From: K Prateek Nayak

Date: Thu May 28 2026 - 06:04:16 EST

From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>

assign_cfs_rq_runtime() during update_curr() sets the resched indicator
and relies on check_cfs_rq_runtime() during pick_next_task() /
put_prev_entity() to throttle the hierarchy once current task is
preempted / blocks.

Per-task throttle, on the other hand, uses throttle_cfs_rq() to simply
propagate the throttle signals, and then relies on task work to
individually throttle the runnable tasks on their way out to the
userspace.

Remove check_cfs_rq_runtime() and unify throttling into
account_cfs_rq_runtime() which only sets the cfs_rq->throttled,
cfs_rq->throttle_count indicators via throttle_cfs_rq() and optionally
adds the task work to the current task (donor) it is on the throttled
hierarchy.

throttle_cfs_rq() requests for sched_cfs_bandwidth_slice() worth of
bandwidth for the current hierarchy that enable it to continue running
uninterrupted when selected. For the rest, it requests a bare minimum of
"1" to ensure some bandwidth is available and pass the
"runtime_remaining > 0" checks once selected.

For SCHED_PROXY_EXEC, a mutex holder cannot exit to userspace without
dropping it first and the mutex_unlock() ensures proxy is stopped before
the mutex handoff which preserves the current semantics for running a
throttled task until it exits to the userspace even if it acts as a
donor.

[ prateek: rebased on tip, comments, commit message. ]

Not-yet-signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
---
kernel/sched/fair.c | 110 ++++++++++++++++++++++----------------------
1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c48eaf2d7919..a481647f0f0f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -525,7 +525,7 @@ static int se_is_idle(struct sched_entity *se)
#endif /* !CONFIG_FAIR_GROUP_SCHED */

static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);

/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -6359,8 +6359,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
return se;
}

-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
@@ -6370,9 +6368,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
if (prev->on_rq)
update_curr(cfs_rq);

- /* throttle cfs_rqs exceeding runtime */
- check_cfs_rq_runtime(cfs_rq);
-
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
@@ -6507,41 +6502,32 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
return cfs_rq->runtime_remaining > 0;
}

-/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
- guard(raw_spinlock)(&cfs_b->lock);
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq);

- return __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
-}
-
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+static bool __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;

if (likely(cfs_rq->runtime_remaining > 0))
- return;
+ return false;

if (cfs_rq->throttled)
- return;
+ return true;
/*
- * if we're unable to extend our runtime we resched so that the active
- * hierarchy can be throttled
+ * throttle_cfs_rq() will try to extend the runtime first
+ * before throttling the hierarchy.
*/
- if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_curr(rq_of(cfs_rq));
+ return throttle_cfs_rq(cfs_rq);
}

static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
- return;
+ return false;

- __account_cfs_rq_runtime(cfs_rq, delta_exec);
+ return __account_cfs_rq_runtime(cfs_rq, delta_exec);
}

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -6829,10 +6815,24 @@ static int tg_throttle_down(struct task_group *tg, void *data)

static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
- struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ struct sched_entity *curr = cfs_rq->curr;
+ struct rq *rq = rq_of(cfs_rq);

scoped_guard(raw_spinlock, &cfs_b->lock) {
+ u64 target_runtime = 1;
+
+ /*
+ * If cfs_rq->curr is still runnable, we are here from an
+ * update_curr(). Request sysctl_sched_cfs_bandwidth_slice
+ * worth of bandwidth to continue running.
+ *
+ * If the curr is not runnable, just request enough bandwidth
+ * to be runnable next time the pick selects this cfs_rq.
+ */
+ if (curr && curr->on_rq)
+ target_runtime = sched_cfs_bandwidth_slice();
+
/*
* Check if We have raced with bandwidth becoming available. If
* we actually throttled the timer might not unthrottle us for
@@ -6843,7 +6843,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
*
* This will start the period timer if necessary.
*/
- if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1))
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, target_runtime))
return false;

/*
@@ -6864,6 +6864,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
*/
cfs_rq->throttled = 1;
WARN_ON_ONCE(cfs_rq->throttled_clock);
+
+ /*
+ * If current hierarchy was throttled, add throttle work to the
+ * current donor. In case of proxy-execution, the execution
+ * context cannot exit to the userspace while holding a mutex
+ * and the rule of throttle deferral to only throttle the
+ * throttled context at exit to userspace is still preserved.
+ */
+ if (curr && curr->on_rq)
+ task_throttle_setup_work(rq->donor);
+
return true;
}

@@ -7245,7 +7256,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;

- /* an active group must be handled by the update_curr()->put() path */
+ /* an active group must be handled by the update_curr() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;

@@ -7255,8 +7266,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)

/* update runtime allocation */
account_cfs_rq_runtime(cfs_rq, 0);
- if (cfs_rq->runtime_remaining <= 0)
- throttle_cfs_rq(cfs_rq);
}

static void sync_throttle(struct task_group *tg, int cpu)
@@ -7286,25 +7295,6 @@ static void sync_throttle(struct task_group *tg, int cpu)
cfs_rq->pelt_clock_throttled = 1;
}

-/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- if (!cfs_bandwidth_used())
- return false;
-
- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return false;
-
- /*
- * it's possible for a throttled entity to be forced into a running
- * state (e.g. set_curr_task), in this case we're finished.
- */
- if (cfs_rq_throttled(cfs_rq))
- return true;
-
- return throttle_cfs_rq(cfs_rq);
-}
-
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -7559,8 +7549,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)

#else /* !CONFIG_CFS_BANDWIDTH: */

-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
+static bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -9893,8 +9882,15 @@ static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
/* Might not have done put_prev_entity() */
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
-
- throttled |= check_cfs_rq_runtime(cfs_rq);
+ /*
+ * For the current hierarchy, update_curr() above would
+ * have set the throttle indicators if the cfs_rq has
+ * run out of bandwidth. For others, enqueue / last
+ * update_curr() for the cfs_rq would have ensured the
+ * throttle indicators are set if bandwidth was not
+ * available.
+ */
+ throttled |= cfs_rq_throttled(cfs_rq);

se = pick_next_entity(rq, cfs_rq, true);
if (!se)
@@ -14868,8 +14864,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
+ struct cfs_rq *cfs_rq;

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -15074,15 +15070,19 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
+ bool throttled = false;

for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);

set_next_entity(cfs_rq, se, first);
/* ensure bandwidth has been allocated on our new cfs_rq */
- account_cfs_rq_runtime(cfs_rq, 0);
+ throttled |= account_cfs_rq_runtime(cfs_rq, 0);
}

+ if (throttled)
+ task_throttle_setup_work(p);
+
__set_next_task_fair(rq, p, first);
}

--
2.43.0