Re: [PATCH v2 10/10] sched/eevdf: Move to a single runqueue

From: Peter Zijlstra

Date: Tue May 12 2026 - 07:10:33 EST

On Mon, May 11, 2026 at 09:51:57PM +0530, K Prateek Nayak wrote:
> Hello Peter,
>
> On 5/11/2026 5:01 PM, Peter Zijlstra wrote:
> > @@ -9291,34 +9206,25 @@ static void wakeup_preempt_fair(struct r
> > + se = pick_next_entity(rq, true);
> > + if (!se)
> > + goto again;
> >
> > p = task_of(se);
> > - if (unlikely(throttled))
> > + if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se))))
> > task_throttle_setup_work(p);
>
> I think this bit should also be replicated in set_next_task() after
> account_cfs_rq_runtime() since any part of the hierarchy may get
> throttled as a result of failing to grab runtime.
>
> Also check_cfs_rq_runtime() only sees if the cfs_rq is throttled
> but the task can fail to run if it is on a throttled_hierarchy() too
> so that should be the correct check here.
>
> Something like below (only build tested on queue/sched/flat):
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e54da4c6c945..950c072244b2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9224,7 +9224,19 @@ struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
> goto again;
>
> p = task_of(se);
> - if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se))))
> + /*
> + * For cases where prev is picked again after
> + * being throttled, entity_tick() would have
> + * already marked its hierarchy as throttled.
> + *
> + * Add throttle work here since
> + * put_prev_set_next_task() is skipped on
> + * same task's selection.
> + *
> + * For other case, set_next_task_fair() will
> + * handle adding the throttle work.
> + */
> + if (throttled_hierarchy(cfs_rq_of(se)))
> task_throttle_setup_work(p);

Ah, right, because we've not accumulated runtime, it doesn't make sense
to use check_cfs_rq_runtime() at pick time, all we need to do is check
if the task should be throttled.

However, since set_next_task_fair() will walk the entire hierarchy
anyway, we can remove it here entirely and fully rely on that.

> return p;
>
> @@ -13819,6 +13831,12 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
> if (on_rq)
> weight = __calc_prop_weight(cfs_rq, se, weight);
> }
> + /*
> + * Add throttle work if the bandwidth allocation above failed
> + * to grab any runtime and throttled the task's hierarchy.
> + */
> + if (throttled_hierarchy(task_cfs_rq(p)))
> + task_throttle_setup_work(p);

We already call into account_cfs_rq_runtime(); which basically does all
we need.

I think the distinction between account_cfs_rq_runtime() and
check_cfs_rq_runtime() no longer makes sense. We can throttle a cfs_rq
at any point now, since we no longer remove the cfs_rq, but rather we
make the tasks suspend themselves until the cfs_rq naturally dequeues
for being empty.

Something like so perhaps?

---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -488,7 +488,7 @@ static int se_is_idle(struct sched_entit
#endif /* !CONFIG_FAIR_GROUP_SCHED */

static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);

/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -1420,12 +1420,22 @@ static void update_curr(struct cfs_rq *c
}
}

+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+static inline void task_throttle_setup_work(struct task_struct *p);
+
static void update_curr_fair(struct rq *rq)
{
struct sched_entity *se = &rq->donor->se;
+ bool throttled = false;

- for_each_sched_entity(se)
- update_curr(cfs_rq_of(se));
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+ throttled |= cfs_rq_throttled(cfs_rq);
+ }
+
+ if (throttled)
+ task_throttle_setup_work(rq->donor);
}

static inline void
@@ -5627,7 +5637,6 @@ place_entity(struct cfs_rq *cfs_rq, stru
}

static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -5830,8 +5839,6 @@ pick_next_entity(struct rq *rq, bool pro
return se;
}

-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
@@ -5841,9 +5848,6 @@ static void put_prev_entity(struct cfs_r
if (prev->on_rq)
update_curr(cfs_rq);

- /* throttle cfs_rqs exceeding runtime */
- check_cfs_rq_runtime(cfs_rq);
-
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
@@ -5976,44 +5980,29 @@ static int __assign_cfs_rq_runtime(struc
return cfs_rq->runtime_remaining > 0;
}

-/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- int ret;
-
- raw_spin_lock(&cfs_b->lock);
- ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
- raw_spin_unlock(&cfs_b->lock);
-
- return ret;
-}
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq);

-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+static bool __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;

if (likely(cfs_rq->runtime_remaining > 0))
- return;
+ return false;

if (cfs_rq->throttled)
- return;
- /*
- * if we're unable to extend our runtime we resched so that the active
- * hierarchy can be throttled
- */
- if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->h_curr))
- resched_curr(rq_of(cfs_rq));
+ return true;
+
+ return throttle_cfs_rq(cfs_rq);
}

static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
- return;
+ return false;

- __account_cfs_rq_runtime(cfs_rq, delta_exec);
+ return __account_cfs_rq_runtime(cfs_rq, delta_exec);
}

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -6284,9 +6273,9 @@ static bool throttle_cfs_rq(struct cfs_r
* We have raced with bandwidth becoming available, and if we
* actually throttled the timer might not unthrottle us for an
* entire period. We additionally needed to make sure that any
- * subsequent check_cfs_rq_runtime calls agree not to throttle
- * us, as we may commit to do cfs put_prev+pick_next, so we ask
- * for 1ns of runtime rather than just check cfs_b.
+ * subsequent account_cfs_rq_runtime() calls agree not to
+ * throttle us, as we may commit to do cfs put_prev+pick_next,
+ * so we ask for 1ns of runtime rather than just check cfs_b.
*/
dequeue = 0;
} else {
@@ -6711,8 +6700,6 @@ static void check_enqueue_throttle(struc

/* update runtime allocation */
account_cfs_rq_runtime(cfs_rq, 0);
- if (cfs_rq->runtime_remaining <= 0)
- throttle_cfs_rq(cfs_rq);
}

static void sync_throttle(struct task_group *tg, int cpu)
@@ -6742,25 +6729,6 @@ static void sync_throttle(struct task_gr
cfs_rq->pelt_clock_throttled = 1;
}

-/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- if (!cfs_bandwidth_used())
- return false;
-
- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return false;
-
- /*
- * it's possible for a throttled entity to be forced into a running
- * state (e.g. set_curr_task), in this case we're finished.
- */
- if (cfs_rq_throttled(cfs_rq))
- return true;
-
- return throttle_cfs_rq(cfs_rq);
-}
-
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -7015,8 +6983,7 @@ static void sched_fair_update_stop_tick(

#else /* !CONFIG_CFS_BANDWIDTH: */

-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
+static bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -9208,7 +9175,6 @@ struct task_struct *pick_task_fair(struc
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
- struct task_struct *p;
int new_tasks;

again:
@@ -9223,10 +9189,7 @@ struct task_struct *pick_task_fair(struc
if (!se)
goto again;

- p = task_of(se);
- if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se))))
- task_throttle_setup_work(p);
- return p;
+ return task_of(se);

idle:
new_tasks = sched_balance_newidle(rq, rf);
@@ -13618,6 +13581,7 @@ static void task_tick_fair(struct rq *rq
{
struct sched_entity *se = &curr->se;
unsigned long weight = NICE_0_LOAD;
+ bool throttled = false;
struct cfs_rq *cfs_rq;

for_each_sched_entity(se) {
@@ -13625,8 +13589,13 @@ static void task_tick_fair(struct rq *rq
entity_tick(cfs_rq, se, queued);

weight = __calc_prop_weight(cfs_rq, se, weight);
+
+ throttled |= cfs_rq_throttled(cfs_rq);
}

+ if (throttled)
+ task_throttle_setup_work(curr);
+
se = &curr->se;
reweight_eevdf(cfs_rq, se, weight, se->on_rq);

@@ -13800,6 +13769,7 @@ static void set_next_task_fair(struct rq
struct cfs_rq *cfs_rq = &rq->cfs;
unsigned long weight = NICE_0_LOAD;
bool on_rq = se->on_rq;
+ bool throttled = false;

clear_buddies(cfs_rq, se);

@@ -13814,12 +13784,15 @@ static void set_next_task_fair(struct rq
set_next_entity(cfs_rq, se);

/* ensure bandwidth has been allocated on our new cfs_rq */
- account_cfs_rq_runtime(cfs_rq, 0);
+ throttled |= account_cfs_rq_runtime(cfs_rq, 0);

if (on_rq)
weight = __calc_prop_weight(cfs_rq, se, weight);
}

+ if (throttled)
+ task_throttle_setup_work(p);
+
se = &p->se;
cfs_rq->curr = se;