Re: [PATCH v2 10/10] sched/eevdf: Move to a single runqueue

From: Vincent Guittot

Date: Wed May 20 2026 - 12:44:48 EST


Le mardi 19 mai 2026 à 12:38:10 (+0200), Vincent Guittot a écrit :
> On Mon, 11 May 2026 at 14:07, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> >
> > Change fair/cgroup to a single runqueue.
> >
> > Infamously fair/cgroup isn't working for a number of people; typically
> > the complaint is latencies and/or overhead. The latency issue is due
> > to the intermediate entries that represent a combination of tasks and
> > thereby obfuscate the runnability of tasks.
> >
> > The approach here is to leave the cgroup hierarchy as is; including
> > the intermediate enqueue/dequeue but move the actual EEVDF runqueue
> > outside. This means things like the shares_weight approximation are
> > fully preserved.
> >
> > That is, given a hierarchy like:
> >
> > R
> > |
> > se--G1
> > / \
> > G2--se se--G3
> > / \ |
> > T1--se se--T2 se--T3
> >
> > This is fully maintained for load tracking, however the EEVDF parts of
> > cfs_rq/se go unused for the intermediates and are instead connected
> > like:
> >
> > _R_
> > / | \
> > T1 T2 T3
> >
> > Since the effective weight of the entities is determined by the
> > hierarchy, this gets recomputed on enqueue,set_next_task and tick.
> >
> > Notably, the effective weight (se->h_load) is computed from the
> > hierarchical fraction: se->load / cfs_rq->load.
> >
> > Since EEVDF is now exclusive operating on rq->cfs, it needs to
> > consider cfs_rq->h_nr_queued rather than cfs_rq->nr_queued. Similarly,
> > only tasks can get delayed, simplifying some of the cgroup cleanup.
> >
> > One place where additional information was required was
> > set_next_task() / put_prev_task(), where we need to track 'current'
> > both in the hierarchical sense (cfs_rq->h_curr) and in the flat sense
> > (cfs_rq->curr).
> >
> > As a result of only having a single level to pick from, much of the
> > complications in pick_next_task() and preemption go away.
> >
> > Since many of the hierarchical operations are still there, this won't
> > immediately fix the performance issues, but hopefully it will fix some
> > of the latency issues.
> >
> > TODO: split struct cfs_rq / struct sched_entity
> > TODO: try and get rid of h_curr

I finally fount the root cause of regression: the update of entity lag happened
after the task has been dequeued which screwed update_entity_lag():

update_entity_lag must be called after updating curr and cfs_rd and before
clearing on_rq

With the fix below I'm back to original hackbench figures and maybe even a bit better.
I haven't checked shceduling latency yet

---
kernel/sched/fair.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77d0e1937f2c..32fe57004f27 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5753,6 +5753,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

update_stats_dequeue_fair(cfs_rq, se, flags);

+ if (entity_is_task(se))
+ update_entity_lag(&rq_of(cfs_rq)->cfs, se);
+
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);

@@ -7423,6 +7426,7 @@ static bool __dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
update_load_avg(cfs_rq_of(se), se, 0);
+ update_entity_lag(cfs_rq, se);
set_delayed(se);
return false;
}
@@ -7430,7 +7434,6 @@ static bool __dequeue_task(struct rq *rq, struct task_struct *p, int flags)

dequeue_hierarchy(p, flags);

- update_entity_lag(cfs_rq, se);
if (sched_feat(PLACE_REL_DEADLINE) && !task_sleep) {
se->deadline -= se->vruntime;
se->rel_deadline = 1;
--
2.43.0




> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> > ---
> > include/linux/sched.h | 1
> > kernel/sched/core.c | 5
> > kernel/sched/debug.c | 9
> > kernel/sched/fair.c | 789 +++++++++++++++++++++-----------------------------
> > kernel/sched/pelt.c | 6
> > kernel/sched/sched.h | 26 -
> > 6 files changed, 366 insertions(+), 470 deletions(-)
> >
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -575,6 +575,7 @@ struct sched_statistics {
> > struct sched_entity {
> > /* For load-balancing: */
> > struct load_weight load;
> > + struct load_weight h_load;
> > struct rb_node run_node;
> > u64 deadline;
> > u64 min_vruntime;
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -5539,11 +5539,8 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
> > */
> > static inline void prefetch_curr_exec_start(struct task_struct *p)
> > {
> > -#ifdef CONFIG_FAIR_GROUP_SCHED
> > - struct sched_entity *curr = p->se.cfs_rq->curr;
> > -#else
> > struct sched_entity *curr = task_rq(p)->cfs.curr;
> > -#endif
> > +
> > prefetch(curr);
> > prefetch(&curr->exec_start);
> > }
> > --- a/kernel/sched/debug.c
> > +++ b/kernel/sched/debug.c
> > @@ -911,10 +911,11 @@ print_task(struct seq_file *m, struct rq
> > else
> > SEQ_printf(m, " %c", task_state_to_char(p));
> >
> > - SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
> > + SEQ_printf(m, " %15s %5d %10ld %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
> > p->comm, task_pid_nr(p),
> > + p->se.h_load.weight,
> > SPLIT_NS(p->se.vruntime),
> > - entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
> > + entity_eligible(&rq->cfs, &p->se) ? 'E' : 'N',
> > SPLIT_NS(p->se.deadline),
> > p->se.custom_slice ? 'S' : ' ',
> > SPLIT_NS(p->se.slice),
> > @@ -943,7 +944,7 @@ static void print_rq(struct seq_file *m,
> >
> > SEQ_printf(m, "\n");
> > SEQ_printf(m, "runnable tasks:\n");
> > - SEQ_printf(m, " S task PID vruntime eligible "
> > + SEQ_printf(m, " S task PID weight vruntime eligible "
> > "deadline slice sum-exec switches "
> > "prio wait-time sum-sleep sum-block"
> > #ifdef CONFIG_NUMA_BALANCING
> > @@ -1051,6 +1052,8 @@ void print_cfs_rq(struct seq_file *m, in
> > cfs_rq->tg_load_avg_contrib);
> > SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
> > atomic_long_read(&cfs_rq->tg->load_avg));
> > + SEQ_printf(m, " .%-30s: %lu\n", "h_load",
> > + cfs_rq->h_load);
> > #endif /* CONFIG_FAIR_GROUP_SCHED */
> > #ifdef CONFIG_CFS_BANDWIDTH
> > SEQ_printf(m, " .%-30s: %d\n", "throttled",
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -296,8 +296,8 @@ static u64 __calc_delta(u64 delta_exec,
> > */
> > static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
> > {
> > - if (unlikely(se->load.weight != NICE_0_LOAD))
> > - delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
> > + if (se->h_load.weight != NICE_0_LOAD)
> > + delta = __calc_delta(delta, NICE_0_LOAD, &se->h_load);
> >
> > return delta;
> > }
> > @@ -427,38 +427,6 @@ static inline struct sched_entity *paren
> > return se->parent;
> > }
> >
> > -static void
> > -find_matching_se(struct sched_entity **se, struct sched_entity **pse)
> > -{
> > - int se_depth, pse_depth;
> > -
> > - /*
> > - * preemption test can be made between sibling entities who are in the
> > - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
> > - * both tasks until we find their ancestors who are siblings of common
> > - * parent.
> > - */
> > -
> > - /* First walk up until both entities are at same depth */
> > - se_depth = (*se)->depth;
> > - pse_depth = (*pse)->depth;
> > -
> > - while (se_depth > pse_depth) {
> > - se_depth--;
> > - *se = parent_entity(*se);
> > - }
> > -
> > - while (pse_depth > se_depth) {
> > - pse_depth--;
> > - *pse = parent_entity(*pse);
> > - }
> > -
> > - while (!is_same_group(*se, *pse)) {
> > - *se = parent_entity(*se);
> > - *pse = parent_entity(*pse);
> > - }
> > -}
> > -
> > static int tg_is_idle(struct task_group *tg)
> > {
> > return tg->idle > 0;
> > @@ -502,11 +470,6 @@ static inline struct sched_entity *paren
> > return NULL;
> > }
> >
> > -static inline void
> > -find_matching_se(struct sched_entity **se, struct sched_entity **pse)
> > -{
> > -}
> > -
> > static inline int tg_is_idle(struct task_group *tg)
> > {
> > return 0;
> > @@ -685,7 +648,7 @@ static inline unsigned long avg_vruntime
> > static inline void
> > __sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > - unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> > + unsigned long weight = avg_vruntime_weight(cfs_rq, se->h_load.weight);
> > s64 w_vruntime, key = entity_key(cfs_rq, se);
> >
> > w_vruntime = key * weight;
> > @@ -702,7 +665,7 @@ sum_w_vruntime_add_paranoid(struct cfs_r
> > s64 key, tmp;
> >
> > again:
> > - weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> > + weight = avg_vruntime_weight(cfs_rq, se->h_load.weight);
> > key = entity_key(cfs_rq, se);
> >
> > if (check_mul_overflow(key, weight, &key))
> > @@ -748,7 +711,7 @@ sum_w_vruntime_add(struct cfs_rq *cfs_rq
> > static void
> > sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > - unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> > + unsigned long weight = avg_vruntime_weight(cfs_rq, se->h_load.weight);
> > s64 key = entity_key(cfs_rq, se);
> >
> > cfs_rq->sum_w_vruntime -= key * weight;
> > @@ -790,7 +753,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
> > s64 runtime = cfs_rq->sum_w_vruntime;
> >
> > if (curr) {
> > - unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight);
> > + unsigned long w = avg_vruntime_weight(cfs_rq, curr->h_load.weight);
> >
> > runtime += entity_key(cfs_rq, curr) * w;
> > weight += w;
> > @@ -861,8 +824,6 @@ bool update_entity_lag(struct cfs_rq *cf
> > u64 avruntime = avg_vruntime(cfs_rq);
> > s64 vlag = entity_lag(cfs_rq, se, avruntime);
> >
> > - WARN_ON_ONCE(!se->on_rq);
> > -
> > if (se->sched_delayed) {
> > /* previous vlag < 0 otherwise se would not be delayed */
> > vlag = max(vlag, se->vlag);
> > @@ -898,7 +859,7 @@ static int vruntime_eligible(struct cfs_
> > long load = cfs_rq->sum_weight;
> >
> > if (curr && curr->on_rq) {
> > - unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight);
> > + unsigned long weight = avg_vruntime_weight(cfs_rq, curr->h_load.weight);
> >
> > avg += entity_key(cfs_rq, curr) * weight;
> > load += weight;
> > @@ -1039,6 +1000,9 @@ RB_DECLARE_CALLBACKS(static, min_vruntim
> > */
> > static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > + WARN_ON_ONCE(&rq_of(cfs_rq)->cfs != cfs_rq);
> > + WARN_ON_ONCE(!entity_is_task(se));
> > +
> > sum_w_vruntime_add(cfs_rq, se);
> > se->min_vruntime = se->vruntime;
> > se->min_slice = se->slice;
> > @@ -1048,6 +1012,9 @@ static void __enqueue_entity(struct cfs_
> >
> > static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > + WARN_ON_ONCE(&rq_of(cfs_rq)->cfs != cfs_rq);
> > + WARN_ON_ONCE(!entity_is_task(se));
> > +
> > rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
> > &min_vruntime_cb);
> > sum_w_vruntime_sub(cfs_rq, se);
> > @@ -1144,7 +1111,7 @@ static struct sched_entity *pick_eevdf(s
> > * We can safely skip eligibility check if there is only one entity
> > * in this cfs_rq, saving some cycles.
> > */
> > - if (cfs_rq->nr_queued == 1)
> > + if (cfs_rq->h_nr_queued == 1)
> > return curr && curr->on_rq ? curr : se;
> >
> > /*
> > @@ -1391,8 +1358,6 @@ static s64 update_se(struct rq *rq, stru
> > return delta_exec;
> > }
> >
> > -static void set_next_buddy(struct sched_entity *se);
> > -
> > /*
> > * Used by other classes to account runtime.
> > */
> > @@ -1412,7 +1377,7 @@ static void update_curr(struct cfs_rq *c
> > * not necessarily be the actual task running
> > * (rq->curr.se). This is easy to confuse!
> > */
> > - struct sched_entity *curr = cfs_rq->curr;
> > + struct sched_entity *curr = cfs_rq->h_curr;
> > struct rq *rq = rq_of(cfs_rq);
> > s64 delta_exec;
> > bool resched;
> > @@ -1424,26 +1389,29 @@ static void update_curr(struct cfs_rq *c
> > if (unlikely(delta_exec <= 0))
> > return;
> >
> > + account_cfs_rq_runtime(cfs_rq, delta_exec);
> > +
> > + if (!entity_is_task(curr))
> > + return;
> > +
> > + cfs_rq = &rq->cfs;
> > +
> > curr->vruntime += calc_delta_fair(delta_exec, curr);
> > resched = update_deadline(cfs_rq, curr);
> >
> > - if (entity_is_task(curr)) {
> > - /*
> > - * If the fair_server is active, we need to account for the
> > - * fair_server time whether or not the task is running on
> > - * behalf of fair_server or not:
> > - * - If the task is running on behalf of fair_server, we need
> > - * to limit its time based on the assigned runtime.
> > - * - Fair task that runs outside of fair_server should account
> > - * against fair_server such that it can account for this time
> > - * and possibly avoid running this period.
> > - */
> > - dl_server_update(&rq->fair_server, delta_exec);
> > - }
> > -
> > - account_cfs_rq_runtime(cfs_rq, delta_exec);
> > + /*
> > + * If the fair_server is active, we need to account for the
> > + * fair_server time whether or not the task is running on
> > + * behalf of fair_server or not:
> > + * - If the task is running on behalf of fair_server, we need
> > + * to limit its time based on the assigned runtime.
> > + * - Fair task that runs outside of fair_server should account
> > + * against fair_server such that it can account for this time
> > + * and possibly avoid running this period.
> > + */
> > + dl_server_update(&rq->fair_server, delta_exec);
> >
> > - if (cfs_rq->nr_queued == 1)
> > + if (cfs_rq->h_nr_queued == 1)
> > return;
> >
> > if (resched || !protect_slice(curr)) {
> > @@ -1454,7 +1422,10 @@ static void update_curr(struct cfs_rq *c
> >
> > static void update_curr_fair(struct rq *rq)
> > {
> > - update_curr(cfs_rq_of(&rq->donor->se));
> > + struct sched_entity *se = &rq->donor->se;
> > +
> > + for_each_sched_entity(se)
> > + update_curr(cfs_rq_of(se));
> > }
> >
> > static inline void
> > @@ -1530,7 +1501,7 @@ update_stats_enqueue_fair(struct cfs_rq
> > * Are we enqueueing a waiting task? (for current tasks
> > * a dequeue/enqueue event is a NOP)
> > */
> > - if (se != cfs_rq->curr)
> > + if (se != cfs_rq->h_curr)
> > update_stats_wait_start_fair(cfs_rq, se);
> >
> > if (flags & ENQUEUE_WAKEUP)
> > @@ -1548,7 +1519,7 @@ update_stats_dequeue_fair(struct cfs_rq
> > * Mark the end of the wait period if dequeueing a
> > * waiting task:
> > */
> > - if (se != cfs_rq->curr)
> > + if (se != cfs_rq->h_curr)
> > update_stats_wait_end_fair(cfs_rq, se);
> >
> > if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
> > @@ -3875,6 +3846,7 @@ static inline void update_scan_period(st
> > static void
> > account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > + WARN_ON_ONCE(cfs_rq != cfs_rq_of(se));
> > update_load_add(&cfs_rq->load, se->load.weight);
> > if (entity_is_task(se)) {
> > struct rq *rq = rq_of(cfs_rq);
> > @@ -3888,6 +3860,7 @@ account_entity_enqueue(struct cfs_rq *cf
> > static void
> > account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > + WARN_ON_ONCE(cfs_rq != cfs_rq_of(se));
> > update_load_sub(&cfs_rq->load, se->load.weight);
> > if (entity_is_task(se)) {
> > account_numa_dequeue(rq_of(cfs_rq), task_of(se));
> > @@ -3965,7 +3938,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq,
> > static void
> > rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot)
> > {
> > - unsigned long old_weight = se->load.weight;
> > + long old_weight = se->h_load.weight;
> >
> > /*
> > * VRUNTIME
> > @@ -4065,16 +4038,17 @@ rescale_entity(struct sched_entity *se,
> > se->vprot = div64_long(se->vprot * old_weight, weight);
> > }
> >
> > -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
> > - unsigned long weight)
> > +static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
> > + unsigned long weight, bool on_rq)
> > {
> > bool curr = cfs_rq->curr == se;
> > bool rel_vprot = false;
> > u64 avruntime = 0;
> >
> > - if (se->on_rq) {
> > - /* commit outstanding execution time */
> > - update_curr(cfs_rq);
> > + if (se->h_load.weight == weight)
> > + return;
> > +
> > + if (on_rq) {
> > avruntime = avg_vruntime(cfs_rq);
> > se->vlag = entity_lag(cfs_rq, se, avruntime);
> > se->deadline -= avruntime;
> > @@ -4084,46 +4058,90 @@ static void reweight_entity(struct cfs_r
> > rel_vprot = true;
> > }
> >
> > - cfs_rq->nr_queued--;
> > + cfs_rq->h_nr_queued--;
> > if (!curr)
> > __dequeue_entity(cfs_rq, se);
> > - update_load_sub(&cfs_rq->load, se->load.weight);
> > }
> > - dequeue_load_avg(cfs_rq, se);
> >
> > rescale_entity(se, weight, rel_vprot);
> >
> > - update_load_set(&se->load, weight);
> > + update_load_set(&se->h_load, weight);
> >
> > - do {
> > - u32 divider = get_pelt_divider(&se->avg);
> > - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
> > - } while (0);
> > -
> > - enqueue_load_avg(cfs_rq, se);
> > - if (se->on_rq) {
> > + if (on_rq) {
> > if (rel_vprot)
> > se->vprot += avruntime;
> > se->deadline += avruntime;
> > se->rel_deadline = 0;
> > se->vruntime = avruntime - se->vlag;
> >
> > - update_load_add(&cfs_rq->load, se->load.weight);
> > if (!curr)
> > __enqueue_entity(cfs_rq, se);
> > - cfs_rq->nr_queued++;
> > + cfs_rq->h_nr_queued++;
> > }
> > }
> >
> > +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
> > + unsigned long weight)
> > +{
> > + if (se->load.weight == weight)
> > + return;
> > +
> > + if (se->on_rq) {
> > + WARN_ON_ONCE(cfs_rq != cfs_rq_of(se));
> > + update_load_sub(&cfs_rq->load, se->load.weight);
> > + }
> > + dequeue_load_avg(cfs_rq, se);
> > +
> > + update_load_set(&se->load, weight);
> > +
> > + do {
> > + u32 divider = get_pelt_divider(&se->avg);
> > + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
> > + } while (0);
> > +
> > + enqueue_load_avg(cfs_rq, se);
> > +
> > + if (se->on_rq)
> > + update_load_add(&cfs_rq->load, se->load.weight);
> > +}
> > +
> > +/*
> > + * weight = NICE_0_LOAD;
> > + * for_each_entity_se(se)
> > + * weight = __calc_prop_weight(cfs_rq_of(se), se, weight);
> > + */
> > +static __always_inline
> > +unsigned long __calc_prop_weight(struct cfs_rq *cfs_rq, struct sched_entity *se,
> > + unsigned long weight)
> > +{
> > + weight *= se->load.weight;
> > + if (parent_entity(se))
> > + weight /= cfs_rq->load.weight;
> > + else
> > + weight /= NICE_0_LOAD;
> > +
> > + return max(weight, MIN_SHARES);
> > +}
> > +
> > static void reweight_task_fair(struct rq *rq, struct task_struct *p,
> > const struct load_weight *lw)
> > {
> > struct sched_entity *se = &p->se;
> > - struct cfs_rq *cfs_rq = cfs_rq_of(se);
> > - struct load_weight *load = &se->load;
> > + unsigned long weight = NICE_0_LOAD;
> > +
> > + if (se->on_rq)
> > + update_curr_fair(rq);
> > +
> > + reweight_entity(cfs_rq_of(se), se, lw->weight);
> > + se->load.inv_weight = lw->inv_weight;
> > +
> > + if (!se->on_rq)
> > + return;
> > +
> > + for_each_sched_entity(se)
> > + weight = __calc_prop_weight(cfs_rq_of(se), se, weight);
> >
> > - reweight_entity(cfs_rq, se, lw->weight);
> > - load->inv_weight = lw->inv_weight;
> > + reweight_eevdf(&rq->cfs, &p->se, weight, p->se.on_rq);
> > }
> >
> > static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
> > @@ -4331,7 +4349,6 @@ static long calc_group_shares(struct cfs
> > static void update_cfs_group(struct sched_entity *se)
> > {
> > struct cfs_rq *gcfs_rq = group_cfs_rq(se);
> > - long shares;
> >
> > /*
> > * When a group becomes empty, preserve its weight. This matters for
> > @@ -4340,9 +4357,7 @@ static void update_cfs_group(struct sche
> > if (!gcfs_rq || !gcfs_rq->load.weight)
> > return;
> >
> > - shares = calc_group_shares(gcfs_rq);
> > - if (unlikely(se->load.weight != shares))
> > - reweight_entity(cfs_rq_of(se), se, shares);
> > + reweight_entity(cfs_rq_of(se), se, calc_group_shares(gcfs_rq));
> > }
> >
> > #else /* !CONFIG_FAIR_GROUP_SCHED: */
> > @@ -4460,7 +4475,7 @@ static inline bool cfs_rq_is_decayed(str
> > * differential update where we store the last value we propagated. This in
> > * turn allows skipping updates if the differential is 'small'.
> > *
> > - * Updating tg's load_avg is necessary before update_cfs_share().
> > + * Updating tg's load_avg is necessary before update_cfs_group().
> > */
> > static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
> > {
> > @@ -4926,7 +4941,7 @@ static void migrate_se_pelt_lag(struct s
> > * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
> > * avg. The immediate corollary is that all (fair) tasks must be attached.
> > *
> > - * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
> > + * cfs_rq->avg is used for task_h_load() and update_cfs_group() for example.
> > *
> > * Return: true if the load decayed or we removed load.
> > *
> > @@ -5475,6 +5490,7 @@ static void
> > place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> > {
> > u64 vslice, vruntime = avg_vruntime(cfs_rq);
> > + unsigned int nr_queued = cfs_rq->h_nr_queued;
> > bool update_zero = false;
> > s64 lag = 0;
> >
> > @@ -5482,6 +5498,9 @@ place_entity(struct cfs_rq *cfs_rq, stru
> > se->slice = sysctl_sched_base_slice;
> > vslice = calc_delta_fair(se->slice, se);
> >
> > + if (flags & ENQUEUE_QUEUED)
> > + nr_queued -= 1;
> > +
> > /*
> > * Due to how V is constructed as the weighted average of entities,
> > * adding tasks with positive lag, or removing tasks with negative lag
> > @@ -5490,7 +5509,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
> > *
> > * EEVDF: placement strategy #1 / #2
> > */
> > - if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
> > + if (sched_feat(PLACE_LAG) && nr_queued && se->vlag) {
> > struct sched_entity *curr = cfs_rq->curr;
> > long load, weight;
> >
> > @@ -5550,9 +5569,9 @@ place_entity(struct cfs_rq *cfs_rq, stru
> > */
> > load = cfs_rq->sum_weight;
> > if (curr && curr->on_rq)
> > - load += avg_vruntime_weight(cfs_rq, curr->load.weight);
> > + load += avg_vruntime_weight(cfs_rq, curr->h_load.weight);
> >
> > - weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> > + weight = avg_vruntime_weight(cfs_rq, se->h_load.weight);
> > lag *= load + weight;
> > if (WARN_ON_ONCE(!load))
> > load = 1;
> > @@ -5611,22 +5630,8 @@ static void check_enqueue_throttle(struc
> > static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
> >
> > static void
> > -requeue_delayed_entity(struct sched_entity *se);
> > -
> > -static void
> > enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> > {
> > - bool curr = cfs_rq->curr == se;
> > -
> > - /*
> > - * If we're the current task, we must renormalise before calling
> > - * update_curr().
> > - */
> > - if (curr)
> > - place_entity(cfs_rq, se, flags);
> > -
> > - update_curr(cfs_rq);
> > -
> > /*
> > * When enqueuing a sched_entity, we must:
> > * - Update loads to have both entity and cfs_rq synced with now.
> > @@ -5645,13 +5650,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
> > */
> > update_cfs_group(se);
> >
> > - /*
> > - * XXX now that the entity has been re-weighted, and it's lag adjusted,
> > - * we can place the entity.
> > - */
> > - if (!curr)
> > - place_entity(cfs_rq, se, flags);
> > -
> > account_entity_enqueue(cfs_rq, se);
> >
> > /* Entity has migrated, no longer consider this task hot */
> > @@ -5660,8 +5658,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
> >
> > check_schedstat_required();
> > update_stats_enqueue_fair(cfs_rq, se, flags);
> > - if (!curr)
> > - __enqueue_entity(cfs_rq, se);
> > se->on_rq = 1;
> >
> > if (cfs_rq->nr_queued == 1) {
> > @@ -5679,21 +5675,19 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
> > }
> > }
> >
> > -static void __clear_buddies_next(struct sched_entity *se)
> > +static void set_next_buddy(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > - for_each_sched_entity(se) {
> > - struct cfs_rq *cfs_rq = cfs_rq_of(se);
> > - if (cfs_rq->next != se)
> > - break;
> > -
> > - cfs_rq->next = NULL;
> > - }
> > + if (WARN_ON_ONCE(!se->on_rq || se->sched_delayed))
> > + return;
> > + if (se_is_idle(se))
> > + return;
> > + cfs_rq->next = se;
> > }
> >
> > static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > if (cfs_rq->next == se)
> > - __clear_buddies_next(se);
> > + cfs_rq->next = NULL;
> > }
> >
> > static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
> > @@ -5704,7 +5698,7 @@ static void set_delayed(struct sched_ent
> >
> > /*
> > * Delayed se of cfs_rq have no tasks queued on them.
> > - * Do not adjust h_nr_runnable since dequeue_entities()
> > + * Do not adjust h_nr_runnable since __dequeue_task()
> > * will account it for blocked tasks.
> > */
> > if (!entity_is_task(se))
> > @@ -5737,37 +5731,11 @@ static void clear_delayed(struct sched_e
> > }
> > }
> >
> > -static bool
> > +static void
> > dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> > {
> > - bool sleep = flags & DEQUEUE_SLEEP;
> > int action = UPDATE_TG;
> >
> > - update_curr(cfs_rq);
> > - clear_buddies(cfs_rq, se);
> > -
> > - if (flags & DEQUEUE_DELAYED) {
> > - WARN_ON_ONCE(!se->sched_delayed);
> > - } else {
> > - bool delay = sleep;
> > - /*
> > - * DELAY_DEQUEUE relies on spurious wakeups, special task
> > - * states must not suffer spurious wakeups, excempt them.
> > - */
> > - if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
> > - delay = false;
> > -
> > - WARN_ON_ONCE(delay && se->sched_delayed);
> > -
> > - if (sched_feat(DELAY_DEQUEUE) && delay &&
> > - !entity_eligible(cfs_rq, se)) {
> > - update_load_avg(cfs_rq, se, 0);
> > - update_entity_lag(cfs_rq, se);
> > - set_delayed(se);
> > - return false;
> > - }
> > - }
> > -
> > if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
> > action |= DO_DETACH;
> >
> > @@ -5785,14 +5753,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> >
> > update_stats_dequeue_fair(cfs_rq, se, flags);
> >
> > - update_entity_lag(cfs_rq, se);
> > - if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
> > - se->deadline -= se->vruntime;
> > - se->rel_deadline = 1;
> > - }
> > -
> > - if (se != cfs_rq->curr)
> > - __dequeue_entity(cfs_rq, se);
> > se->on_rq = 0;
> > account_entity_dequeue(cfs_rq, se);
> >
> > @@ -5801,9 +5761,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> >
> > update_cfs_group(se);
> >
> > - if (flags & DEQUEUE_DELAYED)
> > - clear_delayed(se);
> > -
> > if (cfs_rq->nr_queued == 0) {
> > update_idle_cfs_rq_clock_pelt(cfs_rq);
> > #ifdef CONFIG_CFS_BANDWIDTH
> > @@ -5816,15 +5773,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> > }
> > #endif
> > }
> > -
> > - return true;
> > }
> >
> > static void
> > -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first)
> > +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > - clear_buddies(cfs_rq, se);
> > -
> > /* 'current' is not kept within the tree. */
> > if (se->on_rq) {
> > /*
> > @@ -5833,16 +5786,12 @@ set_next_entity(struct cfs_rq *cfs_rq, s
> > * runqueue.
> > */
> > update_stats_wait_end_fair(cfs_rq, se);
> > - __dequeue_entity(cfs_rq, se);
> > update_load_avg(cfs_rq, se, UPDATE_TG);
> > -
> > - if (first)
> > - set_protect_slice(cfs_rq, se);
> > }
> >
> > update_stats_curr_start(cfs_rq, se);
> > - WARN_ON_ONCE(cfs_rq->curr);
> > - cfs_rq->curr = se;
> > + WARN_ON_ONCE(cfs_rq->h_curr);
> > + cfs_rq->h_curr = se;
> >
> > /*
> > * Track our maximum slice length, if the CPU's load is at
> > @@ -5862,23 +5811,17 @@ set_next_entity(struct cfs_rq *cfs_rq, s
> > se->prev_sum_exec_runtime = se->sum_exec_runtime;
> > }
> >
> > -static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
> > +static bool __dequeue_task(struct rq *rq, struct task_struct *p, int flags);
> >
> > -/*
> > - * Pick the next process, keeping these things in mind, in this order:
> > - * 1) keep things fair between processes/task groups
> > - * 2) pick the "next" process, since someone really wants that to run
> > - * 3) pick the "last" process, for cache locality
> > - * 4) do not run the "skip" process, if something else is available
> > - */
> > static struct sched_entity *
> > -pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
> > +pick_next_entity(struct rq *rq, bool protect)
> > {
> > + struct cfs_rq *cfs_rq = &rq->cfs;
> > struct sched_entity *se;
> >
> > se = pick_eevdf(cfs_rq, protect);
> > if (se->sched_delayed) {
> > - dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> > + __dequeue_task(rq, task_of(se), DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> > /*
> > * Must not reference @se again, see __block_task().
> > */
> > @@ -5903,13 +5846,11 @@ static void put_prev_entity(struct cfs_r
> >
> > if (prev->on_rq) {
> > update_stats_wait_start_fair(cfs_rq, prev);
> > - /* Put 'current' back into the tree. */
> > - __enqueue_entity(cfs_rq, prev);
> > /* in !on_rq case, update occurred at dequeue */
> > update_load_avg(cfs_rq, prev, 0);
> > }
> > - WARN_ON_ONCE(cfs_rq->curr != prev);
> > - cfs_rq->curr = NULL;
> > + WARN_ON_ONCE(cfs_rq->h_curr != prev);
> > + cfs_rq->h_curr = NULL;
> > }
> >
> > static void
> > @@ -6062,7 +6003,7 @@ static void __account_cfs_rq_runtime(str
> > * if we're unable to extend our runtime we resched so that the active
> > * hierarchy can be throttled
> > */
> > - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
> > + if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->h_curr))
> > resched_curr(rq_of(cfs_rq));
> > }
> >
> > @@ -6420,7 +6361,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cf
> > assert_list_leaf_cfs_rq(rq);
> >
> > /* Determine whether we need to wake up potentially idle CPU: */
> > - if (rq->curr == rq->idle && rq->cfs.nr_queued)
> > + if (rq->curr == rq->idle && rq->cfs.h_nr_queued)
> > resched_curr(rq);
> > }
> >
> > @@ -6761,7 +6702,7 @@ static void check_enqueue_throttle(struc
> > return;
> >
> > /* an active group must be handled by the update_curr()->put() path */
> > - if (!cfs_rq->runtime_enabled || cfs_rq->curr)
> > + if (!cfs_rq->runtime_enabled || cfs_rq->h_curr)
> > return;
> >
> > /* ensure the group is not already throttled */
> > @@ -7156,7 +7097,7 @@ static void hrtick_start_fair(struct rq
> > resched_curr(rq);
> > return;
> > }
> > - delta = (se->load.weight * vdelta) / NICE_0_LOAD;
> > + delta = (se->h_load.weight * vdelta) / NICE_0_LOAD;
> >
> > /*
> > * Correct for instantaneous load of other classes.
> > @@ -7256,10 +7197,8 @@ static int choose_idle_cpu(int cpu, stru
> > }
> >
> > static void
> > -requeue_delayed_entity(struct sched_entity *se)
> > +requeue_delayed_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > - struct cfs_rq *cfs_rq = cfs_rq_of(se);
> > -
> > /*
> > * se->sched_delayed should imply: se->on_rq == 1.
> > * Because a delayed entity is one that is still on
> > @@ -7269,19 +7208,58 @@ requeue_delayed_entity(struct sched_enti
> > WARN_ON_ONCE(!se->on_rq);
> >
> > if (update_entity_lag(cfs_rq, se)) {
> > - cfs_rq->nr_queued--;
> > + cfs_rq->h_nr_queued--;
> > if (se != cfs_rq->curr)
> > __dequeue_entity(cfs_rq, se);
> > place_entity(cfs_rq, se, 0);
> > if (se != cfs_rq->curr)
> > __enqueue_entity(cfs_rq, se);
> > - cfs_rq->nr_queued++;
> > + cfs_rq->h_nr_queued++;
> > }
> >
> > update_load_avg(cfs_rq, se, 0);
> > clear_delayed(se);
> > }
> >
> > +static unsigned long enqueue_hierarchy(struct task_struct *p, int flags)
> > +{
> > + unsigned long weight = NICE_0_LOAD;
> > + int task_new = !(flags & ENQUEUE_WAKEUP);
> > + struct sched_entity *se = &p->se;
> > + int h_nr_idle = task_has_idle_policy(p);
> > + int h_nr_runnable = 1;
> > +
> > + if (task_new && se->sched_delayed)
> > + h_nr_runnable = 0;
> > +
> > + for_each_sched_entity(se) {
> > + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> > +
> > + update_curr(cfs_rq);
> > +
> > + if (!se->on_rq) {
> > + enqueue_entity(cfs_rq, se, flags);
> > + } else {
> > + update_load_avg(cfs_rq, se, UPDATE_TG);
> > + se_update_runnable(se);
> > + update_cfs_group(se);
> > + }
> > +
> > + cfs_rq->h_nr_runnable += h_nr_runnable;
> > + cfs_rq->h_nr_queued++;
> > + cfs_rq->h_nr_idle += h_nr_idle;
> > +
> > + if (cfs_rq_is_idle(cfs_rq))
> > + h_nr_idle = 1;
> > +
> > + weight = __calc_prop_weight(cfs_rq, se, weight);
> > +
> > + flags = ENQUEUE_WAKEUP;
> > + }
> > +
> > + return weight;
> > +}
> > +
> > /*
> > * The enqueue_task method is called before nr_running is
> > * increased. Here we update the fair scheduling stats and
> > @@ -7290,13 +7268,12 @@ requeue_delayed_entity(struct sched_enti
> > static void
> > enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > {
> > - struct cfs_rq *cfs_rq;
> > - struct sched_entity *se = &p->se;
> > - int h_nr_idle = task_has_idle_policy(p);
> > - int h_nr_runnable = 1;
> > - int task_new = !(flags & ENQUEUE_WAKEUP);
> > int rq_h_nr_queued = rq->cfs.h_nr_queued;
> > - u64 slice = 0;
> > + int task_new = !(flags & ENQUEUE_WAKEUP);
> > + struct sched_entity *se = &p->se;
> > + struct cfs_rq *cfs_rq = &rq->cfs;
> > + unsigned long weight;
> > + bool curr;
> >
> > if (task_is_throttled(p) && enqueue_throttled_task(p))
> > return;
> > @@ -7308,10 +7285,10 @@ enqueue_task_fair(struct rq *rq, struct
> > * estimated utilization, before we update schedutil.
> > */
> > if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
> > - util_est_enqueue(&rq->cfs, p);
> > + util_est_enqueue(cfs_rq, p);
> >
> > if (flags & ENQUEUE_DELAYED) {
> > - requeue_delayed_entity(se);
> > + requeue_delayed_entity(cfs_rq, se);
> > return;
> > }
> >
> > @@ -7323,57 +7300,22 @@ enqueue_task_fair(struct rq *rq, struct
> > if (p->in_iowait)
> > cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> >
> > - if (task_new && se->sched_delayed)
> > - h_nr_runnable = 0;
> > -
> > - for_each_sched_entity(se) {
> > - if (se->on_rq) {
> > - if (se->sched_delayed)
> > - requeue_delayed_entity(se);
> > - break;
> > - }
> > - cfs_rq = cfs_rq_of(se);
> > -
> > - /*
> > - * Basically set the slice of group entries to the min_slice of
> > - * their respective cfs_rq. This ensures the group can service
> > - * its entities in the desired time-frame.
> > - */
> > - if (slice) {
> > - se->slice = slice;
> > - se->custom_slice = 1;
> > - }
> > - enqueue_entity(cfs_rq, se, flags);
> > - slice = cfs_rq_min_slice(cfs_rq);
> > -
> > - cfs_rq->h_nr_runnable += h_nr_runnable;
> > - cfs_rq->h_nr_queued++;
> > - cfs_rq->h_nr_idle += h_nr_idle;
> > -
> > - if (cfs_rq_is_idle(cfs_rq))
> > - h_nr_idle = 1;
> > -
> > - flags = ENQUEUE_WAKEUP;
> > - }
> > -
> > - for_each_sched_entity(se) {
> > - cfs_rq = cfs_rq_of(se);
> > -
> > - update_load_avg(cfs_rq, se, UPDATE_TG);
> > - se_update_runnable(se);
> > - update_cfs_group(se);
> > + /*
> > + * XXX comment on the curr thing
> > + */
> > + curr = (cfs_rq->curr == se);
> > + if (curr)
> > + place_entity(cfs_rq, se, flags);
> >
> > - se->slice = slice;
> > - if (se != cfs_rq->curr)
> > - min_vruntime_cb_propagate(&se->run_node, NULL);
> > - slice = cfs_rq_min_slice(cfs_rq);
> > + if (se->on_rq && se->sched_delayed)
> > + requeue_delayed_entity(cfs_rq, se);
> >
> > - cfs_rq->h_nr_runnable += h_nr_runnable;
> > - cfs_rq->h_nr_queued++;
> > - cfs_rq->h_nr_idle += h_nr_idle;
> > + weight = enqueue_hierarchy(p, flags);
> >
> > - if (cfs_rq_is_idle(cfs_rq))
> > - h_nr_idle = 1;
> > + if (!curr) {
> > + reweight_eevdf(cfs_rq, se, weight, false);
> > + place_entity(cfs_rq, se, flags | ENQUEUE_QUEUED);
> > + __enqueue_entity(cfs_rq, se);
> > }
> >
> > if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
> > @@ -7404,105 +7346,107 @@ enqueue_task_fair(struct rq *rq, struct
> > hrtick_update(rq);
> > }
> >
> > -/*
> > - * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> > - * failing half-way through and resume the dequeue later.
> > - *
> > - * Returns:
> > - * -1 - dequeue delayed
> > - * 0 - dequeue throttled
> > - * 1 - dequeue complete
> > - */
> > -static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> > +static void dequeue_hierarchy(struct task_struct *p, int flags)
> > {
> > - bool was_sched_idle = sched_idle_rq(rq);
> > + struct sched_entity *se = &p->se;
> > bool task_sleep = flags & DEQUEUE_SLEEP;
> > bool task_delayed = flags & DEQUEUE_DELAYED;
> > bool task_throttled = flags & DEQUEUE_THROTTLE;
> > - struct task_struct *p = NULL;
> > - int h_nr_idle = 0;
> > - int h_nr_queued = 0;
> > int h_nr_runnable = 0;
> > - struct cfs_rq *cfs_rq;
> > - u64 slice = 0;
> > + int h_nr_idle = task_has_idle_policy(p);
> > + bool dequeue = true;
> >
> > - if (entity_is_task(se)) {
> > - p = task_of(se);
> > - h_nr_queued = 1;
> > - h_nr_idle = task_has_idle_policy(p);
> > - if (task_sleep || task_delayed || !se->sched_delayed)
> > - h_nr_runnable = 1;
> > - }
> > + if (task_sleep || task_delayed || !se->sched_delayed)
> > + h_nr_runnable = 1;
> >
> > for_each_sched_entity(se) {
> > - cfs_rq = cfs_rq_of(se);
> > + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> >
> > - if (!dequeue_entity(cfs_rq, se, flags)) {
> > - if (p && &p->se == se)
> > - return -1;
> > + update_curr(cfs_rq);
> >
> > - slice = cfs_rq_min_slice(cfs_rq);
> > - break;
> > + if (dequeue) {
> > + dequeue_entity(cfs_rq, se, flags);
> > + /* Don't dequeue parent if it has other entities besides us */
> > + if (cfs_rq->load.weight)
> > + dequeue = false;
> > + } else {
> > + update_load_avg(cfs_rq, se, UPDATE_TG);
> > + se_update_runnable(se);
> > + update_cfs_group(se);
> > }
> >
> > cfs_rq->h_nr_runnable -= h_nr_runnable;
> > - cfs_rq->h_nr_queued -= h_nr_queued;
> > + cfs_rq->h_nr_queued--;
> > cfs_rq->h_nr_idle -= h_nr_idle;
> >
> > if (cfs_rq_is_idle(cfs_rq))
> > - h_nr_idle = h_nr_queued;
> > + h_nr_idle = 1;
> >
> > if (throttled_hierarchy(cfs_rq) && task_throttled)
> > record_throttle_clock(cfs_rq);
> >
> > - /* Don't dequeue parent if it has other entities besides us */
> > - if (cfs_rq->load.weight) {
> > - slice = cfs_rq_min_slice(cfs_rq);
> > -
> > - /* Avoid re-evaluating load for this entity: */
> > - se = parent_entity(se);
> > - /*
> > - * Bias pick_next to pick a task from this cfs_rq, as
> > - * p is sleeping when it is within its sched_slice.
> > - */
> > - if (task_sleep && se)
> > - set_next_buddy(se);
> > - break;
> > - }
> > flags |= DEQUEUE_SLEEP;
> > flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
> > }
> > +}
> >
> > - for_each_sched_entity(se) {
> > - cfs_rq = cfs_rq_of(se);
> > +/*
> > + * The part of dequeue_task_fair() that is needed to dequeue delayed tasks.
> > + *
> > + * Returns:
> > + * true - dequeued
> > + * false - delayed
> > + */
> > +static bool __dequeue_task(struct rq *rq, struct task_struct *p, int flags)
> > +{
> > + struct sched_entity *se = &p->se;
> > + struct cfs_rq *cfs_rq = &rq->cfs;
> > + bool was_sched_idle = sched_idle_rq(rq);
> > + bool task_sleep = flags & DEQUEUE_SLEEP;
> > + bool task_delayed = flags & DEQUEUE_DELAYED;
> >
> > - update_load_avg(cfs_rq, se, UPDATE_TG);
> > - se_update_runnable(se);
> > - update_cfs_group(se);
> > + clear_buddies(cfs_rq, se);
> >
> > - se->slice = slice;
> > - if (se != cfs_rq->curr)
> > - min_vruntime_cb_propagate(&se->run_node, NULL);
> > - slice = cfs_rq_min_slice(cfs_rq);
> > + if (flags & DEQUEUE_DELAYED) {
> > + WARN_ON_ONCE(!se->sched_delayed);
> > + } else {
> > + bool delay = task_sleep;
> > + /*
> > + * DELAY_DEQUEUE relies on spurious wakeups, special task
> > + * states must not suffer spurious wakeups, excempt them.
> > + */
> > + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
> > + delay = false;
> >
> > - cfs_rq->h_nr_runnable -= h_nr_runnable;
> > - cfs_rq->h_nr_queued -= h_nr_queued;
> > - cfs_rq->h_nr_idle -= h_nr_idle;
> > + WARN_ON_ONCE(delay && se->sched_delayed);
> >
> > - if (cfs_rq_is_idle(cfs_rq))
> > - h_nr_idle = h_nr_queued;
> > + if (sched_feat(DELAY_DEQUEUE) && delay &&
> > + !entity_eligible(cfs_rq, se)) {
> > + update_load_avg(cfs_rq_of(se), se, 0);
>
> update_entity_lag(cfs_rq, se); is missing here. Unfortunately this
> doesn't fix my regression
>
> > + set_delayed(se);
> > + return false;
> > + }
> > + }
> >
> > - if (throttled_hierarchy(cfs_rq) && task_throttled)
> > - record_throttle_clock(cfs_rq);
> > + dequeue_hierarchy(p, flags);
> > +
> > + update_entity_lag(cfs_rq, se);
> > + if (sched_feat(PLACE_REL_DEADLINE) && !task_sleep) {
> > + se->deadline -= se->vruntime;
> > + se->rel_deadline = 1;
> > }
> > + if (se != cfs_rq->curr)
> > + __dequeue_entity(cfs_rq, se);
> >
> > - sub_nr_running(rq, h_nr_queued);
> > + sub_nr_running(rq, 1);
> >
> > /* balance early to pull high priority tasks */
> > if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
> > rq->next_balance = jiffies;
> >
> > - if (p && task_delayed) {
> > + if (task_delayed) {
> > + clear_delayed(se);
> > +
> > WARN_ON_ONCE(!task_sleep);
> > WARN_ON_ONCE(p->on_rq != 1);
> >
> > @@ -7514,7 +7458,7 @@ static int dequeue_entities(struct rq *r
> > __block_task(rq, p);
> > }
> >
> > - return 1;
> > + return true;
> > }
> >
> > /*
> > @@ -7533,11 +7477,11 @@ static bool dequeue_task_fair(struct rq
> > util_est_dequeue(&rq->cfs, p);
> >
> > util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
> > - if (dequeue_entities(rq, &p->se, flags) < 0)
> > + if (!__dequeue_task(rq, p, flags))
> > return false;
> >
> > /*
> > - * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
> > + * Must not reference @p after __dequeue_task(DEQUEUE_DELAYED).
> > */
> > return true;
> > }
> > @@ -9021,19 +8965,6 @@ static void migrate_task_rq_fair(struct
> > static void task_dead_fair(struct task_struct *p)
> > {
> > struct sched_entity *se = &p->se;
> > -
> > - if (se->sched_delayed) {
> > - struct rq_flags rf;
> > - struct rq *rq;
> > -
> > - rq = task_rq_lock(p, &rf);
> > - if (se->sched_delayed) {
> > - update_rq_clock(rq);
> > - dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> > - }
> > - task_rq_unlock(rq, p, &rf);
> > - }
> > -
> > remove_entity_load_avg(se);
> > }
> >
> > @@ -9067,21 +8998,10 @@ static void set_cpus_allowed_fair(struct
> > set_task_max_allowed_capacity(p);
> > }
> >
> > -static void set_next_buddy(struct sched_entity *se)
> > -{
> > - for_each_sched_entity(se) {
> > - if (WARN_ON_ONCE(!se->on_rq))
> > - return;
> > - if (se_is_idle(se))
> > - return;
> > - cfs_rq_of(se)->next = se;
> > - }
> > -}
> > -
> > enum preempt_wakeup_action {
> > PREEMPT_WAKEUP_NONE, /* No preemption. */
> > PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */
> > - PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */
> > + PREEMPT_WAKEUP_PICK, /* Let pick_eevdf() decide. */
> > PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */
> > };
> >
> > @@ -9098,7 +9018,7 @@ set_preempt_buddy(struct cfs_rq *cfs_rq,
> > if (cfs_rq->next && entity_before(cfs_rq->next, pse))
> > return false;
> >
> > - set_next_buddy(pse);
> > + set_next_buddy(cfs_rq, pse);
> > return true;
> > }
> >
> > @@ -9188,7 +9108,6 @@ static void wakeup_preempt_fair(struct r
> > if (!sched_feat(WAKEUP_PREEMPTION))
> > return;
> >
> > - find_matching_se(&se, &pse);
> > WARN_ON_ONCE(!pse);
> >
> > cse_is_idle = se_is_idle(se);
> > @@ -9216,8 +9135,7 @@ static void wakeup_preempt_fair(struct r
> > if (unlikely(!normal_policy(p->policy)))
> > return;
> >
> > - cfs_rq = cfs_rq_of(se);
> > - update_curr(cfs_rq);
> > + update_curr_fair(rq);
> > /*
> > * If @p has a shorter slice than current and @p is eligible, override
> > * current's slice protection in order to allow preemption.
> > @@ -9261,18 +9179,15 @@ static void wakeup_preempt_fair(struct r
> > }
> >
> > pick:
> > - nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT);
> > - /* If @p has become the most eligible task, force preemption */
> > - if (nse == pse)
> > - goto preempt;
> > -
> > - /*
> > - * Because p is enqueued, nse being null can only mean that we
> > - * dequeued a delayed task. If there are still entities queued in
> > - * cfs, check if the next one will be p.
> > - */
> > - if (!nse && cfs_rq->nr_queued)
> > - goto pick;
> > + if (cfs_rq->h_nr_queued) {
> > + nse = pick_next_entity(rq, preempt_action != PREEMPT_WAKEUP_SHORT);
> > + if (unlikely(!nse))
> > + goto pick;
> > +
> > + /* If @p has become the most eligible task, force preemption */
> > + if (nse == pse)
> > + goto preempt;
> > + }
> >
> > if (sched_feat(RUN_TO_PARITY))
> > update_protect_slice(cfs_rq, se);
> > @@ -9291,34 +9206,25 @@ static void wakeup_preempt_fair(struct r
> > struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
> > __must_hold(__rq_lockp(rq))
> > {
> > + struct cfs_rq *cfs_rq = &rq->cfs;
> > struct sched_entity *se;
> > - struct cfs_rq *cfs_rq;
> > struct task_struct *p;
> > - bool throttled;
> > int new_tasks;
> >
> > again:
> > - cfs_rq = &rq->cfs;
> > - if (!cfs_rq->nr_queued)
> > + if (!cfs_rq->h_nr_queued)
> > goto idle;
> >
> > - throttled = false;
> > -
> > - do {
> > - /* Might not have done put_prev_entity() */
> > - if (cfs_rq->curr && cfs_rq->curr->on_rq)
> > - update_curr(cfs_rq);
> > -
> > - throttled |= check_cfs_rq_runtime(cfs_rq);
> > + /* Might not have done put_prev_entity() */
> > + if (cfs_rq->curr && cfs_rq->curr->on_rq)
> > + update_curr(cfs_rq);
> >
> > - se = pick_next_entity(rq, cfs_rq, true);
> > - if (!se)
> > - goto again;
> > - cfs_rq = group_cfs_rq(se);
> > - } while (cfs_rq);
> > + se = pick_next_entity(rq, true);
> > + if (!se)
> > + goto again;
> >
> > p = task_of(se);
> > - if (unlikely(throttled))
> > + if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se))))
> > task_throttle_setup_work(p);
> > return p;
> >
> > @@ -9353,7 +9259,7 @@ void fair_server_init(struct rq *rq)
> > static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
> > {
> > struct sched_entity *se = &prev->se;
> > - struct cfs_rq *cfs_rq;
> > + struct cfs_rq *cfs_rq = &rq->cfs;
> > struct sched_entity *nse = NULL;
> >
> > #ifdef CONFIG_FAIR_GROUP_SCHED
> > @@ -9363,7 +9269,7 @@ static void put_prev_task_fair(struct rq
> >
> > while (se) {
> > cfs_rq = cfs_rq_of(se);
> > - if (!nse || cfs_rq->curr)
> > + if (!nse || cfs_rq->h_curr)
> > put_prev_entity(cfs_rq, se);
> > #ifdef CONFIG_FAIR_GROUP_SCHED
> > if (nse) {
> > @@ -9382,6 +9288,14 @@ static void put_prev_task_fair(struct rq
> > #endif
> > se = parent_entity(se);
> > }
> > +
> > + /* Put 'current' back into the tree. */
> > + cfs_rq = &rq->cfs;
> > + se = &prev->se;
> > + WARN_ON_ONCE(cfs_rq->curr != se);
> > + cfs_rq->curr = NULL;
> > + if (se->on_rq)
> > + __enqueue_entity(cfs_rq, se);
> > }
> >
> > /*
> > @@ -9390,8 +9304,8 @@ static void put_prev_task_fair(struct rq
> > static void yield_task_fair(struct rq *rq)
> > {
> > struct task_struct *curr = rq->donor;
> > - struct cfs_rq *cfs_rq = task_cfs_rq(curr);
> > struct sched_entity *se = &curr->se;
> > + struct cfs_rq *cfs_rq = &rq->cfs;
> >
> > /*
> > * Are we the only task in the tree?
> > @@ -9432,11 +9346,11 @@ static bool yield_to_task_fair(struct rq
> > struct sched_entity *se = &p->se;
> >
> > /* !se->on_rq also covers throttled task */
> > - if (!se->on_rq)
> > + if (!se->on_rq || se->sched_delayed)
> > return false;
> >
> > /* Tell the scheduler that we'd really like se to run next. */
> > - set_next_buddy(se);
> > + set_next_buddy(&task_rq(p)->cfs, se);
> >
> > yield_task_fair(rq);
> >
> > @@ -9762,15 +9676,10 @@ static inline long migrate_degrades_loca
> > */
> > static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu)
> > {
> > - struct cfs_rq *dst_cfs_rq;
> > + struct cfs_rq *dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
> >
> > -#ifdef CONFIG_FAIR_GROUP_SCHED
> > - dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
> > -#else
> > - dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
> > -#endif
> > - if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
> > - !entity_eligible(task_cfs_rq(p), &p->se))
> > + if (sched_feat(PLACE_LAG) && dst_cfs_rq->h_nr_queued &&
> > + !entity_eligible(&task_rq(p)->cfs, &p->se))
> > return 1;
> >
> > return 0;
> > @@ -10240,7 +10149,7 @@ static void update_cfs_rq_h_load(struct
> > while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
> > load = cfs_rq->h_load;
> > load = div64_ul(load * se->avg.load_avg,
> > - cfs_rq_load_avg(cfs_rq) + 1);
> > + cfs_rq_load_avg(cfs_rq) + 1);
> > cfs_rq = group_cfs_rq(se);
> > cfs_rq->h_load = load;
> > cfs_rq->last_h_load_update = now;
> > @@ -13459,7 +13368,7 @@ static inline void task_tick_core(struct
> > * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
> > * if we need to give up the CPU.
> > */
> > - if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
> > + if (rq->core->core_forceidle_count && rq->cfs.h_nr_queued == 1 &&
> > __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
> > resched_curr(rq);
> > }
> > @@ -13668,30 +13577,8 @@ bool cfs_prio_less(const struct task_str
> >
> > WARN_ON_ONCE(task_rq(b)->core != rq->core);
> >
> > -#ifdef CONFIG_FAIR_GROUP_SCHED
> > - /*
> > - * Find an se in the hierarchy for tasks a and b, such that the se's
> > - * are immediate siblings.
> > - */
> > - while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
> > - int sea_depth = sea->depth;
> > - int seb_depth = seb->depth;
> > -
> > - if (sea_depth >= seb_depth)
> > - sea = parent_entity(sea);
> > - if (sea_depth <= seb_depth)
> > - seb = parent_entity(seb);
> > - }
> > -
> > - se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
> > - se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
> > -
> > - cfs_rqa = sea->cfs_rq;
> > - cfs_rqb = seb->cfs_rq;
> > -#else /* !CONFIG_FAIR_GROUP_SCHED: */
> > cfs_rqa = &task_rq(a)->cfs;
> > cfs_rqb = &task_rq(b)->cfs;
> > -#endif /* !CONFIG_FAIR_GROUP_SCHED */
> >
> > /*
> > * Find delta after normalizing se's vruntime with its cfs_rq's
> > @@ -13729,14 +13616,20 @@ static inline void task_tick_core(struct
> > */
> > static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
> > {
> > - struct cfs_rq *cfs_rq;
> > struct sched_entity *se = &curr->se;
> > + unsigned long weight = NICE_0_LOAD;
> > + struct cfs_rq *cfs_rq;
> >
> > for_each_sched_entity(se) {
> > cfs_rq = cfs_rq_of(se);
> > entity_tick(cfs_rq, se, queued);
> > +
> > + weight = __calc_prop_weight(cfs_rq, se, weight);
> > }
> >
> > + se = &curr->se;
> > + reweight_eevdf(cfs_rq, se, weight, se->on_rq);
> > +
> > if (queued)
> > return;
> >
> > @@ -13772,7 +13665,7 @@ prio_changed_fair(struct rq *rq, struct
> > if (p->prio == oldprio)
> > return;
> >
> > - if (rq->cfs.nr_queued == 1)
> > + if (rq->cfs.h_nr_queued == 1)
> > return;
> >
> > /*
> > @@ -13901,29 +13794,40 @@ static void switched_to_fair(struct rq *
> > }
> > }
> >
> > -/*
> > - * Account for a task changing its policy or group.
> > - *
> > - * This routine is mostly called to set cfs_rq->curr field when a task
> > - * migrates between groups/classes.
> > - */
> > static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
> > {
> > struct sched_entity *se = &p->se;
> > + struct cfs_rq *cfs_rq = &rq->cfs;
> > + unsigned long weight = NICE_0_LOAD;
> > + bool on_rq = se->on_rq;
> > +
> > + clear_buddies(cfs_rq, se);
> > +
> > + if (on_rq)
> > + __dequeue_entity(cfs_rq, se);
> >
> > for_each_sched_entity(se) {
> > - struct cfs_rq *cfs_rq = cfs_rq_of(se);
> > + cfs_rq = cfs_rq_of(se);
> >
> > - if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) &&
> > - first && cfs_rq->curr)
> > - break;
> > + if (!IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) ||
> > + !first || !cfs_rq->h_curr)
> > + set_next_entity(cfs_rq, se);
> >
> > - set_next_entity(cfs_rq, se, first);
> > /* ensure bandwidth has been allocated on our new cfs_rq */
> > account_cfs_rq_runtime(cfs_rq, 0);
> > +
> > + if (on_rq)
> > + weight = __calc_prop_weight(cfs_rq, se, weight);
> > }
> >
> > se = &p->se;
> > + cfs_rq->curr = se;
> > +
> > + if (on_rq) {
> > + reweight_eevdf(cfs_rq, se, weight, se->on_rq);
> > + if (first)
> > + set_protect_slice(cfs_rq, se);
> > + }
> >
> > if (task_on_rq_queued(p)) {
> > /*
> > @@ -14054,17 +13958,8 @@ void unregister_fair_sched_group(struct
> > struct sched_entity *se = tg->se[cpu];
> > struct rq *rq = cpu_rq(cpu);
> >
> > - if (se) {
> > - if (se->sched_delayed) {
> > - guard(rq_lock_irqsave)(rq);
> > - if (se->sched_delayed) {
> > - update_rq_clock(rq);
> > - dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> > - }
> > - list_del_leaf_cfs_rq(cfs_rq);
> > - }
> > + if (se)
> > remove_entity_load_avg(se);
> > - }
> >
> > /*
> > * Only empty task groups can be destroyed; so we can speculatively
> > --- a/kernel/sched/pelt.c
> > +++ b/kernel/sched/pelt.c
> > @@ -206,7 +206,7 @@ ___update_load_sum(u64 now, struct sched
> > /*
> > * running is a subset of runnable (weight) so running can't be set if
> > * runnable is clear. But there are some corner cases where the current
> > - * se has been already dequeued but cfs_rq->curr still points to it.
> > + * se has been already dequeued but cfs_rq->h_curr still points to it.
> > * This means that weight will be 0 but not running for a sched_entity
> > * but also for a cfs_rq if the latter becomes idle. As an example,
> > * this happens during sched_balance_newidle() which calls
> > @@ -307,7 +307,7 @@ int __update_load_avg_blocked_se(u64 now
> > int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
> > - cfs_rq->curr == se)) {
> > + cfs_rq->h_curr == se)) {
> >
> > ___update_load_avg(&se->avg, se_weight(se));
> > cfs_se_util_change(&se->avg);
> > @@ -323,7 +323,7 @@ int __update_load_avg_cfs_rq(u64 now, st
> > if (___update_load_sum(now, &cfs_rq->avg,
> > scale_load_down(cfs_rq->load.weight),
> > cfs_rq->h_nr_runnable,
> > - cfs_rq->curr != NULL)) {
> > + cfs_rq->h_curr != NULL)) {
> >
> > ___update_load_avg(&cfs_rq->avg, 1);
> > trace_pelt_cfs_tp(cfs_rq);
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -528,21 +528,8 @@ struct task_group {
> >
> > };
> >
> > -#ifdef CONFIG_GROUP_SCHED_WEIGHT
> > #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
> >
> > -/*
> > - * A weight of 0 or 1 can cause arithmetics problems.
> > - * A weight of a cfs_rq is the sum of weights of which entities
> > - * are queued on this cfs_rq, so a weight of a entity should not be
> > - * too large, so as the shares value of a task group.
> > - * (The default weight is 1024 - so there's no practical
> > - * limitation from this.)
> > - */
> > -#define MIN_SHARES (1UL << 1)
> > -#define MAX_SHARES (1UL << 18)
> > -#endif
> > -
> > typedef int (*tg_visitor)(struct task_group *, void *);
> >
> > extern int walk_tg_tree_from(struct task_group *from,
> > @@ -629,6 +616,17 @@ static inline bool cfs_task_bw_constrain
> >
> > #endif /* !CONFIG_CGROUP_SCHED */
> >
> > +/*
> > + * A weight of 0 or 1 can cause arithmetics problems.
> > + * A weight of a cfs_rq is the sum of weights of which entities
> > + * are queued on this cfs_rq, so a weight of a entity should not be
> > + * too large, so as the shares value of a task group.
> > + * (The default weight is 1024 - so there's no practical
> > + * limitation from this.)
> > + */
> > +#define MIN_SHARES (1UL << 1)
> > +#define MAX_SHARES (1UL << 18)
> > +
> > extern void unregister_rt_sched_group(struct task_group *tg);
> > extern void free_rt_sched_group(struct task_group *tg);
> > extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
> > @@ -707,6 +705,7 @@ struct cfs_rq {
> > /*
> > * CFS load tracking
> > */
> > + struct sched_entity *h_curr;
> > struct sched_avg avg;
> > #ifndef CONFIG_64BIT
> > u64 last_update_time_copy;
> > @@ -2509,6 +2508,7 @@ extern const u32 sched_prio_to_wmult[40
> > #define ENQUEUE_MIGRATED 0x00040000
> > #define ENQUEUE_INITIAL 0x00080000
> > #define ENQUEUE_RQ_SELECTED 0x00100000
> > +#define ENQUEUE_QUEUED 0x00200000
> >
> > #define RETRY_TASK ((void *)-1UL)
> >
> >
> >