Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

From: Luis Machado
Date: Tue Sep 10 2024 - 07:04:49 EST


Peter,

On 9/6/24 11:45, Peter Zijlstra wrote:
> On Thu, Sep 05, 2024 at 04:53:54PM +0200, Peter Zijlstra wrote:
>
>>> But then, like you said, __update_load_avg_cfs_rq() needs correct
>>> cfs_rq->h_nr_running.
>>
>> Uff. So yes __update_load_avg_cfs_rq() needs a different number, but
>> I'll contest that h_nr_running is in fact correct, albeit no longer
>> suitable for this purpose.
>>
>> We can track h_nr_delayed I suppose, and subtract that.
>
> Something like so?
>
> ---
> kernel/sched/debug.c | 1 +
> kernel/sched/fair.c | 49 ++++++++++++++++++++++++++++++++++++++++++++-----
> kernel/sched/pelt.c | 2 +-
> kernel/sched/sched.h | 7 +++++--
> 4 files changed, 51 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 01ce9a76164c..3d3c5be78075 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -829,6 +829,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
> SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
> SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
> + SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
> SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
> cfs_rq->idle_nr_running);
> SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 11e890486c1b..629b46308960 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5456,9 +5456,31 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
>
> static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
>
> -static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
> +static void set_delayed(struct sched_entity *se)
> +{
> + se->sched_delayed = 1;
> + for_each_sched_entity(se) {
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> + cfs_rq->h_nr_delayed++;
> + if (cfs_rq_throttled(cfs_rq))
> + break;
> + }
> +}
> +
> +static void clear_delayed(struct sched_entity *se)
> {
> se->sched_delayed = 0;
> + for_each_sched_entity(se) {
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> + cfs_rq->h_nr_delayed--;
> + if (cfs_rq_throttled(cfs_rq))
> + break;
> + }
> +}
> +
> +static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
> +{
> + clear_delayed(se);
> if (sched_feat(DELAY_ZERO) && se->vlag > 0)
> se->vlag = 0;
> }
> @@ -5488,7 +5510,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;
> update_load_avg(cfs_rq, se, 0);
> - se->sched_delayed = 1;
> + set_delayed(se);
> return false;
> }
> }
> @@ -5907,7 +5929,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
> struct rq *rq = rq_of(cfs_rq);
> struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> struct sched_entity *se;
> - long task_delta, idle_task_delta, dequeue = 1;
> + long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
> long rq_h_nr_running = rq->cfs.h_nr_running;
>
> raw_spin_lock(&cfs_b->lock);
> @@ -5940,6 +5962,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>
> task_delta = cfs_rq->h_nr_running;
> idle_task_delta = cfs_rq->idle_h_nr_running;
> + delayed_delta = cfs_rq->h_nr_delayed;
> for_each_sched_entity(se) {
> struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> int flags;
> @@ -5963,6 +5986,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>
> qcfs_rq->h_nr_running -= task_delta;
> qcfs_rq->idle_h_nr_running -= idle_task_delta;
> + qcfs_rq->h_nr_delayed -= delayed_delta;
>
> if (qcfs_rq->load.weight) {
> /* Avoid re-evaluating load for this entity: */
> @@ -5985,6 +6009,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>
> qcfs_rq->h_nr_running -= task_delta;
> qcfs_rq->idle_h_nr_running -= idle_task_delta;
> + qcfs_rq->h_nr_delayed -= delayed_delta;
> }
>
> /* At this point se is NULL and we are at root level*/
> @@ -6010,7 +6035,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> struct rq *rq = rq_of(cfs_rq);
> struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> struct sched_entity *se;
> - long task_delta, idle_task_delta;
> + long task_delta, idle_task_delta, delayed_delta;
> long rq_h_nr_running = rq->cfs.h_nr_running;
>
> se = cfs_rq->tg->se[cpu_of(rq)];
> @@ -6046,6 +6071,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>
> task_delta = cfs_rq->h_nr_running;
> idle_task_delta = cfs_rq->idle_h_nr_running;
> + delayed_delta = cfs_rq->h_nr_delayed;
> for_each_sched_entity(se) {
> struct cfs_rq *qcfs_rq = cfs_rq_of(se);
>
> @@ -6060,6 +6086,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>
> qcfs_rq->h_nr_running += task_delta;
> qcfs_rq->idle_h_nr_running += idle_task_delta;
> + qcfs_rq->h_nr_delayed += delayed_delta;
>
> /* end evaluation on encountering a throttled cfs_rq */
> if (cfs_rq_throttled(qcfs_rq))
> @@ -6077,6 +6104,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>
> qcfs_rq->h_nr_running += task_delta;
> qcfs_rq->idle_h_nr_running += idle_task_delta;
> + qcfs_rq->h_nr_delayed += delayed_delta;
>
> /* end evaluation on encountering a throttled cfs_rq */
> if (cfs_rq_throttled(qcfs_rq))
> @@ -6930,7 +6958,7 @@ requeue_delayed_entity(struct sched_entity *se)
> }
>
> update_load_avg(cfs_rq, se, 0);
> - se->sched_delayed = 0;
> + clear_delayed(se);
> }
>
> /*
> @@ -6944,6 +6972,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> struct cfs_rq *cfs_rq;
> struct sched_entity *se = &p->se;
> int idle_h_nr_running = task_has_idle_policy(p);
> + int h_nr_delayed = 0;
> int task_new = !(flags & ENQUEUE_WAKEUP);
> int rq_h_nr_running = rq->cfs.h_nr_running;
> u64 slice = 0;
> @@ -6953,6 +6982,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> return;
> }
>
> + if (task_new)
> + h_nr_delayed = !!se->sched_delayed;
> +
> /*
> * The code below (indirectly) updates schedutil which looks at
> * the cfs_rq utilization to select a frequency.
> @@ -6991,6 +7023,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> cfs_rq->h_nr_running++;
> cfs_rq->idle_h_nr_running += idle_h_nr_running;
> + cfs_rq->h_nr_delayed += h_nr_delayed;
>
> if (cfs_rq_is_idle(cfs_rq))
> idle_h_nr_running = 1;
> @@ -7014,6 +7047,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> cfs_rq->h_nr_running++;
> cfs_rq->idle_h_nr_running += idle_h_nr_running;
> + cfs_rq->h_nr_delayed += h_nr_delayed;
>
> if (cfs_rq_is_idle(cfs_rq))
> idle_h_nr_running = 1;
> @@ -7076,6 +7110,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> struct task_struct *p = NULL;
> int idle_h_nr_running = 0;
> int h_nr_running = 0;
> + int h_nr_delayed = 0;
> struct cfs_rq *cfs_rq;
> u64 slice = 0;
>
> @@ -7083,6 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> p = task_of(se);
> h_nr_running = 1;
> idle_h_nr_running = task_has_idle_policy(p);
> + if (!task_sleep && !task_delayed)
> + h_nr_delayed = !!se->sched_delayed;
> } else {
> cfs_rq = group_cfs_rq(se);
> slice = cfs_rq_min_slice(cfs_rq);
> @@ -7100,6 +7137,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>
> cfs_rq->h_nr_running -= h_nr_running;
> cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> + cfs_rq->h_nr_delayed -= h_nr_delayed;
>
> if (cfs_rq_is_idle(cfs_rq))
> idle_h_nr_running = h_nr_running;
> @@ -7138,6 +7176,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>
> cfs_rq->h_nr_running -= h_nr_running;
> cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> + cfs_rq->h_nr_delayed -= h_nr_delayed;
>
> if (cfs_rq_is_idle(cfs_rq))
> idle_h_nr_running = h_nr_running;
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index fa52906a4478..21e3ff5eb77a 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
> {
> if (___update_load_sum(now, &cfs_rq->avg,
> scale_load_down(cfs_rq->load.weight),
> - cfs_rq->h_nr_running,
> + cfs_rq->h_nr_running - cfs_rq->h_nr_delayed,
> cfs_rq->curr != NULL)) {
>
> ___update_load_avg(&cfs_rq->avg, 1);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 3744f16a1293..d91360b0cca1 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -603,6 +603,7 @@ struct cfs_rq {
> unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
> unsigned int idle_nr_running; /* SCHED_IDLE */
> unsigned int idle_h_nr_running; /* SCHED_IDLE */
> + unsigned int h_nr_delayed;
>
> s64 avg_vruntime;
> u64 avg_load;
> @@ -813,8 +814,10 @@ struct dl_rq {
>
> static inline void se_update_runnable(struct sched_entity *se)
> {
> - if (!entity_is_task(se))
> - se->runnable_weight = se->my_q->h_nr_running;
> + if (!entity_is_task(se)) {
> + struct cfs_rq *cfs_rq = se->my_q;
> + se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed;
> + }
> }
>
> static inline long se_runnable(struct sched_entity *se)

I gave the above patch a try on our Android workload running on the Pixel 6 with a 6.8-based kernel.

First I'd like to confirm that Dietmar's fix that was pushed to tip:sched/core (Fix util_est
accounting for DELAY_DEQUEUE) helps bring the frequencies and power use down to more sensible levels.

As for the above changes, unfortunately I'm seeing high frequencies and high power usage again. The
pattern looks similar to what we observed with the uclamp inc/dec imbalance.

I haven't investigated this in depth yet, but I'll go stare at some traces and the code, and hopefully
something will ring bells.