Re: [PATCH v4] Sched/fair: Block nohz tick_stop when cfs bandwidth in use

From: Peter Zijlstra
Date: Fri Jun 30 2023 - 11:07:08 EST


On Fri, Jun 30, 2023 at 09:57:14AM -0400, Phil Auld wrote:

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a68d1276bab0..2685373e12f1 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1229,6 +1229,16 @@ bool sched_can_stop_tick(struct rq *rq)
> if (rq->nr_running > 1)
> return false;
>
> + /*
> + * If there is one task and it has CFS runtime bandwidth constraints
> + * and it's on the cpu now we don't want to stop the tick.
> + */
> + if (sched_feat(HZ_BW) && rq->nr_running == 1 && rq->curr
> + && rq->curr->sched_class == &fair_sched_class && task_on_rq_queued(rq->curr)) {

&& goes at the end of the previous line

rq->curr is never NULL

But surely you can find a saner way to write this?

> + if (sched_cfs_bandwidth_active(rq->curr))
> + return false;
> + }
> +
> return true;
> }
> #endif /* CONFIG_NO_HZ_FULL */
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 373ff5f55884..125b1ec4476f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6139,6 +6139,50 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
> rcu_read_unlock();
> }
>
> +#ifdef CONFIG_NO_HZ_FULL
> +static inline bool cfs_se_bandwidth_enabled(struct sched_entity *se)
> +{
> + int ret = 0;
> +
> + for_each_sched_entity(se)
> + ret += cfs_rq_of(se)->runtime_enabled;
> +
> + return ret != 0;
> +}
> +
> +bool sched_cfs_bandwidth_active(struct task_struct *p)
> +{
> + if (cfs_bandwidth_used() && cfs_se_bandwidth_enabled(&p->se))
> + return true;
> +
> + return false;
> +}
> +
> +/* called from pick_next_task_fair() */
> +static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
> +{
> + int cpu = cpu_of(rq);
> +
> + if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
> + return;
> +
> + if (!tick_nohz_full_cpu(cpu))
> + return;
> +
> + if (rq->nr_running != 1)
> + return;
> +
> + /*
> + * We know there is only one task runnable and we've just picked it. The
> + * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
> + * be otherwise able to stop the tick. Just need to check if we are using
> + * bandwidth control.
> + */
> + if (cfs_se_bandwidth_enabled(&p->se))
> + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
> +}

Yeah, I think not; pick_next_task_fair() just walked the cgroup
hierarchy and now you do it again.

Also, why does this code exist at all? Both enqueue/dequeue already end
up in sched_update_tick_depenency() and should be able to handle the
nr_running==1 with bandwidth crap, no?