Re: [GIT PULL] sched_ext: Initial pull request for v6.11

From: Peter Zijlstra
Date: Tue Jul 30 2024 - 05:12:13 EST


On Thu, Jul 25, 2024 at 02:19:07AM +0100, Qais Yousef wrote:

> We really shouldn't change how schedutil works. The governor is supposed to
> behave in a certain way, and we need to ensure consistency. I think you should
> look on how you make your scheduler compatible with it. Adding hooks to say
> apply this perf value that I want is a recipe for randomness.

That would be this part right?

> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index eece6244f9d2..e683e5d08daa 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
>
> static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
> {
> - unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
> + unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
>
> + if (!scx_switched_all())
> + util += cpu_util_cfs_boost(sg_cpu->cpu);
> util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
> util = max(util, boost);
> sg_cpu->bw_min = min;
> @@ -325,16 +327,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
> }
>
> #ifdef CONFIG_NO_HZ_COMMON
> -static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
> +static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
> {
> - unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
> - bool ret = idle_calls == sg_cpu->saved_idle_calls;
> + unsigned long idle_calls;
> + bool ret;
> +
> + /*
> + * The heuristics in this function is for the fair class. For SCX, the
> + * performance target comes directly from the BPF scheduler. Let's just
> + * follow it.
> + */
> + if (scx_switched_all())
> + return false;

This one does seem really weird. It makes schedutil behave significantly
different from the BPF pov depending on if you have this partial mode on
or not.

So I would really like this to be reconsidered as I agree with Qais,
things should be consistent.

> + /* if capped by uclamp_max, always update to be in compliance */
> + if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
> + return false;
> +
> + /*
> + * Maintain the frequency if the CPU has not been idle recently, as
> + * reduction is likely to be premature.
> + */
> + idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
> + ret = idle_calls == sg_cpu->saved_idle_calls;
>
> sg_cpu->saved_idle_calls = idle_calls;
> return ret;
> }
> #else
> -static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
> +static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
> #endif /* CONFIG_NO_HZ_COMMON */
>
> /*
> @@ -382,14 +403,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
> return;
>
> next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
> - /*
> - * Do not reduce the frequency if the CPU has not been idle
> - * recently, as the reduction is likely to be premature then.
> - *
> - * Except when the rq is capped by uclamp_max.
> - */
> - if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
> - sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
> +
> + if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq &&
> !sg_policy->need_freq_update) {
> next_f = sg_policy->next_freq;
>
> @@ -436,14 +451,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
> if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
> return;
>
> - /*
> - * Do not reduce the target performance level if the CPU has not been
> - * idle recently, as the reduction is likely to be premature then.
> - *
> - * Except when the rq is capped by uclamp_max.
> - */
> - if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
> - sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
> + if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
> sg_cpu->util = prev_util;
>
> cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,