Re: [PATCH V2 1/2] sched/core: Rename and move schedutil_cpu_util() to core.c

From: Vincent Guittot
Date: Fri Oct 23 2020 - 06:43:49 EST


On Fri, 23 Oct 2020 at 12:20, Viresh Kumar <viresh.kumar@xxxxxxxxxx> wrote:
>
> There is nothing schedutil specific in schedutil_cpu_util(), move it to
> core.c and rename it to sched_cpu_util(), so it can be used from other

I wonder if pelt.c would be a better place for this than core.c ?

> parts of the kernel as well.
>
> The cpufreq_cooling stuff will make use of this in a later commit.
>
> Signed-off-by: Viresh Kumar <viresh.kumar@xxxxxxxxxx>
> ---
> include/linux/sched.h | 19 +++++
> kernel/sched/core.c | 113 ++++++++++++++++++++++++++++++
> kernel/sched/cpufreq_schedutil.c | 116 +------------------------------
> kernel/sched/fair.c | 6 +-
> kernel/sched/sched.h | 29 +-------
> 5 files changed, 140 insertions(+), 143 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 393db0690101..3c27c10141cb 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1930,6 +1930,25 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
> #define TASK_SIZE_OF(tsk) TASK_SIZE
> #endif
>
> +/**
> + * enum cpu_util_type - CPU utilization type
> + * @FREQUENCY_UTIL: Utilization used to select frequency
> + * @ENERGY_UTIL: Utilization used during energy calculation
> + *
> + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
> + * need to be aggregated differently depending on the usage made of them. This
> + * enum is used within sched_cpu_util() to differentiate the types of
> + * utilization expected by the callers, and adjust the aggregation accordingly.
> + */
> +enum cpu_util_type {
> + FREQUENCY_UTIL,
> + ENERGY_UTIL,
> +};
> +
> +/* Returns effective CPU utilization, as seen by the scheduler */
> +unsigned long sched_cpu_util(int cpu, enum cpu_util_type type,
> + unsigned long max);
> +
> #ifdef CONFIG_RSEQ
>
> /*
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d2003a7d5ab5..369ff54d11d4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5117,6 +5117,119 @@ struct task_struct *idle_task(int cpu)
> return cpu_rq(cpu)->idle;
> }
>
> +/*
> + * This function computes an effective utilization for the given CPU, to be
> + * used for frequency selection given the linear relation: f = u * f_max.
> + *
> + * The scheduler tracks the following metrics:
> + *
> + * cpu_util_{cfs,rt,dl,irq}()
> + * cpu_bw_dl()
> + *
> + * Where the cfs,rt and dl util numbers are tracked with the same metric and
> + * synchronized windows and are thus directly comparable.
> + *
> + * The cfs,rt,dl utilization are the running times measured with rq->clock_task
> + * which excludes things like IRQ and steal-time. These latter are then accrued
> + * in the irq utilization.
> + *
> + * The DL bandwidth number otoh is not a measured metric but a value computed
> + * based on the task model parameters and gives the minimal utilization
> + * required to meet deadlines.
> + */
> +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
> + unsigned long max, enum cpu_util_type type,
> + struct task_struct *p)
> +{
> + unsigned long dl_util, util, irq;
> + struct rq *rq = cpu_rq(cpu);
> +
> + if (!uclamp_is_used() &&
> + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
> + return max;
> + }
> +
> + /*
> + * Early check to see if IRQ/steal time saturates the CPU, can be
> + * because of inaccuracies in how we track these -- see
> + * update_irq_load_avg().
> + */
> + irq = cpu_util_irq(rq);
> + if (unlikely(irq >= max))
> + return max;
> +
> + /*
> + * Because the time spend on RT/DL tasks is visible as 'lost' time to
> + * CFS tasks and we use the same metric to track the effective
> + * utilization (PELT windows are synchronized) we can directly add them
> + * to obtain the CPU's actual utilization.
> + *
> + * CFS and RT utilization can be boosted or capped, depending on
> + * utilization clamp constraints requested by currently RUNNABLE
> + * tasks.
> + * When there are no CFS RUNNABLE tasks, clamps are released and
> + * frequency will be gracefully reduced with the utilization decay.
> + */
> + util = util_cfs + cpu_util_rt(rq);
> + if (type == FREQUENCY_UTIL)
> + util = uclamp_rq_util_with(rq, util, p);
> +
> + dl_util = cpu_util_dl(rq);
> +
> + /*
> + * For frequency selection we do not make cpu_util_dl() a permanent part
> + * of this sum because we want to use cpu_bw_dl() later on, but we need
> + * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
> + * that we select f_max when there is no idle time.
> + *
> + * NOTE: numerical errors or stop class might cause us to not quite hit
> + * saturation when we should -- something for later.
> + */
> + if (util + dl_util >= max)
> + return max;
> +
> + /*
> + * OTOH, for energy computation we need the estimated running time, so
> + * include util_dl and ignore dl_bw.
> + */
> + if (type == ENERGY_UTIL)
> + util += dl_util;
> +
> + /*
> + * There is still idle time; further improve the number by using the
> + * irq metric. Because IRQ/steal time is hidden from the task clock we
> + * need to scale the task numbers:
> + *
> + * max - irq
> + * U' = irq + --------- * U
> + * max
> + */
> + util = scale_irq_capacity(util, irq, max);
> + util += irq;
> +
> + /*
> + * Bandwidth required by DEADLINE must always be granted while, for
> + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
> + * to gracefully reduce the frequency when no tasks show up for longer
> + * periods of time.
> + *
> + * Ideally we would like to set bw_dl as min/guaranteed freq and util +
> + * bw_dl as requested freq. However, cpufreq is not yet ready for such
> + * an interface. So, we only do the latter for now.
> + */
> + if (type == FREQUENCY_UTIL)
> + util += cpu_bw_dl(rq);
> +
> + return min(max, util);
> +}
> +
> +unsigned long sched_cpu_util(int cpu, enum cpu_util_type type,
> + unsigned long max)
> +{
> + return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, type,
> + NULL);
> +}
> +
> /**
> * find_process_by_pid - find a process with a matching PID value.
> * @pid: the pid in question.
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index 5ae7b4e6e8d6..0c5c61a095f6 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -169,122 +169,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
> return cpufreq_driver_resolve_freq(policy, freq);
> }
>
> -/*
> - * This function computes an effective utilization for the given CPU, to be
> - * used for frequency selection given the linear relation: f = u * f_max.
> - *
> - * The scheduler tracks the following metrics:
> - *
> - * cpu_util_{cfs,rt,dl,irq}()
> - * cpu_bw_dl()
> - *
> - * Where the cfs,rt and dl util numbers are tracked with the same metric and
> - * synchronized windows and are thus directly comparable.
> - *
> - * The cfs,rt,dl utilization are the running times measured with rq->clock_task
> - * which excludes things like IRQ and steal-time. These latter are then accrued
> - * in the irq utilization.
> - *
> - * The DL bandwidth number otoh is not a measured metric but a value computed
> - * based on the task model parameters and gives the minimal utilization
> - * required to meet deadlines.
> - */
> -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
> - unsigned long max, enum schedutil_type type,
> - struct task_struct *p)
> -{
> - unsigned long dl_util, util, irq;
> - struct rq *rq = cpu_rq(cpu);
> -
> - if (!uclamp_is_used() &&
> - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
> - return max;
> - }
> -
> - /*
> - * Early check to see if IRQ/steal time saturates the CPU, can be
> - * because of inaccuracies in how we track these -- see
> - * update_irq_load_avg().
> - */
> - irq = cpu_util_irq(rq);
> - if (unlikely(irq >= max))
> - return max;
> -
> - /*
> - * Because the time spend on RT/DL tasks is visible as 'lost' time to
> - * CFS tasks and we use the same metric to track the effective
> - * utilization (PELT windows are synchronized) we can directly add them
> - * to obtain the CPU's actual utilization.
> - *
> - * CFS and RT utilization can be boosted or capped, depending on
> - * utilization clamp constraints requested by currently RUNNABLE
> - * tasks.
> - * When there are no CFS RUNNABLE tasks, clamps are released and
> - * frequency will be gracefully reduced with the utilization decay.
> - */
> - util = util_cfs + cpu_util_rt(rq);
> - if (type == FREQUENCY_UTIL)
> - util = uclamp_rq_util_with(rq, util, p);
> -
> - dl_util = cpu_util_dl(rq);
> -
> - /*
> - * For frequency selection we do not make cpu_util_dl() a permanent part
> - * of this sum because we want to use cpu_bw_dl() later on, but we need
> - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
> - * that we select f_max when there is no idle time.
> - *
> - * NOTE: numerical errors or stop class might cause us to not quite hit
> - * saturation when we should -- something for later.
> - */
> - if (util + dl_util >= max)
> - return max;
> -
> - /*
> - * OTOH, for energy computation we need the estimated running time, so
> - * include util_dl and ignore dl_bw.
> - */
> - if (type == ENERGY_UTIL)
> - util += dl_util;
> -
> - /*
> - * There is still idle time; further improve the number by using the
> - * irq metric. Because IRQ/steal time is hidden from the task clock we
> - * need to scale the task numbers:
> - *
> - * max - irq
> - * U' = irq + --------- * U
> - * max
> - */
> - util = scale_irq_capacity(util, irq, max);
> - util += irq;
> -
> - /*
> - * Bandwidth required by DEADLINE must always be granted while, for
> - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
> - * to gracefully reduce the frequency when no tasks show up for longer
> - * periods of time.
> - *
> - * Ideally we would like to set bw_dl as min/guaranteed freq and util +
> - * bw_dl as requested freq. However, cpufreq is not yet ready for such
> - * an interface. So, we only do the latter for now.
> - */
> - if (type == FREQUENCY_UTIL)
> - util += cpu_bw_dl(rq);
> -
> - return min(max, util);
> -}
> -
> static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
> {
> - struct rq *rq = cpu_rq(sg_cpu->cpu);
> - unsigned long util = cpu_util_cfs(rq);
> - unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
> -
> - sg_cpu->max = max;
> - sg_cpu->bw_dl = cpu_bw_dl(rq);
> + sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
> + sg_cpu->bw_dl = cpu_bw_dl(cpu_rq(sg_cpu->cpu));
>
> - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
> + return sched_cpu_util(sg_cpu->cpu, FREQUENCY_UTIL, sg_cpu->max);
> }
>
> /**
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index aa4c6227cd6d..52e2d866e875 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6499,7 +6499,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
> * is already enough to scale the EM reported power
> * consumption at the (eventually clamped) cpu_capacity.
> */
> - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
> + sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap,
> ENERGY_UTIL, NULL);
>
> /*
> @@ -6509,7 +6509,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
> * NOTE: in case RT tasks are running, by default the
> * FREQUENCY_UTIL's utilization can be max OPP.
> */
> - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
> + cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap,
> FREQUENCY_UTIL, tsk);
> max_util = max(max_util, cpu_util);
> }
> @@ -6607,7 +6607,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> * IOW, placing the task there would make the CPU
> * overutilized. Take uclamp into account to see how
> * much capacity we can get out of the CPU; this is
> - * aligned with schedutil_cpu_util().
> + * aligned with sched_cpu_util().
> */
> util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
> if (!fits_capacity(util, cpu_cap))
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index df80bfcea92e..0f0439344eec 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2486,25 +2486,8 @@ static inline unsigned long capacity_orig_of(int cpu)
> }
> #endif
>
> -/**
> - * enum schedutil_type - CPU utilization type
> - * @FREQUENCY_UTIL: Utilization used to select frequency
> - * @ENERGY_UTIL: Utilization used during energy calculation
> - *
> - * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
> - * need to be aggregated differently depending on the usage made of them. This
> - * enum is used within schedutil_freq_util() to differentiate the types of
> - * utilization expected by the callers, and adjust the aggregation accordingly.
> - */
> -enum schedutil_type {
> - FREQUENCY_UTIL,
> - ENERGY_UTIL,
> -};
> -
> -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
> -
> -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
> - unsigned long max, enum schedutil_type type,
> +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
> + unsigned long max, enum cpu_util_type type,
> struct task_struct *p);
>
> static inline unsigned long cpu_bw_dl(struct rq *rq)
> @@ -2533,14 +2516,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
> {
> return READ_ONCE(rq->avg_rt.util_avg);
> }
> -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
> -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
> - unsigned long max, enum schedutil_type type,
> - struct task_struct *p)
> -{
> - return 0;
> -}
> -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
>
> #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
> static inline unsigned long cpu_util_irq(struct rq *rq)
> --
> 2.25.0.rc1.19.g042ed3e048af
>