Re: [RFC PATCH v7] sched/fair: select idle cpu from idle cpumask for task wakeup
From: Vincent Guittot
Date: Mon Dec 14 2020 - 04:19:35 EST
On Fri, 11 Dec 2020 at 18:45, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Thu, Dec 10, 2020 at 12:58:33PM +0000, Mel Gorman wrote:
> > The prequisite patch to make that approach work was rejected though
> > as on its own, it's not very helpful and Vincent didn't like that the
> > load_balance_mask was abused to make it effective.
>
> So last time I poked at all this, I found that using more masks was a
> performance issue as well due to added cache misses.
>
> Anyway, I finally found some time to look at all this, and while reading
> over the whole SiS crud to refresh the brain came up with the below...
>
> It's still naf, but at least it gets rid of a bunch of duplicate
> scanning and LoC decreases, so it should be awesome. Ofcourse, as
> always, benchmarks will totally ruin everything, we'll see, I started
> some.
>
> It goes on top of Mel's first two patches (which is about as far as I
> got)...
We have several different things that Aubrey and Mel patches are
trying to achieve:
Aubrey wants to avoid scanning busy cpus
- patch works well on busy system: I see a significant benefit with
hackbench on a lot of group on my 2 nodes * 28 cores * 4 smt
hackbench -l 2000 -g 128
tip 3.334
w/ patch 2.978 (+12%)
- Aubey also tried to not scan the cpus which are idle for a short
duration (when a tick not stopped) but there are problems because not
stopping a tick doesn't mean a short idle. In fact , something similar
to find_idlest_group_cpu() should be done in this case but then it's
no more a fast path search
Mel wants to minimize looping over the cpus
-patch 4 is an obvious win on light loaded system because it avoids
looping twice the cpus mask when some cpus are idle but no core
-But patch 3 generates perf régression
hackbench -l 2000 -g 1
tip 12.293
/w all Mel's patches 15.163 -14%
/w Aubrey + Mel patches minus patch 3 : 12.788 +3.8% But I think that
Aubreay's patch doesn't help here. Test without aubrey's patch are
running
-he also tried to used load_balance_mask to do something similar to the below
>
> ---
> fair.c | 124 ++++++++++++++++++++++++-----------------------------------------
> 1 file changed, 47 insertions(+), 77 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6021,11 +6021,13 @@ static inline void set_idle_cores(int cp
>
> static inline bool test_idle_cores(int cpu, bool def)
> {
> - struct sched_domain_shared *sds;
> + if (static_branch_likely(&sched_smt_present)) {
> + struct sched_domain_shared *sds;
>
> - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> - if (sds)
> - return READ_ONCE(sds->has_idle_cores);
> + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> + if (sds)
> + return READ_ONCE(sds->has_idle_cores);
> + }
>
> return def;
> }
> @@ -6059,77 +6061,39 @@ void __update_idle_core(struct rq *rq)
> rcu_read_unlock();
> }
>
> -/*
> - * Scan the entire LLC domain for idle cores; this dynamically switches off if
> - * there are no idle cores left in the system; tracked through
> - * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
> - */
> -static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
> +static int __select_idle_core(int core, struct cpumask *cpus, int *idle_cpu)
> {
> - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
> - int core, cpu;
> -
> - if (!static_branch_likely(&sched_smt_present))
> - return -1;
> -
> - if (!test_idle_cores(target, false))
> - return -1;
> -
> - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> -
> - for_each_cpu_wrap(core, cpus, target) {
> - bool idle = true;
> + bool idle = true;
> + int cpu;
>
> - for_each_cpu(cpu, cpu_smt_mask(core)) {
> - if (!available_idle_cpu(cpu)) {
> - idle = false;
> - break;
> - }
> + for_each_cpu(cpu, cpu_smt_mask(core)) {
> + if (!available_idle_cpu(cpu)) {
> + idle = false;
> + continue;
By not breaking on the first not idle cpu of the core, you will
largely increase the number of loops. On my system, it increases 4
times from 28 up tu 112
> }
> -
> - if (idle)
> - return core;
> -
> - cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
> + if (idle_cpu)
> + *idle_cpu = cpu;
> }
>
> - /*
> - * Failed to find an idle core; stop looking for one.
> - */
> - set_idle_cores(target, 0);
> + if (idle)
> + return core;
>
> + cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
> return -1;
> }
>
> -/*
> - * Scan the local SMT mask for idle CPUs.
> - */
> -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
> -{
> - int cpu;
> -
> - if (!static_branch_likely(&sched_smt_present))
> - return -1;
> -
> - for_each_cpu(cpu, cpu_smt_mask(target)) {
> - if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> - !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> - continue;
> - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> - return cpu;
> - }
> +#else /* CONFIG_SCHED_SMT */
>
> - return -1;
> +static inline void set_idle_cores(int cpu, int val)
> +{
> }
>
> -#else /* CONFIG_SCHED_SMT */
> -
> -static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
> +static inline bool test_idle_cores(int cpu, bool def)
> {
> - return -1;
> + return def;
> }
>
> -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
> +static inline int __select_idle_core(int core, struct cpumask *cpus, int *idle_cpu)
> {
> return -1;
> }
> @@ -6144,10 +6108,11 @@ static inline int select_idle_smt(struct
> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
> {
> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
> + int this = smp_processor_id();
> + bool smt = test_idle_cores(this, false);
> + int i, cpu, idle_cpu = -1, nr = INT_MAX;
> struct sched_domain *this_sd;
> u64 time;
> - int this = smp_processor_id();
> - int cpu, nr = INT_MAX;
>
> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
> if (!this_sd)
> @@ -6155,7 +6120,7 @@ static int select_idle_cpu(struct task_s
>
> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
>
> - if (sched_feat(SIS_PROP)) {
> + if (sched_feat(SIS_PROP) && !smt) {
> u64 avg_cost, avg_idle, span_avg;
>
> /*
> @@ -6175,18 +6140,31 @@ static int select_idle_cpu(struct task_s
> }
>
> for_each_cpu_wrap(cpu, cpus, target) {
> - if (!--nr)
> - return -1;
> - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> - break;
> + if (smt) {
sds->has_idle_core being set doesn't means that there is one idle core
which means for the same real HW state (i.e. no idle cores)
if sds->has_idle_core is set, we will loop all cpus here and never get
a chance to return a sched_idle_cpu()
but if sds->has_idle_core is clear, we will loop on a limited number
of cpus and test sched_idle_cpu()
> + i = __select_idle_core(cpu, cpus, &idle_cpu);
> + if ((unsigned)i < nr_cpumask_bits)
> + return i;
> +
> + } else {
> + if (!--nr)
> + return -1;
> +
> + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) {
> + idle_cpu = cpu;
> + break;
> + }
> + }
> }
>
> - if (sched_feat(SIS_PROP)) {
> + if (smt)
> + set_idle_cores(this, false);
> +
> + if (sched_feat(SIS_PROP) && !smt) {
> time = cpu_clock(this) - time;
> update_avg(&this_sd->avg_scan_cost, time);
> }
>
> - return cpu;
> + return idle_cpu;
> }
>
> /*
> @@ -6315,18 +6293,10 @@ static int select_idle_sibling(struct ta
> if (!sd)
> return target;
>
> - i = select_idle_core(p, sd, target);
> - if ((unsigned)i < nr_cpumask_bits)
> - return i;
> -
> i = select_idle_cpu(p, sd, target);
> if ((unsigned)i < nr_cpumask_bits)
> return i;
>
> - i = select_idle_smt(p, sd, target);
> - if ((unsigned)i < nr_cpumask_bits)
> - return i;
> -
> return target;
> }
>