Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

From: Vincent Guittot
Date: Wed Apr 07 2021 - 05:42:28 EST


Le mercredi 07 avril 2021 à 09:17:18 (+0200), Peter Zijlstra a écrit :
> On Tue, Apr 06, 2021 at 11:26:37AM -0400, Rik van Riel wrote:
> > I would be happy to pull the static branch out of select_idle_smt()
> > and place it into this if condition, though. You are right that
> > would save some overhead on non-smt systems.
> >
> > Peter, would you prefer a follow-up patch for that or a version 4
> > of the patch?
>
> Sorry, I was side-tracked with that core scheduling crap.. Something
> like the below then?
>
> (Also fixed that stray line-wrap)
>
> ---
> Subject: sched/fair: Bring back select_idle_smt(), but differently
> From: Rik van Riel <riel@xxxxxxxxxxx>
> Date: Fri, 26 Mar 2021 15:19:32 -0400
>
> From: Rik van Riel <riel@xxxxxxxxxxx>
>
> Mel Gorman did some nice work in 9fe1f127b913 ("sched/fair: Merge
> select_idle_core/cpu()"), resulting in the kernel being more efficient
> at finding an idle CPU, and in tasks spending less time waiting to be
> run, both according to the schedstats run_delay numbers, and according
> to measured application latencies. Yay.
>
> The flip side of this is that we see more task migrations (about 30%
> more), higher cache misses, higher memory bandwidth utilization, and
> higher CPU use, for the same number of requests/second.
>
> This is most pronounced on a memcache type workload, which saw a
> consistent 1-3% increase in total CPU use on the system, due to those
> increased task migrations leading to higher L2 cache miss numbers, and
> higher memory utilization. The exclusive L3 cache on Skylake does us
> no favors there.
>
> On our web serving workload, that effect is usually negligible.
>
> It appears that the increased number of CPU migrations is generally a
> good thing, since it leads to lower cpu_delay numbers, reflecting the
> fact that tasks get to run faster. However, the reduced locality and
> the corresponding increase in L2 cache misses hurts a little.
>
> The patch below appears to fix the regression, while keeping the
> benefit of the lower cpu_delay numbers, by reintroducing
> select_idle_smt with a twist: when a socket has no idle cores, check
> to see if the sibling of "prev" is idle, before searching all the
> other CPUs.
>
> This fixes both the occasional 9% regression on the web serving
> workload, and the continuous 2% CPU use regression on the memcache
> type workload.
>
> With Mel's patches and this patch together, task migrations are still
> high, but L2 cache misses, memory bandwidth, and CPU time used are
> back down to what they were before. The p95 and p99 response times for
> the memcache type application improve by about 10% over what they were
> before Mel's patches got merged.
>
> Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> Link: https://lkml.kernel.org/r/20210326151932.2c187840@xxxxxxxxxxxxxxxxxxxx
> ---
> kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++--
> 1 file changed, 37 insertions(+), 2 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6112,6 +6112,27 @@ static int select_idle_core(struct task_
> return -1;
> }
>
> +/*
> + * Scan the local SMT mask for idle CPUs.
> + */
> +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
> +{
> + int cpu;
> +
> + if (!static_branch_likely(&sched_smt_present))
> + return -1;
> +
> + for_each_cpu(cpu, cpu_smt_mask(target)) {
> + if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> + !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> + continue;
> + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> + return cpu;
> + }
> +
> + return -1;
> +}
> +
> #else /* CONFIG_SCHED_SMT */
>
> static inline void set_idle_cores(int cpu, int val)
> @@ -6128,6 +6149,11 @@ static inline int select_idle_core(struc
> return __select_idle_cpu(core);
> }
>
> +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
> +{
> + return -1;
> +}
> +
> #endif /* CONFIG_SCHED_SMT */
>
> /*
> @@ -6135,7 +6161,7 @@ static inline int select_idle_core(struc
> * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
> * average idle time for this rq (as found in rq->avg_idle).
> */
> -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
> +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int prev, int target)
> {
> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
> int i, cpu, idle_cpu = -1, nr = INT_MAX;
> @@ -6148,6 +6174,15 @@ static int select_idle_cpu(struct task_s
> if (!this_sd)
> return -1;
>
> + /* If we have SMT but there are no idle cores */
> + if (static_branch_likely(&sched_smt_presernt) && !smt) {
> + if (cpus_share_cache(prev, target)) {
> + i = select_idle_smt(p, sd, prev);
> + if ((unsigned int)i < nr_cpumask_bits)
> + return i;
> + }
> + }
> +
> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
>
> if (sched_feat(SIS_PROP) && !smt) {
> @@ -6321,7 +6356,7 @@ static int select_idle_sibling(struct ta
> if (!sd)
> return target;
>
> - i = select_idle_cpu(p, sd, target);
> + i = select_idle_cpu(p, sd, prev, target);
> if ((unsigned)i < nr_cpumask_bits)
> return i;

I would really prefer to keep that out of select_idle_cpu which aims to merge in one
single loop the walk through sd_llc. In the case of select_idle_smt, this is done outside
the loop:

I would prefer the below which also removed a number of useless and duplicated static_branch_likely(&sched_smt_presernt) in test_idle_cores and select_idle_smt

---
kernel/sched/fair.c | 48 +++++++++++++++++++++++++++++++++++++--------
1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6d73bdbb2d40..01d0bacedc8d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6038,11 +6038,9 @@ static inline bool test_idle_cores(int cpu, bool def)
{
struct sched_domain_shared *sds;

- if (static_branch_likely(&sched_smt_present)) {
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds)
- return READ_ONCE(sds->has_idle_cores);
- }
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ return READ_ONCE(sds->has_idle_cores);

return def;
}
@@ -6112,6 +6110,25 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
return -1;
}

+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int
+target)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpu_smt_mask(target)) {
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ return cpu;
+ }
+
+ return -1;
+}
+
#else /* CONFIG_SCHED_SMT */

static inline void set_idle_cores(int cpu, int val)
@@ -6128,6 +6145,11 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
return __select_idle_cpu(core);
}

+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
#endif /* CONFIG_SCHED_SMT */

/*
@@ -6135,11 +6157,10 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool smt, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
- bool smt = test_idle_cores(target, false);
int this = smp_processor_id();
struct sched_domain *this_sd;
u64 time;
@@ -6242,6 +6263,7 @@ static inline bool asym_fits_capacity(int task_util, int cpu)
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
+ bool smt = false;
struct sched_domain *sd;
unsigned long task_util;
int i, recent_used_cpu;
@@ -6317,11 +6339,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
}

+ if (static_branch_likely(&sched_smt_present)) {
+ smt = test_idle_cores(target, false);
+ if (!smt && cpus_share_cache(prev, target)) {
+ /* No idle core. Check if prev has an idle sibling. */
+ i = select_idle_smt(p, sd, prev);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ }
+ }
+
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;

- i = select_idle_cpu(p, sd, target);
+ i = select_idle_cpu(p, sd, smt, target);
if ((unsigned)i < nr_cpumask_bits)
return i;

--
2.17.1

>