Re: [PATCH v3 9/9] sched/topology: Define and use shortcut pointers for wakeup sd_flag scan
From: Vincent Guittot
Date: Thu Apr 16 2020 - 03:47:08 EST
On Wed, 15 Apr 2020 at 23:05, Valentin Schneider
<valentin.schneider@xxxxxxx> wrote:
>
> Reworking select_task_rq_fair()'s domain walk exposed that !want_affine
> wakeups only look for highest sched_domain with the required sd_flag
> set. This is something we can cache at sched domain build time to slightly
> optimize select_task_rq_fair(). Note that this isn't a "free" optimization:
> it costs us 3 pointers per CPU.
>
> Add cached per-CPU pointers for the highest domains with SD_BALANCE_WAKE,
> SD_BALANCE_EXEC and SD_BALANCE_FORK. Use them in select_task_rq_fair().
>
> Signed-off-by: Valentin Schneider <valentin.schneider@xxxxxxx>
> ---
> kernel/sched/fair.c | 25 +++++++++++++------------
> kernel/sched/sched.h | 3 +++
> kernel/sched/topology.c | 12 ++++++++++++
> 3 files changed, 28 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6f8cdb99f4a0..db4fb29a88d9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6631,17 +6631,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
> int want_affine = 0;
> int sd_flag;
>
> - switch (wake_flags & (WF_TTWU | WF_FORK | WF_EXEC)) {
> - case WF_TTWU:
> - sd_flag = SD_BALANCE_WAKE;
> - break;
> - case WF_FORK:
> - sd_flag = SD_BALANCE_FORK;
> - break;
> - default:
> - sd_flag = SD_BALANCE_EXEC;
> - }
> -
> if (wake_flags & WF_TTWU) {
> record_wakee(p);
>
> @@ -6657,7 +6646,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>
> rcu_read_lock();
>
> - sd = highest_flag_domain(cpu, sd_flag);
> + switch (wake_flags & (WF_TTWU | WF_FORK | WF_EXEC)) {
> + case WF_TTWU:
> + sd_flag = SD_BALANCE_WAKE;
> + sd = rcu_dereference(per_cpu(sd_balance_wake, cpu));
It's worth having a direct pointer for the fast path which we always
try to keep short but the other paths are already slow and will not
get any benefit of this per cpu pointer.
We should keep the loop for the slow paths
> + break;
> + case WF_FORK:
> + sd_flag = SD_BALANCE_FORK;
> + sd = rcu_dereference(per_cpu(sd_balance_fork, cpu));
> + break;
> + default:
> + sd_flag = SD_BALANCE_EXEC;
> + sd = rcu_dereference(per_cpu(sd_balance_exec, cpu));
> + }
>
> /*
> * If !want_affine, we just look for the highest domain where
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 448f5d630544..4b014103affb 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1423,6 +1423,9 @@ DECLARE_PER_CPU(int, sd_llc_size);
> DECLARE_PER_CPU(int, sd_llc_id);
> DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_balance_wake);
> +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_balance_fork);
> +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_balance_exec);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> extern struct static_key_false sched_asym_cpucapacity;
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 1d7b446fac7d..66763c539bbd 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -618,6 +618,9 @@ DEFINE_PER_CPU(int, sd_llc_size);
> DEFINE_PER_CPU(int, sd_llc_id);
> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_balance_wake);
> +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_balance_fork);
> +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_balance_exec);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> @@ -644,6 +647,15 @@ static void update_top_cache_domain(int cpu)
> sd = lowest_flag_domain(cpu, SD_NUMA);
> rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
>
> + sd = highest_flag_domain(cpu, SD_BALANCE_WAKE);
> + rcu_assign_pointer(per_cpu(sd_balance_wake, cpu), sd);
> +
> + sd = highest_flag_domain(cpu, SD_BALANCE_FORK);
> + rcu_assign_pointer(per_cpu(sd_balance_fork, cpu), sd);
> +
> + sd = highest_flag_domain(cpu, SD_BALANCE_EXEC);
> + rcu_assign_pointer(per_cpu(sd_balance_exec, cpu), sd);
> +
> sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
> rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
>
> --
> 2.24.0
>