[PATCH v3 2/2] sched: reduce the overhead of obtain factor

From: Michael Wang
Date: Thu Jul 04 2013 - 00:57:00 EST


From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>

Smart wake-affine is using node-size as the factor, but the overhead of
mask operation is high.

Thus, this patch introduce the 'sd_llc_size', which will record the highest
cache-share domain size, and make it to be the new factor, in order to
reduce the overhead and make more reasonable.

And we suppose it will benefit a lot when facing a huge platform.

Test:
Tested with 12 cpu X86 server and tip 3.10.0-rc7.

pgbench base smart + optimization

| db_size | clients | tps | | tps |
+---------+---------+-------+ +-------+
| 22 MB | 1 | 10598 | | 10781 |
| 22 MB | 2 | 21257 | | 21328 |
| 22 MB | 4 | 41386 | | 41622 |
| 22 MB | 8 | 51253 | | 60351 |
| 22 MB | 12 | 48570 | | 54255 |
| 22 MB | 16 | 46748 | | 55534 | +18.79%
| 22 MB | 24 | 44346 | | 55976 | +26.23%
| 22 MB | 32 | 43460 | | 55279 | +27.20%
| 7484 MB | 1 | 8951 | | 9054 |
| 7484 MB | 2 | 19233 | | 19252 |
| 7484 MB | 4 | 37239 | | 37354 |
| 7484 MB | 8 | 46087 | | 51218 |
| 7484 MB | 12 | 42054 | | 49510 |
| 7484 MB | 16 | 40765 | | 52151 | +27.93%
| 7484 MB | 24 | 37651 | | 52720 | +40.02%
| 7484 MB | 32 | 37056 | | 51094 | +37.88%
| 15 GB | 1 | 8845 | | 9139 |
| 15 GB | 2 | 19094 | | 19379 |
| 15 GB | 4 | 36979 | | 37077 |
| 15 GB | 8 | 46087 | | 50490 |
| 15 GB | 12 | 41901 | | 48235 |
| 15 GB | 16 | 40147 | | 51878 | +29.22%
| 15 GB | 24 | 37250 | | 52676 | +41.41%
| 15 GB | 32 | 36470 | | 50198 | +37.64%

CC: Ingo Molnar <mingo@xxxxxxxxxx>
CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
CC: Mike Galbraith <efault@xxxxxx>
Signed-off-by: Michael Wang <wangyun@xxxxxxxxxxxxxxxxxx>
---
kernel/sched/core.c | 7 ++++++-
kernel/sched/fair.c | 2 +-
kernel/sched/sched.h | 1 +
3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8b3350..8fcca57 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5648,18 +5648,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
* two cpus are in the same cache domain, see cpus_share_cache().
*/
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);

static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
int id = cpu;
+ int size = 1;

sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ }

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
}

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a4ddbf5..86c4b86 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3129,7 +3129,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,

static int wake_wide(struct task_struct *p)
{
- int factor = nr_cpus_node(cpu_to_node(smp_processor_id()));
+ int factor = this_cpu_read(sd_llc_size);

/*
* Yeah, it's the switching-frequency, could means many wakee or
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce39224..3227948 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -582,6 +582,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
}

DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);

struct sched_group_power {
--
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/