[RESEND PATCH] sched/fair: consider RT/IRQ pressure in select_idle_sibling

From: Rohit Jain
Date: Mon Jan 29 2018 - 18:23:04 EST


Currently fast path in the scheduler looks for an idle CPU to schedule
threads on. Capacity is taken into account in the function
'select_task_rq_fair' when it calls 'wake_cap', however it ignores the
instantaneous capacity and looks at the original capacity. Furthermore
select_idle_sibling path of the code, ignores the RT/IRQ threads which
are also running on the CPUs it is looking to schedule fair threads on.

We don't necessarily have to force the code to go to slow path (by
modifying wake_cap), instead we could do a better selection of the CPU
in the current domain itself (i.e. in the fast path).

This patch makes the fast path aware of capacity, resulting in overall
performance improvements as shown in the test results.

1) KVM Test:
-----------------------------------------------------------------------
In this test KVM is configured with a ubuntu VM (unchanged kernel, used
Ubuntu server 16.04) which is running ping workload in a taskset along
with hackbench in a separate taskset. The VM is a virtio VM (which means
the host is taking and processing the interrupts). In this case, we want
to avoid scheduling vcpus on CPUs which are very busy processing
interrupts if there is a better choice available.

This machine is a 2 socket 40 CPU 20 core Intel x86 machine.

lscpu output:
CPU(s): 40
On-line CPU(s) list: 0-39
Thread(s) per core: 2
Core(s) per socket: 10
Socket(s): 2
NUMA node(s): 2
NUMA node0 CPUs: 0-9,20-29
NUMA node1 CPUs: 10-19,30-39

The setup is realistic enough to mirror realistic use cases. KVM is
bound to a full NUMA node with CPUs bound to whole cores, i.e. CPU 0 and
1 are bound to core 0, CPU 2 and 3 to core 1 and so on.

virsh vcpupin output:
VCPU: CPU Affinity
----------------------------------
0: 0,20
1: 0,20
2: 1,21
3: 1,21
4: 2,22
5: 2,22
6: 3,23
7: 3,23
8: 4,24
9: 4,24
10: 5,25
11: 5,25
12: 6,26
13: 6,26
14: 7,27
15: 7,27
16: 8,28
17: 8,28
18: 9,29
19: 9,29

Here are the results seen with ping and hackbench running inside the
KVM. Note: hackbench was run for 10000 loops (lower is better)
(Improvement is show in brackets +ve is good, -ve is bad)
+-------------------+-----------------+---------------------------+
| | Without patch | With patch |
+---+-------+-------+-------+---------+-----------------+---------+
|FD | Groups| Tasks | Mean | Std Dev | Mean | Std Dev |
+---+-------+-------+-------+---------+-----------------+---------+
|4 | 1 | 4 | 0.059 | 0.021 | 0.034 (+42.37%) | 0.008 |
|4 | 2 | 8 | 0.087 | 0.031 | 0.075 (+13.79%) | 0.021 |
|4 | 4 | 16 | 0.124 | 0.022 | 0.089 (+28.23%) | 0.013 |
|4 | 8 | 32 | 0.149 | 0.031 | 0.126 (+15.43%) | 0.022 |
+---+-------+-------+-------+---------+-----------------+---------+
|8 | 1 | 8 | 0.212 | 0.025 | 0.211 (+0.47%) | 0.023 |
|8 | 2 | 16 | 0.246 | 0.055 | 0.225 (+8.54%) | 0.024 |
|8 | 4 | 32 | 0.298 | 0.047 | 0.294 (+1.34%) | 0.022 |
|8 | 8 | 64 | 0.407 | 0.03 | 0.378 (+7.13%) | 0.032 |
+---+-------+-------+-------+---------+-----------------+---------+
|40 | 1 | 40 | 1.703 | 0.133 | 1.451 (+14.80%) | 0.072 |
|40 | 2 | 80 | 2.912 | 0.204 | 2.431 (+16.52%) | 0.075 |
+---+-------+-------+-------+---------+-----------------+---------+

2) ping + hackbench test on x86 machine:
-----------------------------------------------------------------------

Here hackbench is running in threaded mode along
with, running ping on CPU 0 and 1 as:
'ping -l 10000 -q -s 10 -f hostX'

This test is running on 2 socket, 20 core and 40 threads Intel x86 machine:
runtime is in seconds (Lower is better)

+-----------------------------+----------------+---------------------------+
| | Without patch | With patch |
+----------+----+------+------+-------+--------+----------------+----------+
|Loops | FD |Groups|Tasks | Mean | Std Dev|Mean | Std Dev |
+----------+----+------+------+-------+--------+----------------+----------+
|1,000,000 | 4 |1 |4 | 2.375 | 0.818 |1.785 (+24.84%) | 0.21 |
|1,000,000 | 4 |2 |8 | 2.748 | 0.694 |2.102 (+23.51%) | 0.239 |
|1,000,000 | 4 |4 |16 | 3.237 | 0.256 |2.922 (+9.73%) | 0.169 |
|1,000,000 | 4 |8 |32 | 3.786 | 0.185 |3.637 (+3.94%) | 0.471 |
+----------+----+------+------+-------+--------+----------------+----------+
|1,000,000 | 8 |1 |8 | 7.287 | 1.03 |5.364 (+26.39%) | 1.085 |
|1,000,000 | 8 |2 |16 | 7.963 | 0.951 |6.474 (+18.70%) | 0.397 |
|1,000,000 | 8 |4 |32 | 8.991 | 0.618 |8.32 (+7.46%) | 0.69 |
|1,000,000 | 8 |8 |64 | 13.868| 1.195 |12.881 (+7.12%) | 0.722 |
+----------+----+------+------+-------+--------+----------------+----------+
|10,000 | 40 |1 |40 | 0.828 | 0.032 |0.784 (+5.31%) | 0.010 |
|10,000 | 40 |2 |80 | 1.087 | 0.246 |0.980 (+9.84%) | 0.037 |
|10,000 | 40 |4 |160 | 1.611 | 0.055 |1.591 (+1.24%) | 0.039 |
|10,000 | 40 |8 |320 | 2.827 | 0.031 |2.817 (+0.35%) | 0.025 |
|10,000 | 40 |16 |640 | 5.107 | 0.085 |5.087 (+0.39%) | 0.045 |
|10,000 | 40 |25 |1000 | 7.503 | 0.143 |7.468 (+0.46%) | 0.045 |
+----------+----+------+------+-------+--------+----------------+----------+

Sanity tests:
-----------------------------------------------------------------------
schbench results on 2 socket, 44 core and 88 threads Intel x86
machine, with 2 message threads (lower is better):

+----------+-------------+----------+------------------+
|Threads | Latency | Without | With Patch |
| | percentiles | Patch | |
| | | (usec) | (usec) |
+----------+-------------+----------+------------------+
|16 | 50.0000th | 60 | 62 (-3.33%) |
|16 | 75.0000th | 72 | 68 (+5.56%) |
|16 | 90.0000th | 81 | 80 (+2.46%) |
|16 | 95.0000th | 88 | 83 (+5.68%) |
|16 | *99.0000th | 100 | 92 (+8.00%) |
|16 | 99.5000th | 105 | 97 (+7.62%) |
|16 | 99.9000th | 110 | 100 (+9.09%) |
+----------+-------------+----------+------------------+
|32 | 50.0000th | 62 | 62 (0%) |
|32 | 75.0000th | 80 | 81 (0%) |
|32 | 90.0000th | 93 | 94 (-1.07%) |
|32 | 95.0000th | 103 | 105 (-1.94%) |
|32 | *99.0000th | 121 | 121 (0%) |
|32 | 99.5000th | 127 | 125 (+1.57%) |
|32 | 99.9000th | 143 | 135 (+5.59%) |
+----------+-------------+----------+------------------+
|44 | 50.0000th | 79 | 79 (0%) |
|44 | 75.0000th | 104 | 104 (0%) |
|44 | 90.0000th | 126 | 125 (+0.79%) |
|44 | 95.0000th | 138 | 137 (+0.72%) |
|44 | *99.0000th | 163 | 163 (0%) |
|44 | 99.5000th | 174 | 171 (+1.72%) |
|44 | 99.9000th | 10832 | 11248 (-3.84%) |
+----------+-------------+----------+------------------+

I also ran uperf and sysbench MySQL workloads but I see no statistically
significant change.

Signed-off-by: Rohit Jain <rohit.k.jain@xxxxxxxxxx>
---
kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++----------
1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 26a71eb..ce5ccf8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5625,6 +5625,11 @@ static unsigned long capacity_orig_of(int cpu)
return cpu_rq(cpu)->cpu_capacity_orig;
}

+static inline bool full_capacity(int cpu)
+{
+ return capacity_of(cpu) >= (capacity_orig_of(cpu)*3)/4;
+}
+
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -6081,7 +6086,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int

for_each_cpu(cpu, cpu_smt_mask(core)) {
cpumask_clear_cpu(cpu, cpus);
- if (!idle_cpu(cpu))
+ if (!idle_cpu(cpu) || !full_capacity(cpu))
idle = false;
}

@@ -6102,7 +6107,8 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
*/
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
- int cpu;
+ int cpu, rcpu = -1;
+ unsigned long max_cap = 0;

if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -6110,11 +6116,13 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
- if (idle_cpu(cpu))
- return cpu;
+ if (idle_cpu(cpu) && (capacity_of(cpu) > max_cap)) {
+ max_cap = capacity_of(cpu);
+ rcpu = cpu;
+ }
}

- return -1;
+ return rcpu;
}

#else /* CONFIG_SCHED_SMT */
@@ -6143,6 +6151,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 time, cost;
s64 delta;
int cpu, nr = INT_MAX;
+ int best_cpu = -1;
+ unsigned int best_cap = 0;

this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6173,8 +6183,15 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
return -1;
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
- if (idle_cpu(cpu))
- break;
+ if (idle_cpu(cpu)) {
+ if (full_capacity(cpu)) {
+ best_cpu = cpu;
+ break;
+ } else if (capacity_of(cpu) > best_cap) {
+ best_cap = capacity_of(cpu);
+ best_cpu = cpu;
+ }
+ }
}

time = local_clock() - time;
@@ -6182,7 +6199,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
delta = (s64)(time - cost) / 8;
this_sd->avg_scan_cost += delta;

- return cpu;
+ return best_cpu;
}

/*
@@ -6193,13 +6210,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i;

- if (idle_cpu(target))
+ if (idle_cpu(target) && full_capacity(target))
return target;

/*
* If the previous cpu is cache affine and idle, don't be stupid.
*/
- if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev) &&
+ full_capacity(prev))
return prev;

sd = rcu_dereference(per_cpu(sd_llc, target));
--
2.7.4