[PATCH] Reduce rq lock contention in load_balance()
From: chenying
Date: Thu Nov 24 2022 - 04:08:10 EST
From: chenying <chenying.kernel@xxxxxxxxxxxxx>
When doing newidle load balancing, we may have lock contention on rq->lock
while finding the same busiest rq on multiple cpus. However, it is often
the case that after the first load balancing, the busiest-rq may not be the
busiest anymore. This may lead to pointless waits for locks. For this case,
we want to use trylock to reduce rq lock contention in load_balance().
We add rq->lb_lock for the load balancing path, and uses trylock to
try to acquire the busiest rq lb_lock, if it fails, clear the
busiest rq's cpu from load_balance_mask and then goto refind.
The test results show that this patch brings ~35% rq lock contentions
reduced and no scheduling latency reduction.
unpatched:
lock_stat version 0.4
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg
acq-bounces acquisitions holdtime-min holdtime-max holdtime-total
holdtime-avg
.............................................................................................................................................................................................................................
&rq->lock: 24906 25996
0.08 27.77 43122.87 1.66
1216316 6601547 0.05 41.59 10224267.38
1.55
---------
&rq->lock 1210
[<000000000fe88813>] scheduler_tick+0x4f/0xf0
&rq->lock 1885
[<00000000de367e3c>] _nohz_idle_balance+0x116/0x250
&rq->lock 15111 [<00000000daf6fa95>]
update_blocked_averages+0x30/0x6f0
&rq->lock 1156
[<00000000d5c71b0e>] __schedule+0xa9/0x800
---------
&rq->lock 15542
[<00000000daf6fa95>] update_blocked_averages+0x30/0x6f0
&rq->lock 733
[<000000000fe88813>] scheduler_tick+0x4f/0xf0
&rq->lock 3066
[<000000000bc2ee47>] try_to_wake_up+0x206/0x710
&rq->lock 1272
[<00000000d5c71b0e>] __schedule+0xa9/0x800
patched:
lock_stat version 0.4
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg
acq-bounces acquisitions holdtime-min holdtime-max holdtime-total
holdtime-avg
.............................................................................................................................................................................................................................
&rq->lock: 16174 17105
0.07 33.13 31154.45 1.82
1162817 6602803 0.05 64.68 10141979.28
1.54
---------
&rq->lock 11665
[<00000000ce27c902>] update_blocked_averages+0x30/0x700
&rq->lock 1457
[<00000000a6302c24>] try_to_wake_up+0x206/0x710
&rq->lock 1159
[<000000009f2bc605>] __schedule+0xa9/0x810
&rq->lock 1411
[<00000000aa0a6e31>] _nohz_idle_balance+0x116/0x250
---------
&rq->lock 3032
[<00000000a6302c24>] try_to_wake_up+0x206/0x710
&rq->lock 248
[<000000008bd7e827>] load_balance+0x571/0xe80
&rq->lock 11502
[<00000000ce27c902>] update_blocked_averages+0x30/0x700
&rq->lock 1253
[<000000009f2bc605>] __schedule+0xa9/0x810
unpatched:
# ./runqlat 60 1
usecs : count distribution
0 -> 1 : 1172 |
|
2 -> 3 : 210063 |************************
|
4 -> 7 : 337576
|****************************************|
8 -> 15 : 24555 |**
|
16 -> 31 : 13598 |*
|
32 -> 63 : 779 |
|
64 -> 127 : 230 |
|
128 -> 255 : 83 |
|
256 -> 511 : 54 |
|
512 -> 1023 : 62 |
|
1024 -> 2047 : 123 |
|
2048 -> 4095 : 283 |
|
4096 -> 8191 : 1362 |
|
8192 -> 16383 : 2775 |
|
16384 -> 32767 : 52352 |******
|
32768 -> 65535 : 14 |
|
65536 -> 131071 : 140 |
|
patched:
# ./runqlat 60 1
usecs : count distribution
0 -> 1 : 1091 |
|
2 -> 3 : 205259 |***********************
|
4 -> 7 : 351620
|****************************************|
8 -> 15 : 27812 |***
|
16 -> 31 : 13971 |*
|
32 -> 63 : 727 |
|
64 -> 127 : 198 |
|
128 -> 255 : 103 |
|
256 -> 511 : 61 |
|
512 -> 1023 : 45 |
|
1024 -> 2047 : 108 |
|
2048 -> 4095 : 271 |
|
4096 -> 8191 : 1342 |
|
8192 -> 16383 : 2732 |
|
16384 -> 32767 : 49367 |*****
|
32768 -> 65535 : 8 |
|
65536 -> 131071 : 183 |
|
test script:
#!/bin/bash
mkdir /sys/fs/cgroup/cpuset/test1
echo 12,14,16,18,20,22 > /sys/fs/cgroup/cpuset/test1/cpuset.cpus
echo 0 > /sys/fs/cgroup/cpuset/test1/cpuset.mems
mkdir /sys/fs/cgroup/cpuset/test2
echo 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46
> /sys/fs/cgroup/cpuset/test2/cpuset.cpus
echo 0 > /sys/fs/cgroup/cpuset/test2/cpuset.mems
cgexec -g cpuset:test1 sysbench --test=cpu --cpu-max-prime=200000 run
--num-threads=24 --rate=100 --time=6000 &
cgexec -g cpuset:test2 sysbench --test=cpu --cpu-max-prime=200000 run
--num-threads=96 --rate=100 --time=6000 &
Suggested-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx>
Signed-off-by: chenying <chenying.kernel@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 12 ++++++++++++
kernel/sched/sched.h | 1 +
3 files changed, 14 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index daff72f00385..d41f1a9c7d5f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9697,6 +9697,7 @@ void __init sched_init(void)
rq = cpu_rq(i);
raw_spin_lock_init(&rq->__lock);
+ raw_spin_lock_init(&rq->lb_lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4a0b8bd941c..d92c42671b99 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10295,6 +10295,7 @@ static int load_balance(int this_cpu, struct rq
*this_rq,
goto out_balanced;
}
+refind:
busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
@@ -10303,6 +10304,14 @@ static int load_balance(int this_cpu, struct rq
*this_rq,
WARN_ON_ONCE(busiest == env.dst_rq);
+ if (!raw_spin_trylock(&busiest->lb_lock)) {
+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
+ if (cpumask_intersects(sched_group_span(group), cpus))
+ goto refind;
+
+ goto out_balanced;
+ }
+
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
@@ -10403,6 +10412,8 @@ static int load_balance(int this_cpu, struct rq
*this_rq,
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
+ raw_spin_unlock(&busiest->lb_lock);
+
__cpumask_clear_cpu(cpu_of(busiest), cpus);
/*
* Attempting to continue load balancing at the current
@@ -10420,6 +10431,7 @@ static int load_balance(int this_cpu, struct rq
*this_rq,
goto out_all_pinned;
}
}
+ raw_spin_unlock(&busiest->lb_lock);
if (!ld_moved) {
schedstat_inc(sd->lb_failed[idle]);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4a20046e586..384690bda8c3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -954,6 +954,7 @@ struct balance_callback {
struct rq {
/* runqueue lock: */
raw_spinlock_t __lock;
+ raw_spinlock_t lb_lock;
/*
* nr_running and cpu_load should be in the same cacheline because
--
2.11.0