Re: [PATCH 10/10] sched/fair: Implement an EEVDF like policy

From: Peter Zijlstra
Date: Thu Mar 09 2023 - 11:53:16 EST


On Thu, Mar 09, 2023 at 04:29:04PM +0100, Peter Zijlstra wrote:

> So if I add TICK_NSEC based sleeper bonus (/2 for gentle), then starve
> works -- this is the absolutely minimal amount required. It sucks a bit
> it's HZ dependent, but alas.

Fixes starve, sucks for schbench and hackbench :/

Clearly more thinking is required...

root@ivb-ep:~/bench# echo NO_FAIR_SLEEPERS > /debug/sched/features
root@ivb-ep:~/bench# ./doit-schbench.sh ; ./doit-hackbench-series.sh
Latency percentiles (usec)
50.0000th: 83
75.0000th: 102
90.0000th: 109
95.0000th: 114
*99.0000th: 450
99.5000th: 723
99.9000th: 985
min=0, max=1067
1: 0.55355 +- 0.00290 seconds time elapsed ( +- 0.52% )
2: 0.79591 +- 0.00545 seconds time elapsed ( +- 0.68% )
5: 1.5804 +- 0.0102 seconds time elapsed ( +- 0.65% )
10: 2.5674 +- 0.0110 seconds time elapsed ( +- 0.43% )
20: 4.6116 +- 0.0160 seconds time elapsed ( +- 0.35% )
40: 9.5965 +- 0.0167 seconds time elapsed ( +- 0.17% )
root@ivb-ep:~/bench# time taskset -c 3 ./starve/starve 1000000
expecting to receive 1000000 signals
^C

real 0m32.999s
user 0m0.000s
sys 0m0.719s
root@ivb-ep:~/bench# echo FAIR_SLEEPERS > /debug/sched/features
root@ivb-ep:~/bench# ./doit-schbench.sh ; ./doit-hackbench-series.sh
Latency percentiles (usec)
50.0000th: 87
75.0000th: 103
90.0000th: 111
95.0000th: 116
*99.0000th: 163
99.5000th: 697
99.9000th: 1110
min=0, max=1522
1: 0.59076 +- 0.00577 seconds time elapsed ( +- 0.98% )
2: 0.86093 +- 0.00407 seconds time elapsed ( +- 0.47% )
5: 2.1018 +- 0.0129 seconds time elapsed ( +- 0.61% )
10: 3.6378 +- 0.0395 seconds time elapsed ( +- 1.09% )
20: 5.56884 +- 0.00979 seconds time elapsed ( +- 0.18% )
40: 10.8570 +- 0.0207 seconds time elapsed ( +- 0.19% )
root@ivb-ep:~/bench# time taskset -c 3 ./starve/starve 1000000
expecting to receive 1000000 signals

real 0m5.651s
user 0m0.604s
sys 0m4.047s


---

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4938,17 +4938,22 @@ place_entity(struct cfs_rq *cfs_rq, stru
{
u64 vruntime = avg_vruntime(cfs_rq);

+ if (sched_feat(PRESERVE_LAG))
+ vruntime -= se->lag;
+
if (sched_feat(FAIR_SLEEPERS)) {
- u64 sleep_time;
+// u64 sleep_time;

/* sleeps up to a single latency don't count. */
if (!initial) {
- unsigned long thresh;
+ unsigned long thresh = TICK_NSEC;

- if (se_is_idle(se))
- thresh = sysctl_sched_min_granularity;
- else
- thresh = sysctl_sched_latency;
+ if (!sched_feat(EEVDF)) {
+ if (se_is_idle(se))
+ thresh = sysctl_sched_min_granularity;
+ else
+ thresh = sysctl_sched_latency;
+ }

/*
* Halve their sleep time's effect, to allow
@@ -4957,7 +4962,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
if (sched_feat(GENTLE_FAIR_SLEEPERS))
thresh >>= 1;

- vruntime -= thresh;
+ vruntime -= calc_delta_fair(thresh, se);
}

/*
@@ -4966,15 +4971,12 @@ place_entity(struct cfs_rq *cfs_rq, stru
* slept for a long time, don't even try to compare its vruntime with
* the base as it may be too far off and the comparison may get
* inversed due to s64 overflow.
- */
sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
if ((s64)sleep_time < 60LL * NSEC_PER_SEC)
+ */
vruntime = max_vruntime(se->vruntime, vruntime);
}

- if (sched_feat(PRESERVE_LAG))
- vruntime -= se->lag;
-
se->vruntime = vruntime;
set_slice(cfs_rq, se);
}