Re: schbench v1.0

From: Peter Zijlstra
Date: Thu Apr 20 2023 - 11:06:00 EST


On Mon, Apr 17, 2023 at 10:10:25AM +0200, Chris Mason wrote:

> F128 N10 EEVDF Linus
> Wakeup (usec): 99.0th: 755 1,266
> Request (usec): 99.0th: 25,632 22,304
> RPS (count): 50.0th: 4,280 4,376
>
> F128 N10 no-locking EEVDF Linus
> Wakeup (usec): 99.0th: 823 1,118
> Request (usec): 99.0th: 17,184 14,192
> RPS (count): 50.0th: 4,440 4,456

With the below fixlet (against queue/sched/eevdf) on my measly IVB-EP
(2*10*2):

./schbench -F128 -n10 -C

Request Latencies percentiles (usec) runtime 30 (s) (153800 total samples)
90.0th: 6376 (35699 samples)
* 99.0th: 6440 (9055 samples)
99.9th: 7048 (1345 samples)

CFS

schbench -m2 -F128 -n10 -r90 OTHER BATCH
Wakeup (usec): 99.0th: 6600 6328
Request (usec): 99.0th: 35904 14640
RPS (count): 50.0th: 5368 6104

EEVDF base_slice = 3000[us] (default)

schbench -m2 -F128 -n10 -r90 OTHER BATCH
Wakeup (usec): 99.0th: 3820 6968
Request (usec): 99.0th: 30496 24608
RPS (count): 50.0th: 3836 5496

EEVDF base_slice = 6440[us] (per the calibrate run)

schbench -m2 -F128 -n10 -r90 OTHER BATCH
Wakeup (usec): 99.0th: 9136 6232
Request (usec): 99.0th: 21984 12944
RPS (count): 50.0th: 4968 6184


With base_slice >= request and BATCH (disables wakeup preemption), the
EEVDF thing should turn into FIFO-queue, which is close to ideal for
your workload.

For giggles:

echo 6440000 > /debug/sched/base_slice_ns
echo NO_PLACE_LAG > /debug/sched/features
chrt -b 0 ./schbench -m2 -F128 -n10 -r90

gets me:

Wakeup Latencies percentiles (usec) runtime 90 (s) (526553 total samples)
50.0th: 2084 (158080 samples)
90.0th: 5320 (210675 samples)
* 99.0th: 6232 (47643 samples)
99.9th: 6648 (4297 samples)
min=1, max=13105
Request Latencies percentiles (usec) runtime 90 (s) (526673 total samples)
50.0th: 7544 (157171 samples)
90.0th: 10992 (210461 samples)
* 99.0th: 12944 (48069 samples)
99.9th: 15088 (3716 samples)
min=3841, max=32882
RPS percentiles (requests) runtime 90 (s) (9 total samples)
20.0th: 6184 (9 samples)
* 50.0th: 6184 (0 samples)
90.0th: 6184 (0 samples)
min=6173, max=6180
average rps: 6195.77

FWIW, your RPS stats are broken, note how all the buckets are over the
max value and the average is too.

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 050e98c97ba3..931102b00786 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1071,6 +1071,8 @@ void set_latency_fair(struct sched_entity *se, int prio)
se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
}

+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
/*
* XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
* this is probably good enough.
@@ -1084,6 +1086,14 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
* EEVDF: vd_i = ve_i + r_i / w_i
*/
se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
+
+ /*
+ * The task has consumed its request, reschedule.
+ */
+ if (cfs_rq->nr_running > 1) {
+ resched_curr(rq_of(cfs_rq));
+ clear_buddies(cfs_rq, se);
+ }
}

#include "pelt.h"
@@ -3636,6 +3646,13 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
* we need to scale se->vlag when w_i changes.
*/
se->vlag = div_s64(se->vlag * old_weight, weight);
+ } else {
+ /*
+ * When the weight changes the virtual time slope changes and
+ * we should adjust the virtual deadline. For now, punt and
+ * simply reset.
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
}

#ifdef CONFIG_SMP
@@ -5225,22 +5256,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_idle_cfs_rq_clock_pelt(cfs_rq);
}

-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- if (pick_eevdf(cfs_rq) != curr) {
- resched_curr(rq_of(cfs_rq));
- /*
- * The current task ran long enough, ensure it doesn't get
- * re-elected due to buddy favours.
- */
- clear_buddies(cfs_rq, curr);
- }
-}
-
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -5353,9 +5384,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
-
- if (cfs_rq->nr_running > 1)
- check_preempt_tick(cfs_rq, curr);
}