[PATCH] sched/cputime: Make cputime_adjust() more accurate

From: Ma Xing
Date: Mon Mar 27 2023 - 22:49:07 EST


In the current algorithm of cputime_adjust(), the accumulated stime and
utime are used to divide the accumulated rtime. When the value is very
large, it is easy for the stime or utime not to be updated. It can cause
sys or user utilization to be zero for long time.

A better and intuitive way is to save the last stime and utime, and
divide the rtime increment proportionally according to the tick
increment.

I wrote the test-case: refer to
https://lore.kernel.org/lkml/20190718131834.GA22211@xxxxxxxxxx/

int main(int argc, char ** argv) {
struct task_cputime c;
struct prev_cputime p;
u64 st, pst, cst;
u64 ut, put, cut;
u64 x, y;
int i = -1; // one step not printed

if (argc != 2) {
printf("usage: %s <start_in_seconds>\n", argv[0]);
return 1;
}
x = strtoull(argv[1], NULL, 0) * SEC;
printf("start=%lld\n", x);

p.stime = p.stick = x;
p.utime = p.utick = x;

c.stime = p.stime;
c.utime = p.utime;
c.sum_exec_runtime = p.stime + p.utime;

while (i++ < NSTEPS) {
y = STEP;
c.stime += 4*y;
c.utime += y;
c.sum_exec_runtime += y;
pst = nsec_to_clock_t(p.stime);
put = nsec_to_clock_t(p.utime);
cputime_adjust( & c, & p, & ut, & st);
cst = nsec_to_clock_t(st);
cut = nsec_to_clock_t(ut);
if (i)
printf("ut(diff)/st(diff): %20lld (%4lld) %20lld (%4lld)\n",
cut, cut - put, cst, cst - pst);
}
printf("\n\n");

while (i++ < 2*NSTEPS) {
y = STEP;
c.stime += y;
c.utime += 4*y;
c.sum_exec_runtime += y;
pst = nsec_to_clock_t(p.stime);
put = nsec_to_clock_t(p.utime);
cputime_adjust( & c, & p, & ut, & st);
cst = nsec_to_clock_t(st);
cut = nsec_to_clock_t(ut);
if (i)
printf("ut(diff)/st(diff): %20lld (%4lld) %20lld (%4lld)\n",
cut, cut - put, cst, cst - pst);
}
}

For example, the new patch works well when cfs based rtime disagrees
with tick based stime/utime, the root reason is it converges fast:

# ./a.out 300000
start=300000000000000
ut(diff)/st(diff): 300000400 ( 200) 300001600 ( 800)
ut(diff)/st(diff): 300000600 ( 200) 300002400 ( 800)
ut(diff)/st(diff): 300000800 ( 200) 300003200 ( 800)
ut(diff)/st(diff): 300001000 ( 200) 300004000 ( 800)
ut(diff)/st(diff): 300001200 ( 200) 300004800 ( 800)
ut(diff)/st(diff): 300001400 ( 200) 300005600 ( 800)
ut(diff)/st(diff): 300001600 ( 200) 300006400 ( 800)
ut(diff)/st(diff): 300001800 ( 200) 300007200 ( 800)
ut(diff)/st(diff): 300002000 ( 200) 300008000 ( 800)
ut(diff)/st(diff): 300002200 ( 200) 300008800 ( 800)
ut(diff)/st(diff): 300002400 ( 200) 300009600 ( 800)
ut(diff)/st(diff): 300002600 ( 200) 300010400 ( 800)
ut(diff)/st(diff): 300002800 ( 200) 300011200 ( 800)
ut(diff)/st(diff): 300003000 ( 200) 300012000 ( 800)
ut(diff)/st(diff): 300003200 ( 200) 300012800 ( 800)
ut(diff)/st(diff): 300003400 ( 200) 300013600 ( 800)
ut(diff)/st(diff): 300003600 ( 200) 300014400 ( 800)
ut(diff)/st(diff): 300003800 ( 200) 300015200 ( 800)
ut(diff)/st(diff): 300004000 ( 200) 300016000 ( 800)
ut(diff)/st(diff): 300004200 ( 200) 300016800 ( 800)

ut(diff)/st(diff): 300005000 ( 800) 300017000 ( 200)
ut(diff)/st(diff): 300005800 ( 800) 300017200 ( 200)
ut(diff)/st(diff): 300006600 ( 800) 300017400 ( 200)
ut(diff)/st(diff): 300007400 ( 800) 300017600 ( 200)
ut(diff)/st(diff): 300008200 ( 800) 300017800 ( 200)
ut(diff)/st(diff): 300009000 ( 800) 300018000 ( 200)
ut(diff)/st(diff): 300009800 ( 800) 300018200 ( 200)
ut(diff)/st(diff): 300010600 ( 800) 300018400 ( 200)
ut(diff)/st(diff): 300011400 ( 800) 300018600 ( 200)
ut(diff)/st(diff): 300012200 ( 800) 300018800 ( 200)
ut(diff)/st(diff): 300013000 ( 800) 300019000 ( 200)
ut(diff)/st(diff): 300013800 ( 800) 300019200 ( 200)
ut(diff)/st(diff): 300014600 ( 800) 300019400 ( 200)
ut(diff)/st(diff): 300015400 ( 800) 300019600 ( 200)
ut(diff)/st(diff): 300016200 ( 800) 300019800 ( 200)
ut(diff)/st(diff): 300017000 ( 800) 300020000 ( 200)
ut(diff)/st(diff): 300017800 ( 800) 300020200 ( 200)
ut(diff)/st(diff): 300018600 ( 800) 300020400 ( 200)
ut(diff)/st(diff): 300019400 ( 800) 300020600 ( 200)

while the old cputime_adjust has the following problem, when sum_exec_runtime is 300000 secs.

# ./a.out 300000
start=300000000000000
ut(diff)/st(diff): 300000000 ( 0) 300002000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300003000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300004000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300005000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300006000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300007000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300008000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300009000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300010000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300011000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300012000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300013000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300014000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300015000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300016000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300017000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300018000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300019000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300020000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300021000 (1000)

ut(diff)/st(diff): 300000000 ( 0) 300022000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300023000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300024000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300025000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300026000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300027000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300028000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300029000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300030000 (1000)
ut(diff)/st(diff): 300000000 ( 0) 300031000 (1000)
ut(diff)/st(diff): 300001000 (1000) 300031000 ( 0)
ut(diff)/st(diff): 300002000 (1000) 300031000 ( 0)
ut(diff)/st(diff): 300003000 (1000) 300031000 ( 0)
ut(diff)/st(diff): 300004000 (1000) 300031000 ( 0)
ut(diff)/st(diff): 300005000 (1000) 300031000 ( 0)
ut(diff)/st(diff): 300006000 (1000) 300031000 ( 0)
ut(diff)/st(diff): 300007000 (1000) 300031000 ( 0)
ut(diff)/st(diff): 300008000 (1000) 300031000 ( 0)
ut(diff)/st(diff): 300009000 (1000) 300031000 ( 0)

In addition, this patch has been running stably for 2 months and no problems have been found.

Signed-off-by: Ma Xing <maxing.lan@xxxxxxxxxxxxx>
---
include/linux/sched.h | 2 ++
include/linux/sched/cputime.h | 1 +
kernel/sched/cputime.c | 38 +++++++++++++++++++++++++----------
3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d654eb4cabd..e1bac4ee48ba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -326,6 +326,8 @@ struct prev_cputime {
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
u64 utime;
u64 stime;
+ u64 utick;
+ u64 stick;
raw_spinlock_t lock;
#endif
};
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 5f8fd5b24a2e..855503bbd067 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -173,6 +173,7 @@ static inline void prev_cputime_init(struct prev_cputime *prev)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
prev->utime = prev->stime = 0;
+ prev->utick = prev->stick = 0;
raw_spin_lock_init(&prev->lock);
#endif
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index af7952f12e6c..ee8084957578 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -559,6 +559,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
u64 *ut, u64 *st)
{
u64 rtime, stime, utime;
+ s64 delta_rtime, delta_stime, delta_utime;
unsigned long flags;

/* Serialize concurrent callers such that we can honour our guarantees */
@@ -579,22 +580,36 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
stime = curr->stime;
utime = curr->utime;

+
+ delta_rtime = rtime - prev->stime - prev->utime;
+ delta_stime = stime - prev->stick;
+ delta_utime = utime - prev->utick;
+
+ prev->stick = stime;
+ prev->utick = utime;
+
/*
* If either stime or utime are 0, assume all runtime is userspace.
* Once a task gets some ticks, the monotonicity code at 'update:'
* will ensure things converge to the observed ratio.
*/
if (stime == 0) {
- utime = rtime;
+ delta_utime = delta_rtime;
goto update;
}

if (utime == 0) {
- stime = rtime;
+ delta_stime = delta_rtime;
goto update;
}

- stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
+ if (delta_stime <= 0)
+ goto update;
+
+ if (delta_utime <= 0)
+ goto update;
+
+ delta_stime = mul_u64_u64_div_u64(delta_stime, delta_rtime, delta_stime + delta_utime);

update:
/*
@@ -606,21 +621,22 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
* = (rtime_i+1 - rtime_i) + utime_i
* >= utime_i
*/
- if (stime < prev->stime)
- stime = prev->stime;
- utime = rtime - stime;
+ if (delta_stime <= 0)
+ delta_stime = 0;
+ delta_utime = delta_rtime - delta_stime;
+

/*
* Make sure utime doesn't go backwards; this still preserves
* monotonicity for stime, analogous argument to above.
*/
- if (utime < prev->utime) {
- utime = prev->utime;
- stime = rtime - utime;
+ if (delta_utime <= 0) {
+ delta_utime = 0;
+ delta_stime = delta_rtime;
}

- prev->stime = stime;
- prev->utime = utime;
+ prev->stime += delta_stime;
+ prev->utime += delta_utime;
out:
*ut = prev->utime;
*st = prev->stime;
--
2.20.1