[PATCH] sched/cputime: make scale_stime() more precise

From: Oleg Nesterov
Date: Thu Jul 18 2019 - 09:18:38 EST


People report that utime and stime from /proc/<pid>/stat become very wrong
when the numbers are big enough. In particular, the monitored application
can run all the time in user-space but only stime grows.

This is because scale_stime() is very inaccurate. It tries to minimize the
relative error, but the absolute error can be huge.

Andrew wrote the test-case:

int main(int argc, char **argv)
{
struct task_cputime c;
struct prev_cputime p;
u64 st, pst, cst;
u64 ut, put, cut;
u64 x;
int i = -1; // one step not printed

if (argc != 2)
{
printf("usage: %s <start_in_seconds>\n", argv[0]);
return 1;
}
x = strtoull(argv[1], NULL, 0) * SEC;
printf("start=%lld\n", x);

p.stime = 0;
p.utime = 0;

while (i++ < NSTEPS)
{
x += STEP;
c.stime = x;
c.utime = x;
c.sum_exec_runtime = x + x;
pst = cputime_to_clock_t(p.stime);
put = cputime_to_clock_t(p.utime);
cputime_adjust(&c, &p, &ut, &st);
cst = cputime_to_clock_t(st);
cut = cputime_to_clock_t(ut);
if (i)
printf("ut(diff)/st(diff): %20lld (%4lld) %20lld (%4lld)\n",
cut, cut - put, cst, cst - pst);
}
}

For example,

$ ./stime 300000
start=300000000000000
ut(diff)/st(diff): 299994875 ( 0) 300009124 (2000)
ut(diff)/st(diff): 299994875 ( 0) 300011124 (2000)
ut(diff)/st(diff): 299994875 ( 0) 300013124 (2000)
ut(diff)/st(diff): 299994875 ( 0) 300015124 (2000)
ut(diff)/st(diff): 299994875 ( 0) 300017124 (2000)
ut(diff)/st(diff): 299994875 ( 0) 300019124 (2000)
ut(diff)/st(diff): 299994875 ( 0) 300021124 (2000)
ut(diff)/st(diff): 299994875 ( 0) 300023124 (2000)
ut(diff)/st(diff): 299994875 ( 0) 300025124 (2000)
ut(diff)/st(diff): 299994875 ( 0) 300027124 (2000)
ut(diff)/st(diff): 299994875 ( 0) 300029124 (2000)
ut(diff)/st(diff): 299996875 (2000) 300029124 ( 0)
ut(diff)/st(diff): 299998875 (2000) 300029124 ( 0)
ut(diff)/st(diff): 300000875 (2000) 300029124 ( 0)
ut(diff)/st(diff): 300002875 (2000) 300029124 ( 0)
ut(diff)/st(diff): 300004875 (2000) 300029124 ( 0)
ut(diff)/st(diff): 300006875 (2000) 300029124 ( 0)
ut(diff)/st(diff): 300008875 (2000) 300029124 ( 0)
ut(diff)/st(diff): 300010875 (2000) 300029124 ( 0)
ut(diff)/st(diff): 300012055 (1180) 300029944 ( 820)
ut(diff)/st(diff): 300012055 ( 0) 300031944 (2000)
ut(diff)/st(diff): 300012055 ( 0) 300033944 (2000)
ut(diff)/st(diff): 300012055 ( 0) 300035944 (2000)
ut(diff)/st(diff): 300012055 ( 0) 300037944 (2000)

shows the problem even when sum_exec_runtime is not that big: 300000 secs.

The new implementation of scale_stime() does the additional div64_u64_rem()
in a loop but see the comment, as long it is used by cputime_adjust() this
can happen only once.

Reported-by: Andrew Fox <afox@xxxxxxxxxx>
Signed-off-by: Oleg Nesterov <oleg@xxxxxxxxxx>
---
kernel/sched/cputime.c | 66 ++++++++++++++++++++++++++++----------------------
1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2305ce8..ad055a3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -525,47 +525,55 @@ void account_idle_ticks(unsigned long ticks)
}

/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
+ * Perform (stime * rtime) / total, but avoid multiplication overflow
+ * by losing precision when the numbers are big.
+ *
+ * NOTE! currently the only user is cputime_adjust() and thus
+ *
+ * stime < total && rtime > total
+ *
+ * this means that the end result is always precise and the additional
+ * div64_u64_rem() inside the main loop is called at most once.
*/
static u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
- u64 scaled;
+ u64 res = 0, div, rem;

- for (;;) {
- /* Make sure "rtime" is the bigger of stime/rtime */
+ /* can stime * rtime overflow ? */
+ while (ilog2(stime) + ilog2(rtime) > 62) {
if (stime > rtime)
swap(rtime, stime);

- /* Make sure 'total' fits in 32 bits */
- if (total >> 32)
- goto drop_precision;
-
- /* Does rtime (and thus stime) fit in 32 bits? */
- if (!(rtime >> 32))
- break;
-
- /* Can we just balance rtime/stime rather than dropping bits? */
- if (stime >> 31)
- goto drop_precision;
-
- /* We can grow stime and shrink rtime and try to make them both fit */
- stime <<= 1;
- rtime >>= 1;
- continue;
+ if (rtime >= total) {
+ /*
+ * (rtime * stime) / total is equal to
+ *
+ * (rtime / total) * stime +
+ * (rtime % total) * stime / total
+ *
+ * if nothing overflows. Can the 1st multiplication
+ * overflow? Yes, but we do not care: this can only
+ * happen if the end result can't fit in u64 anyway.
+ *
+ * So the code below does
+ *
+ * res += (rtime / total) * stime;
+ * rtime = rtime % total;
+ */
+ div = div64_u64_rem(rtime, total, &rem);
+ res += div * stime;
+ rtime = rem;
+ continue;
+ }

-drop_precision:
- /* We drop from rtime, it has more bits than stime */
+ /* drop precision */
rtime >>= 1;
total >>= 1;
+ if (!total)
+ return res;
}

- /*
- * Make sure gcc understands that this is a 32x32->64 multiply,
- * followed by a 64/32->64 divide.
- */
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return scaled;
+ return res + div64_u64(stime * rtime, total);
}

/*
--
2.5.0