[PATCH] sched/cputime: Make cputime_adjust() more accurate

From: Ma Xing
Date: Sun Dec 25 2022 - 22:10:33 EST


In the current algorithm of cputime_adjust(), the accumulated stime and
utime are used to divide the accumulated rtime. When the value is very
large, it is easy for the stime or utime not to be updated.

A better and intuitive way is to save the last stime and utime, and
divide the rtime increment proportionally according to the tick
increment.

Signed-off-by: Ma Xing <maxing.lan@xxxxxxxxxxxxx>
---
include/linux/sched.h | 2 ++
include/linux/sched/cputime.h | 1 +
kernel/sched/cputime.c | 38 +++++++++++++++++++++++++----------
3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 853d08f7562b..41b69ea8b717 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -325,6 +325,8 @@ struct prev_cputime {
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
u64 utime;
u64 stime;
+ u64 utick;
+ u64 stick;
raw_spinlock_t lock;
#endif
};
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index ce3c58286062..db5c32d4a467 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -182,6 +182,7 @@ static inline void prev_cputime_init(struct prev_cputime *prev)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
prev->utime = prev->stime = 0;
+ prev->utick = prev->stick = 0;
raw_spin_lock_init(&prev->lock);
#endif
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 95fc77853743..d94e9a306478 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,6 +555,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
u64 *ut, u64 *st)
{
u64 rtime, stime, utime;
+ s64 delta_rtime, delta_stime, delta_utime;
unsigned long flags;

/* Serialize concurrent callers such that we can honour our guarantees */
@@ -575,22 +576,36 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
stime = curr->stime;
utime = curr->utime;

+
+ delta_rtime = rtime - prev->stime - prev->utime;
+ delta_stime = stime - prev->stick;
+ delta_utime = utime - prev->utick;
+
+ prev->stick = stime;
+ prev->utick = utime;
+
/*
* If either stime or utime are 0, assume all runtime is userspace.
* Once a task gets some ticks, the monotonicity code at 'update:'
* will ensure things converge to the observed ratio.
*/
if (stime == 0) {
- utime = rtime;
+ delta_utime = delta_rtime;
goto update;
}

if (utime == 0) {
- stime = rtime;
+ delta_stime = delta_rtime;
goto update;
}

- stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
+ if (delta_stime <= 0)
+ goto update;
+
+ if (delta_utime <= 0)
+ goto update;
+
+ delta_stime = mul_u64_u64_div_u64(delta_stime, delta_rtime, delta_stime + delta_utime);

update:
/*
@@ -602,21 +617,22 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
* = (rtime_i+1 - rtime_i) + utime_i
* >= utime_i
*/
- if (stime < prev->stime)
- stime = prev->stime;
- utime = rtime - stime;
+ if (delta_stime <= 0)
+ delta_stime = 0;
+ delta_utime = delta_rtime - delta_stime;
+

/*
* Make sure utime doesn't go backwards; this still preserves
* monotonicity for stime, analogous argument to above.
*/
- if (utime < prev->utime) {
- utime = prev->utime;
- stime = rtime - utime;
+ if (delta_utime <= 0) {
+ delta_utime = 0;
+ delta_stime = delta_rtime - delta_utime;
}

- prev->stime = stime;
- prev->utime = utime;
+ prev->stime += delta_stime;
+ prev->utime += delta_utime;
out:
*ut = prev->utime;
*st = prev->stime;
--
2.20.1