[PATCH v2] sched/cputime: Ensure accurate utime and stime ratio in cputime_adjust()

From: Xunlei Pang
Date: Mon Jul 09 2018 - 10:59:22 EST


If users access "/proc/pid/stat", the utime and stime ratio in the
current SAMPLE period are excepted, but currently cputime_adjust()
always calculates with the ratio of the WHOLE lifetime of the process.

This results in inaccurate utime and stime in "/proc/pid/stat". For
example, a process runs for a while with "50% usr, 0% sys", then
followed by "100% sys". For later while, the following is excepted:
0.0 usr, 100.0 sys
but we got:
10.0 usr, 90.0 sys

This patch uses the accurate ratio in cputime_adjust() to address the
issue. A new task_cputime type field is added in prev_cputime to record
previous task_cputime so that we can get the elapsed times as the accurate
ratio.

Signed-off-by: Xunlei Pang <xlpang@xxxxxxxxxxxxxxxxx>
---
v1->v2:
- Rewrite the changelog.

include/linux/sched.h | 34 ++++++++++++------------
include/linux/sched/cputime.h | 12 ++++++++-
kernel/sched/cputime.c | 61 ++++++++++++++++---------------------------
3 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 87bf02d93a27..9cb76005b638 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -223,10 +223,27 @@ extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);

+/**
+ * struct task_cputime - collected CPU time counts
+ * @utime: time spent in user mode, in nanoseconds
+ * @stime: time spent in kernel mode, in nanoseconds
+ * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
+ *
+ * This structure groups together three kinds of CPU time that are tracked for
+ * threads and thread groups. Most things considering CPU time want to group
+ * these counts together and treat all three of them in parallel.
+ */
+struct task_cputime {
+ u64 utime;
+ u64 stime;
+ unsigned long long sum_exec_runtime;
+};
+
/**
* struct prev_cputime - snapshot of system and user cputime
* @utime: time spent in user mode
* @stime: time spent in system mode
+ * @cputime: previous task_cputime to calculate utime/stime
* @lock: protects the above two fields
*
* Stores previous user/system time values such that we can guarantee
@@ -236,26 +253,11 @@ struct prev_cputime {
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
u64 utime;
u64 stime;
+ struct task_cputime cputime;
raw_spinlock_t lock;
#endif
};

-/**
- * struct task_cputime - collected CPU time counts
- * @utime: time spent in user mode, in nanoseconds
- * @stime: time spent in kernel mode, in nanoseconds
- * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
- *
- * This structure groups together three kinds of CPU time that are tracked for
- * threads and thread groups. Most things considering CPU time want to group
- * these counts together and treat all three of them in parallel.
- */
-struct task_cputime {
- u64 utime;
- u64 stime;
- unsigned long long sum_exec_runtime;
-};
-
/* Alternate field names when used on cache expirations: */
#define virt_exp utime
#define prof_exp stime
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 53f883f5a2fd..49f8fd2564ed 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -175,10 +175,20 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
}

-static inline void prev_cputime_init(struct prev_cputime *prev)
+static inline void prev_cputime_clear(struct prev_cputime *prev)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
prev->utime = prev->stime = 0;
+ prev->cputime.utime = 0;
+ prev->cputime.stime = 0;
+ prev->cputime.sum_exec_runtime = 0;
+#endif
+}
+
+static inline void prev_cputime_init(struct prev_cputime *prev)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ prev_cputime_clear(prev);
raw_spin_lock_init(&prev->lock);
#endif
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0796f938c4f0..a68483ee3ad7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -590,69 +590,54 @@ static u64 scale_stime(u64 stime, u64 rtime, u64 total)
void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
u64 *ut, u64 *st)
{
- u64 rtime, stime, utime;
+ u64 rtime_delta, stime_delta, utime_delta;
unsigned long flags;

/* Serialize concurrent callers such that we can honour our guarantees */
raw_spin_lock_irqsave(&prev->lock, flags);
- rtime = curr->sum_exec_runtime;

/*
* This is possible under two circumstances:
- * - rtime isn't monotonic after all (a bug);
+ * - task_cputime isn't monotonic after all (a bug);
* - we got reordered by the lock.
*
* In both cases this acts as a filter such that the rest of the code
* can assume it is monotonic regardless of anything else.
*/
- if (prev->stime + prev->utime >= rtime)
+ if (prev->cputime.utime > curr->utime ||
+ prev->cputime.stime > curr->stime ||
+ prev->cputime.sum_exec_runtime >= curr->sum_exec_runtime)
goto out;

- stime = curr->stime;
- utime = curr->utime;
+ stime_delta = curr->stime - prev->cputime.stime;
+ utime_delta = curr->utime - prev->cputime.utime;
+ rtime_delta = curr->sum_exec_runtime - prev->cputime.sum_exec_runtime;

/*
- * If either stime or utime are 0, assume all runtime is userspace.
- * Once a task gets some ticks, the monotonicy code at 'update:'
- * will ensure things converge to the observed ratio.
+ * If either stime or utime increase are 0, assume all runtime
+ * is userspace. Once a task gets some ticks, the monotonicy code
+ * at 'update:' will ensure things converge to the observed ratio.
*/
- if (stime == 0) {
- utime = rtime;
+ if (stime_delta == 0) {
+ utime_delta = rtime_delta;
goto update;
}

- if (utime == 0) {
- stime = rtime;
+ if (utime_delta == 0) {
+ stime_delta = rtime_delta;
goto update;
}

- stime = scale_stime(stime, rtime, stime + utime);
+ stime_delta = scale_stime(stime_delta, rtime_delta,
+ stime_delta + utime_delta);
+ if (stime_delta > rtime_delta)
+ stime_delta = rtime_delta;
+ utime_delta = rtime_delta - stime_delta;

update:
- /*
- * Make sure stime doesn't go backwards; this preserves monotonicity
- * for utime because rtime is monotonic.
- *
- * utime_i+1 = rtime_i+1 - stime_i
- * = rtime_i+1 - (rtime_i - utime_i)
- * = (rtime_i+1 - rtime_i) + utime_i
- * >= utime_i
- */
- if (stime < prev->stime)
- stime = prev->stime;
- utime = rtime - stime;
-
- /*
- * Make sure utime doesn't go backwards; this still preserves
- * monotonicity for stime, analogous argument to above.
- */
- if (utime < prev->utime) {
- utime = prev->utime;
- stime = rtime - utime;
- }
-
- prev->stime = stime;
- prev->utime = utime;
+ prev->cputime = *curr;
+ prev->utime += utime_delta;
+ prev->stime += stime_delta;
out:
*ut = prev->utime;
*st = prev->stime;
--
2.14.1.40.g8e62ba1