Re: [PATCH 1/1] cputime: Make the reported utime+stime correspond to the actual runtime.
From: Fredrik MarkstrÃm
Date: Mon Jun 29 2015 - 11:29:23 EST
Hello Peter, the locking part looks good, I don't have a strong
opinion on per task/signal lock vs global lock.
But with the patch we still update prev->utime and prev->stime
independently, which was the original problem. But maybe the locking
and monoticity/sum issue should be addressed by two separate patches
since they are separate bugs ?
The part I'm referring to is the change below from my original patch
(maybe without the WARN_ON:s ?):
â
- cputime_advance(&prev->stime, stime);
- cputime_advance(&prev->utime, utime);
+ if (stime < prev->stime) {
+ stime = prev->stime;
+ utime = rtime - stime;
+ } else if (utime < prev->utime) {
+ utime = prev->utime;
+ stime = rtime - utime;
+ }
+ WARN_ON(stime < prev->stime);
+ WARN_ON(utime < prev->utime);
+ WARN_ON(stime + utime != rtime);
I can prepare and test such a patch set (based on yours) if you would
like me to, or you can do it yourself. Either way I'll be happy.
/Fredrik
On Mon, Jun 29, 2015 at 4:58 PM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> Sorry for the delay, I seem to have gotten distracted...
>
> On Mon, Jun 15, 2015 at 05:34:11PM +0200, Fredrik MarkstrÃm wrote:
>> Hello Peter, your patch helps with some of the cases but not all:
>
> Indeed, and barring cmpxchg_double(), which is not available on all
> platforms, the thing needs a lock indeed.
>
> Now, while you're probably right in that contention is unlikely for sane
> behaviour, I could imagine some perverted apps hammering
> getrusage() just because they can.
>
> Therefore, find attached a version that has a per task/signal lock.
>
> ---
> Subject: sched,cputime: Serialize cputime_adjust()
>
> Fredrik reports that top and other tools can occasionally observe >100%
> cpu usage and reports that this is because cputime_adjust() callers are
> not serialized.
>
> This means that when the s/u-time sample values are small, and change
> can shift the balance quickly, concurrent updaters can race such that
> one advances stime while the other advances utime such that the sum will
> be vastly larger than the initial rtime.
>
> There is also an issue with calculating utime as rtime - stime, where is
> if the computed stime is smaller then the previously returned stime,
> our utime will be larger than it should be, making the above problem
> worse.
>
> Cc: Jason Low <jason.low2@xxxxxx>
> Cc: Rik van Riel <riel@xxxxxxxxxx>
> Cc: Frederic Weisbecker <fweisbec@xxxxxxxxx>
> Suggested-by: Fredrik Markstrom <fredrik.markstrom@xxxxxxxxx>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> ---
> include/linux/init_task.h | 10 ++++++++++
> include/linux/sched.h | 23 +++++++++++++++--------
> kernel/fork.c | 7 ++++---
> kernel/sched/cputime.c | 34 +++++++++-------------------------
> 4 files changed, 38 insertions(+), 36 deletions(-)
>
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index bb9b075f0eb0..e38681f4912d 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -39,6 +39,14 @@ extern struct fs_struct init_fs;
> #define INIT_CPUSET_SEQ(tsk)
> #endif
>
> +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> +#define INIT_PREV_CPUTIME(x) .prev_cputime = { \
> + .lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock), \
> +},
> +#else
> +#define INIT_PREV_CPUTIME(x)
> +#endif
> +
> #define INIT_SIGNALS(sig) { \
> .nr_threads = 1, \
> .thread_head = LIST_HEAD_INIT(init_task.thread_node), \
> @@ -53,6 +61,7 @@ extern struct fs_struct init_fs;
> .cputime_atomic = INIT_CPUTIME_ATOMIC, \
> .running = 0, \
> }, \
> + INIT_PREV_CPUTIME(sig) \
> .cred_guard_mutex = \
> __MUTEX_INITIALIZER(sig.cred_guard_mutex), \
> INIT_GROUP_RWSEM(sig) \
> @@ -254,6 +263,7 @@ extern struct task_group root_task_group;
> INIT_TASK_RCU_TASKS(tsk) \
> INIT_CPUSET_SEQ(tsk) \
> INIT_RT_MUTEXES(tsk) \
> + INIT_PREV_CPUTIME(tsk) \
> INIT_VTIME(tsk) \
> INIT_NUMA_BALANCING(tsk) \
> INIT_KASAN(tsk) \
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 6633e83e608a..5dfbe92ce04e 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -531,17 +531,28 @@ struct cpu_itimer {
> };
>
> /**
> - * struct cputime - snaphsot of system and user cputime
> + * struct prev_cputime - snaphsot of system and user cputime
> * @utime: time spent in user mode
> * @stime: time spent in system mode
> *
> * Gathers a generic snapshot of user and system time.
> */
> -struct cputime {
> +struct prev_cputime {
> +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> cputime_t utime;
> cputime_t stime;
> + raw_spinlock_t lock;
> +#endif
> };
>
> +static inline void prev_cputime_init(struct prev_cputime *prev)
> +{
> +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> + prev->utime = prev->stime = 0;
> + raw_spin_lock_init(&prev->lock);
> +#endif
> +}
> +
> /**
> * struct task_cputime - collected CPU time counts
> * @utime: time spent in user mode, in &cputime_t units
> @@ -716,9 +727,7 @@ struct signal_struct {
> cputime_t utime, stime, cutime, cstime;
> cputime_t gtime;
> cputime_t cgtime;
> -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> - struct cputime prev_cputime;
> -#endif
> + struct prev_cputime prev_cputime;
> unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
> unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
> unsigned long inblock, oublock, cinblock, coublock;
> @@ -1494,9 +1503,7 @@ struct task_struct {
>
> cputime_t utime, stime, utimescaled, stimescaled;
> cputime_t gtime;
> -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> - struct cputime prev_cputime;
> -#endif
> + struct prev_cputime prev_cputime;
> #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> seqlock_t vtime_seqlock;
> unsigned long long vtime_snap;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index c9507afd4a7d..306481d3a0e0 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1067,6 +1067,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
> rcu_assign_pointer(tsk->sighand, sig);
> if (!sig)
> return -ENOMEM;
> +
> atomic_set(&sig->count, 1);
> memcpy(sig->action, current->sighand->action, sizeof(sig->action));
> return 0;
> @@ -1128,6 +1129,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
> init_sigpending(&sig->shared_pending);
> INIT_LIST_HEAD(&sig->posix_timers);
> seqlock_init(&sig->stats_lock);
> + prev_cputime_init(&sig->prev_cputime);
>
> hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> sig->real_timer.function = it_real_fn;
> @@ -1338,9 +1340,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>
> p->utime = p->stime = p->gtime = 0;
> p->utimescaled = p->stimescaled = 0;
> -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> - p->prev_cputime.utime = p->prev_cputime.stime = 0;
> -#endif
> + prev_cputime_init(&p->prev_cputime);
> +
> #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> seqlock_init(&p->vtime_seqlock);
> p->vtime_snap = 0;
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index f5a64ffad176..3acfab426e4f 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -555,32 +555,16 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
> }
>
> /*
> - * Atomically advance counter to the new value. Interrupts, vcpu
> - * scheduling, and scaling inaccuracies can cause cputime_advance
> - * to be occasionally called with a new value smaller than counter.
> - * Let's enforce atomicity.
> - *
> - * Normally a caller will only go through this loop once, or not
> - * at all in case a previous caller updated counter the same jiffy.
> - */
> -static void cputime_advance(cputime_t *counter, cputime_t new)
> -{
> - cputime_t old;
> -
> - while (new > (old = READ_ONCE(*counter)))
> - cmpxchg_cputime(counter, old, new);
> -}
> -
> -/*
> * Adjust tick based cputime random precision against scheduler
> * runtime accounting.
> */
> static void cputime_adjust(struct task_cputime *curr,
> - struct cputime *prev,
> + struct prev_cputime *prev,
> cputime_t *ut, cputime_t *st)
> {
> cputime_t rtime, stime, utime;
>
> + raw_spin_lock(&prev->lock);
> /*
> * Tick based cputime accounting depend on random scheduling
> * timeslices of a task to be interrupted or not by the timer.
> @@ -609,19 +593,19 @@ static void cputime_adjust(struct task_cputime *curr,
> } else if (stime == 0) {
> utime = rtime;
> } else {
> - cputime_t total = stime + utime;
> -
> - stime = scale_stime((__force u64)stime,
> - (__force u64)rtime, (__force u64)total);
> + stime = scale_stime((__force u64)stime, (__force u64)rtime,
> + (__force u64)(stime + utime));
> + if (stime < prev->stime)
> + stime = prev->stime;
> utime = rtime - stime;
> }
>
> - cputime_advance(&prev->stime, stime);
> - cputime_advance(&prev->utime, utime);
> -
> + prev->stime = max(prev->stime, stime);
> + prev->utime = max(prev->utime, utime);
> out:
> *ut = prev->utime;
> *st = prev->stime;
> + raw_spin_unlock(&prev->lock);
> }
>
> void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
--
/Fredrik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/