Re: [PATCH 1/1] cputime: Make the reported utime+stime correspond to the actual runtime.
From: Fredrik MarkstrÃm
Date: Tue Jun 30 2015 - 07:50:56 EST
Excellent,
The reason I replaced the early bail with that last test is that I
believe it needs to be done within the lock and I wanted to keep that
region short. To be honest I'm not sure this test is needed at all
anymore, but I couldn't make sense of the comment above the early bail
so I didn't dare to remove it.
Regarding the lock, have you considered how many cores you need
hammering at rusage to introduce some substantial congestion ? On arm
this region is ~10 cpu-cycles long and I measure a null system
(lmbench: lat_syscall null) call to ~1000 cycles (with some kernel
debug turned on).
Sorry for not letting this go (I know I should) but I always feel bad
introducing per thread data.
/Fredrik
On Tue, Jun 30, 2015 at 11:30 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Mon, Jun 29, 2015 at 05:28:42PM +0200, Fredrik MarkstrÃm wrote:
>> Hello Peter, the locking part looks good, I don't have a strong
>> opinion on per task/signal lock vs global lock.
>>
>> But with the patch we still update prev->utime and prev->stime
>> independently, which was the original problem.
>
> Ah,..
>
>> > @@ -609,19 +593,19 @@ static void cputime_adjust(struct task_cputime *curr,
>> > } else if (stime == 0) {
>> > utime = rtime;
>> > } else {
>> > + stime = scale_stime((__force u64)stime, (__force u64)rtime,
>> > + (__force u64)(stime + utime));
>> > + if (stime < prev->stime)
>> > + stime = prev->stime;
>> > utime = rtime - stime;
>> > }
>> >
>> > + prev->stime = max(prev->stime, stime);
>> > + prev->utime = max(prev->utime, utime);
>> > out:
>> > *ut = prev->utime;
>> > *st = prev->stime;
>> > + raw_spin_unlock(&prev->lock);
>> > }
>
> So the things we want from this function is that:
>
> - stime + utime == rtime
> - stime_i+1 >= stime_i, utime_i+1 >= utime_i
>
> Under the assumption that rtime_i+1 >= rtime_i.
>
> There are 3 cases:
>
> 1) utime == 0 -> stime = rtime
> 2) stime == 0 -> utime = rtime
>
> Both these trivially meet the above conditions, and:
>
> 3) stime_i+1 = max((stime * rtime) / (stime + utime), stime_i)
> utime = rtime - stime
>
> Which I thought would meet them because we keep stime from shrinking and
> therefore utime from growing too big. But there is indeed the other side
> to consider, what if stime grows too big and makes the new utime too
> small. When we update stime but not utime and break the first condition.
>
> Now, you propose:
>
>> + if (stime < prev->stime) {
>> + stime = prev->stime;
>> + utime = rtime - stime;
>
> Right, I have this branch, it keeps stime in check as per the above.
>
> Since we set stime_i+1 = stime_i, utime_i+1 = rtime - stime_i >= utime_i
> since rtime_i+1 >= rtime_i.
>
>> + } else if (utime < prev->utime) {
>> + utime = prev->utime;
>> + stime = rtime - utime;
>> + }
>
> And that deals with the other side, similar proof.
>
>
>> + if (prev->stime + prev->utime < rtime) {
>> + prev->stime = stime;
>> + prev->utime = utime;
>> + }
>
> This I don't really agree with, we can leave the inverse check as is and
> simply bail early.
>
> Folding this all together gives me
>
> ---
> include/linux/init_task.h | 10 ++++++++++
> include/linux/sched.h | 23 +++++++++++++--------
> kernel/fork.c | 7 ++++---
> kernel/sched/cputime.c | 51 ++++++++++++++++++++++-------------------------
> 4 files changed, 53 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index bb9b075f0eb0..e38681f4912d 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -39,6 +39,14 @@ extern struct fs_struct init_fs;
> #define INIT_CPUSET_SEQ(tsk)
> #endif
>
> +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> +#define INIT_PREV_CPUTIME(x) .prev_cputime = { \
> + .lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock), \
> +},
> +#else
> +#define INIT_PREV_CPUTIME(x)
> +#endif
> +
> #define INIT_SIGNALS(sig) { \
> .nr_threads = 1, \
> .thread_head = LIST_HEAD_INIT(init_task.thread_node), \
> @@ -53,6 +61,7 @@ extern struct fs_struct init_fs;
> .cputime_atomic = INIT_CPUTIME_ATOMIC, \
> .running = 0, \
> }, \
> + INIT_PREV_CPUTIME(sig) \
> .cred_guard_mutex = \
> __MUTEX_INITIALIZER(sig.cred_guard_mutex), \
> INIT_GROUP_RWSEM(sig) \
> @@ -254,6 +263,7 @@ extern struct task_group root_task_group;
> INIT_TASK_RCU_TASKS(tsk) \
> INIT_CPUSET_SEQ(tsk) \
> INIT_RT_MUTEXES(tsk) \
> + INIT_PREV_CPUTIME(tsk) \
> INIT_VTIME(tsk) \
> INIT_NUMA_BALANCING(tsk) \
> INIT_KASAN(tsk) \
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 6633e83e608a..5dfbe92ce04e 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -531,17 +531,28 @@ struct cpu_itimer {
> };
>
> /**
> - * struct cputime - snaphsot of system and user cputime
> + * struct prev_cputime - snaphsot of system and user cputime
> * @utime: time spent in user mode
> * @stime: time spent in system mode
> *
> * Gathers a generic snapshot of user and system time.
> */
> -struct cputime {
> +struct prev_cputime {
> +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> cputime_t utime;
> cputime_t stime;
> + raw_spinlock_t lock;
> +#endif
> };
>
> +static inline void prev_cputime_init(struct prev_cputime *prev)
> +{
> +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> + prev->utime = prev->stime = 0;
> + raw_spin_lock_init(&prev->lock);
> +#endif
> +}
> +
> /**
> * struct task_cputime - collected CPU time counts
> * @utime: time spent in user mode, in &cputime_t units
> @@ -716,9 +727,7 @@ struct signal_struct {
> cputime_t utime, stime, cutime, cstime;
> cputime_t gtime;
> cputime_t cgtime;
> -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> - struct cputime prev_cputime;
> -#endif
> + struct prev_cputime prev_cputime;
> unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
> unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
> unsigned long inblock, oublock, cinblock, coublock;
> @@ -1494,9 +1503,7 @@ struct task_struct {
>
> cputime_t utime, stime, utimescaled, stimescaled;
> cputime_t gtime;
> -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> - struct cputime prev_cputime;
> -#endif
> + struct prev_cputime prev_cputime;
> #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> seqlock_t vtime_seqlock;
> unsigned long long vtime_snap;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index c9507afd4a7d..306481d3a0e0 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1067,6 +1067,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
> rcu_assign_pointer(tsk->sighand, sig);
> if (!sig)
> return -ENOMEM;
> +
> atomic_set(&sig->count, 1);
> memcpy(sig->action, current->sighand->action, sizeof(sig->action));
> return 0;
> @@ -1128,6 +1129,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
> init_sigpending(&sig->shared_pending);
> INIT_LIST_HEAD(&sig->posix_timers);
> seqlock_init(&sig->stats_lock);
> + prev_cputime_init(&sig->prev_cputime);
>
> hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> sig->real_timer.function = it_real_fn;
> @@ -1338,9 +1340,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>
> p->utime = p->stime = p->gtime = 0;
> p->utimescaled = p->stimescaled = 0;
> -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> - p->prev_cputime.utime = p->prev_cputime.stime = 0;
> -#endif
> + prev_cputime_init(&p->prev_cputime);
> +
> #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> seqlock_init(&p->vtime_seqlock);
> p->vtime_snap = 0;
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index f5a64ffad176..f3af6d86f38b 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -555,32 +555,16 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
> }
>
> /*
> - * Atomically advance counter to the new value. Interrupts, vcpu
> - * scheduling, and scaling inaccuracies can cause cputime_advance
> - * to be occasionally called with a new value smaller than counter.
> - * Let's enforce atomicity.
> - *
> - * Normally a caller will only go through this loop once, or not
> - * at all in case a previous caller updated counter the same jiffy.
> - */
> -static void cputime_advance(cputime_t *counter, cputime_t new)
> -{
> - cputime_t old;
> -
> - while (new > (old = READ_ONCE(*counter)))
> - cmpxchg_cputime(counter, old, new);
> -}
> -
> -/*
> * Adjust tick based cputime random precision against scheduler
> * runtime accounting.
> */
> static void cputime_adjust(struct task_cputime *curr,
> - struct cputime *prev,
> + struct prev_cputime *prev,
> cputime_t *ut, cputime_t *st)
> {
> cputime_t rtime, stime, utime;
>
> + raw_spin_lock(&prev->lock);
> /*
> * Tick based cputime accounting depend on random scheduling
> * timeslices of a task to be interrupted or not by the timer.
> @@ -606,22 +590,35 @@ static void cputime_adjust(struct task_cputime *curr,
>
> if (utime == 0) {
> stime = rtime;
> - } else if (stime == 0) {
> - utime = rtime;
> - } else {
> - cputime_t total = stime + utime;
> + goto update;
> + }
>
> - stime = scale_stime((__force u64)stime,
> - (__force u64)rtime, (__force u64)total);
> - utime = rtime - stime;
> + if (stime == 0) {
> + utime = rtime;
> + goto update;
> }
>
> - cputime_advance(&prev->stime, stime);
> - cputime_advance(&prev->utime, utime);
> + stime = scale_stime((__force u64)stime, (__force u64)rtime,
> + (__force u64)(stime + utime));
> +
> + /* make sure stime doesn't go backwards */
> + if (stime < prev->stime)
> + stime = prev->stime;
> + utime = rtime - stime;
> +
> + /* make sure utime doesn't go backwards */
> + if (utime < prev->utime) {
> + utime = prev->utime;
> + stime = rtime - utime;
> + }
>
> +update:
> + prev->stime = stime;
> + prev->utime = utime;
> out:
> *ut = prev->utime;
> *st = prev->stime;
> + raw_spin_unlock(&prev->lock);
> }
>
> void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
--
/Fredrik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/