Re: [BUG nohz]: wrong user and system time accounting

From: Wanpeng Li
Date: Thu Apr 13 2017 - 00:31:19 EST


2017-04-12 22:57 GMT+08:00 Thomas Gleixner <tglx@xxxxxxxxxxxxx>:
> On Wed, 12 Apr 2017, Frederic Weisbecker wrote:
>> On Tue, Apr 11, 2017 at 04:22:48PM +0200, Thomas Gleixner wrote:
>> > It's not different from the current jiffies based stuff at all. Same
>> > failure mode.
>>
>> Yes you're right, I got confused again. So to fix this we could do our snapshots
>> at a frequency lower than HZ but still high enough to avoid overhead.
>>
>> Something like TICK_NSEC / 2 ?
>
> If you are using TSC anyway then you can do proper accumulation for both
> system and user and only account the data when the accumulation is more
> than a jiffie.

So I implement it as below:

- HZ=1000.
1) two cpu hogs on cpu in nohz_full mode, 100% user time
2) Luzi's testcase, ~95% user time, ~5% idle time (as we expected)
- HZ=250
1) two cpu hogs on cpu in nohz_full mode, 100% user time
2) Luzi's testcase, 100% idle

So the codes below still not work correctly for HZ=250, any suggestions?

-------------------------------------->8-----------------------------------------------------

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee8..6a11771 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -668,6 +668,8 @@ struct task_struct {
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqcount_t vtime_seqcount;
unsigned long long vtime_snap;
+ u64 vtime_acct_utime;
+ u64 vtime_acct_stime;
enum {
/* Task is sleeping or running in a CPU with VTIME inactive: */
VTIME_INACTIVE = 0,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b..f8e54ba 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -674,20 +674,41 @@ void thread_group_cputime_adjusted(struct
task_struct *p, u64 *ut, u64 *st)
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static u64 vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct task_struct *tsk, bool user)
{
- unsigned long now = READ_ONCE(jiffies);
+ u64 delta, ret = 0;

- if (time_before(now, (unsigned long)tsk->vtime_snap))
- return 0;
+ delta = sched_clock() - tsk->vtime_snap;

- return jiffies_to_nsecs(now - tsk->vtime_snap);
+ if (is_idle_task(tsk)) {
+ if (delta >= TICK_NSEC)
+ ret = delta;
+ } else {
+ if (user) {
+ tsk->vtime_acct_utime += delta;
+ if (tsk->vtime_acct_utime >= TICK_NSEC)
+ ret = tsk->vtime_acct_utime;
+ } else {
+ tsk->vtime_acct_stime += delta;
+ if (tsk->vtime_acct_utime >= TICK_NSEC)
+ ret = tsk->vtime_acct_stime;
+ }
+ }
+
+ return ret;
}

-static u64 get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct task_struct *tsk, bool user)
{
- unsigned long now = READ_ONCE(jiffies);
- u64 delta, other;
+ u64 delta = vtime_delta(tsk, user);
+ u64 other;
+
+ if (!is_idle_task(tsk)) {
+ if (user)
+ tsk->vtime_acct_utime = 0;
+ else
+ tsk->vtime_acct_stime = 0;
+ }

/*
* Unlike tick based timing, vtime based timing never has lost
@@ -696,22 +717,21 @@ static u64 get_vtime_delta(struct task_struct *tsk)
* elapsed time. Limit account_other_time to prevent rounding
* errors from causing elapsed vtime to go negative.
*/
- delta = jiffies_to_nsecs(now - tsk->vtime_snap);
other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
- tsk->vtime_snap = now;
+ tsk->vtime_snap += delta;

return delta - other;
}

static void __vtime_account_system(struct task_struct *tsk)
{
- account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
+ account_system_time(tsk, irq_count(), get_vtime_delta(tsk, false));
}

void vtime_account_system(struct task_struct *tsk)
{
- if (!vtime_delta(tsk))
+ if (!vtime_delta(tsk, false))
return;

write_seqcount_begin(&tsk->vtime_seqcount);
@@ -723,15 +743,15 @@ void vtime_account_user(struct task_struct *tsk)
{
write_seqcount_begin(&tsk->vtime_seqcount);
tsk->vtime_snap_whence = VTIME_SYS;
- if (vtime_delta(tsk))
- account_user_time(tsk, get_vtime_delta(tsk));
+ if (vtime_delta(tsk, true))
+ account_user_time(tsk, get_vtime_delta(tsk, true));
write_seqcount_end(&tsk->vtime_seqcount);
}

void vtime_user_enter(struct task_struct *tsk)
{
write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
+ if (vtime_delta(tsk, false))
__vtime_account_system(tsk);
tsk->vtime_snap_whence = VTIME_USER;
write_seqcount_end(&tsk->vtime_seqcount);
@@ -747,7 +767,7 @@ void vtime_guest_enter(struct task_struct *tsk)
* that can thus safely catch up with a tickless delta.
*/
write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
+ if (vtime_delta(tsk, false))
__vtime_account_system(tsk);
current->flags |= PF_VCPU;
write_seqcount_end(&tsk->vtime_seqcount);
@@ -765,7 +785,7 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit);

void vtime_account_idle(struct task_struct *tsk)
{
- account_idle_time(get_vtime_delta(tsk));
+ account_idle_time(get_vtime_delta(tsk, false));
}

void arch_vtime_task_switch(struct task_struct *prev)
@@ -776,7 +796,7 @@ void arch_vtime_task_switch(struct task_struct *prev)

write_seqcount_begin(&current->vtime_seqcount);
current->vtime_snap_whence = VTIME_SYS;
- current->vtime_snap = jiffies;
+ current->vtime_snap = sched_clock_cpu(smp_processor_id());
write_seqcount_end(&current->vtime_seqcount);
}

@@ -787,7 +807,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags);
write_seqcount_begin(&t->vtime_seqcount);
t->vtime_snap_whence = VTIME_SYS;
- t->vtime_snap = jiffies;
+ t->vtime_snap = sched_clock_cpu(cpu);
write_seqcount_end(&t->vtime_seqcount);
local_irq_restore(flags);
}
@@ -805,7 +825,7 @@ u64 task_gtime(struct task_struct *t)

gtime = t->gtime;
if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
- gtime += vtime_delta(t);
+ gtime += vtime_delta(t, false);

} while (read_seqcount_retry(&t->vtime_seqcount, seq));

@@ -819,7 +839,6 @@ u64 task_gtime(struct task_struct *t)
*/
void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
- u64 delta;
unsigned int seq;

if (!vtime_accounting_enabled()) {
@@ -838,16 +857,14 @@ void task_cputime(struct task_struct *t, u64
*utime, u64 *stime)
if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
continue;

- delta = vtime_delta(t);
-
/*
* Task runs either in user or kernel space, add pending nohz time to
* the right place.
*/
if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
- *utime += delta;
+ *utime += vtime_delta(t, true);
else if (t->vtime_snap_whence == VTIME_SYS)
- *stime += delta;
+ *stime += vtime_delta(t, false);
} while (read_seqcount_retry(&t->vtime_seqcount, seq));
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */