[tip:timers/nohz] sched/cputime: Count actually elapsed irq & softirq time

From: tip-bot for Rik van Riel
Date: Thu Jul 14 2016 - 06:38:00 EST


Commit-ID: 57430218317e5b280a80582a139b26029c25de6c
Gitweb: http://git.kernel.org/tip/57430218317e5b280a80582a139b26029c25de6c
Author: Rik van Riel <riel@xxxxxxxxxx>
AuthorDate: Wed, 13 Jul 2016 16:50:01 +0200
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Thu, 14 Jul 2016 10:42:34 +0200

sched/cputime: Count actually elapsed irq & softirq time

Currently, if there was any irq or softirq time during 'ticks'
jiffies, the entire period will be accounted as irq or softirq
time.

This is inaccurate if only a subset of the time was actually spent
handling irqs, and could conceivably mis-count all of the ticks during
a period as irq time, when there was some irq and some softirq time.

This can actually happen when irqtime_account_process_tick is called
from account_idle_ticks, which can pass a larger number of ticks down
all at once.

Fix this by changing irqtime_account_hi_update(), irqtime_account_si_update(),
and steal_account_process_ticks() to work with cputime_t time units, and
return the amount of time spent in each mode.

Rename steal_account_process_ticks() to steal_account_process_time(), to
reflect that time is now accounted in cputime_t, instead of ticks.

Additionally, have irqtime_account_process_tick() take into account how
much time was spent in each of steal, irq, and softirq time.

The latter could help improve the accuracy of cputime
accounting when returning from idle on a NO_HZ_IDLE CPU.

Properly accounting how much time was spent in hardirq and
softirq time will also allow the NO_HZ_FULL code to re-use
these same functions for hardirq and softirq accounting.

Signed-off-by: Rik van Riel <riel@xxxxxxxxxx>
[ Make nsecs_to_cputime64() actually return cputime64_t. ]
Signed-off-by: Frederic Weisbecker <fweisbec@xxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Mike Galbraith <efault@xxxxxx>
Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Radim Krcmar <rkrcmar@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Wanpeng Li <wanpeng.li@xxxxxxxxxxx>
Link: http://lkml.kernel.org/r/1468421405-20056-2-git-send-email-fweisbec@xxxxxxxxx
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
include/asm-generic/cputime_nsecs.h | 2 +
kernel/sched/cputime.c | 124 ++++++++++++++++++++++--------------
2 files changed, 79 insertions(+), 47 deletions(-)

diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h
index 0f1c6f3..a84e28e 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -50,6 +50,8 @@ typedef u64 __nocast cputime64_t;
(__force u64)(__ct)
#define nsecs_to_cputime(__nsecs) \
(__force cputime_t)(__nsecs)
+#define nsecs_to_cputime64(__nsecs) \
+ (__force cputime64_t)(__nsecs)


/*
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 3d60e5d..db82ae1 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -79,40 +79,50 @@ void irqtime_account_irq(struct task_struct *curr)
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);

-static int irqtime_account_hi_update(void)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ cputime_t irq_cputime;

local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_hardirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
- ret = 1;
+ irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
+ cpustat[CPUTIME_IRQ];
+ irq_cputime = min(irq_cputime, maxtime);
+ cpustat[CPUTIME_IRQ] += irq_cputime;
local_irq_restore(flags);
- return ret;
+ return irq_cputime;
}

-static int irqtime_account_si_update(void)
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ cputime_t softirq_cputime;

local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_softirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
- ret = 1;
+ softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
+ cpustat[CPUTIME_SOFTIRQ];
+ softirq_cputime = min(softirq_cputime, maxtime);
+ cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
local_irq_restore(flags);
- return ret;
+ return softirq_cputime;
}

#else /* CONFIG_IRQ_TIME_ACCOUNTING */

#define sched_clock_irqtime (0)

+static cputime_t irqtime_account_hi_update(cputime_t dummy)
+{
+ return 0;
+}
+
+static cputime_t irqtime_account_si_update(cputime_t dummy)
+{
+ return 0;
+}
+
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */

static inline void task_group_account_field(struct task_struct *p, int index,
@@ -257,32 +267,45 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}

-static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies)
+static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
+ cputime_t steal_cputime;
u64 steal;
- unsigned long steal_jiffies;

steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;

- /*
- * steal is in nsecs but our caller is expecting steal
- * time in jiffies. Lets cast the result to jiffies
- * granularity and account the rest on the next rounds.
- */
- steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies);
- this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
+ steal_cputime = min(nsecs_to_cputime(steal), maxtime);
+ account_steal_time(steal_cputime);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);

- account_steal_time(jiffies_to_cputime(steal_jiffies));
- return steal_jiffies;
+ return steal_cputime;
}
#endif
return 0;
}

/*
+ * Account how much elapsed time was spent in steal, irq, or softirq time.
+ */
+static inline cputime_t account_other_time(cputime_t max)
+{
+ cputime_t accounted;
+
+ accounted = steal_account_process_time(max);
+
+ if (accounted < max)
+ accounted += irqtime_account_hi_update(max - accounted);
+
+ if (accounted < max)
+ accounted += irqtime_account_si_update(max - accounted);
+
+ return accounted;
+}
+
+/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
* tasks (sum on group iteration) belonging to @tsk's group.
*/
@@ -342,21 +365,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int ticks)
{
- cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
- u64 cputime = (__force u64) cputime_one_jiffy;
- u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 cputime = (__force u64) cputime_one_jiffy * ticks;
+ cputime_t scaled, other;

- if (steal_account_process_tick(ULONG_MAX))
+ /*
+ * When returning from idle, many ticks can get accounted at
+ * once, including some ticks of steal, irq, and softirq time.
+ * Subtract those ticks from the amount of time accounted to
+ * idle, or potentially user or system time. Due to rounding,
+ * other time can exceed ticks occasionally.
+ */
+ other = account_other_time(cputime);
+ if (other >= cputime)
return;
+ cputime -= other;
+ scaled = cputime_to_scaled(cputime);

- cputime *= ticks;
- scaled *= ticks;
-
- if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += cputime;
- } else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += cputime;
- } else if (this_cpu_ksoftirqd() == p) {
+ if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
@@ -466,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t cputime, scaled, steal;
struct rq *rq = this_rq();

if (vtime_accounting_cpu_enabled())
@@ -477,16 +502,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}

- if (steal_account_process_tick(ULONG_MAX))
+ cputime = cputime_one_jiffy;
+ steal = steal_account_process_time(cputime);
+
+ if (steal >= cputime)
return;

+ cputime -= steal;
+ scaled = cputime_to_scaled(cputime);
+
if (user_tick)
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
- one_jiffy_scaled);
+ account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
else
- account_idle_time(cputime_one_jiffy);
+ account_idle_time(cputime);
}

/*
@@ -681,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
static cputime_t get_vtime_delta(struct task_struct *tsk)
{
unsigned long now = READ_ONCE(jiffies);
- unsigned long delta_jiffies, steal_jiffies;
+ cputime_t delta, steal;

- delta_jiffies = now - tsk->vtime_snap;
- steal_jiffies = steal_account_process_tick(delta_jiffies);
+ delta = jiffies_to_cputime(now - tsk->vtime_snap);
+ steal = steal_account_process_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap = now;

- return jiffies_to_cputime(delta_jiffies - steal_jiffies);
+ return delta - steal;
}

static void __vtime_account_system(struct task_struct *tsk)