[GIT pull] timers/nohz for v7.2-rc1

From: Thomas Gleixner

Date: Sat Jun 13 2026 - 17:25:55 EST

Linus,

please pull the latest timers/nohz branch from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers-nohz-2026-06-13

up to: 6199f9999a9b: sched/cputime: Handle dyntick-idle steal time correctly

Updates for the NOHZ subsystem:

- Fix a long standing TOCTOU in get_cpu_sleep_time_us()

- Make the CPU offline NOHZ handling more robust by disabling NOHZ on the
outgoing CPU early instead of creating unneeded state which needs to be
undone.

- Unify idle CPU time accounting instead of having two different
accounting mechanisms. These two different mechanisms are not really
independent, but the different properties can in the worst case cause
that gloabl idle time can be observed going backwards.

- Consolidate the idle/iowait time retrieval interfaces instead of
converting back and forth between them.

- Make idle interrupt time accounting more robust. The original code
assumes that interrupt time accouting is enabled and therefore stops
elapsing idle time while an interrupt is handled in NOHZ dyntick
state. That assumption is not correct as interrupt time accounting can
be disabled at compile and runtime.

- Fix an accounting error between dyntick idle time and dyntick idle
steal time. The stolen time is not accounted and therefore idle time
becomes inaccurate. The stolen time is now accounted after the fact as
there is no way to predict the steal time upfront.

Thanks,

tglx

------------------>
Frederic Weisbecker (15):
tick/sched: Fix TOCTOU in nohz idle time fetch
sched/idle: Handle offlining first in idle loop
sched/cputime: Remove superfluous and error prone kcpustat_field() parameter
sched/cputime: Correctly support generic vtime idle time
powerpc/time: Prepare to stop elapsing in dynticks-idle
s390/time: Prepare to stop elapsing in dynticks-idle
tick/sched: Unify idle cputime accounting
tick/sched: Remove nohz disabled special case in cputime fetch
tick/sched: Move dyntick-idle cputime accounting to cputime code
tick/sched: Remove unused fields
tick/sched: Account tickless idle cputime only when tick is stopped
tick/sched: Consolidate idle time fetching APIs
sched/cputime: Provide get_cpu_[idle|iowait]_time_us() off-case
sched/cputime: Handle idle irqtime gracefully
sched/cputime: Handle dyntick-idle steal time correctly

arch/powerpc/kernel/time.c | 41 +++++
arch/s390/include/asm/idle.h | 2 +
arch/s390/kernel/idle.c | 5 +-
arch/s390/kernel/vtime.c | 75 ++++++++-
drivers/cpufreq/cpufreq.c | 29 +---
drivers/cpufreq/cpufreq_governor.c | 6 +-
drivers/macintosh/rack-meter.c | 2 +-
fs/proc/stat.c | 40 +----
fs/proc/uptime.c | 8 +-
include/linux/kernel_stat.h | 76 +++++++--
include/linux/tick.h | 4 -
include/linux/vtime.h | 22 ++-
kernel/rcu/tree.c | 9 +-
kernel/rcu/tree_stall.h | 7 +-
kernel/sched/core.c | 6 +-
kernel/sched/cputime.c | 308 +++++++++++++++++++++++++++++++------
kernel/sched/idle.c | 13 +-
kernel/time/tick-sched.c | 212 ++++++-------------------
kernel/time/tick-sched.h | 12 --
kernel/time/timer_list.c | 6 +-
scripts/gdb/linux/timerlist.py | 4 -
21 files changed, 529 insertions(+), 358 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index b4472288e0d4..3460d1a5a97c 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -376,6 +376,47 @@ void vtime_task_switch(struct task_struct *prev)
acct->starttime = acct0->starttime;
}
}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/**
+ * vtime_reset - Fast forward vtime entry clocks
+ *
+ * Called from dynticks idle IRQ entry to fast-forward the clocks to current time
+ * so that the IRQ time is still accounted by vtime while nohz cputime is paused.
+ */
+void vtime_reset(void)
+{
+ struct cpu_accounting_data *acct = get_accounting(current);
+
+ acct->starttime = mftb();
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
+ acct->startspurr = read_spurr(acct->starttime);
+#endif
+}
+
+/**
+ * vtime_dyntick_start - Inform vtime about entry to idle-dynticks
+ *
+ * Called when idle enters in dyntick mode. The idle cputime that elapsed so far
+ * is accumulated and the tick subsystem takes over the idle cputime accounting.
+ */
+void vtime_dyntick_start(void)
+{
+ vtime_account_idle(current);
+}
+
+/**
+ * vtime_dyntick_stop - Inform vtime about exit from idle-dynticks
+ *
+ * Called when idle exits from dyntick mode. The vtime entry clocks are
+ * fast-forward to current time so that idle accounting restarts elapsing from
+ * now.
+ */
+void vtime_dyntick_stop(void)
+{
+ vtime_reset();
+}
+#endif /* CONFIG_NO_HZ_COMMON */
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */

void __no_kcsan __delay(unsigned long loops)
diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h
index 32536ee34aa0..e4ad09a22400 100644
--- a/arch/s390/include/asm/idle.h
+++ b/arch/s390/include/asm/idle.h
@@ -8,10 +8,12 @@
#ifndef _S390_IDLE_H
#define _S390_IDLE_H

+#include <linux/percpu-defs.h>
#include <linux/types.h>
#include <linux/device.h>

struct s390_idle_data {
+ bool idle_dyntick;
unsigned long idle_count;
unsigned long idle_time;
unsigned long clock_idle_enter;
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 1f1b06b6b4ef..4685d7c5bc51 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -31,7 +31,10 @@ void account_idle_time_irq(void)
/* Account time spent with enabled wait psw loaded as idle time. */
__atomic64_add(idle_time, &idle->idle_time);
__atomic64_add_const(1, &idle->idle_count);
- account_idle_time(cputime_to_nsecs(idle_time));
+
+ /* Dyntick idle time accounted by nohz/scheduler */
+ if (!idle->idle_dyntick)
+ account_idle_time(cputime_to_nsecs(idle_time));
}

void noinstr arch_cpu_idle(void)
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index bf48744d0912..d1102a6f80bd 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -17,6 +17,7 @@
#include <asm/vtimer.h>
#include <asm/vtime.h>
#include <asm/cpu_mf.h>
+#include <asm/idle.h>
#include <asm/smp.h>

#include "entry.h"
@@ -110,6 +111,16 @@ static void account_system_index_scaled(struct task_struct *p, u64 cputime,
account_system_index_time(p, cputime_to_nsecs(cputime), index);
}

+static inline void vtime_reset_last_update(struct lowcore *lc)
+{
+ asm volatile(
+ " stpt %0\n" /* Store current cpu timer value */
+ " stckf %1" /* Store current tod clock value */
+ : "=Q" (lc->last_update_timer),
+ "=Q" (lc->last_update_clock)
+ : : "cc");
+}
+
/*
* Update process times based on virtual cpu times stored by entry.S
* to the lowcore fields user_timer, system_timer & steal_clock.
@@ -121,17 +132,16 @@ static int do_account_vtime(struct task_struct *tsk)

timer = lc->last_update_timer;
clock = lc->last_update_clock;
- asm volatile(
- " stpt %0\n" /* Store current cpu timer value */
- " stckf %1" /* Store current tod clock value */
- : "=Q" (lc->last_update_timer),
- "=Q" (lc->last_update_clock)
- : : "cc");
+
+ vtime_reset_last_update(lc);
+
clock = lc->last_update_clock - clock;
timer -= lc->last_update_timer;

if (hardirq_count())
lc->hardirq_timer += timer;
+ else if (in_serving_softirq())
+ lc->softirq_timer += timer;
else
lc->system_timer += timer;

@@ -231,13 +241,62 @@ EXPORT_SYMBOL_GPL(vtime_account_kernel);

void vtime_account_softirq(struct task_struct *tsk)
{
- get_lowcore()->softirq_timer += vtime_delta();
+ if (!__this_cpu_read(s390_idle.idle_dyntick))
+ get_lowcore()->softirq_timer += vtime_delta();
+ else
+ vtime_flush(tsk);
}

void vtime_account_hardirq(struct task_struct *tsk)
{
- get_lowcore()->hardirq_timer += vtime_delta();
+ if (!__this_cpu_read(s390_idle.idle_dyntick)) {
+ get_lowcore()->hardirq_timer += vtime_delta();
+ } else {
+ /*
+ * In dynticks mode, the idle cputime is accounted by the nohz
+ * subsystem. Therefore the s390 timer/clocks are reset on IRQ
+ * entry and steal time must be accounted now.
+ */
+ vtime_flush(tsk);
+ }
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/**
+ * vtime_reset - Fast forward vtime entry clocks
+ *
+ * Called from dynticks idle IRQ entry to fast-forward the clocks to current time
+ * so that the IRQ time is still accounted by vtime while nohz cputime is paused.
+ */
+void vtime_reset(void)
+{
+ vtime_reset_last_update(get_lowcore());
+}
+
+/**
+ * vtime_dyntick_start - Inform vtime about entry to idle-dynticks
+ *
+ * Called when idle enters in dyntick mode. The idle cputime that elapsed so far
+ * is flushed and the tick subsystem takes over the idle cputime accounting.
+ */
+void vtime_dyntick_start(void)
+{
+ __this_cpu_write(s390_idle.idle_dyntick, true);
+ vtime_flush(current);
+}
+
+/**
+ * vtime_dyntick_stop - Inform vtime about exit from idle-dynticks
+ *
+ * Called when idle exits from dyntick mode. The vtime entry clocks are
+ * fast-forward to current time and idle accounting resumes.
+ */
+void vtime_dyntick_stop(void)
+{
+ vtime_reset_last_update(get_lowcore());
+ __this_cpu_write(s390_idle.idle_dyntick, false);
}
+#endif /* CONFIG_NO_HZ_COMMON */

/*
* Sorted add to a list. List is linear searched until first bigger
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 44eb1b7e7fc1..dda0d34d3c02 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -130,38 +130,11 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
}
EXPORT_SYMBOL_GPL(get_governor_parent_kobj);

-static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
-{
- struct kernel_cpustat kcpustat;
- u64 cur_wall_time;
- u64 idle_time;
- u64 busy_time;
-
- cur_wall_time = jiffies64_to_nsecs(get_jiffies_64());
-
- kcpustat_cpu_fetch(&kcpustat, cpu);
-
- busy_time = kcpustat.cpustat[CPUTIME_USER];
- busy_time += kcpustat.cpustat[CPUTIME_SYSTEM];
- busy_time += kcpustat.cpustat[CPUTIME_IRQ];
- busy_time += kcpustat.cpustat[CPUTIME_SOFTIRQ];
- busy_time += kcpustat.cpustat[CPUTIME_STEAL];
- busy_time += kcpustat.cpustat[CPUTIME_NICE];
-
- idle_time = cur_wall_time - busy_time;
- if (wall)
- *wall = div_u64(cur_wall_time, NSEC_PER_USEC);
-
- return div_u64(idle_time, NSEC_PER_USEC);
-}
-
u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy)
{
u64 idle_time = get_cpu_idle_time_us(cpu, io_busy ? wall : NULL);

- if (idle_time == -1ULL)
- return get_cpu_idle_time_jiffy(cpu, wall);
- else if (!io_busy)
+ if (!io_busy)
idle_time += get_cpu_iowait_time_us(cpu, wall);

return idle_time;
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 86f35e451914..3c4a1f9af3ae 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -105,7 +105,7 @@ void gov_update_cpu_data(struct dbs_data *dbs_data)
j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_update_time,
dbs_data->io_is_busy);
if (dbs_data->ignore_nice_load)
- j_cdbs->prev_cpu_nice = kcpustat_field(&kcpustat_cpu(j), CPUTIME_NICE, j);
+ j_cdbs->prev_cpu_nice = kcpustat_field(CPUTIME_NICE, j);
}
}
}
@@ -165,7 +165,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
j_cdbs->prev_cpu_idle = cur_idle_time;

if (ignore_nice) {
- u64 cur_nice = kcpustat_field(&kcpustat_cpu(j), CPUTIME_NICE, j);
+ u64 cur_nice = kcpustat_field(CPUTIME_NICE, j);

idle_time += div_u64(cur_nice - j_cdbs->prev_cpu_nice, NSEC_PER_USEC);
j_cdbs->prev_cpu_nice = cur_nice;
@@ -539,7 +539,7 @@ int cpufreq_dbs_governor_start(struct cpufreq_policy *policy)
j_cdbs->prev_load = 0;

if (ignore_nice)
- j_cdbs->prev_cpu_nice = kcpustat_field(&kcpustat_cpu(j), CPUTIME_NICE, j);
+ j_cdbs->prev_cpu_nice = kcpustat_field(CPUTIME_NICE, j);
}

gov->start(policy);
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
index 8a1e2c08b096..26cb93191ede 100644
--- a/drivers/macintosh/rack-meter.c
+++ b/drivers/macintosh/rack-meter.c
@@ -87,7 +87,7 @@ static inline u64 get_cpu_idle_time(unsigned int cpu)
kcpustat->cpustat[CPUTIME_IOWAIT];

if (rackmeter_ignore_nice)
- retval += kcpustat_field(kcpustat, CPUTIME_NICE, cpu);
+ retval += kcpustat_field(CPUTIME_NICE, cpu);

return retval;
}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 8b444e862319..c00468a83f64 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -22,38 +22,6 @@
#define arch_irq_stat() 0
#endif

-u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
-{
- u64 idle, idle_usecs = -1ULL;
-
- if (cpu_online(cpu))
- idle_usecs = get_cpu_idle_time_us(cpu, NULL);
-
- if (idle_usecs == -1ULL)
- /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcs->cpustat[CPUTIME_IDLE];
- else
- idle = idle_usecs * NSEC_PER_USEC;
-
- return idle;
-}
-
-static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
-{
- u64 iowait, iowait_usecs = -1ULL;
-
- if (cpu_online(cpu))
- iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
-
- if (iowait_usecs == -1ULL)
- /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
- iowait = kcs->cpustat[CPUTIME_IOWAIT];
- else
- iowait = iowait_usecs * NSEC_PER_USEC;
-
- return iowait;
-}
-
static void show_irq_gap(struct seq_file *p, unsigned int gap)
{
static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
@@ -105,8 +73,8 @@ static int show_stat(struct seq_file *p, void *v)
user += cpustat[CPUTIME_USER];
nice += cpustat[CPUTIME_NICE];
system += cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(&kcpustat, i);
- iowait += get_iowait_time(&kcpustat, i);
+ idle += cpustat[CPUTIME_IDLE];
+ iowait += cpustat[CPUTIME_IOWAIT];
irq += cpustat[CPUTIME_IRQ];
softirq += cpustat[CPUTIME_SOFTIRQ];
steal += cpustat[CPUTIME_STEAL];
@@ -146,8 +114,8 @@ static int show_stat(struct seq_file *p, void *v)
user = cpustat[CPUTIME_USER];
nice = cpustat[CPUTIME_NICE];
system = cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(&kcpustat, i);
- iowait = get_iowait_time(&kcpustat, i);
+ idle = cpustat[CPUTIME_IDLE];
+ iowait = cpustat[CPUTIME_IOWAIT];
irq = cpustat[CPUTIME_IRQ];
softirq = cpustat[CPUTIME_SOFTIRQ];
steal = cpustat[CPUTIME_STEAL];
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index b5343d209381..433aa947cd57 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -18,12 +18,8 @@ static int uptime_proc_show(struct seq_file *m, void *v)
int i;

idle_nsec = 0;
- for_each_possible_cpu(i) {
- struct kernel_cpustat kcs;
-
- kcpustat_cpu_fetch(&kcs, i);
- idle_nsec += get_idle_time(&kcs, i);
- }
+ for_each_possible_cpu(i)
+ idle_nsec += kcpustat_field(CPUTIME_IDLE, i);

ktime_get_boottime_ts64(&uptime);
timens_add_boottime(&uptime);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b97ce2df376f..fce1392e2140 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -34,7 +34,14 @@ enum cpu_usage_stat {
};

struct kernel_cpustat {
- u64 cpustat[NR_STATS];
+#ifdef CONFIG_NO_HZ_COMMON
+ bool idle_dyntick;
+ bool idle_elapse;
+ seqcount_t idle_sleeptime_seq;
+ u64 idle_entrytime;
+ u64 idle_stealtime[2];
+#endif
+ u64 cpustat[NR_STATS];
};

struct kernel_stat {
@@ -99,23 +106,68 @@ static inline unsigned long kstat_cpu_irqs_sum(unsigned int cpu)
return kstat_cpu(cpu).irqs_sum;
}

+#ifdef CONFIG_NO_HZ_COMMON
+extern void kcpustat_dyntick_start(u64 now);
+extern void kcpustat_dyntick_stop(u64 now);
+extern void kcpustat_irq_enter(u64 now);
+extern void kcpustat_irq_exit(u64 now);
+extern u64 kcpustat_field_idle(int cpu);
+extern u64 kcpustat_field_iowait(int cpu);
+
+static inline bool kcpustat_idle_dyntick(void)
+{
+ return __this_cpu_read(kernel_cpustat.idle_dyntick);
+}
+#else
+static inline u64 kcpustat_field_idle(int cpu)
+{
+ return kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+}
+static inline u64 kcpustat_field_iowait(int cpu)
+{
+ return kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+}
+
+static inline bool kcpustat_idle_dyntick(void)
+{
+ return false;
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
+extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
+
+/* Fetch cputime values when vtime is disabled on a CPU */
+static inline u64 kcpustat_field_default(enum cpu_usage_stat usage, int cpu)
+{
+ if (usage == CPUTIME_IDLE)
+ return kcpustat_field_idle(cpu);
+ if (usage == CPUTIME_IOWAIT)
+ return kcpustat_field_iowait(cpu);
+ return kcpustat_cpu(cpu).cpustat[usage];
+}
+
+static inline void kcpustat_cpu_fetch_default(struct kernel_cpustat *dst, int cpu)
+{
+ *dst = kcpustat_cpu(cpu);
+ dst->cpustat[CPUTIME_IDLE] = kcpustat_field_idle(cpu);
+ dst->cpustat[CPUTIME_IOWAIT] = kcpustat_field_iowait(cpu);
+}
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern u64 kcpustat_field(struct kernel_cpustat *kcpustat,
- enum cpu_usage_stat usage, int cpu);
+extern u64 kcpustat_field(enum cpu_usage_stat usage, int cpu);
extern void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu);
#else
-static inline u64 kcpustat_field(struct kernel_cpustat *kcpustat,
- enum cpu_usage_stat usage, int cpu)
+static inline u64 kcpustat_field(enum cpu_usage_stat usage, int cpu)
{
- return kcpustat->cpustat[usage];
+ return kcpustat_field_default(usage, cpu);
}

static inline void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
{
- *dst = kcpustat_cpu(cpu);
+ kcpustat_cpu_fetch_default(dst, cpu);
}
-
-#endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */

extern void account_user_time(struct task_struct *, u64);
extern void account_guest_time(struct task_struct *, u64);
@@ -124,19 +176,17 @@ extern void account_system_index_time(struct task_struct *, u64,
enum cpu_usage_stat);
extern void account_steal_time(u64);
extern void account_idle_time(u64);
-extern u64 get_idle_time(struct kernel_cpustat *kcs, int cpu);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline void account_process_tick(struct task_struct *tsk, int user)
{
- vtime_flush(tsk);
+ if (!kcpustat_idle_dyntick())
+ vtime_flush(tsk);
}
#else
extern void account_process_tick(struct task_struct *, int user);
#endif

-extern void account_idle_ticks(unsigned long ticks);
-
#ifdef CONFIG_SCHED_CORE
extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
#endif
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 738007d6f577..1cf4651f09ad 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -139,8 +139,6 @@ extern bool tick_nohz_idle_got_tick(void);
extern ktime_t tick_nohz_get_next_hrtimer(void);
extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next);
extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
-extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
-extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
#else /* !CONFIG_NO_HZ_COMMON */
#define tick_nohz_enabled (0)
static inline bool tick_nohz_is_active(void) { return false; }
@@ -162,8 +160,6 @@ static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
*delta_next = TICK_NSEC;
return *delta_next;
}
-static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
-static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
#endif /* !CONFIG_NO_HZ_COMMON */

/*
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 29dd5b91dd7d..9dc25b04a119 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -10,7 +10,6 @@
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void vtime_account_kernel(struct task_struct *tsk);
-extern void vtime_account_idle(struct task_struct *tsk);
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -27,16 +26,33 @@ static inline void vtime_guest_exit(struct task_struct *tsk) { }
static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
#endif

+static inline bool vtime_generic_enabled_cpu(int cpu)
+{
+ return context_tracking_enabled_cpu(cpu);
+}
+
+static inline bool vtime_generic_enabled_this_cpu(void)
+{
+ return context_tracking_enabled_this_cpu();
+}
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+extern void vtime_account_idle(struct task_struct *tsk);
extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
extern void vtime_account_softirq(struct task_struct *tsk);
extern void vtime_account_hardirq(struct task_struct *tsk);
extern void vtime_flush(struct task_struct *tsk);
+extern void vtime_reset(void);
+extern void vtime_dyntick_start(void);
+extern void vtime_dyntick_stop(void);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
static inline void vtime_account_softirq(struct task_struct *tsk) { }
static inline void vtime_account_hardirq(struct task_struct *tsk) { }
static inline void vtime_flush(struct task_struct *tsk) { }
+static inline void vtime_reset(void) { }
+static inline void vtime_dyntick_start(void) { }
+static inline void vtime_dyntick_stop(void) { }
#endif

/*
@@ -74,12 +90,12 @@ static inline bool vtime_accounting_enabled(void)

static inline bool vtime_accounting_enabled_cpu(int cpu)
{
- return context_tracking_enabled_cpu(cpu);
+ return vtime_generic_enabled_cpu(cpu);
}

static inline bool vtime_accounting_enabled_this_cpu(void)
{
- return context_tracking_enabled_this_cpu();
+ return vtime_generic_enabled_this_cpu();
}

extern void vtime_task_switch_generic(struct task_struct *prev);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 55df6d37145e..3cbf79bee976 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -969,14 +969,11 @@ static int rcu_watching_snap_recheck(struct rcu_data *rdp)
if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
int cpu = rdp->cpu;
struct rcu_snap_record *rsrp;
- struct kernel_cpustat *kcsp;
-
- kcsp = &kcpustat_cpu(cpu);

rsrp = &rdp->snap_record;
- rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
- rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
- rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+ rsrp->cputime_irq = kcpustat_field(CPUTIME_IRQ, cpu);
+ rsrp->cputime_softirq = kcpustat_field(CPUTIME_SOFTIRQ, cpu);
+ rsrp->cputime_system = kcpustat_field(CPUTIME_SYSTEM, cpu);
rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu);
rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu);
rsrp->nr_csw = nr_context_switches_cpu(cpu);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b67532cb8770..cf7ae51cba40 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -479,7 +479,6 @@ static void print_cpu_stat_info(int cpu)
{
struct rcu_snap_record rsr, *rsrp;
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu);

if (!rcu_cpu_stall_cputime)
return;
@@ -488,9 +487,9 @@ static void print_cpu_stat_info(int cpu)
if (rsrp->gp_seq != rdp->gp_seq)
return;

- rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
- rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
- rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+ rsr.cputime_irq = kcpustat_field(CPUTIME_IRQ, cpu);
+ rsr.cputime_softirq = kcpustat_field(CPUTIME_SOFTIRQ, cpu);
+ rsr.cputime_system = kcpustat_field(CPUTIME_SYSTEM, cpu);

pr_err("\t hardirqs softirqs csw/system\n");
pr_err("\t number: %8lld %10d %12lld\n",
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b8871449d3c6..d797d6696c58 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5518,7 +5518,11 @@ void sched_exec(void)
}

DEFINE_PER_CPU(struct kernel_stat, kstat);
-DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat) = {
+#ifdef CONFIG_NO_HZ_COMMON
+ .idle_sleeptime_seq = SEQCNT_ZERO(kernel_cpustat.idle_sleeptime_seq)
+#endif
+};

EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index fbf31db0d2f3..244b57417240 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,6 +2,7 @@
/*
* Simple CPU accounting cgroup controller
*/
+#include <linux/sched/clock.h>
#include <linux/sched/cputime.h>
#include <linux/tsacct_kern.h>
#include "sched.h"
@@ -46,7 +47,8 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
u64_stats_update_begin(&irqtime->sync);
cpustat[idx] += delta;
irqtime->total += delta;
- irqtime->tick_delta += delta;
+ if (!kcpustat_idle_dyntick())
+ irqtime->tick_delta += delta;
u64_stats_update_end(&irqtime->sync);
}

@@ -414,16 +416,219 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
}
}

-static void irqtime_account_idle_ticks(int ticks)
-{
- irqtime_account_process_tick(current, 0, ticks);
-}
#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
-static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
int nr_ticks) { }
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */

+#ifdef CONFIG_NO_HZ_COMMON
+static void kcpustat_idle_stop(struct kernel_cpustat *kc, u64 now)
+{
+ u64 *cpustat = kc->cpustat;
+ u64 delta, steal, steal_delta;
+ int iowait;
+
+ if (!kc->idle_elapse)
+ return;
+
+ iowait = nr_iowait_cpu(smp_processor_id()) > 0;
+ delta = now - kc->idle_entrytime;
+ steal = steal_account_process_time(delta);
+
+ /*
+ * Record the idle time after substracting the steal time from
+ * previous update sequence. Don't substract the steal time from
+ * the current update sequence to avoid readers moving backward.
+ */
+ write_seqcount_begin(&kc->idle_sleeptime_seq);
+ steal_delta = min_t(u64, kc->idle_stealtime[iowait], delta);
+ delta -= steal_delta;
+ kc->idle_stealtime[iowait] -= steal_delta;
+
+ if (iowait)
+ cpustat[CPUTIME_IOWAIT] += delta;
+ else
+ cpustat[CPUTIME_IDLE] += delta;
+
+ kc->idle_stealtime[iowait] += steal;
+ kc->idle_entrytime = now;
+ kc->idle_elapse = false;
+ write_seqcount_end(&kc->idle_sleeptime_seq);
+}
+
+static void kcpustat_idle_start(struct kernel_cpustat *kc, u64 now)
+{
+ /* Irqtime accounting might have been enabled in the middle of the IRQ */
+ if (kc->idle_elapse)
+ return;
+
+ write_seqcount_begin(&kc->idle_sleeptime_seq);
+ kc->idle_entrytime = now;
+ kc->idle_elapse = true;
+ write_seqcount_end(&kc->idle_sleeptime_seq);
+}
+
+void kcpustat_dyntick_stop(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ if (!vtime_generic_enabled_this_cpu()) {
+ WARN_ON_ONCE(!kc->idle_dyntick);
+ kcpustat_idle_stop(kc, now);
+ kc->idle_dyntick = false;
+ vtime_dyntick_stop();
+ }
+}
+
+void kcpustat_dyntick_start(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ if (!vtime_generic_enabled_this_cpu()) {
+ vtime_dyntick_start();
+ kc->idle_dyntick = true;
+ kcpustat_idle_start(kc, now);
+ }
+}
+
+void kcpustat_irq_enter(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ if (!vtime_generic_enabled_this_cpu() &&
+ (irqtime_enabled() || vtime_accounting_enabled_this_cpu()))
+ kcpustat_idle_stop(kc, now);
+}
+
+void kcpustat_irq_exit(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ /*
+ * Generic vtime already does its own idle accounting.
+ * But irqtime accounting or arch vtime which also accounts IRQs
+ * need to pause nohz accounting. Resume nohz accounting as long
+ * as the irqtime config is enabled to handle case where irqtime
+ * accounting got runtime disabled in the middle of an IRQ.
+ */
+ if (!vtime_generic_enabled_this_cpu() &&
+ (IS_ENABLED(CONFIG_IRQ_TIME_ACCOUNTING) || vtime_accounting_enabled_this_cpu()))
+ kcpustat_idle_start(kc, now);
+}
+
+static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx,
+ bool compute_delta, u64 now)
+{
+ struct kernel_cpustat *kc = &kcpustat_cpu(cpu);
+ int iowait = idx == CPUTIME_IOWAIT;
+ u64 *cpustat = kc->cpustat;
+ unsigned int seq;
+ u64 idle;
+
+ do {
+ seq = read_seqcount_begin(&kc->idle_sleeptime_seq);
+
+ idle = cpustat[idx];
+
+ if (kc->idle_elapse && compute_delta && now > kc->idle_entrytime) {
+ u64 delta = now - kc->idle_entrytime;
+
+ delta -= min_t(u64, kc->idle_stealtime[iowait], delta);
+ idle += delta;
+ }
+ } while (read_seqcount_retry(&kc->idle_sleeptime_seq, seq));
+
+ return idle;
+}
+
+u64 kcpustat_field_idle(int cpu)
+{
+ return kcpustat_field_dyntick(cpu, CPUTIME_IDLE,
+ !nr_iowait_cpu(cpu), ktime_get());
+}
+EXPORT_SYMBOL_GPL(kcpustat_field_idle);
+
+u64 kcpustat_field_iowait(int cpu)
+{
+ return kcpustat_field_dyntick(cpu, CPUTIME_IOWAIT,
+ nr_iowait_cpu(cpu), ktime_get());
+}
+EXPORT_SYMBOL_GPL(kcpustat_field_iowait);
+#else
+static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx,
+ bool compute_delta, ktime_t now)
+{
+ return kcpustat_cpu(cpu).cpustat[idx];
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx,
+ bool compute_delta, u64 *last_update_time)
+{
+ ktime_t now = ktime_get();
+ u64 res;
+
+ if (vtime_generic_enabled_cpu(cpu))
+ res = kcpustat_field(idx, cpu);
+ else
+ res = kcpustat_field_dyntick(cpu, idx, compute_delta, now);
+
+ do_div(res, NSEC_PER_USEC);
+
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+ return res;
+}
+
+/**
+ * get_cpu_idle_time_us - get the total idle time of a CPU
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cumulative idle time (since boot) for a given
+ * CPU, in microseconds. Note that this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * Return: total idle time of the @cpu
+ */
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+ return get_cpu_sleep_time_us(cpu, CPUTIME_IDLE,
+ !nr_iowait_cpu(cpu), last_update_time);
+}
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+
+/**
+ * get_cpu_iowait_time_us - get the total iowait time of a CPU
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cumulative iowait time (since boot) for a given
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * Return: total iowait time of @cpu
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+ return get_cpu_sleep_time_us(cpu, CPUTIME_IOWAIT,
+ nr_iowait_cpu(cpu), last_update_time);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
/*
* Use precise platform statistics if available:
*/
@@ -437,11 +642,15 @@ void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
vtime_account_hardirq(tsk);
} else if (pc & SOFTIRQ_OFFSET) {
vtime_account_softirq(tsk);
- } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
- is_idle_task(tsk)) {
- vtime_account_idle(tsk);
+ } else if (!kcpustat_idle_dyntick()) {
+ if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
+ is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ } else {
+ vtime_account_kernel(tsk);
+ }
} else {
- vtime_account_kernel(tsk);
+ vtime_reset();
}
}

@@ -483,6 +692,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (vtime_accounting_enabled_this_cpu())
return;

+ if (kcpustat_idle_dyntick())
+ return;
+
if (irqtime_enabled()) {
irqtime_account_process_tick(p, user_tick, 1);
return;
@@ -504,29 +716,6 @@ void account_process_tick(struct task_struct *p, int user_tick)
account_idle_time(cputime);
}

-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
- u64 cputime, steal;
-
- if (irqtime_enabled()) {
- irqtime_account_idle_ticks(ticks);
- return;
- }
-
- cputime = ticks * TICK_NSEC;
- steal = steal_account_process_time(ULONG_MAX);
-
- if (steal >= cputime)
- return;
-
- cputime -= steal;
- account_idle_time(cputime);
-}
-
/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
@@ -773,9 +962,9 @@ void vtime_guest_exit(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);

-void vtime_account_idle(struct task_struct *tsk)
+static void __vtime_account_idle(struct vtime *vtime)
{
- account_idle_time(get_vtime_delta(&tsk->vtime));
+ account_idle_time(get_vtime_delta(vtime));
}

void vtime_task_switch_generic(struct task_struct *prev)
@@ -784,7 +973,7 @@ void vtime_task_switch_generic(struct task_struct *prev)

write_seqcount_begin(&vtime->seqcount);
if (vtime->state == VTIME_IDLE)
- vtime_account_idle(prev);
+ __vtime_account_idle(vtime);
else
__vtime_account_kernel(prev, vtime);
vtime->state = VTIME_INACTIVE;
@@ -926,6 +1115,7 @@ static int kcpustat_field_vtime(u64 *cpustat,
int cpu, u64 *val)
{
struct vtime *vtime = &tsk->vtime;
+ struct rq *rq = cpu_rq(cpu);
unsigned int seq;

do {
@@ -967,6 +1157,14 @@ static int kcpustat_field_vtime(u64 *cpustat,
if (state == VTIME_GUEST && task_nice(tsk) > 0)
*val += vtime->gtime + vtime_delta(vtime);
break;
+ case CPUTIME_IDLE:
+ if (state == VTIME_IDLE && !atomic_read(&rq->nr_iowait))
+ *val += vtime_delta(vtime);
+ break;
+ case CPUTIME_IOWAIT:
+ if (state == VTIME_IDLE && atomic_read(&rq->nr_iowait) > 0)
+ *val += vtime_delta(vtime);
+ break;
default:
break;
}
@@ -975,16 +1173,15 @@ static int kcpustat_field_vtime(u64 *cpustat,
return 0;
}

-u64 kcpustat_field(struct kernel_cpustat *kcpustat,
- enum cpu_usage_stat usage, int cpu)
+u64 kcpustat_field(enum cpu_usage_stat usage, int cpu)
{
- u64 *cpustat = kcpustat->cpustat;
+ u64 *cpustat = kcpustat_cpu(cpu).cpustat;
u64 val = cpustat[usage];
struct rq *rq;
int err;

- if (!vtime_accounting_enabled_cpu(cpu))
- return val;
+ if (!vtime_generic_enabled_cpu(cpu))
+ return kcpustat_field_default(usage, cpu);

rq = cpu_rq(cpu);

@@ -1030,8 +1227,8 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
*dst = *src;
cpustat = dst->cpustat;

- /* Task is sleeping, dead or idle, nothing to add */
- if (state < VTIME_SYS)
+ /* Task is sleeping or dead, nothing to add */
+ if (state < VTIME_IDLE)
continue;

delta = vtime_delta(vtime);
@@ -1040,15 +1237,17 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
* Task runs either in user (including guest) or kernel space,
* add pending nohz time to the right place.
*/
- if (state == VTIME_SYS) {
+ switch (state) {
+ case VTIME_SYS:
cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
- } else if (state == VTIME_USER) {
+ break;
+ case VTIME_USER:
if (task_nice(tsk) > 0)
cpustat[CPUTIME_NICE] += vtime->utime + delta;
else
cpustat[CPUTIME_USER] += vtime->utime + delta;
- } else {
- WARN_ON_ONCE(state != VTIME_GUEST);
+ break;
+ case VTIME_GUEST:
if (task_nice(tsk) > 0) {
cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
cpustat[CPUTIME_NICE] += vtime->gtime + delta;
@@ -1056,6 +1255,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
cpustat[CPUTIME_USER] += vtime->gtime + delta;
}
+ break;
+ case VTIME_IDLE:
+ if (atomic_read(&cpu_rq(cpu)->nr_iowait) > 0)
+ cpustat[CPUTIME_IOWAIT] += delta;
+ else
+ cpustat[CPUTIME_IDLE] += delta;
+ break;
+ default:
+ WARN_ON_ONCE(1);
}
} while (read_seqcount_retry(&vtime->seqcount, seq));

@@ -1068,8 +1276,8 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
struct rq *rq;
int err;

- if (!vtime_accounting_enabled_cpu(cpu)) {
- *dst = *src;
+ if (!vtime_generic_enabled_cpu(cpu)) {
+ kcpustat_cpu_fetch_default(dst, cpu);
return;
}

@@ -1082,7 +1290,7 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
curr = rcu_dereference(rq->curr);
if (WARN_ON_ONCE(!curr)) {
rcu_read_unlock();
- *dst = *src;
+ kcpustat_cpu_fetch_default(dst, cpu);
return;
}

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index a83be0c834dd..aa7e3dc59856 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -280,6 +280,14 @@ static void do_idle(void)
int cpu = smp_processor_id();
bool got_tick = false;

+ if (cpu_is_offline(cpu)) {
+ local_irq_disable();
+ /* All per-CPU kernel threads should be done by now. */
+ WARN_ON_ONCE(need_resched());
+ cpuhp_report_idle_dead();
+ arch_cpu_idle_dead();
+ }
+
/*
* Check if we need to update blocked load
*/
@@ -331,11 +339,6 @@ static void do_idle(void)
*/
local_irq_disable();

- if (cpu_is_offline(cpu)) {
- cpuhp_report_idle_dead();
- arch_cpu_idle_dead();
- }
-
arch_cpu_idle_enter();
rcu_nocb_flush_deferred_wakeup();

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cbbb87a0c6e7..c1ee0b256445 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -285,8 +285,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
touch_softlockup_watchdog_sched();
- if (is_idle_task(current))
- ts->idle_jiffies++;
/*
* In case the current tick fired too early past its expected
* expiration, make sure we don't bypass the next clock reprogramming
@@ -751,119 +749,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog_sched();
}

-static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
-{
- ktime_t delta;
-
- if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
- return;
-
- delta = ktime_sub(now, ts->idle_entrytime);
-
- write_seqcount_begin(&ts->idle_sleeptime_seq);
- if (nr_iowait_cpu(smp_processor_id()) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
- else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-
- ts->idle_entrytime = now;
- tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
- write_seqcount_end(&ts->idle_sleeptime_seq);
-
- sched_clock_idle_wakeup_event();
-}
-
-static void tick_nohz_start_idle(struct tick_sched *ts)
-{
- write_seqcount_begin(&ts->idle_sleeptime_seq);
- ts->idle_entrytime = ktime_get();
- tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
- write_seqcount_end(&ts->idle_sleeptime_seq);
-
- sched_clock_idle_sleep_event();
-}
-
-static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
- bool compute_delta, u64 *last_update_time)
-{
- ktime_t now, idle;
- unsigned int seq;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
-
- do {
- seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
-
- if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- idle = ktime_add(*sleeptime, delta);
- } else {
- idle = *sleeptime;
- }
- } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
-
- return ktime_to_us(idle);
-
-}
-
-/**
- * get_cpu_idle_time_us - get the total idle time of a CPU
- * @cpu: CPU number to query
- * @last_update_time: variable to store update time in. Do not update
- * counters if NULL.
- *
- * Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds. Note that this is partially broken due to
- * the counter of iowait tasks that can be remotely updated without
- * any synchronization. Therefore it is possible to observe backward
- * values within two consecutive reads.
- *
- * This time is measured via accounting rather than sampling,
- * and is as accurate as ktime_get() is.
- *
- * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
- */
-u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
-{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-
- return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
- !nr_iowait_cpu(cpu), last_update_time);
-}
-EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
-
-/**
- * get_cpu_iowait_time_us - get the total iowait time of a CPU
- * @cpu: CPU number to query
- * @last_update_time: variable to store update time in. Do not update
- * counters if NULL.
- *
- * Return the cumulative iowait time (since boot) for a given
- * CPU, in microseconds. Note this is partially broken due to
- * the counter of iowait tasks that can be remotely updated without
- * any synchronization. Therefore it is possible to observe backward
- * values within two consecutive reads.
- *
- * This time is measured via accounting rather than sampling,
- * and is as accurate as ktime_get() is.
- *
- * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
- */
-u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
-{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-
- return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
- nr_iowait_cpu(cpu), last_update_time);
-}
-EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-
/* Simplified variant of hrtimer_forward_now() */
static ktime_t tick_forward_now(ktime_t expires, ktime_t now)
{
@@ -1273,7 +1158,7 @@ void tick_nohz_idle_stop_tick(void)
ts->idle_expires = expires;

if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
- ts->idle_jiffies = ts->last_jiffies;
+ kcpustat_dyntick_start(ts->idle_entrytime);
nohz_balance_enter_idle(cpu);
}
} else {
@@ -1286,6 +1171,20 @@ void tick_nohz_idle_retain_tick(void)
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
}

+static void tick_nohz_clock_sleep(struct tick_sched *ts)
+{
+ tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
+ sched_clock_idle_sleep_event();
+}
+
+static void tick_nohz_clock_wakeup(struct tick_sched *ts)
+{
+ if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) {
+ tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
+ sched_clock_idle_wakeup_event();
+ }
+}
+
/**
* tick_nohz_idle_enter - prepare for entering idle on the current CPU
*
@@ -1300,11 +1199,10 @@ void tick_nohz_idle_enter(void)
local_irq_disable();

ts = this_cpu_ptr(&tick_cpu_sched);
-
WARN_ON_ONCE(ts->timer_expires_base);
-
tick_sched_flag_set(ts, TS_FLAG_INIDLE);
- tick_nohz_start_idle(ts);
+ ts->idle_entrytime = ktime_get();
+ tick_nohz_clock_sleep(ts);

local_irq_enable();
}
@@ -1332,10 +1230,14 @@ void tick_nohz_irq_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

- if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
- tick_nohz_start_idle(ts);
- else
+ if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) {
+ tick_nohz_clock_sleep(ts);
+ ts->idle_entrytime = ktime_get();
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
+ kcpustat_irq_exit(ts->idle_entrytime);
+ } else {
tick_nohz_full_update_tick(ts);
+ }
}

/**
@@ -1429,36 +1331,20 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
return ts->idle_calls;
}

-static void tick_nohz_account_idle_time(struct tick_sched *ts,
- ktime_t now)
-{
- unsigned long ticks;
-
- ts->idle_exittime = now;
-
- if (vtime_accounting_enabled_this_cpu())
- return;
- /*
- * We stopped the tick in idle. update_process_times() would miss the
- * time we slept, as it does only a 1 tick accounting.
- * Enforce that this is accounted to idle !
- */
- ticks = jiffies - ts->idle_jiffies;
- /*
- * We might be one off. Do not randomly account a huge number of ticks!
- */
- if (ticks && ticks < LONG_MAX)
- account_idle_ticks(ticks);
-}
-
void tick_nohz_idle_restart_tick(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
- ktime_t now = ktime_get();
- tick_nohz_restart_sched_tick(ts, now);
- tick_nohz_account_idle_time(ts, now);
+ /*
+ * Update entrytime here in case the tick restart is due to temporary
+ * polling on forced broadcast. The tick may be stopped again later within
+ * the same idle trip. The idle_entrytime was updated recently but make sure
+ * no tiny amount of idle time is accounted twice.
+ */
+ ts->idle_entrytime = ktime_get();
+ kcpustat_dyntick_stop(ts->idle_entrytime);
+ tick_nohz_restart_sched_tick(ts, ts->idle_entrytime);
}
}

@@ -1468,8 +1354,6 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
__tick_nohz_full_update_tick(ts, now);
else
tick_nohz_restart_sched_tick(ts, now);
-
- tick_nohz_account_idle_time(ts, now);
}

/**
@@ -1491,7 +1375,6 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
void tick_nohz_idle_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- bool idle_active, tick_stopped;
ktime_t now;

local_irq_disable();
@@ -1500,17 +1383,13 @@ void tick_nohz_idle_exit(void)
WARN_ON_ONCE(ts->timer_expires_base);

tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
- idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
- tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
+ tick_nohz_clock_wakeup(ts);

- if (idle_active || tick_stopped)
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
now = ktime_get();
-
- if (idle_active)
- tick_nohz_stop_idle(ts, now);
-
- if (tick_stopped)
+ kcpustat_dyntick_stop(now);
tick_nohz_idle_update_tick(ts, now);
+ }

local_irq_enable();
}
@@ -1565,11 +1444,14 @@ static inline void tick_nohz_irq_enter(void)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;

- if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
+ tick_nohz_clock_wakeup(ts);
+
+ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED))
return;
+
now = ktime_get();
- if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
- tick_nohz_stop_idle(ts, now);
+ kcpustat_irq_enter(now);
+
/*
* If all CPUs are idle we may need to update a stale jiffies value.
* Note nohz_full is a special case: a timekeeper is guaranteed to stay
@@ -1577,8 +1459,7 @@ static inline void tick_nohz_irq_enter(void)
* rare case (typically stop machine). So we must make sure we have a
* last resort.
*/
- if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
- tick_nohz_update_jiffies(now);
+ tick_nohz_update_jiffies(now);
}

#else
@@ -1648,20 +1529,15 @@ void tick_setup_sched_timer(bool hrtimer)
void tick_sched_timer_dying(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t idle_sleeptime, iowait_sleeptime;
unsigned long idle_calls, idle_sleeps;

/* This must happen before hrtimers are migrated! */
if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
hrtimer_cancel(&ts->sched_timer);

- idle_sleeptime = ts->idle_sleeptime;
- iowait_sleeptime = ts->iowait_sleeptime;
idle_calls = ts->idle_calls;
idle_sleeps = ts->idle_sleeps;
memset(ts, 0, sizeof(*ts));
- ts->idle_sleeptime = idle_sleeptime;
- ts->iowait_sleeptime = iowait_sleeptime;
ts->idle_calls = idle_calls;
ts->idle_sleeps = idle_sleeps;
}
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index b4a7822f495d..79b9252047b1 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -44,9 +44,7 @@ struct tick_device {
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
- * @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_waketime: Time when the idle was interrupted
- * @idle_sleeptime_seq: sequence counter for data consistency
* @idle_entrytime: Time when the idle call was entered
* @last_jiffies: Base jiffies snapshot when next event was last computed
* @timer_expires_base: Base time clock monotonic for @timer_expires
@@ -55,9 +53,6 @@ struct tick_device {
* @idle_expires: Next tick in idle, for debugging purpose only
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_exittime: Time when the idle state was left
- * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
- * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
* @check_clocks: Notification mechanism about clocksource changes
*/
@@ -73,12 +68,10 @@ struct tick_sched {
struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
- unsigned long idle_jiffies;
ktime_t idle_waketime;
unsigned int got_idle_tick;

/* Idle entry */
- seqcount_t idle_sleeptime_seq;
ktime_t idle_entrytime;

/* Tick stop */
@@ -90,11 +83,6 @@ struct tick_sched {
unsigned long idle_calls;
unsigned long idle_sleeps;

- /* Idle exit */
- ktime_t idle_exittime;
- ktime_t idle_sleeptime;
- ktime_t iowait_sleeptime;
-
/* Full dynticks handling */
atomic_t tick_dep_mask;

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 427d7ddea3af..514802def1e0 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -152,14 +152,10 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P_flag(highres, TS_FLAG_HIGHRES);
P_ns(last_tick);
P_flag(tick_stopped, TS_FLAG_STOPPED);
- P(idle_jiffies);
P(idle_calls);
P(idle_sleeps);
P_ns(idle_entrytime);
P_ns(idle_waketime);
- P_ns(idle_exittime);
- P_ns(idle_sleeptime);
- P_ns(iowait_sleeptime);
P(last_jiffies);
P(next_timer);
P_ns(idle_expires);
@@ -256,7 +252,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)

static inline void timer_list_header(struct seq_file *m, u64 now)
{
- SEQ_printf(m, "Timer List Version: v0.10\n");
+ SEQ_printf(m, "Timer List Version: v0.11\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py
index 9fb3436a217c..744b032e4d38 100644
--- a/scripts/gdb/linux/timerlist.py
+++ b/scripts/gdb/linux/timerlist.py
@@ -90,14 +90,10 @@ def print_cpu(hrtimer_bases, cpu, max_clock_bases):
text += f" .{'nohz':15s}: {int(bool(ts['flags'] & TS_FLAG_NOHZ))}\n"
text += f" .{'last_tick':15s}: {ts['last_tick']}\n"
text += f" .{'tick_stopped':15s}: {int(bool(ts['flags'] & TS_FLAG_STOPPED))}\n"
- text += f" .{'idle_jiffies':15s}: {ts['idle_jiffies']}\n"
text += f" .{'idle_calls':15s}: {ts['idle_calls']}\n"
text += f" .{'idle_sleeps':15s}: {ts['idle_sleeps']}\n"
text += f" .{'idle_entrytime':15s}: {ts['idle_entrytime']} nsecs\n"
text += f" .{'idle_waketime':15s}: {ts['idle_waketime']} nsecs\n"
- text += f" .{'idle_exittime':15s}: {ts['idle_exittime']} nsecs\n"
- text += f" .{'idle_sleeptime':15s}: {ts['idle_sleeptime']} nsecs\n"
- text += f" .{'iowait_sleeptime':15s}: {ts['iowait_sleeptime']} nsecs\n"
text += f" .{'last_jiffies':15s}: {ts['last_jiffies']}\n"
text += f" .{'next_timer':15s}: {ts['next_timer']}\n"
text += f" .{'idle_expires':15s}: {ts['idle_expires']} nsecs\n"