[PATCH] irqtime: exclude steal time on paravirt

From: Xiang Lin
Date: Thu Jul 18 2024 - 20:00:17 EST


On paravirt, guest irq time may include some steal time, need
subtract steal time delta when accounting irqtime. And when
account user or system time, we exclude irq time and steal
time, if irq time includes some steal time, this will make
user or system time smaller than actual value.

Using below steps to reproduce the problem:
1. run qemu, and pin vcpu to a physical cpu(e.g, cpu15)
taskset 0x8000 qemu-system-x86_64 -nographic -enable-kvm \
-kernel bzImage -append "console=ttyS0,115200 nokaslr" \
-initrd rootfs.cpio.gz -nic user,hostfwd=tcp::2222-:22
2. run below cmd on guest console to increase some irq loads
(guest) top -d 1 -b
3. ssh to guest, and monitor cpu usage
(guest) top -d 1 -b | grep ^%Cpu
4. do some stress on host, and pin it to the same physical cpu as vcpu
(host) chrt -f 2 stress-ng --cpu 1 --cpu-load 90 --taskset 15

Before patch, the irq usage is fluctuating hugely
%Cpu(s):0.0 us,0.0 sy,0.0 ni,5.6 id,0.0 wa, 16.7 hi, 0.0 si, 77.8 st
%Cpu(s):0.0 us,0.0 sy,0.0 ni,8.1 id,0.0 wa, 2.0 hi, 0.0 si, 89.9 st
%Cpu(s):0.0 us,0.7 sy,0.0 ni,6.2 id,0.0 wa, 15.9 hi, 0.0 si, 77.2 st
%Cpu(s):0.0 us,0.0 sy,0.0 ni,8.7 id,0.0 wa, 2.9 hi, 0.0 si, 88.5 st
%Cpu(s):0.6 us,0.0 sy,0.0 ni,5.0 id,0.0 wa, 16.7 hi, 0.0 si, 77.8 st
%Cpu(s):0.0 us,0.8 sy,0.0 ni,9.4 id,0.0 wa, 0.0 hi, 0.0 si, 89.8 st
%Cpu(s):0.0 us,0.0 sy,0.0 ni,6.9 id,0.0 wa, 15.9 hi, 0.0 si, 77.2 st
....

After patch, the irq usage is steady
%Cpu(s):0.0 us,0.0 sy,0.0 ni,7.2 id,0.0 wa, 2.4 hi, 0.0 si, 90.4 st
%Cpu(s):0.0 us,0.8 sy,0.0 ni,7.9 id,0.0 wa, 1.6 hi, 0.0 si, 89.7 st
%Cpu(s):0.8 us,0.0 sy,0.0 ni,7.2 id,0.0 wa, 2.4 hi, 0.0 si, 89.6 st
%Cpu(s):0.0 us,0.8 sy,0.0 ni,8.0 id,0.0 wa, 1.6 hi, 0.0 si, 89.6 st
%Cpu(s):0.0 us,0.0 sy,0.0 ni,7.2 id,0.0 wa, 2.4 hi, 0.0 si, 90.4 st
%Cpu(s):0.0 us,0.8 sy,0.0 ni,7.9 id,0.0 wa, 1.6 hi, 0.0 si, 89.7 st
%Cpu(s):0.0 us,0.0 sy,0.0 ni,7.2 id,0.0 wa, 2.4 hi, 0.0 si, 90.4 st
....

Signed-off-by: Xiang Lin <myd.xia@xxxxxxxxx>
---
kernel/sched/cputime.c | 20 +++++++++++++++++++-
kernel/sched/sched.h | 3 +++
2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a5e00293ae43..bc00296f8f9b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -46,6 +46,23 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
u64_stats_update_end(&irqtime->sync);
}

+static u64 steal_irqtime_account(bool irq_entry)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_key_false(&paravirt_steal_enabled)) {
+ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ u64 delta, steal;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ delta = steal - irqtime->steal_start_time;
+ irqtime->steal_start_time += delta;
+
+ return irq_entry ? 0 : delta;
+ }
+#endif
+ return 0;
+}
+
/*
* Called after incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
@@ -54,7 +71,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
unsigned int pc;
- s64 delta;
+ u64 delta;
int cpu;

if (!sched_clock_irqtime)
@@ -64,6 +81,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
pc = irq_count() - offset;
+ delta -= min(delta, steal_irqtime_account(!pc));

/*
* We do not account for softirq time from ksoftirqd here.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4c36cc680361..b5389bc8062f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2981,6 +2981,9 @@ struct irqtime {
u64 total;
u64 tick_delta;
u64 irq_start_time;
+#ifdef CONFIG_PARAVIRT
+ u64 steal_start_time;
+#endif
struct u64_stats_sync sync;
};

--
2.45.2