[PATCH v2 4/4] sched: Fix cgroup irq accounting for CONFIG_IRQ_TIME_ACCOUNTING

From: Yafang Shao
Date: Tue Oct 08 2024 - 02:50:46 EST


After enabling CONFIG_IRQ_TIME_ACCOUNTING to monitor IRQ pressure in our
container environment, we observed several noticeable behavioral changes.

One of our IRQ-heavy services, such as Redis, reported a significant
reduction in CPU usage after upgrading to the new kernel with
CONFIG_IRQ_TIME_ACCOUNTING enabled. However, despite adding more threads
to handle an increased workload, the CPU usage could not be raised. In
other words, even though the container’s CPU usage appeared low, it was
unable to process more workloads to utilize additional CPU resources, which
caused issues.

This behavior can be demonstrated using netperf:

function start_server() {
for j in `seq 1 3`; do
netserver -p $[12345+j] > /dev/null &
done
}

server_ip=$1
function start_client() {
# That applies to cgroup2 as well.
mkdir -p /sys/fs/cgroup/cpuacct/test
echo $$ > /sys/fs/cgroup/cpuacct/test/cgroup.procs
for j in `seq 1 3`; do
port=$[12345+j]
taskset -c 0 netperf -H ${server_ip} -l ${run_time:-30000} \
-t TCP_STREAM -p $port -- -D -m 1k -M 1K -s 8k -S 8k \
> /dev/null &
done
}

start_server
start_client

We can verify the CPU usage of the test cgroup using cpuacct.stat. The
output shows:

system: 53
user: 2

The CPU usage of the cgroup is relatively low at around 55%, but this usage
doesn't increase, even with more netperf tasks. The reason is that CPU0 is
at 100% utilization, as confirmed by mpstat:

02:56:22 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
02:56:23 PM 0 0.99 0.00 55.45 0.00 0.99 42.57 0.00 0.00 0.00 0.00

02:56:23 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
02:56:24 PM 0 2.00 0.00 55.00 0.00 0.00 43.00 0.00 0.00 0.00 0.00

This behavior is unexpected. We should account for IRQ time to the cgroup
to reflect the pressure the group is under.

After a thorough analysis, I discovered that this change in behavior is due
to commit 305e6835e055 ("sched: Do not account irq time to current task"),
which altered whether IRQ time should be charged to the interrupted task.
While I agree that a task should not be penalized by random interrupts, the
task itself cannot progress while interrupted. Therefore, the interrupted
time should be reported to the user.

The system metric in cpuacct.stat is crucial in indicating whether a
container is under heavy system pressure, including IRQ/softirq activity.
Hence, IRQ/softirq time should be accounted for in the cpuacct system
usage, which also applies to cgroup2’s rstat.

This patch reintroduces IRQ/softirq accounting to cgroups.

Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
---
kernel/sched/core.c | 39 +++++++++++++++++++++++++++++++++++++--
kernel/sched/psi.c | 15 +++------------
kernel/sched/stats.h | 7 ++++---
3 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b633a14a60f..533e015f8777 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5587,7 +5587,24 @@ void sched_tick(void)
rq_lock(rq, &rf);

curr = rq->curr;
- psi_account_irqtime(rq, curr, NULL);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (static_branch_likely(&sched_clock_irqtime)) {
+ u64 now, irq;
+ s64 delta;
+
+ now = cpu_clock(cpu);
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta > 0) {
+ rq->psi_irq_time = irq;
+ psi_account_irqtime(rq, curr, NULL, now, delta);
+ cgroup_account_cputime(curr, delta);
+ /* We account both softirq and irq into softirq */
+ cgroup_account_cputime_field(curr, CPUTIME_SOFTIRQ, delta);
+ }
+ }
+#endif

update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
@@ -6667,7 +6684,25 @@ static void __sched notrace __schedule(int sched_mode)
++*switch_count;

migrate_disable_switch(rq, prev);
- psi_account_irqtime(rq, prev, next);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (static_branch_likely(&sched_clock_irqtime)) {
+ u64 now, irq;
+ s64 delta;
+
+ now = cpu_clock(cpu);
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta > 0) {
+ rq->psi_irq_time = irq;
+ psi_account_irqtime(rq, prev, next, now, delta);
+ cgroup_account_cputime(prev, delta);
+ /* We account both softirq and irq into softirq */
+ cgroup_account_cputime_field(prev, CPUTIME_SOFTIRQ, delta);
+ }
+ }
+#endif
+
psi_sched_switch(prev, next, !task_on_rq_queued(prev));

trace_sched_switch(preempt, prev, next, prev_state);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 49d9c75be0c8..ffa8aa372fbd 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -992,16 +992,14 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev,
+ u64 now, s64 delta)
{
int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- u64 now, irq;
- s64 delta;

- if (static_branch_likely(&psi_disabled) ||
- !static_branch_likely(&sched_clock_irqtime))
+ if (static_branch_likely(&psi_disabled))
return;

if (!curr->pid)
@@ -1012,13 +1010,6 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
if (prev && task_psi_group(prev) == group)
return;

- now = cpu_clock(cpu);
- irq = irq_time_read(cpu);
- delta = (s64)(irq - rq->psi_irq_time);
- if (delta < 0)
- return;
- rq->psi_irq_time = irq;
-
do {
if (!group->enabled)
continue;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 237780aa3c53..7c5979761021 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -111,10 +111,11 @@ void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev, u64 now, s64 delta);
#else
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
- struct task_struct *prev) {}
+ struct task_struct *prev, u64 now, s64 delta) {}
#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
@@ -197,7 +198,7 @@ static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
- struct task_struct *prev) {}
+ struct task_struct *prev, u64 now, s64 delta) {}
#endif /* CONFIG_PSI */

#ifdef CONFIG_SCHED_INFO
--
2.43.5