[PATCH v2 08/10] sched/psi: add PSI_IRQ to track IRQ/SOFTIRQ pressure
From: Chengming Zhou
Date: Mon Aug 08 2022 - 07:05:59 EST
Now PSI already tracked workload pressure stall information for
CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have
obvious impact on some workload productivity, such as web service
workload.
When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time
from update_rq_clock_task(), in which we can record that delta
to CPU curr task's cgroups as PSI_IRQ_FULL status.
Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in
the current task on the CPU, make nothing productive could run
even if it were runnable, so we only use PSI_IRQ_FULL.
Signed-off-by: Chengming Zhou <zhouchengming@xxxxxxxxxxxxx>
---
Documentation/admin-guide/cgroup-v2.rst | 6 +++
include/linux/psi_types.h | 6 ++-
kernel/cgroup/cgroup.c | 27 ++++++++++
kernel/sched/core.c | 1 +
kernel/sched/psi.c | 65 ++++++++++++++++++++++++-
kernel/sched/stats.h | 2 +
6 files changed, 103 insertions(+), 4 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 176298f2f4de..dd84e34bc051 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -968,6 +968,12 @@ All cgroup core files are prefixed with "cgroup."
killing cgroups is a process directed operation, i.e. it affects
the whole thread-group.
+ irq.pressure
+ A read-write nested-keyed file.
+
+ Shows pressure stall information for IRQ/SOFTIRQ. See
+ :ref:`Documentation/accounting/psi.rst <psi>` for details.
+
Controllers
===========
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 54cb74946db4..4677655f6ca1 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -42,7 +42,8 @@ enum psi_res {
PSI_IO,
PSI_MEM,
PSI_CPU,
- NR_PSI_RESOURCES = 3,
+ PSI_IRQ,
+ NR_PSI_RESOURCES = 4,
};
/*
@@ -58,9 +59,10 @@ enum psi_states {
PSI_MEM_FULL,
PSI_CPU_SOME,
PSI_CPU_FULL,
+ PSI_IRQ_FULL,
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
- NR_PSI_STATES = 7,
+ NR_PSI_STATES = 8,
};
/* Use one bit in the state mask to track TSK_ONCPU */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5f88117fc81e..91de8ff7fa50 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3692,6 +3692,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
@@ -5088,6 +5105,16 @@ static struct cftype cgroup_base_files[] = {
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .flags = CFTYPE_PRESSURE,
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
#endif /* CONFIG_PSI */
{ } /* terminate */
};
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3aa401689f7e..4cfb6ab32142 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -708,6 +708,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+ psi_account_irqtime(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1c675715ed33..58f8092c938f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -910,6 +910,34 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
}
+void psi_account_irqtime(struct task_struct *task, u32 delta)
+{
+ int cpu = task_cpu(task);
+ void *iter = NULL;
+ struct psi_group *group;
+ struct psi_group_cpu *groupc;
+ u64 now;
+
+ if (!task->pid)
+ return;
+
+ now = cpu_clock(cpu);
+
+ while ((group = iterate_groups(task, &iter))) {
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ write_seqcount_begin(&groupc->seq);
+
+ record_times(groupc, now);
+ groupc->times[PSI_IRQ_FULL] += delta;
+
+ write_seqcount_end(&groupc->seq);
+
+ if (group->poll_states & (1 << PSI_IRQ_FULL))
+ psi_schedule_poll_work(group, 1);
+ }
+}
+
/**
* psi_memstall_enter - mark the beginning of a memory stall section
* @flags: flags to handle nested sections
@@ -1078,7 +1106,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock);
- for (full = 0; full < 2; full++) {
+ for (full = 0; full < 2 - (res == PSI_IRQ); full++) {
unsigned long avg[3] = { 0, };
u64 total = 0;
int w;
@@ -1092,7 +1120,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
}
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
- full ? "full" : "some",
+ full || (res == PSI_IRQ) ? "full" : "some",
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1120,6 +1148,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else
return ERR_PTR(-EINVAL);
+ if ((res == PSI_IRQ) && (--state != PSI_IRQ_FULL))
+ return ERR_PTR(-EINVAL);
+
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
@@ -1404,6 +1435,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release,
};
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_irq_show);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+ .proc_open = psi_irq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_irq_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+#endif
+
static int __init psi_proc_init(void)
{
if (psi_enable) {
@@ -1411,6 +1469,9 @@ static int __init psi_proc_init(void)
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+#endif
}
return 0;
}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index e930b8fa6253..8b6cfc7a56f5 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -111,6 +111,7 @@ void psi_change_groups(struct task_struct *task, int clear, int set);
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
+void psi_account_irqtime(struct task_struct *task, u32 delta);
/*
* PSI tracks state that persists across sleeps, such as iowaits and
@@ -203,6 +204,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
+static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
--
2.36.1