[PATCH 1/3] psi: fix cpu.pressure for cpu.max and competing cgroups
From: Johannes Weiner
Date: Mon Mar 16 2020 - 15:13:49 EST
For simplicity, cpu pressure is defined as having more than one
runnable task on a given CPU. This works on the system-level, but it
has limitations in a cgrouped reality: When cpu.max is in use, it
doesn't capture the time in which a task is not executing on the CPU
due to throttling. Likewise, it doesn't capture the time in which a
competing cgroup is occupying the CPU - meaning it only reflects
cgroup-internal competitive pressure, not outside pressure.
Enable tracking of currently executing tasks, and then change the
definition of cpu pressure in a cgroup from
NR_RUNNING > 1
to
NR_RUNNING > ON_CPU
which will capture the effects of cpu.max as well as competition from
outside the cgroup.
After this patch, a cgroup running `stress -c 1` with a cpu.max
setting of 5000 10000 shows ~50% continuous CPU pressure.
Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
---
include/linux/psi_types.h | 10 +++++++++-
kernel/sched/core.c | 2 ++
kernel/sched/psi.c | 12 +++++++-----
kernel/sched/stats.h | 28 ++++++++++++++++++++++++++++
4 files changed, 46 insertions(+), 6 deletions(-)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 07aaf9b82241..4b7258495a04 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -14,13 +14,21 @@ enum psi_task_count {
NR_IOWAIT,
NR_MEMSTALL,
NR_RUNNING,
- NR_PSI_TASK_COUNTS = 3,
+ /*
+ * This can't have values other than 0 or 1 and could be
+ * implemented as a bit flag. But for now we still have room
+ * in the first cacheline of psi_group_cpu, and this way we
+ * don't have to special case any state tracking for it.
+ */
+ NR_ONCPU,
+ NR_PSI_TASK_COUNTS = 4,
};
/* Task state bitmasks */
#define TSK_IOWAIT (1 << NR_IOWAIT)
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING)
+#define TSK_ONCPU (1 << NR_ONCPU)
/* Resources that workloads could be stalled on */
enum psi_res {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1a9983da4408..f920fbf5dcd1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4074,6 +4074,8 @@ static void __sched notrace __schedule(bool preempt)
*/
++*switch_count;
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+
trace_sched_switch(preempt, prev, next);
/* Also unlocks the rq: */
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 028520702717..50128297a4f9 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_FULL:
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
case PSI_CPU_SOME:
- return tasks[NR_RUNNING] > 1;
+ return tasks[NR_RUNNING] > tasks[NR_ONCPU];
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -695,10 +695,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
if (!(m & (1 << t)))
continue;
if (groupc->tasks[t] == 0 && !psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- clear, set);
+ groupc->tasks[3], clear, set);
psi_bug = 1;
}
groupc->tasks[t]--;
@@ -916,9 +916,11 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
rq = task_rq_lock(task, &rf);
- if (task_on_rq_queued(task))
+ if (task_on_rq_queued(task)) {
task_flags = TSK_RUNNING;
- else if (task->in_iowait)
+ if (task_current(rq, task))
+ task_flags |= TSK_ONCPU;
+ } else if (task->in_iowait)
task_flags = TSK_IOWAIT;
if (task->flags & PF_MEMSTALL)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index ba683fe81a6e..6ff0ac1a803f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -93,6 +93,14 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
if (p->flags & PF_MEMSTALL)
clear |= TSK_MEMSTALL;
} else {
+ /*
+ * When a task sleeps, schedule() dequeues it before
+ * switching to the next one. Merge the clearing of
+ * TSK_RUNNING and TSK_ONCPU to save an unnecessary
+ * psi_task_change() call in psi_sched_switch().
+ */
+ clear |= TSK_ONCPU;
+
if (p->in_iowait)
set |= TSK_IOWAIT;
}
@@ -126,6 +134,23 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
}
}
+static inline void psi_sched_switch(struct task_struct *prev,
+ struct task_struct *next,
+ bool sleep)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ /*
+ * Clear the TSK_ONCPU state if the task was preempted. If
+ * it's a voluntary sleep, dequeue will have taken care of it.
+ */
+ if (!sleep)
+ psi_task_change(prev, TSK_ONCPU, 0);
+
+ psi_task_change(next, 0, TSK_ONCPU);
+}
+
static inline void psi_task_tick(struct rq *rq)
{
if (static_branch_likely(&psi_disabled))
@@ -138,6 +163,9 @@ static inline void psi_task_tick(struct rq *rq)
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
+static inline void psi_sched_switch(struct task_struct *prev,
+ struct task_struct *next,
+ bool sleep) {}
static inline void psi_task_tick(struct rq *rq) {}
#endif /* CONFIG_PSI */
--
2.25.1