[PATCH] sched/core: add forced idle accounting for cgroups

From: Josh Don
Date: Thu May 12 2022 - 20:54:55 EST


4feee7d1260 previously added per-task forced idle accounting. This patch
extends this to also include cgroups.

rstat is used for cgroup accounting, except for the root, which uses
kcpustat in order to bypass the need for doing an rstat flush when
reading root stats.

Only cgroup v2 is supported. Similar to the task accounting, the cgroup
accounting requires that schedstats is enabled.

Signed-off-by: Josh Don <joshdon@xxxxxxxxxx>
---
include/linux/kernel_stat.h | 1 +
kernel/sched/core.c | 15 ++++++++-
kernel/sched/core_sched.c | 62 +++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 18 +++++++++++
4 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 69ae6b278464..2e9b3c7d2f18 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,6 +28,7 @@ enum cpu_usage_stat {
CPUTIME_STEAL,
CPUTIME_GUEST,
CPUTIME_GUEST_NICE,
+ CPUTIME_FORCEIDLE,
NR_STATS,
};

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 48cfad152b86..a29cb4029818 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10828,12 +10828,18 @@ static struct cftype cpu_legacy_files[] = {
{ } /* Terminate */
};

+static void cpu_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ sched_core_rstat_flush(css_tg(css), cpu);
+}
+
static int cpu_extra_stat_show(struct seq_file *sf,
struct cgroup_subsys_state *css)
{
+ struct task_group __maybe_unused *tg = css_tg(css);
+
#ifdef CONFIG_CFS_BANDWIDTH
{
- struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
u64 throttled_usec, burst_usec;

@@ -10851,6 +10857,12 @@ static int cpu_extra_stat_show(struct seq_file *sf,
throttled_usec, cfs_b->nr_burst, burst_usec);
}
#endif
+
+#ifdef CONFIG_SCHED_CORE
+ /* already updated stats via rstat flush */
+ seq_printf(sf, "forceidle_usec %llu\n",
+ sched_core_forceidle_sum(tg) / NSEC_PER_USEC);
+#endif
return 0;
}

@@ -11031,6 +11043,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
+ .css_rstat_flush = cpu_cgroup_css_rstat_flush,
.css_extra_stat_show = cpu_extra_stat_show,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 38a2cec21014..ccfeef6542dc 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -277,7 +277,16 @@ void __sched_core_account_forceidle(struct rq *rq)
if (p == rq_i->idle)
continue;

+ /* thread accounting */
__schedstat_add(p->stats.core_forceidle_sum, delta);
+
+ /* root accounting */
+ kcpustat_cpu(i).cpustat[CPUTIME_FORCEIDLE] += delta;
+
+ /* cgroup accounting */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ task_group(p)->cfs_rq[i]->forceidle_sum += delta;
+#endif
}
}

@@ -292,4 +301,57 @@ void __sched_core_tick(struct rq *rq)
__sched_core_account_forceidle(rq);
}

+void sched_core_rstat_flush(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ struct task_group *parent = tg->parent;
+ u64 delta, curr_sum;
+
+ /* root uses cpustat */
+ if (!parent)
+ return;
+
+ /*
+ * Note: cgroup_rstat_lock protects cfs_rq->forceidle_sum_prev and
+ * tg->{forceidle_sum, forceidle_sum_pending}.
+ */
+
+ delta = tg->forceidle_sum_pending;
+ if (delta)
+ tg->forceidle_sum_pending = 0;
+
+ /* rq lock not held; value may change concurrently */
+ curr_sum = READ_ONCE(cfs_rq->forceidle_sum);
+ if (curr_sum != cfs_rq->forceidle_sum_prev) {
+ delta += curr_sum - cfs_rq->forceidle_sum_prev;
+ cfs_rq->forceidle_sum_prev = curr_sum;
+ }
+
+ if (!delta)
+ return;
+
+ tg->forceidle_sum += delta;
+ parent->forceidle_sum_pending += delta;
+}
+
+/* REQUIRES: If tg is not root, an rstat flush was recently done. */
+u64 sched_core_forceidle_sum(struct task_group *tg)
+{
+ if (!tg->parent) {
+ u64 sum = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcpustat;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
+ sum += kcpustat.cpustat[CPUTIME_FORCEIDLE];
+ }
+
+ return sum;
+ } else {
+ return tg->forceidle_sum;
+ }
+}
+
#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7f338c53ce42..36bef97b9e2f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -425,6 +425,12 @@ struct task_group {
struct uclamp_se uclamp[UCLAMP_CNT];
#endif

+#ifdef CONFIG_SCHED_CORE
+ /* used with rstat */
+ u64 forceidle_sum;
+ u64 forceidle_sum_pending;
+#endif
+
};

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -526,6 +532,10 @@ struct cfs_rq {
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
u64 min_vruntime_fi;
+
+ /* for accounting with rstat */
+ u64 forceidle_sum;
+ u64 forceidle_sum_prev;
#endif

#ifndef CONFIG_64BIT
@@ -1849,12 +1859,20 @@ static inline void sched_core_tick(struct rq *rq)
__sched_core_tick(rq);
}

+extern void sched_core_rstat_flush(struct task_group *tg, int cpu);
+
+extern u64 sched_core_forceidle_sum(struct task_group *tg);
+
#else

static inline void sched_core_account_forceidle(struct rq *rq) {}

static inline void sched_core_tick(struct rq *rq) {}

+static inline void sched_core_rstat_flush(struct task_group *tg, int cpu) {}
+
+static inline u64 sched_core_forceidle_sum(struct task_group *tg) { return 0; }
+
#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */

#ifdef CONFIG_CGROUP_SCHED
--
2.36.0.512.ge40c2bad7a-goog