[PATCH v4 2/2] cgroup/rstat: Add run_delay accounting for cgroups

From: Abel Wu
Date: Sun Feb 09 2025 - 01:14:11 EST


The "some" field of cpu.pressure indicator may lose the insight into
how severely one cgroup is stalled on certain cpu, because PSI tracks
stall time for each cpu through:

tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)

which turns nr_delayed_tasks[cpu] into boolean value. So together with
this cgroup level run_delay accounting, the scheduling info of cgroups
will be better illustrated.

Currently the task and cpu level accounting have already been tracked
through the following two holders respectively:

struct task_struct::sched_info if SCHED_INFO
struct rq::rq_sched_info if SCHEDSTATS

When extending this to cgroups, the minimal requirement would be:

root: relies on rq::rq_sched_info, hence SCHEDSTATS
non-root: relies on task's, hence SCHED_INFO

It might be too demanding to require both, while collecting data for
root cgroup from different holders according to different configs would
also be confusing and error-prone. In order to keep things simple, let
us rely on the cputime infrastructure to do the accounting as the other
cputimes do.

Only cgroup v2 is supported and CONFIG_SCHED_INFO is required.

Signed-off-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx>
---
include/linux/cgroup-defs.h | 3 +++
include/linux/kernel_stat.h | 7 +++++++
kernel/cgroup/rstat.c | 25 +++++++++++++++++++++++++
kernel/sched/cputime.c | 10 ++++++++++
kernel/sched/stats.h | 3 +++
5 files changed, 48 insertions(+)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1b20d2d8ef7c..287366e60414 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -328,6 +328,9 @@ struct cgroup_base_stat {
u64 forceidle_sum;
#endif
u64 ntime;
+#ifdef CONFIG_SCHED_INFO
+ u64 run_delay;
+#endif
};

/*
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b97ce2df376f..ddd59fea10ad 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -29,6 +29,9 @@ enum cpu_usage_stat {
CPUTIME_GUEST_NICE,
#ifdef CONFIG_SCHED_CORE
CPUTIME_FORCEIDLE,
+#endif
+#ifdef CONFIG_SCHED_INFO
+ CPUTIME_RUN_DELAY,
#endif
NR_STATS,
};
@@ -141,4 +144,8 @@ extern void account_idle_ticks(unsigned long ticks);
extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
#endif

+#ifdef CONFIG_SCHED_INFO
+extern void account_run_delay_time(struct task_struct *tsk, u64 delta);
+#endif
+
#endif /* _LINUX_KERNEL_STAT_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c2784c317cdd..53984cdf7f9b 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -445,6 +445,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
dst_bstat->ntime += src_bstat->ntime;
+#ifdef CONFIG_SCHED_INFO
+ dst_bstat->run_delay += src_bstat->run_delay;
+#endif
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -457,6 +460,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
dst_bstat->ntime -= src_bstat->ntime;
+#ifdef CONFIG_SCHED_INFO
+ dst_bstat->run_delay -= src_bstat->run_delay;
+#endif
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -551,6 +557,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_FORCEIDLE:
rstatc->bstat.forceidle_sum += delta_exec;
break;
+#endif
+#ifdef CONFIG_SCHED_INFO
+ case CPUTIME_RUN_DELAY:
+ rstatc->bstat.run_delay += delta_exec;
+ break;
#endif
default:
break;
@@ -596,6 +607,9 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
bstat->ntime += cpustat[CPUTIME_NICE];
+#ifdef CONFIG_SCHED_INFO
+ bstat->run_delay += cpustat[CPUTIME_RUN_DELAY];
+#endif
}
}

@@ -610,6 +624,16 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
#endif
}

+static void cgroup_run_delay_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
+{
+#ifdef CONFIG_SCHED_INFO
+ u64 run_delay = bstat->run_delay;
+
+ do_div(run_delay, NSEC_PER_USEC);
+ seq_printf(seq, "run_delay_usec %llu\n", run_delay);
+#endif
+}
+
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
@@ -640,6 +664,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
bstat.ntime);

cgroup_force_idle_show(seq, &bstat);
+ cgroup_run_delay_show(seq, &bstat);
}

/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5d9143dd0879..42af602c10a6 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -243,6 +243,16 @@ void __account_forceidle_time(struct task_struct *p, u64 delta)
}
#endif

+#ifdef CONFIG_SCHED_INFO
+/*
+ * Account for run_delay time spent waiting in rq.
+ */
+void account_run_delay_time(struct task_struct *p, u64 delta)
+{
+ task_group_account_field(p, CPUTIME_RUN_DELAY, delta);
+}
+#endif
+
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 19cdbe96f93d..fdfd04a89b05 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -252,7 +252,9 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
t->sched_info.max_run_delay = delta;
if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
t->sched_info.min_run_delay = delta;
+
rq_sched_info_dequeue(rq, delta);
+ account_run_delay_time(t, delta);
}

/*
@@ -279,6 +281,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
t->sched_info.min_run_delay = delta;

rq_sched_info_arrive(rq, delta);
+ account_run_delay_time(t, delta);
}

/*
--
2.37.3