[PATCH v3 7/7] sched, rt: support schedstats for RT sched class
From: Yafang Shao
Date: Tue Aug 24 2021 - 07:30:46 EST
We want to measure the latency of RT tasks in our production
environment with schedstats facility, but currently schedstats is only
supported for fair sched class. This patch enable it for RT sched class
as well.
After we make the struct sched_statistics and the helpers of it
independent of fair sched class, we can easily use the schedstats
facility for RT sched class.
The schedstat usage in RT sched class is similar with fair sched class,
for example,
fair RT
enqueue update_stats_enqueue_fair update_stats_enqueue_rt
dequeue update_stats_dequeue_fair update_stats_dequeue_rt
put_prev_task update_stats_wait_start update_stats_wait_start_rt
set_next_task update_stats_wait_end update_stats_wait_end_rt
The user can get the schedstats information in the same way in fair sched
class. For example,
fair RT
/proc/[pid]/sched /proc/[pid]/sched
The output of a RT task's schedstats as follows,
$ cat /proc/227408/sched
...
se.statistics.sum_sleep_runtime : 402284.476088
se.statistics.sum_block_runtime : 402272.475254
se.statistics.wait_start : 0.000000
se.statistics.sleep_start : 0.000000
se.statistics.block_start : 46903176.965093
se.statistics.sleep_max : 12.000834
se.statistics.block_max : 1446.963040
se.statistics.exec_max : 0.463806
se.statistics.slice_max : 0.000000
se.statistics.wait_max : 146.656326
se.statistics.wait_sum : 81741.944704
se.statistics.wait_count : 1004
se.statistics.iowait_sum : 77875.399958
se.statistics.iowait_count : 142
se.statistics.nr_migrations_cold : 0
se.statistics.nr_failed_migrations_affine : 0
se.statistics.nr_failed_migrations_running : 0
se.statistics.nr_failed_migrations_hot : 0
se.statistics.nr_forced_migrations : 0
se.statistics.nr_wakeups : 1003
se.statistics.nr_wakeups_sync : 0
se.statistics.nr_wakeups_migrate : 0
se.statistics.nr_wakeups_local : 351
se.statistics.nr_wakeups_remote : 652
se.statistics.nr_wakeups_affine : 0
se.statistics.nr_wakeups_affine_attempts : 0
se.statistics.nr_wakeups_passive : 0
se.statistics.nr_wakeups_idle : 0
...
The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can
be used to trace RT tasks as well. The output of these tracepoints for a
RT tasks as follows,
- blocked
kworker/u113:0-230817 [000] d... 47197.452940: sched_stat_blocked: comm=stress pid=227408 delay=4096 [ns]
- iowait
kworker/3:1-222921 [003] d... 47492.211521: sched_stat_iowait: comm=stress pid=227408 delay=905187613 [ns]
- wait
stress-227400 [003] d... 47202.283021: sched_stat_wait: comm=stress pid=227408 delay=67958890 [ns]
- runtime
stress-227408 [003] d... 47202.283027: sched_stat_runtime: comm=stress pid=227408 runtime=7815 [ns] vruntime=0 [ns]
- sleep
sleep-244868 [022] dN.. 50070.614833: sched_stat_sleep: comm=sleep.sh pid=244300 delay=1001131165 [ns]
sleep-244869 [022] dN.. 50071.616222: sched_stat_sleep: comm=sleep.sh pid=244300 delay=1001100486 [ns]
sleep-244879 [022] dN.. 50072.617628: sched_stat_sleep: comm=sleep.sh pid=244300 delay=1001137198 [ns]
[ In sleep.sh, it sleeps 1 sec each time. ]
[lkp@xxxxxxxxx: reported build failure in earlier version]
Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
Cc: kernel test robot <lkp@xxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Alison Chaiken <achaiken@xxxxxxxxxxx>
---
kernel/sched/rt.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 141 insertions(+)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5d251112e51c..446164597232 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1273,6 +1273,129 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr
rt_se->on_list = 0;
}
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline void
+__schedstats_from_sched_rt_entity(struct sched_rt_entity *rt_se,
+ struct sched_statistics **stats)
+{
+ struct task_struct *p;
+ struct task_group *tg;
+ struct rt_rq *rt_rq;
+ int cpu;
+
+ if (rt_entity_is_task(rt_se)) {
+ p = rt_task_of(rt_se);
+ *stats = &p->stats;
+ } else {
+ rt_rq = group_rt_rq(rt_se);
+ tg = rt_rq->tg;
+ cpu = cpu_of(rq_of_rt_rq(rt_rq));
+ *stats = tg->stats[cpu];
+ }
+}
+
+#else
+
+static inline void
+__schedstats_from_sched_rt_entity(struct sched_rt_entity *rt_se,
+ struct sched_statistics **stats)
+{
+ struct task_struct *p;
+
+ p = rt_task_of(rt_se);
+ *stats = &p->stats;
+}
+
+#endif
+
+static inline void
+update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats = NULL;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ __schedstats_from_sched_rt_entity(rt_se, &stats);
+
+ __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats = NULL;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ __schedstats_from_sched_rt_entity(rt_se, &stats);
+
+ __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+ int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
+}
+
+static inline void
+update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats = NULL;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ __schedstats_from_sched_rt_entity(rt_se, &stats);
+
+ __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+ int flags)
+{
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ if ((flags & DEQUEUE_SLEEP) && p) {
+ unsigned int state;
+
+ state = READ_ONCE(p->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(p->stats.sleep_start,
+ rq_clock(rq_of_rt_rq(rt_rq)));
+
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(p->stats.block_start,
+ rq_clock(rq_of_rt_rq(rt_rq)));
+ }
+}
+
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
@@ -1346,6 +1469,8 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
+ update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
+
dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, flags);
@@ -1356,6 +1481,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
+ update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
+
dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) {
@@ -1378,6 +1505,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
+ check_schedstat_required();
+ update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
+
enqueue_rt_entity(rt_se, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -1578,7 +1708,12 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = &rq->rt;
+
p->se.exec_start = rq_clock_task(rq);
+ if (on_rt_rq(&p->rt))
+ update_stats_wait_end_rt(rt_rq, rt_se);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
@@ -1652,6 +1787,12 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (on_rt_rq(&p->rt))
+ update_stats_wait_start_rt(rt_rq, rt_se);
+
update_curr_rt(rq);
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
--
2.18.2