Re: [PATCH v3 0/7] sched: support schedstats for RT sched class

From: Yafang Shao
Date: Tue Aug 31 2021 - 09:21:55 EST


On Tue, Aug 31, 2021 at 6:44 PM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Tue, Aug 31, 2021 at 12:08:15PM +0200, Peter Zijlstra wrote:
> > On Tue, Aug 24, 2021 at 11:29:39AM +0000, Yafang Shao wrote:
>
> > > After the patchset, schestats are orgnized as follows,
> > > struct task_struct {
> > > ...
> > > struct sched_statistics statistics;
> > > ...
> > > struct sched_entity *se;
> > > struct sched_rt_entity *rt;
> > > ...
> > > };
> > >
> > > struct task_group { |---> stats[0] : of CPU0
> > > ... |
> > > struct sched_statistics **stats; --|---> stats[1] : of CPU1
> > > ... |
> > > |---> stats[n] : of CPUn
> > > #ifdef CONFIG_FAIR_GROUP_SCHED
> > > struct sched_entity **se;
> > > #endif
> > > #ifdef CONFIG_RT_GROUP_SCHED
> > > struct sched_rt_entity **rt_se;
> > > #endif
> > > ...
> > > };
> >
> > Yeah, this seems to give a terrible mess, let me see if I can come up
> > with anything less horrible.
>
> Here, isn't this *MUCH* saner ?
>

Seems like a good idea.
I will verify it.


> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -521,7 +521,7 @@ struct sched_statistics {
> u64 nr_wakeups_passive;
> u64 nr_wakeups_idle;
> #endif
> -};
> +} ____cacheline_aligned;
>
> struct sched_entity {
> /* For load-balancing: */
> @@ -537,8 +537,6 @@ struct sched_entity {
>
> u64 nr_migrations;
>
> - struct sched_statistics statistics;
> -
> #ifdef CONFIG_FAIR_GROUP_SCHED
> int depth;
> struct sched_entity *parent;
> @@ -802,6 +800,8 @@ struct task_struct {
> struct uclamp_se uclamp[UCLAMP_CNT];
> #endif
>
> + struct sched_statistics stats;
> +

The stats was kept close to 'struct sched_entity se' before, because I
don't want to change the original layout of 'struct task_struct' too
much, in case the change may impact the cache line.
I'm not sure whether it is proper to place it here, I will verify it.

> #ifdef CONFIG_PREEMPT_NOTIFIERS
> /* List of struct preempt_notifier: */
> struct hlist_head preempt_notifiers;
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3489,11 +3489,11 @@ ttwu_stat(struct task_struct *p, int cpu
> #ifdef CONFIG_SMP
> if (cpu == rq->cpu) {
> __schedstat_inc(rq->ttwu_local);
> - __schedstat_inc(p->se.statistics.nr_wakeups_local);
> + __schedstat_inc(p->stats.nr_wakeups_local);
> } else {
> struct sched_domain *sd;
>
> - __schedstat_inc(p->se.statistics.nr_wakeups_remote);
> + __schedstat_inc(p->stats.nr_wakeups_remote);
> rcu_read_lock();
> for_each_domain(rq->cpu, sd) {
> if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
> @@ -3505,14 +3505,14 @@ ttwu_stat(struct task_struct *p, int cpu
> }
>
> if (wake_flags & WF_MIGRATED)
> - __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
> + __schedstat_inc(p->stats.nr_wakeups_migrate);
> #endif /* CONFIG_SMP */
>
> __schedstat_inc(rq->ttwu_count);
> - __schedstat_inc(p->se.statistics.nr_wakeups);
> + __schedstat_inc(p->stats.nr_wakeups);
>
> if (wake_flags & WF_SYNC)
> - __schedstat_inc(p->se.statistics.nr_wakeups_sync);
> + __schedstat_inc(p->stats.nr_wakeups_sync);
> }
>
> /*
> @@ -4196,7 +4196,7 @@ static void __sched_fork(unsigned long c
>
> #ifdef CONFIG_SCHEDSTATS
> /* Even if schedstat is disabled, there should not be garbage */
> - memset(&p->se.statistics, 0, sizeof(p->se.statistics));
> + memset(&p->stats, 0, sizeof(p->stats));
> #endif
>
> RB_CLEAR_NODE(&p->dl.rb_node);
> @@ -9619,9 +9619,9 @@ void normalize_rt_tasks(void)
> continue;
>
> p->se.exec_start = 0;
> - schedstat_set(p->se.statistics.wait_start, 0);
> - schedstat_set(p->se.statistics.sleep_start, 0);
> - schedstat_set(p->se.statistics.block_start, 0);
> + schedstat_set(p->stats.wait_start, 0);
> + schedstat_set(p->stats.sleep_start, 0);
> + schedstat_set(p->stats.block_start, 0);
>
> if (!dl_task(p) && !rt_task(p)) {
> /*
> @@ -10467,7 +10467,7 @@ static int cpu_cfs_stat_show(struct seq_
> int i;
>
> for_each_possible_cpu(i)
> - ws += schedstat_val(tg->se[i]->statistics.wait_sum);
> + ws += schedstat_val(tg->stats[i]->wait_sum);
>
> seq_printf(sf, "wait_sum %llu\n", ws);
> }
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1265,8 +1265,8 @@ static void update_curr_dl(struct rq *rq
> return;
> }
>
> - schedstat_set(curr->se.statistics.exec_max,
> - max(curr->se.statistics.exec_max, delta_exec));
> + schedstat_set(curr->stats.exec_max,
> + max(curr->stats.exec_max, delta_exec));
>
> curr->se.sum_exec_runtime += delta_exec;
> account_group_exec_runtime(curr, delta_exec);
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -819,6 +819,21 @@ static void update_tg_load_avg(struct cf
> }
> #endif /* CONFIG_SMP */
>
> +struct sched_entity_stats {
> + struct sched_entity se;
> + struct sched_statistics stats;
> +} __no_randomize_layout;
> +
> +static inline struct sched_statistics *
> +__schedstats_from_se(struct sched_entity *se)
> +{
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> + if (!entity_is_task(se))
> + return &container_of(se, struct sched_entity_stats, se)->stats;
> +#endif
> + return &task_of(se)->stats;
> +}
> +
> /*
> * Update the current task's runtime statistics.
> */
> @@ -837,8 +852,10 @@ static void update_curr(struct cfs_rq *c
>
> curr->exec_start = now;
>
> - schedstat_set(curr->statistics.exec_max,
> - max(delta_exec, curr->statistics.exec_max));
> + if (schedstat_enabled()) {
> + struct sched_statistics *stats = __schedstats_from_se(curr);
> + __schedstat_set(stats->exec_max, max(delta_exec, stats->exec_max));
> + }
>
> curr->sum_exec_runtime += delta_exec;
> schedstat_add(cfs_rq->exec_clock, delta_exec);
> @@ -866,39 +883,45 @@ static inline void
> update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> u64 wait_start, prev_wait_start;
> + struct sched_statistics *stats;
>
> if (!schedstat_enabled())
> return;
>
> + stats = __schedstats_from_se(se);
> +
> wait_start = rq_clock(rq_of(cfs_rq));
> - prev_wait_start = schedstat_val(se->statistics.wait_start);
> + prev_wait_start = schedstat_val(stats->wait_start);
>
> if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
> likely(wait_start > prev_wait_start))
> wait_start -= prev_wait_start;
>
> - __schedstat_set(se->statistics.wait_start, wait_start);
> + __schedstat_set(stats->wait_start, wait_start);
> }
>
> static inline void
> update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> - struct task_struct *p;
> + struct sched_statistics *stats;
> + struct task_struct *p = NULL;
> u64 delta;
>
> if (!schedstat_enabled())
> return;
>
> + stats = __schedstats_from_se(se);
> +
> /*
> * When the sched_schedstat changes from 0 to 1, some sched se
> * maybe already in the runqueue, the se->statistics.wait_start
> * will be 0.So it will let the delta wrong. We need to avoid this
> * scenario.
> */
> - if (unlikely(!schedstat_val(se->statistics.wait_start)))
> + if (unlikely(!schedstat_val(stats->wait_start)))
> return;
>
> - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
> + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start);
>
> if (entity_is_task(se)) {
> p = task_of(se);
> @@ -908,30 +931,33 @@ update_stats_wait_end(struct cfs_rq *cfs
> * time stamp can be adjusted to accumulate wait time
> * prior to migration.
> */
> - __schedstat_set(se->statistics.wait_start, delta);
> + __schedstat_set(stats->wait_start, delta);
> return;
> }
> trace_sched_stat_wait(p, delta);
> }
>
> - __schedstat_set(se->statistics.wait_max,
> - max(schedstat_val(se->statistics.wait_max), delta));
> - __schedstat_inc(se->statistics.wait_count);
> - __schedstat_add(se->statistics.wait_sum, delta);
> - __schedstat_set(se->statistics.wait_start, 0);
> + __schedstat_set(stats->wait_max,
> + max(schedstat_val(stats->wait_max), delta));
> + __schedstat_inc(stats->wait_count);
> + __schedstat_add(stats->wait_sum, delta);
> + __schedstat_set(stats->wait_start, 0);
> }
>
> static inline void
> update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> + struct sched_statistics *stats;
> struct task_struct *tsk = NULL;
> u64 sleep_start, block_start;
>
> if (!schedstat_enabled())
> return;
>
> - sleep_start = schedstat_val(se->statistics.sleep_start);
> - block_start = schedstat_val(se->statistics.block_start);
> + stats = __schedstats_from_se(se);
> +
> + sleep_start = schedstat_val(stats->sleep_start);
> + block_start = schedstat_val(stats->block_start);
>
> if (entity_is_task(se))
> tsk = task_of(se);
> @@ -942,11 +968,11 @@ update_stats_enqueue_sleeper(struct cfs_
> if ((s64)delta < 0)
> delta = 0;
>
> - if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
> - __schedstat_set(se->statistics.sleep_max, delta);
> + if (unlikely(delta > schedstat_val(stats->sleep_max)))
> + __schedstat_set(stats->sleep_max, delta);
>
> - __schedstat_set(se->statistics.sleep_start, 0);
> - __schedstat_add(se->statistics.sum_sleep_runtime, delta);
> + __schedstat_set(stats->sleep_start, 0);
> + __schedstat_add(stats->sum_sleep_runtime, delta);
>
> if (tsk) {
> account_scheduler_latency(tsk, delta >> 10, 1);
> @@ -959,16 +985,16 @@ update_stats_enqueue_sleeper(struct cfs_
> if ((s64)delta < 0)
> delta = 0;
>
> - if (unlikely(delta > schedstat_val(se->statistics.block_max)))
> - __schedstat_set(se->statistics.block_max, delta);
> + if (unlikely(delta > schedstat_val(stats->block_max)))
> + __schedstat_set(stats->block_max, delta);
>
> - __schedstat_set(se->statistics.block_start, 0);
> - __schedstat_add(se->statistics.sum_sleep_runtime, delta);
> + __schedstat_set(stats->block_start, 0);
> + __schedstat_add(stats->sum_sleep_runtime, delta);
>
> if (tsk) {
> if (tsk->in_iowait) {
> - __schedstat_add(se->statistics.iowait_sum, delta);
> - __schedstat_inc(se->statistics.iowait_count);
> + __schedstat_add(stats->iowait_sum, delta);
> + __schedstat_inc(stats->iowait_count);
> trace_sched_stat_iowait(tsk, delta);
> }
>
> @@ -1030,10 +1056,10 @@ update_stats_dequeue(struct cfs_rq *cfs_
> /* XXX racy against TTWU */
> state = READ_ONCE(tsk->__state);
> if (state & TASK_INTERRUPTIBLE)
> - __schedstat_set(se->statistics.sleep_start,
> + __schedstat_set(tsk->stats.sleep_start,
> rq_clock(rq_of(cfs_rq)));
> if (state & TASK_UNINTERRUPTIBLE)
> - __schedstat_set(se->statistics.block_start,
> + __schedstat_set(tsk->stats.block_start,
> rq_clock(rq_of(cfs_rq)));
> }
> }
> @@ -4502,9 +4528,10 @@ set_next_entity(struct cfs_rq *cfs_rq, s
> */
> if (schedstat_enabled() &&
> rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
> - __schedstat_set(se->statistics.slice_max,
> - max((u64)schedstat_val(se->statistics.slice_max),
> - se->sum_exec_runtime - se->prev_sum_exec_runtime));
> + struct sched_statistics *stats = __schedstats_from_se(se);
> + __schedstat_set(stats->slice_max,
> + max((u64)stats->slice_max,
> + se->sum_exec_runtime - se->prev_sum_exec_runtime));
> }
>
> se->prev_sum_exec_runtime = se->sum_exec_runtime;
> @@ -5993,12 +6020,12 @@ static int wake_affine(struct sched_doma
> if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
> target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
>
> - schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
> + schedstat_inc(p->stats.nr_wakeups_affine_attempts);
> if (target == nr_cpumask_bits)
> return prev_cpu;
>
> schedstat_inc(sd->ttwu_move_affine);
> - schedstat_inc(p->se.statistics.nr_wakeups_affine);
> + schedstat_inc(p->stats.nr_wakeups_affine);
> return target;
> }
>
> @@ -7802,7 +7829,7 @@ int can_migrate_task(struct task_struct
> if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
> int cpu;
>
> - schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
> + schedstat_inc(p->stats.nr_failed_migrations_affine);
>
> env->flags |= LBF_SOME_PINNED;
>
> @@ -7836,7 +7863,7 @@ int can_migrate_task(struct task_struct
> env->flags &= ~LBF_ALL_PINNED;
>
> if (task_running(env->src_rq, p)) {
> - schedstat_inc(p->se.statistics.nr_failed_migrations_running);
> + schedstat_inc(p->stats.nr_failed_migrations_running);
> return 0;
> }
>
> @@ -7858,12 +7885,12 @@ int can_migrate_task(struct task_struct
> env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
> if (tsk_cache_hot == 1) {
> schedstat_inc(env->sd->lb_hot_gained[env->idle]);
> - schedstat_inc(p->se.statistics.nr_forced_migrations);
> + schedstat_inc(p->stats.nr_forced_migrations);
> }
> return 1;
> }
>
> - schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
> + schedstat_inc(p->stats.nr_failed_migrations_hot);
> return 0;
> }
>
> @@ -11390,7 +11417,7 @@ int alloc_fair_sched_group(struct task_g
> if (!cfs_rq)
> goto err;
>
> - se = kzalloc_node(sizeof(struct sched_entity),
> + se = kzalloc_node(sizeof(struct sched_entity_stats),
> GFP_KERNEL, cpu_to_node(i));
> if (!se)
> goto err_free_rq;
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1009,8 +1009,8 @@ static void update_curr_rt(struct rq *rq
> if (unlikely((s64)delta_exec <= 0))
> return;
>
> - schedstat_set(curr->se.statistics.exec_max,
> - max(curr->se.statistics.exec_max, delta_exec));
> + schedstat_set(curr->stats.exec_max,
> + max(curr->stats.exec_max, delta_exec));
>
> curr->se.sum_exec_runtime += delta_exec;
> account_group_exec_runtime(curr, delta_exec);
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -41,6 +41,7 @@ rq_sched_info_dequeue(struct rq *rq, uns
> #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
>
> #else /* !CONFIG_SCHEDSTATS: */
> +
> static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
> static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { }
> static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
> @@ -53,6 +54,7 @@ static inline void rq_sched_info_depart
> # define schedstat_set(var, val) do { } while (0)
> # define schedstat_val(var) 0
> # define schedstat_val_or_zero(var) 0
> +
> #endif /* CONFIG_SCHEDSTATS */
>
> #ifdef CONFIG_PSI
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -78,8 +78,8 @@ static void put_prev_task_stop(struct rq
> if (unlikely((s64)delta_exec < 0))
> delta_exec = 0;
>
> - schedstat_set(curr->se.statistics.exec_max,
> - max(curr->se.statistics.exec_max, delta_exec));
> + schedstat_set(curr->stats.exec_max,
> + max(curr->stats.exec_max, delta_exec));
>
> curr->se.sum_exec_runtime += delta_exec;
> account_group_exec_runtime(curr, delta_exec);



--
Thanks
Yafang