Re: [RFC PATCH 2/5] sched/events: Introduce cfs_rq load tracking trace event

From: Peter Zijlstra
Date: Tue Mar 28 2017 - 12:46:35 EST


On Tue, Mar 28, 2017 at 10:46:00AM -0400, Steven Rostedt wrote:
> On Tue, 28 Mar 2017 07:35:38 +0100
> Dietmar Eggemann <dietmar.eggemann@xxxxxxx> wrote:
>
> > /* This part must be outside protection */
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 03adf9fb48b1..ac19ab6ced8f 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -2950,6 +2950,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> > sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
> > }
> >
> > + if (cfs_rq)
> > + trace_sched_load_cfs_rq(cfs_rq);
> > +
>
> Please use TRACE_EVENT_CONDITION(), and test for cfs_rq not NULL.

I too suggested that; but then I looked again at that code and we can
actually do this. cfs_rq can be constant propagated and the if
determined at build time.

Its not immediately obvious from the current code; but if we do
something like the below, it should be clearer.

---
Subject: sched/fair: Explicitly generate __update_load_avg() instances
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Tue Mar 28 11:08:20 CEST 2017

The __update_load_avg() function is an __always_inline because its
used with constant propagation to generate different variants of the
code without having to duplicate it (which would be prone to bugs).

Explicitly instantiate the 3 variants.

Note that most of this is called from rather hot paths, so reducing
branches is good.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2849,7 +2849,7 @@ static u32 __compute_runnable_contrib(u6
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
u64 delta, scaled_delta, periods;
@@ -2953,6 +2953,26 @@ __update_load_avg(u64 now, int cpu, stru
return decayed;
}

+static int
+__update_load_avg_blocked_se(u64 now, int cpu, struct sched_avg *sa)
+{
+ return ___update_load_avg(now, cpu, sa, 0, 0, NULL);
+}
+
+static int
+__update_load_avg_se(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running)
+{
+ return ___update_load_avg(now, cpu, sa, weight, running, NULL);
+}
+
+static int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+ return ___update_load_avg(now, cpu, sa, weight, running, cfs_rq);
+}
+
/*
* Signed add and clamp on underflow.
*
@@ -3014,6 +3034,9 @@ static inline void update_tg_load_avg(st
void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next)
{
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
if (!sched_feat(ATTACH_AGE_LOAD))
return;

@@ -3024,11 +3047,11 @@ void set_task_rq_fair(struct sched_entit
* time. This will result in the wakee task is less decayed, but giving
* the wakee more load sounds not bad.
*/
- if (se->avg.last_update_time && prev) {
- u64 p_last_update_time;
- u64 n_last_update_time;
+ if (!(se->avg.last_update_time && prev))
+ return;

#ifndef CONFIG_64BIT
+ {
u64 p_last_update_time_copy;
u64 n_last_update_time_copy;

@@ -3043,14 +3066,15 @@ void set_task_rq_fair(struct sched_entit

} while (p_last_update_time != p_last_update_time_copy ||
n_last_update_time != n_last_update_time_copy);
+ }
#else
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
#endif
- __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
- &se->avg, 0, 0, NULL);
- se->avg.last_update_time = n_last_update_time;
- }
+ __update_load_avg_blocked_se(p_last_update_time,
+ cpu_of(rq_of(prev)),
+ &se->avg);
+ se->avg.last_update_time = n_last_update_time;
}

/* Take into account change of utilization of a child task group */
@@ -3329,9 +3353,9 @@ static inline void update_load_avg(struc
* track group sched_entity load average for task_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
- __update_load_avg(now, cpu, &se->avg,
+ __update_load_avg_se(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
+ cfs_rq->curr == se);
}

decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
@@ -3437,7 +3461,7 @@ void sync_entity_load_avg(struct sched_e
u64 last_update_time;

last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg);
}

/*