Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()
From: Peter Zijlstra
Date: Fri Mar 31 2017 - 07:24:25 EST
On Fri, Mar 31, 2017 at 02:58:57AM -0700, Paul Turner wrote:
> > So lets pull it out again -- but I don't think we need to undo all of
> > yuyang's patches for that. So please, look at the patch I proposed for
> > the problem you spotted. Lets fix the current state and take it from
> > there, ok?
>
> Oh, absolutely. I'm not really not proposing re-vulcanizing any
> rubber here. Just saying that this particular problem is just one
> facet of the weight mixing. 100% agree on fixing this as is and
> iterating. I was actually thinking that these changes (which really
> nicely simplify the calculation) greatly simplify restoring the
> separation there.
So on top of the previous patch; I've arrived (without testing and
considering numerical overflow much) at the following patch.
Its known broken for things like update_cfs_rq_load_avg() where doing
this will unconditionally flatten load_sum, which seems undesired. Need
to think more on this.
Also, I've pulled out the cpufreq scaling, which I'm not sure is the
right thing to do. Vincent wants to scale time, which would be
persistent in the history, unlike this now.
But yes, this looks doable and simpler than before.
---
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -313,7 +313,7 @@ struct load_weight {
*/
struct sched_avg {
u64 last_update_time;
- u64 load_sum;
+ u32 load_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -747,8 +747,8 @@ void init_entity_runnable_average(struct
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
- sa->load_avg = scale_load_down(se->load.weight);
- sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+ sa->load_avg = se->load.weight;
+ sa->load_sum = LOAD_AVG_MAX;
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
*/
@@ -2820,15 +2820,11 @@ static u32 __accumulate_pelt_segments(u6
*/
static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ bool load, bool running, struct cfs_rq *cfs_rq)
{
- unsigned long scale_freq, scale_cpu;
- u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+ u32 contrib = (u32)delta;
u64 periods;
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
delta += sa->period_contrib;
periods = delta / 1024; /* A period is 1024us (~1ms) */
@@ -2852,14 +2848,13 @@ accumulate_sum(u64 delta, int cpu, struc
}
sa->period_contrib = delta;
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
+ if (load) {
+ sa->load_sum += contrib;
if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
+ cfs_rq->runnable_load_sum += contrib;
}
if (running)
- sa->util_sum += contrib * scale_cpu;
+ sa->util_sum += contrib;
return periods;
}
@@ -2894,9 +2889,10 @@ accumulate_sum(u64 delta, int cpu, struc
*/
static __always_inline int
___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ unsigned long weight, bool running, struct cfs_rq *cfs_rq)
{
- u64 delta;
+ unsigned long scale_freq, scale_cpu;
+ u64 delta, load;
delta = now - sa->last_update_time;
/*
@@ -2924,18 +2920,22 @@ ___update_load_avg(u64 now, int cpu, str
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ if (!accumulate_sum(delta, cpu, sa, !!weight, running, cfs_rq))
return 0;
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ load = cap_scale(weight, scale_freq);
+ sa->load_avg = div_u64(load * sa->load_sum, LOAD_AVG_MAX);
if (cfs_rq) {
cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ div_u64(load * cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
}
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ sa->util_avg = (cap_scale(scale_cpu, scale_freq) * sa->util_sum) / LOAD_AVG_MAX;
return 1;
}
@@ -2950,7 +2950,7 @@ static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return ___update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
+ se->on_rq * se->load.weight,
cfs_rq->curr == se, NULL);
}
@@ -2958,8 +2958,7 @@ static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
return ___update_load_avg(now, cpu, &cfs_rq->avg,
- scale_load_down(cfs_rq->load.weight),
- cfs_rq->curr != NULL, cfs_rq);
+ cfs_rq->load.weight, cfs_rq->curr != NULL, cfs_rq);
}
/*
@@ -3130,11 +3129,11 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq
/* Set new sched_entity's load */
se->avg.load_avg = load;
- se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+ se->avg.load_sum = LOAD_AVG_MAX;
/* Update parent cfs_rq load */
add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+ cfs_rq->avg.load_sum = LOAD_AVG_MAX;
/*
* If the sched_entity is already enqueued, we also have to update the
@@ -3143,7 +3142,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq
if (se->on_rq) {
/* Update parent cfs_rq runnable_load_avg */
add_positive(&cfs_rq->runnable_load_avg, delta);
- cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ cfs_rq->runnable_load_sum = LOAD_AVG_MAX;
}
}