Re: [RFC][PATCH 03/14] sched/fair: Remove se->load.weight from se->avg.load_sum
From: Vincent Guittot
Date: Wed May 17 2017 - 05:51:21 EST
Le Wednesday 17 May 2017 à 09:04:47 (+0200), Vincent Guittot a écrit :
> Hi Peter,
>
> On 12 May 2017 at 18:44, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> > Remove the load from the load_sum for sched_entities, basically
> > turning load_sum into runnable_sum. This prepares for better
> > reweighting of group entities.
> >
> > Since we now have different rules for computing load_avg, split
> > ___update_load_avg() into two parts, ___update_load_sum() and
> > ___update_load_avg().
> >
> > So for se:
> >
> > ___update_load_sum(.weight = 1)
> > ___upate_load_avg(.weight = se->load.weight)
> >
> > and for cfs_rq:
> >
> > ___update_load_sum(.weight = cfs_rq->load.weight)
> > ___upate_load_avg(.weight = 1)
> >
> > Since the primary consumable is load_avg, most things will not be
> > affected. Only those few sites that initialize/modify load_sum need
> > attention.
>
> I wonder if there is a problem with this new way to compute se's
> load_avg and cfs_rq's load_avg when a task changes is nice prio before
> migrating to another CPU.
>
> se load_avg is now: runnable x current weight
> but cfs_rq load_avg keeps the history of the previous weight of the se
> When we detach se, we will remove an up to date se's load_avg from
> cfs_rq which doesn't have the up to date load_avg in its own load_avg.
> So if se's prio decreases just before migrating, some load_avg stays
> in prev cfs_rq and if se's prio increases, we will remove too much
> load_avg and possibly make the cfs_rq load_avg null whereas other
> tasks are running.
>
> Thought ?
>
> I'm able to reproduce the problem with a simple rt-app use case (after
> adding a new feature in rt-app)
>
> Vincent
>
The hack below fixes the problem I mentioned above. It applies on task what is
done when updating group_entity's weight
---
kernel/sched/core.c | 26 +++++++++++++++++++-------
kernel/sched/fair.c | 20 ++++++++++++++++++++
2 files changed, 39 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 09e0996..f327ab6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -732,7 +732,9 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-static void set_load_weight(struct task_struct *p)
+extern void reweight_task(struct task_struct *p, int prio);
+
+static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
@@ -746,8 +748,18 @@ static void set_load_weight(struct task_struct *p)
return;
}
- load->weight = scale_load(sched_prio_to_weight[prio]);
- load->inv_weight = sched_prio_to_wmult[prio];
+ /*
+ * SCHED_OTHER tasks have to update their load when changing their
+ * weight
+ */
+ if (update_load) {
+ reweight_task(p, prio);
+ } else {
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
+ }
+
+
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2373,7 +2385,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = __normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, false);
/*
* We don't need the reset flag anymore after the fork. It has
@@ -3854,7 +3866,7 @@ void set_user_nice(struct task_struct *p, long nice)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
+ set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
delta = p->prio - old_prio;
@@ -4051,7 +4063,7 @@ static void __setscheduler_params(struct task_struct *p,
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, fair_policy(policy));
}
/* Actually do priority change: must hold pi & rq lock. */
@@ -6152,7 +6164,7 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
- set_load_weight(&init_task);
+ set_load_weight(&init_task, false);
/*
* The boot idle thread does lazy MMU switching as well:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bd2f9f5..3853e34 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2825,6 +2825,26 @@ __sub_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
}
+void reweight_task(struct task_struct *p, int prio)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct load_weight *load = &p->se.load;
+
+ u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+
+ __sub_load_avg(cfs_rq, se);
+
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
+
+ se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+ se->avg.runnable_load_avg =
+ div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
+
+ __add_load_avg(cfs_rq, se);
+}
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight, unsigned long runnable)
{
--