[PATCH v2 2/2] sched: make it possible to account fair class load avg consistently
From: byungchul.park
Date: Wed Oct 14 2015 - 04:44:21 EST
From: Byungchul Park <byungchul.park@xxxxxxx>
Current code can account fair class load avg for time the task was on
other class e.g. rt or dl thanks to ATTACH_AGE_LOAD. However, it does
not work in the case that either migration or group change happened in
the other classes.
This patch introduces more general solution so that load avg of fair
class will be considered consistently, and removed a weird coupling
between se->avg.last_update_time and the condition checking for
migration.
Additionally, cleaned up code a little bit.
Signed-off-by: Byungchul Park <byungchul.park@xxxxxxx>
---
include/linux/sched.h | 3 +++
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 72 +++++++++++++++++++++++++++++++++----------------
kernel/sched/sched.h | 1 +
4 files changed, 54 insertions(+), 23 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 699228b..a104c72 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1379,6 +1379,9 @@ struct task_struct {
#endif
int on_rq;
+ /* For indicating if a migration has happened. */
+ int migrated;
+
int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a91df61..57f4300 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2068,6 +2068,7 @@ void __dl_clear_params(struct task_struct *p)
static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->on_rq = 0;
+ p->migrated = 0;
p->se.on_rq = 0;
p->se.exec_start = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 07882c2..52e7d85 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2771,14 +2771,15 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/* Add the load generated by se into cfs_rq's load average */
static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
struct sched_avg *sa = &se->avg;
u64 now = cfs_rq_clock_task(cfs_rq);
- int migrated, decayed;
+ int decayed;
+ int migrated = flags & ENQUEUE_MIGRATED;
+ int created = !(sa->last_update_time);
- migrated = !sa->last_update_time;
- if (!migrated) {
+ if (!migrated && !created) {
__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
@@ -2789,10 +2790,10 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated)
+ if (migrated || created)
attach_entity_load_avg(cfs_rq, se);
- if (decayed || migrated)
+ if (decayed || migrated || created)
update_tg_load_avg(cfs_rq, 0);
}
@@ -2808,15 +2809,10 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
+/* This function is useful for the case that rq->lock may not be held */
+static inline u64 get_last_update_time(struct cfs_rq *cfs_rq)
{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 last_update_time;
-
#ifndef CONFIG_64BIT
u64 last_update_time_copy;
@@ -2828,6 +2824,17 @@ void remove_entity_load_avg(struct sched_entity *se)
#else
last_update_time = cfs_rq->avg.last_update_time;
#endif
+ return last_update_time;
+}
+
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+void remove_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time = get_last_update_time(cfs_rq);
__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
@@ -2868,7 +2875,7 @@ static int idle_balance(struct rq *this_rq);
static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
@@ -3008,7 +3015,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- enqueue_entity_load_avg(cfs_rq, se);
+ enqueue_entity_load_avg(cfs_rq, se, flags);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -4136,6 +4143,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ flags = flags | (xchg(&p->migrated, 0) ? ENQUEUE_MIGRATED : 0);
for_each_sched_entity(se) {
if (se->on_rq)
break;
@@ -5021,7 +5029,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
remove_entity_load_avg(&p->se);
/* Tell new CPU we are migrated */
- p->se.avg.last_update_time = 0;
+ p->migrated = 1;
/* We have migrated, no longer consider this task hot */
p->se.exec_start = 0;
@@ -8080,20 +8088,38 @@ static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
set_task_rq(p, task_cpu(p));
-
-#ifdef CONFIG_SMP
- /* Tell se's cfs_rq has been changed -- migrated */
- p->se.avg.last_update_time = 0;
-#endif
attach_task_cfs_rq(p);
}
+/*
+ * Called immediately before a task is set to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu. However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including the state of rq->lock, should be made.
+ */
static void set_task_rq_fair(struct task_struct *p, unsigned int cpu)
{
struct task_group *tg = task_group(p);
+ struct sched_entity *se = &p->se;
+
+ /*
+ * Now that the base cfs_rq for aging or decaying is about to be
+ * changed, we perform the aging or decaying here with the old
+ * cfs_rq, as the cfs_rq is not changed yet.
+ */
+ if (se->avg.last_update_time)
+ __update_load_avg(get_last_update_time(cfs_rq_of(se)), task_cpu(p), &se->avg,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq_of(se)->curr == se, NULL);
+
+ se->cfs_rq = tg->cfs_rq[cpu];
+ se->parent = tg->se[cpu];
- p->se.cfs_rq = tg->cfs_rq[cpu];
- p->se.parent = tg->se[cpu];
+ /*
+ * Now that se's cfs_rq was changed, we need to also change its
+ * base time for aging or decaying.
+ */
+ se->avg.last_update_time = get_last_update_time(cfs_rq_of(se));
}
void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7f73e89..1897c85 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1095,6 +1095,7 @@ static const u32 prio_to_wmult[40] = {
#define ENQUEUE_WAKING 0
#endif
#define ENQUEUE_REPLENISH 8
+#define ENQUEUE_MIGRATED 16
#define DEQUEUE_SLEEP 1
--
1.7.9.5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/