[PATCH 14/14] sched,fair: propagate sum_exec_runtime up the hierarchy

From: Rik van Riel
Date: Mon Jul 22 2019 - 13:35:02 EST


Now that enqueue_task_fair and dequeue_task_fair no longer iterate up
the hierarchy all the time, a method to lazily propagate sum_exec_runtime
up the hierarchy is necessary.

Once a tick, propagate the newly accumulated exec_runtime up the hierarchy,
and feed it into CFS bandwidth control.

Remove the pointless call to account_cfs_rq_runtime from update_curr,
which is always called with a root cfs_rq.

Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
---
include/linux/sched.h | 1 +
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 23 +++++++++++++++++++++--
3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 901c710363e7..bdca15b3afe7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -454,6 +454,7 @@ struct sched_entity {
int depth;
unsigned long enqueued_h_load;
unsigned long enqueued_h_weight;
+ u64 propagated_exec_runtime;
struct load_weight h_load;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fbd96900f715..9915d20e84a9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2137,6 +2137,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_LIST_HEAD(&p->se.group_node);

#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.propagated_exec_runtime = 0;
p->se.cfs_rq = NULL;
#endif

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4c7e1818efba..0bb8a7e92f07 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -898,8 +898,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
-
- account_cfs_rq_runtime(cfs_rq, delta_exec);
}

static void update_curr_fair(struct rq *rq)
@@ -3405,6 +3403,20 @@ static inline bool skip_blocked_update(struct sched_entity *se)
return true;
}

+static void propagate_exec_runtime(struct cfs_rq *cfs_rq,
+ struct sched_entity *se)
+{
+ struct sched_entity *parent = se->parent;
+ u64 diff = se->sum_exec_runtime - se->propagated_exec_runtime;
+
+ if (parent) {
+ parent->sum_exec_runtime += diff;
+ account_cfs_rq_runtime(cfs_rq, diff);
+ }
+
+ se->propagated_exec_runtime = se->sum_exec_runtime;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */

static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -3416,6 +3428,11 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)

static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}

+static void propagate_exec_runtime(struct cfs_rq *cfs_rq,
+ struct sched_entity *se);
+{
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */

/**
@@ -10140,9 +10157,11 @@ static void propagate_entity_cfs_rq(struct sched_entity *se, int flags)
if (cfs_rq_throttled(cfs_rq))
break;

+ /* Walk the hierarchy while PELT says there is work to do. */
if (!update_load_avg(cfs_rq, se, flags))
break;

+ propagate_exec_runtime(cfs_rq, se);
update_cfs_group(se);
}
}
--
2.20.1