[RFC 50/60] cosched: Propagate load changes across hierarchy levels

From: Jan H. SchÃnherr
Date: Fri Sep 07 2018 - 17:49:38 EST


The weight of an SD-SE is defined to be the average weight of all
runqueues that are represented by the SD-SE. Hence, its weight
should change whenever one of the child runqueues changes its
weight. However, as these are two different hierarchy levels,
they are protected by different locks. To reduce lock contention,
we want to avoid holding higher level locks for prolonged amounts
of time, if possible.

Therefore, we update an aggregated weight -- sdrq->sdse_load --
in a lock-free manner during enqueue and dequeue in the lower level,
and once we actually get the higher level lock, we perform the actual
SD-SE weight adjustment via update_sdse_load().

At some point in the future (the code isn't there yet), this will
allow software combining, where not all CPUs have to walk up the
full hierarchy on enqueue/dequeue.

Signed-off-by: Jan H. SchÃnherr <jschoenh@xxxxxxxxx>
---
kernel/sched/fair.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 55 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0dc4d289497c..1eee262ecf88 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2740,6 +2740,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+#ifdef CONFIG_COSCHEDULING
+ if (!cfs_rq->sdrq.is_root && !cfs_rq->throttled)
+ atomic64_add(se->load.weight, &cfs_rq->sdrq.sd_parent->sdse_load);
+#endif
update_load_add(&cfs_rq->load, se->load.weight);
if (!parent_entity(se) || is_sd_se(parent_entity(se)))
update_load_add(&hrq_of(cfs_rq)->load, se->load.weight);
@@ -2757,6 +2761,10 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+#ifdef CONFIG_COSCHEDULING
+ if (!cfs_rq->sdrq.is_root && !cfs_rq->throttled)
+ atomic64_sub(se->load.weight, &cfs_rq->sdrq.sd_parent->sdse_load);
+#endif
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se) || is_sd_se(parent_entity(se)))
update_load_sub(&hrq_of(cfs_rq)->load, se->load.weight);
@@ -3083,6 +3091,35 @@ static inline void update_cfs_group(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */

+#ifdef CONFIG_COSCHEDULING
+static void update_sdse_load(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct sdrq *sdrq = &cfs_rq->sdrq;
+ unsigned long load;
+
+ if (!is_sd_se(se))
+ return;
+
+ /* FIXME: the load calculation assumes a homogeneous topology */
+ load = atomic64_read(&sdrq->sdse_load);
+
+ if (!list_empty(&sdrq->children)) {
+ struct sdrq *entry;
+
+ entry = list_first_entry(&sdrq->children, struct sdrq, siblings);
+ load *= entry->data->span_weight;
+ }
+
+ load /= sdrq->data->span_weight;
+
+ /* FIXME: Use a proper runnable */
+ reweight_entity(cfs_rq, se, load, load);
+}
+#else /* !CONFIG_COSCHEDULING */
+static void update_sdse_load(struct sched_entity *se) { }
+#endif /* !CONFIG_COSCHEDULING */
+
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = hrq_of(cfs_rq);
@@ -4527,6 +4564,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)

se = cfs_rq->my_se;

+#ifdef CONFIG_COSCHEDULING
+ if (!cfs_rq->sdrq.is_root && !cfs_rq->throttled)
+ atomic64_sub(cfs_rq->load.weight,
+ &cfs_rq->sdrq.sd_parent->sdse_load);
+#endif
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
@@ -4538,6 +4580,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_rq *qcfs_rq = cfs_rq_of(se);

rq_chain_lock(&rc, se);
+ update_sdse_load(se);
+
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
break;
@@ -4590,6 +4634,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->my_se;

cfs_rq->throttled = 0;
+#ifdef CONFIG_COSCHEDULING
+ if (!cfs_rq->sdrq.is_root && !cfs_rq->throttled)
+ atomic64_add(cfs_rq->load.weight,
+ &cfs_rq->sdrq.sd_parent->sdse_load);
+#endif

update_rq_clock(rq);

@@ -4608,6 +4657,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
rq_chain_init(&rc, rq);
for_each_sched_entity(se) {
rq_chain_lock(&rc, se);
+ update_sdse_load(se);
if (se->on_rq)
enqueue = 0;

@@ -5152,6 +5202,7 @@ bool enqueue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
rq_chain_init(&rc, rq);
for_each_sched_entity(se) {
rq_chain_lock(&rc, se);
+ update_sdse_load(se);
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
@@ -5173,6 +5224,7 @@ bool enqueue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
for_each_sched_entity(se) {
/* FIXME: taking locks up to the top is bad */
rq_chain_lock(&rc, se);
+ update_sdse_load(se);
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running += task_delta;

@@ -5235,6 +5287,7 @@ bool dequeue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
rq_chain_init(&rc, rq);
for_each_sched_entity(se) {
rq_chain_lock(&rc, se);
+ update_sdse_load(se);
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);

@@ -5269,6 +5322,7 @@ bool dequeue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
for_each_sched_entity(se) {
/* FIXME: taking locks up to the top is bad */
rq_chain_lock(&rc, se);
+ update_sdse_load(se);
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running -= task_delta;

@@ -9897,6 +9951,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)

for_each_sched_entity(se) {
rq_chain_lock(&rc, se);
+ update_sdse_load(se);
cfs_rq = cfs_rq_of(se);

if (cfs_rq_throttled(cfs_rq))
--
2.9.3.1.gcba166c.dirty