[rfc patch] sched/fair: Use instantaneous load for fork/exec balancing

From: Mike Galbraith
Date: Tue Jun 14 2016 - 03:58:40 EST


SUSE's regression testing noticed that...

0905f04eb21f sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task()

...introduced a hackbench regression, and indeed it does. I think this
regression has more to do with randomness than anything else, but in
general...

While averaging calms down load balancing, helping to keep migrations
down to a dull roar, it's not completely wonderful when it comes to
things that live in the here and now, hackbench being one such.

time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'

real 0m55.397s
user 0m8.320s
sys 5m40.789s

echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features

real 0m48.049s
user 0m6.510s
sys 5m6.291s

Signed-off-by: Mike Galbraith <umgwanakikbuti@xxxxxxxxx>
---
kernel/sched/fair.c | 54 ++++++++++++++++++++++++------------------------
kernel/sched/features.h | 1
kernel/sched/sched.h | 6 +++++
3 files changed, 35 insertions(+), 26 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -738,7 +738,7 @@ void post_init_entity_util_avg(struct sc
}
}

-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq, int avg);
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
void init_entity_runnable_average(struct sched_entity *se)
@@ -1229,9 +1229,9 @@ bool should_numa_migrate_memory(struct t
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}

-static unsigned long weighted_cpuload(const int cpu);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long weighted_cpuload(const int cpu, int avg);
+static unsigned long source_load(int cpu, int type, int avg);
+static unsigned long target_load(int cpu, int type, int avg);
static unsigned long capacity_of(int cpu);
static long effective_load(struct task_group *tg, int cpu, long wl, long wg);

@@ -1261,7 +1261,7 @@ static void update_numa_stats(struct num
struct rq *rq = cpu_rq(cpu);

ns->nr_running += rq->nr_running;
- ns->load += weighted_cpuload(cpu);
+ ns->load += weighted_cpuload(cpu, LOAD_AVERAGE);
ns->compute_capacity += capacity_of(cpu);

cpus++;
@@ -3102,8 +3102,10 @@ void remove_entity_load_avg(struct sched
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}

-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq, int avg)
{
+ if (sched_feat(LB_INSTANTANEOUS_LOAD) && avg == LOAD_INSTANT)
+ return cfs_rq->load.weight;
return cfs_rq->runnable_load_avg;
}

@@ -4701,9 +4703,9 @@ static void cpu_load_update(struct rq *t
}

/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(const int cpu, int avg)
{
- return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+ return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs, avg);
}

#ifdef CONFIG_NO_HZ_COMMON
@@ -4748,7 +4750,7 @@ static void cpu_load_update_idle(struct
/*
* bail if there's load or we're actually up-to-date.
*/
- if (weighted_cpuload(cpu_of(this_rq)))
+ if (weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE))
return;

cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -4769,7 +4771,7 @@ void cpu_load_update_nohz_start(void)
* concurrently we'll exit nohz. And cpu_load write can race with
* cpu_load_update_idle() but both updater would be writing the same.
*/
- this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+ this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
}

/*
@@ -4784,7 +4786,7 @@ void cpu_load_update_nohz_stop(void)
if (curr_jiffies == this_rq->last_load_update_tick)
return;

- load = weighted_cpuload(cpu_of(this_rq));
+ load = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
raw_spin_lock(&this_rq->lock);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -4810,7 +4812,7 @@ static void cpu_load_update_periodic(str
*/
void cpu_load_update_active(struct rq *this_rq)
{
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ unsigned long load = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);

if (tick_nohz_tick_stopped())
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -4825,10 +4827,10 @@ void cpu_load_update_active(struct rq *t
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
-static unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int type, int avg)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(cpu, avg);

if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -4840,10 +4842,10 @@ static unsigned long source_load(int cpu
* Return a high guess at the load of a migration-target cpu weighted
* according to the scheduling class and "nice" value.
*/
-static unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int type, int avg)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(cpu, avg);

if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -4865,7 +4867,7 @@ static unsigned long cpu_avg_load_per_ta
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(cpu);
+ unsigned long load_avg = weighted_cpuload(cpu, LOAD_AVERAGE);

if (nr_running)
return load_avg / nr_running;
@@ -5047,8 +5049,8 @@ static int wake_affine(struct sched_doma
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
- load = source_load(prev_cpu, idx);
- this_load = target_load(this_cpu, idx);
+ load = source_load(prev_cpu, idx, LOAD_AVERAGE);
+ this_load = target_load(this_cpu, idx, LOAD_AVERAGE);

/*
* If sync wakeup then subtract the (maximum possible)
@@ -5136,9 +5138,9 @@ find_idlest_group(struct sched_domain *s
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
- load = source_load(i, load_idx);
+ load = source_load(i, load_idx, LOAD_INSTANT);
else
- load = target_load(i, load_idx);
+ load = target_load(i, load_idx, LOAD_INSTANT);

avg_load += load;
}
@@ -5197,7 +5199,7 @@ find_idlest_cpu(struct sched_group *grou
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(i);
+ load = weighted_cpuload(i, LOAD_INSTANT);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
least_loaded_cpu = i;
@@ -6982,9 +6984,9 @@ static inline void update_sg_lb_stats(st

/* Bias balancing toward cpus of our domain */
if (local_group)
- load = target_load(i, load_idx);
+ load = target_load(i, load_idx, LOAD_AVERAGE);
else
- load = source_load(i, load_idx);
+ load = source_load(i, load_idx, LOAD_AVERAGE);

sgs->group_load += load;
sgs->group_util += cpu_util(i);
@@ -6998,7 +7000,7 @@ static inline void update_sg_lb_stats(st
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
- sgs->sum_weighted_load += weighted_cpuload(i);
+ sgs->sum_weighted_load += weighted_cpuload(i, LOAD_AVERAGE);
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -7510,7 +7512,7 @@ static struct rq *find_busiest_queue(str

capacity = capacity_of(i);

- wl = weighted_cpuload(i);
+ wl = weighted_cpuload(i, LOAD_AVERAGE);

/*
* When comparing with imbalance, use weighted_cpuload()
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(LB_INSTANTANEOUS_LOAD, false)

/*
* Decrement CPU capacity based on time not spent running tasks
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1630,6 +1630,12 @@ static inline void double_rq_unlock(stru
__release(rq2->lock);
}

+/*
+ * Tell load balancing functions whether we want instant or average load
+ */
+#define LOAD_INSTANT 0
+#define LOAD_AVERAGE 1
+
#else /* CONFIG_SMP */

/*