[RFC][PATCH 10/10] sched, fair: Rewrite group_imb trigger

From: Peter Zijlstra
Date: Mon Aug 19 2013 - 12:17:25 EST


Change the group_imb detection from the old 'load-spike' detector to
an actual imbalance detector. We set it from the lower domain balance
pass when it fails to create a balance in the presence of task
affinities.

The advantage is that this should no longer generate the false
positive group_imb conditions generated by transient load spikes from
the normal balancing/bulk-wakeup etc. behaviour.

While I haven't actually observed those they could happen.

I'm not entirely happy with this patch; it somehow feels a little
fragile.

Nor does it solve the biggest issue I have with the group_imb code; it
it still a fragile construct in that once we 'fixed' the imbalance
we'll not detect the group_imb again and could end up re-creating it.

That said, this patch does seem to preserve behaviour for the
described degenerate case. In particular on my 2*6*2 wsm-ep:

taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done'

ends up with 9 spinners, each on their own CPU; whereas if you disable
the group_imb code that typically doesn't happen (you'll get one pair
sharing a CPU most of the time).

Signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 90 +++++++++++++++++----------------------------------
kernel/sched/sched.h | 1
2 files changed, 31 insertions(+), 60 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3906,7 +3906,8 @@ static unsigned long __read_mostly max_l

#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
-#define LBF_SOME_PINNED 0x04
+#define LBF_DST_PINNED 0x04
+#define LBF_SOME_PINNED 0x08

struct lb_env {
struct sched_domain *sd;
@@ -3997,6 +3998,8 @@ int can_migrate_task(struct task_struct

schedstat_inc(p, se.statistics.nr_failed_migrations_affine);

+ env->flags |= LBF_SOME_PINNED;
+
/*
* Remember if this task can be migrated to any other cpu in
* our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4008,13 @@ int can_migrate_task(struct task_struct
* Also avoid computing new_dst_cpu if we have already computed
* one in current iteration.
*/
- if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+ if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
return 0;

/* Prevent to re-select dst_cpu via env's cpus */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
- env->flags |= LBF_SOME_PINNED;
+ env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
}
@@ -4535,13 +4538,12 @@ fix_small_capacity(struct sched_domain *
* cpu 3 and leave one of the cpus in the second group unused.
*
* The current solution to this issue is detecting the skew in the first group
- * by noticing it has a cpu that is overloaded while the remaining cpus are
- * idle -- or rather, there's a distinct imbalance in the cpus; see
- * sg_imbalanced().
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
*
* When this is so detected; this group becomes a candidate for busiest; see
* update_sd_pick_busiest(). And calculcate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditional to allow it
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
* to create an effective group imbalance.
*
* This is a somewhat tricky proposition since the next run might not find the
@@ -4549,49 +4551,9 @@ fix_small_capacity(struct sched_domain *
* subtle and fragile situation.
*/

-struct sg_imb_stats {
- unsigned long max_nr_running, min_nr_running;
- unsigned long max_cpu_load, min_cpu_load;
-};
-
-static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
+static inline int sg_imbalanced(struct sched_group *group)
{
- sgi->max_cpu_load = sgi->max_nr_running = 0UL;
- sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
-}
-
-static inline void
-update_sg_imb_stats(struct sg_imb_stats *sgi,
- unsigned long load, unsigned long nr_running)
-{
- if (load > sgi->max_cpu_load)
- sgi->max_cpu_load = load;
- if (sgi->min_cpu_load > load)
- sgi->min_cpu_load = load;
-
- if (nr_running > sgi->max_nr_running)
- sgi->max_nr_running = nr_running;
- if (sgi->min_nr_running > nr_running)
- sgi->min_nr_running = nr_running;
-}
-
-static inline int
-sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
-{
- /*
- * Consider the group unbalanced when the imbalance is larger
- * than the average weight of a task.
- *
- * APZ: with cgroup the avg task weight can vary wildly and
- * might not be a suitable number - should we keep a
- * normalized nr_running number somewhere that negates
- * the hierarchy?
- */
- if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
- (sgi->max_nr_running - sgi->min_nr_running) > 1)
- return 1;
-
- return 0;
+ return group->sgp->imbalance;
}

/**
@@ -4606,25 +4568,20 @@ static inline void update_sg_lb_stats(st
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs)
{
- struct sg_imb_stats sgi;
unsigned long nr_running;
unsigned long load;
int i;

- init_sg_imb_stats(&sgi);
-
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);

nr_running = rq->nr_running;

/* Bias balancing toward cpus of our domain */
- if (local_group) {
+ if (local_group)
load = target_load(i, load_idx);
- } else {
+ else
load = source_load(i, load_idx);
- update_sg_imb_stats(&sgi, load, nr_running);
- }

sgs->group_load += load;
sgs->sum_nr_running += nr_running;
@@ -4644,7 +4601,7 @@ static inline void update_sg_lb_stats(st
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

- sgs->group_imb = sg_imbalanced(sgs, &sgi);
+ sgs->group_imb = sg_imbalanced(group);

sgs->group_capacity =
DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
@@ -5167,6 +5124,7 @@ static int load_balance(int this_cpu, st
int *should_balance)
{
int ld_moved, cur_ld_moved, active_balance = 0;
+ struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
@@ -5269,11 +5227,11 @@ static int load_balance(int this_cpu, st
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {

env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
- env.flags &= ~LBF_SOME_PINNED;
+ env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;

@@ -5287,6 +5245,18 @@ static int load_balance(int this_cpu, st
goto more_balance;
}

+ /*
+ * We failed to reach balance because of affinity.
+ */
+ if (sd_parent) {
+ int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+ *group_imbalance = 1;
+ } else if (*group_imbalance)
+ *group_imbalance = 0;
+ }
+
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5690,7 +5660,7 @@ static void rebalance_domains(int cpu, e
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (load_balance(cpu, rq, sd, idle, &should_balance)) {
/*
- * The LBF_SOME_PINNED logic could have changed
+ * The LBF_DST_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -605,6 +605,7 @@ struct sched_group_power {
*/
unsigned int power, power_orig;
unsigned long next_update;
+ int imbalance; /* XXX unrelated to power but shared group state */
/*
* Number of busy cpus in this group.
*/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/