Re: [PATCH RESEND] sched: prefer an idle cpu vs an idle sibling for BALANCE_WAKE

From: Mike Galbraith
Date: Sun Jul 05 2015 - 06:29:39 EST


On Fri, 2015-07-03 at 08:40 +0200, Mike Galbraith wrote:

> Hm. Seems what this load should like best is if we detect 1:N, skip all
> of the routine gyrations, ie move the N (workers) infrequently, expend
> search cycles frequently only on the 1 (dispatch).
>
> Ponder..

Since it was too hot to do outside chores (any excuse will do;)...

If we're (read /me) on track, the bellow should help. Per my tracing,
it may want a wee bit of toning down actually, though when I trace
virgin source I expect to see the same, namely Xorg and friends having
"wide-load" tattooed across their hindquarters earlier than they should.
It doesn't seem to hurt anything, but then demolishing a single llc box
is a tad more difficult than demolishing a NUMA box.


sched: beef up wake_wide()

Josef Bacik reported that Facebook sees better performance with their
1:N load (1 dispatch/node, N workers/node) when carrying an old patch
to try very hard to wake to an idle CPU. While looking at wake_wide(),
I noticed that it doesn't pay attention to wakeup of the 1:N waker,
returning 1 only when waking one of its N minions.

Correct that, and give the user the option to do an expensive balance IFF
select_idle_sibling() doesn't find an idle CPU, and IFF the wakee is the
the 1:N dispatcher of work, thus worth some extra effort.

Not-Signed-off-by: Mike Galbraith <umgwanakikbuti@xxxxxxxxx>
---
kernel/sched/fair.c | 89 +++++++++++++++++++++++++-----------------------
kernel/sched/features.h | 6 +++
2 files changed, 54 insertions(+), 41 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -666,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *c
}

#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int cpu, void *clear);
static unsigned long task_h_load(struct task_struct *p);

static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -1375,7 +1375,7 @@ static void task_numa_compare(struct tas
* Call select_idle_sibling to maybe find a better one.
*/
if (!cur)
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu, NULL);

assign:
task_numa_assign(env, cur, imp);
@@ -4730,26 +4730,30 @@ static long effective_load(struct task_g

#endif

+/*
+ * Detect 1:N waker/wakee relationship via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees. In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other. With
+ * both conditions met, we can be relatively sure that we are seeing a 1:N
+ * relationship, and that load size exceeds socket size.
+ */
static int wake_wide(struct task_struct *p)
{
- int factor = this_cpu_read(sd_llc_size);
-
- /*
- * Yeah, it's the switching-frequency, could means many wakee or
- * rapidly switch, use factor here will just help to automatically
- * adjust the loose-degree, so bigger node will lead to more pull.
- */
- if (p->wakee_flips > factor) {
- /*
- * wakee is somewhat hot, it needs certain amount of cpu
- * resource, so if waker is far more hot, prefer to leave
- * it alone.
- */
- if (current->wakee_flips > (factor * p->wakee_flips))
- return 1;
+ unsigned long waker_flips = current->wakee_flips;
+ unsigned long wakee_flips = p->wakee_flips;
+ int factor = this_cpu_read(sd_llc_size), ret = 1;
+
+ if (waker_flips < wakee_flips) {
+ swap(waker_flips, wakee_flips);
+ /* Tell the caller that we're waking a 1:N waker */
+ ret += sched_feat(WAKE_WIDE_BALANCE);
}
-
- return 0;
+ if (wakee_flips < factor || waker_flips < wakee_flips * factor)
+ return 0;
+ return ret;
}

static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4761,13 +4765,6 @@ static int wake_affine(struct sched_doma
unsigned long weight;
int balanced;

- /*
- * If we wake multiple tasks be careful to not bounce
- * ourselves around too much.
- */
- if (wake_wide(p))
- return 0;
-
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -4935,20 +4932,22 @@ find_idlest_cpu(struct sched_group *grou
/*
* Try and locate an idle CPU in the sched_domain.
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int target, void *clear)
{
struct sched_domain *sd;
struct sched_group *sg;
int i = task_cpu(p);

if (idle_cpu(target))
- return target;
+ goto done;

/*
* If the prevous cpu is cache affine and idle, don't be stupid.
*/
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) {
+ target = i;
+ goto done;
+ }

/*
* Otherwise, iterate the domains and find an elegible idle cpu.
@@ -4973,7 +4972,11 @@ static int select_idle_sibling(struct ta
sg = sg->next;
} while (sg != sd->groups);
}
+ return target;
done:
+ if (clear)
+ *(void **)clear = 0;
+
return target;
}
/*
@@ -5021,14 +5024,19 @@ select_task_rq_fair(struct task_struct *
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int new_cpu = cpu;
- int want_affine = 0;
+ int new_cpu = prev_cpu;
+ int want_affine = 0, want_balance = 0;
int sync = wake_flags & WF_SYNC;

- if (sd_flag & SD_BALANCE_WAKE)
- want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
-
rcu_read_lock();
+ if (sd_flag & SD_BALANCE_WAKE) {
+ want_affine = wake_wide(p);
+ want_balance = want_affine > 1;
+ want_affine = !want_affine && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ if (!want_affine && !want_balance)
+ goto select;
+ }
+
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
continue;
@@ -5043,23 +5051,23 @@ select_task_rq_fair(struct task_struct *
break;
}

- if (tmp->flags & sd_flag)
+ if (tmp->flags & sd_flag || want_balance)
sd = tmp;
}

if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
+ new_cpu = cpu;

if (sd_flag & SD_BALANCE_WAKE) {
- new_cpu = select_idle_sibling(p, prev_cpu);
- goto unlock;
+select:
+ new_cpu = select_idle_sibling(p, new_cpu, &sd);
}

while (sd) {
struct sched_group *group;
int weight;

- if (!(sd->flags & sd_flag)) {
+ if (!(sd->flags & sd_flag) && !want_balance) {
sd = sd->child;
continue;
}
@@ -5089,7 +5097,6 @@ select_task_rq_fair(struct task_struct *
}
/* while loop will break here if sd == NULL */
}
-unlock:
rcu_read_unlock();

return new_cpu;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -96,3 +96,9 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
*/
SCHED_FEAT(NUMA_RESIST_LOWER, false)
#endif
+
+/*
+ * Perform expensive full wake balance for 1:N wakers when the
+ * selected cpu is not completely idle.
+ */
+SCHED_FEAT(WAKE_WIDE_BALANCE, false)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/