Re: [PATCH RESEND] sched: prefer an idle cpu vs an idle sibling for BALANCE_WAKE

From: Mike Galbraith
Date: Sun Jul 05 2015 - 04:24:38 EST


On Sat, 2015-07-04 at 17:57 +0200, Mike Galbraith wrote:

> If we're (read /me) on track, the bellow should help. Per my tracing,
> it may want a wee bit of toning down actually, though when I trace
> virgin source I expect to see the same, namely Xorg and friends having
> "wide-load" tattooed across their hindquarters earlier than they should.

That's true, the only difference is that the virgin kernel will still
try to pull the multi-waker when it is being awakened, which I can
easily imagine going either way performance wise, depending on a pile of
variables.. so let's just ask your box.

V2: drop select_idle_sibling() changes (ick), try to save some cycles.


sched: beef up wake_wide()

Josef Bacik reported that Facebook sees better performance with their
1:N load (1 dispatch/node, N workers/node) when carrying an old patch
to try very hard to wake to an idle CPU. While looking at wake_wide(),
I noticed that it doesn't pay attention to wakeup of the 1:N waker,
returning 1 only when the 1:N waker is waking one of its minions.

Correct that, and give the user the option to do an expensive balance IFF
select_idle_sibling() doesn't find an idle CPU, and IFF the wakee is the
1:N waker, the dispatcher of work, thus worth some extra effort. Don't
drill down through all domains though, stop searching at highest, we'll
either have found the desired completely idle CPU, or if heavily loaded,
the least loaded CPU of the least loaded group, which should still add
up to an average scheduling latency improvement (knock wood).


Not-Signed-off-by: Mike Galbraith <umgwanakikbuti@xxxxxxxxx>
---
include/linux/sched.h | 7 ++-
kernel/sched/fair.c | 86 ++++++++++++++++++++++++++++--------------------
kernel/sched/features.h | 6 +++
3 files changed, 61 insertions(+), 38 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -962,7 +962,8 @@ extern void wake_up_q(struct wake_q_head
#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
-#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
+#define SD_BALANCE_WAKE_IDLE 0x0020 /* Balance on wakeup, searching for an idle CPU */
+#define SD_WAKE_AFFINE 0x0040 /* Wake task to waking CPU */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
@@ -1353,9 +1354,9 @@ struct task_struct {
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
- struct task_struct *last_wakee;
- unsigned long wakee_flips;
+ unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
+ struct task_struct *last_wakee;

int wake_cpu;
#endif
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4730,26 +4730,30 @@ static long effective_load(struct task_g

#endif

+/*
+ * Detect 1:N waker/wakee relationship via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees. In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other. With
+ * both conditions met, we can be relatively sure that we are seeing a 1:N
+ * relationship, and that load size exceeds socket size.
+ */
static int wake_wide(struct task_struct *p)
{
- int factor = this_cpu_read(sd_llc_size);
-
- /*
- * Yeah, it's the switching-frequency, could means many wakee or
- * rapidly switch, use factor here will just help to automatically
- * adjust the loose-degree, so bigger node will lead to more pull.
- */
- if (p->wakee_flips > factor) {
- /*
- * wakee is somewhat hot, it needs certain amount of cpu
- * resource, so if waker is far more hot, prefer to leave
- * it alone.
- */
- if (current->wakee_flips > (factor * p->wakee_flips))
- return 1;
+ unsigned int waker_flips = current->wakee_flips;
+ unsigned int wakee_flips = p->wakee_flips;
+ int factor = this_cpu_read(sd_llc_size), ret = 1;
+
+ if (waker_flips < wakee_flips) {
+ swap(waker_flips, wakee_flips);
+ /* Tell the caller that we're waking a 1:N waker */
+ ret += sched_feat(WAKE_WIDE_IDLE);
}
-
- return 0;
+ if (wakee_flips < factor || waker_flips < wakee_flips * factor)
+ return 0;
+ return ret;
}

static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4761,13 +4765,6 @@ static int wake_affine(struct sched_doma
unsigned long weight;
int balanced;

- /*
- * If we wake multiple tasks be careful to not bounce
- * ourselves around too much.
- */
- if (wake_wide(p))
- return 0;
-
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -4865,6 +4862,10 @@ find_idlest_group(struct sched_domain *s
load = target_load(i, load_idx);

avg_load += load;
+
+ if (sd_flag & SD_BALANCE_WAKE_IDLE && idle_cpu(i) &&
+ cpumask_test_cpu(1, tsk_cpus_allowed(p)))
+ return group;
}

/* Adjust by relative CPU capacity of the group */
@@ -5021,14 +5022,21 @@ select_task_rq_fair(struct task_struct *
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int new_cpu = cpu;
- int want_affine = 0;
+ int new_cpu = prev_cpu;
+ int want_affine = 0, want_idle = 0;
int sync = wake_flags & WF_SYNC;

- if (sd_flag & SD_BALANCE_WAKE)
- want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
-
rcu_read_lock();
+ if (sd_flag & SD_BALANCE_WAKE) {
+ want_idle = wake_wide(p);
+ want_affine = !want_idle && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_idle = want_idle > 1;
+ if (!want_affine && !want_idle)
+ goto select;
+ if (want_idle)
+ sd_flag |= SD_BALANCE_WAKE_IDLE;
+ }
+
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
continue;
@@ -5043,23 +5051,25 @@ select_task_rq_fair(struct task_struct *
break;
}

- if (tmp->flags & sd_flag)
+ if (tmp->flags & sd_flag || want_idle)
sd = tmp;
}

if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
+ new_cpu = cpu;

if (sd_flag & SD_BALANCE_WAKE) {
- new_cpu = select_idle_sibling(p, prev_cpu);
- goto unlock;
+select:
+ new_cpu = select_idle_sibling(p, new_cpu);
+ if (want_idle && (new_cpu != prev_cpu || idle_cpu(prev_cpu)))
+ sd = NULL;
}

while (sd) {
struct sched_group *group;
int weight;

- if (!(sd->flags & sd_flag)) {
+ if (!(sd->flags & sd_flag) && !want_idle) {
sd = sd->child;
continue;
}
@@ -5077,6 +5087,13 @@ select_task_rq_fair(struct task_struct *
continue;
}

+ /*
+ * We've either found an idle CPU, or the least loaded CPU in
+ * the least load group of the highest domain. Good enough.
+ */
+ if (want_idle)
+ break;
+
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
weight = sd->span_weight;
@@ -5089,7 +5106,6 @@ select_task_rq_fair(struct task_struct *
}
/* while loop will break here if sd == NULL */
}
-unlock:
rcu_read_unlock();

return new_cpu;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -96,3 +96,9 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
*/
SCHED_FEAT(NUMA_RESIST_LOWER, false)
#endif
+
+/*
+ * Perform expensive full wake balance for 1:N wakers when the
+ * selected cpu is not completely idle.
+ */
+SCHED_FEAT(WAKE_WIDE_IDLE, false)



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/