[patch 2/2] sched: fix select_idle_sibling() logic in select_task_rq_fair()

From: Suresh Siddha
Date: Fri Mar 05 2010 - 13:47:33 EST


Performance improvements with this patch:
"lat_ctx -s 0 2" ~22usec (before-this-patch) ~5usec (after-this-patch)

There are number of things wrong with the select_idle_sibling() logic

a) Once we select the idle sibling, we use that domain (spanning the cpu that
the task is currently woken-up and the idle sibling that we found) in our
wake_affine() comparisons. This domain is completely different from the
domain(we are supposed to use) that spans the cpu that the task currently
woken-up and the cpu where the task previously ran.

b) We do select_idle_sibling() check only for the cpu that the task is
currently woken-up on. If the wake_affine makes the decision of selecting
the cpu where the task previously ran, doing a select_idle_sibling() check
for that cpu also helps and we don't do this currently.

c) Also, selelct_idle_sibling() should also treat the current cpu as an idle
cpu if it is a sync wakeup and we have only one task running.

Fixing all this improves the lat_ctx performance. Also, there might be
other workloads where select_idle_sibling() check on previously ran cpu
will also help.

Signed-off-by: Suresh Siddha <suresh.b.siddha@xxxxxxxxx>
---
kernel/sched_fair.c | 73 +++++++++++++++++++++++++++++-----------------------
1 file changed, 42 insertions(+), 31 deletions(-)

Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1411,28 +1411,49 @@ find_idlest_cpu(struct sched_group *grou
* Try and locate an idle CPU in the sched_domain.
*/
static int
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+select_idle_sibling(struct task_struct *p, int target, int sync)
{
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
int i;
+ struct sched_domain *sd;
+
+ /*
+ * If the task is going to be woken-up on this cpu and if it is
+ * already idle or going to be idle, then it is the right target.
+ */
+ if (target == cpu && (!cpu_rq(cpu)->cfs.nr_running ||
+ (sync && cpu_rq(cpu)->cfs.nr_running == 1)))
+ return cpu;

/*
- * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
- * test in select_task_rq_fair) and the prev_cpu is idle then that's
- * always a better target than the current cpu.
+ * If the task is going to be woken-up on the cpu where it previously
+ * ran and if it is currently idle, then it the right target.
*/
- if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+ if (target == prev_cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
return prev_cpu;

/*
- * Otherwise, iterate the domain and find an elegible idle cpu.
+ * Otherwise, iterate the domains and find an elegible idle cpu.
*/
- for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
- if (!cpu_rq(i)->cfs.nr_running) {
- target = i;
+ for_each_domain(target, sd) {
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break;
+
+ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+ if (!cpu_rq(i)->cfs.nr_running) {
+ target = i;
+ break;
+ }
}
+
+ /*
+ * Lets stop looking for an idle sibling when we reached
+ * the domain that spans the current cpu and prev_cpu.
+ */
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+ break;
}

return target;
@@ -1496,32 +1517,17 @@ static int select_task_rq_fair(struct ta

/*
* While iterating the domains looking for a spanning
- * WAKE_AFFINE domain, adjust the affine target to any idle cpu
- * in cache sharing domains along the way.
+ * WAKE_AFFINE domain.
*/
if (want_affine) {
- int target = -1;
-
/*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
- if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
- target = cpu;
-
- /*
- * If there's an idle sibling in this domain, make that
- * the wake_affine target instead of the current cpu.
- */
- if (tmp->flags & SD_SHARE_PKG_RESOURCES)
- target = select_idle_sibling(p, tmp, target);
-
- if (target >= 0) {
- if (tmp->flags & SD_WAKE_AFFINE) {
- affine_sd = tmp;
- want_affine = 0;
- }
- cpu = target;
+ if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))
+ && (tmp->flags & SD_WAKE_AFFINE)) {
+ affine_sd = tmp;
+ want_affine = 0;
}
}

@@ -1549,8 +1555,13 @@ static int select_task_rq_fair(struct ta
update_shares(tmp);
}

- if (affine_sd && wake_affine(affine_sd, p, sync))
- return cpu;
+ if (affine_sd) {
+ int target;
+
+ target = wake_affine(affine_sd, p, sync) ? cpu : prev_cpu;
+
+ return select_idle_sibling(p, target, sync);
+ }

while (sd) {
int load_idx = sd->forkexec_idx;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/