Re: [RFC PATCH 0/2] sched: simplify the select_task_rq_fair()

From: Mike Galbraith
Date: Wed Jan 23 2013 - 04:32:40 EST


On Wed, 2013-01-23 at 10:18 +0100, Mike Galbraith wrote:
> On Wed, 2013-01-23 at 17:00 +0800, Michael Wang wrote:
> > On 01/23/2013 04:49 PM, Mike Galbraith wrote:
> > > On Wed, 2013-01-23 at 16:30 +0800, Michael Wang wrote:
> > >> On 01/23/2013 04:20 PM, Mike Galbraith wrote:
> > >>> On Wed, 2013-01-23 at 15:10 +0800, Michael Wang wrote:
> > >>>> On 01/23/2013 02:28 PM, Mike Galbraith wrote:
> > >>>
> > >>>>> Abbreviated test run:
> > >>>>> Tasks jobs/min jti jobs/min/task real cpu
> > >>>>> 640 158044.01 81 246.9438 24.54 577.66 Wed Jan 23 07:14:33 2013
> > >>>>> 1280 50434.33 39 39.4018 153.80 5737.57 Wed Jan 23 07:17:07 2013
> > >>>>> 2560 47214.07 34 18.4430 328.58 12715.56 Wed Jan 23 07:22:36 2013
> > >>>>
> > >>>> So still not works... and not going to balance path while waking up will
> > >>>> fix it, looks like that's the only choice if no error on balance path
> > >>>> could be found...benchmark wins again, I'm feeling bad...
> > >>>>
> > >>>> I will conclude the info we collected and make a v3 later.
> > >>>
> > >>> FWIW, I hacked virgin to do full balance if an idle CPU was not found,
> > >>> leaving the preference to wake cache affine intact though, turned on
> > >>> WAKE_BALANCE in all domains, and it did not collapse. In fact, the high
> > >>> load end, where the idle search will frequently be a waste of cycles,
> > >>> actually improved a bit. Things that make ya go hmmm.
> > >>
> > >> Oh, does that means the old balance path is good while the new is really
> > >> broken, I mean, compared this with the previously results, could we say
> > >> that all the collapse was just caused by the change of balance path?
> > >
> > > That's a good supposition. I'll see if it holds.
> >
> > I just notice that there is no sd support the WAKE flag at all according
> > to your debug info, isn't it?
>
> There is, I turned it on in all domains.

For your patches, I had to turn it on at birth, but doing that, and
restoring the full balance path to original form killed the collapse.

Tasks jobs/min jti jobs/min/task real cpu
640 152452.83 97 238.2075 25.44 613.48 Wed Jan 23 10:22:12 2013
1280 190491.16 97 148.8212 40.72 1223.74 Wed Jan 23 10:22:53 2013
2560 219397.54 95 85.7022 70.71 2422.46 Wed Jan 23 10:24:04 2013

---
include/linux/topology.h | 6 ++---
kernel/sched/core.c | 41 ++++++++++++++++++++++++++++++-------
kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++------------------
3 files changed, 70 insertions(+), 29 deletions(-)

--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -95,7 +95,7 @@ int arch_update_cpu_topology(void);
| 1*SD_BALANCE_NEWIDLE \
| 1*SD_BALANCE_EXEC \
| 1*SD_BALANCE_FORK \
- | 0*SD_BALANCE_WAKE \
+ | 1*SD_BALANCE_WAKE \
| 1*SD_WAKE_AFFINE \
| 1*SD_SHARE_CPUPOWER \
| 1*SD_SHARE_PKG_RESOURCES \
@@ -126,7 +126,7 @@ int arch_update_cpu_topology(void);
| 1*SD_BALANCE_NEWIDLE \
| 1*SD_BALANCE_EXEC \
| 1*SD_BALANCE_FORK \
- | 0*SD_BALANCE_WAKE \
+ | 1*SD_BALANCE_WAKE \
| 1*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 1*SD_SHARE_PKG_RESOURCES \
@@ -156,7 +156,7 @@ int arch_update_cpu_topology(void);
| 1*SD_BALANCE_NEWIDLE \
| 1*SD_BALANCE_EXEC \
| 1*SD_BALANCE_FORK \
- | 0*SD_BALANCE_WAKE \
+ | 1*SD_BALANCE_WAKE \
| 1*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5609,11 +5609,39 @@ static void update_top_cache_domain(int
static int sbm_max_level;
DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_balance_map, sbm_array);

+static void debug_sched_balance_map(int cpu)
+{
+ int i, type, level = 0;
+ struct sched_balance_map *sbm = &per_cpu(sbm_array, cpu);
+
+ printk("WYT: sbm of cpu %d\n", cpu);
+
+ for (type = 0; type < SBM_MAX_TYPE; type++) {
+ if (type == SBM_EXEC_TYPE)
+ printk("WYT: \t exec map\n");
+ else if (type == SBM_FORK_TYPE)
+ printk("WYT: \t fork map\n");
+ else if (type == SBM_WAKE_TYPE)
+ printk("WYT: \t wake map\n");
+
+ for (level = 0; level < sbm_max_level; level++) {
+ if (sbm->sd[type][level])
+ printk("WYT: \t\t sd %x, idx %d, level %d, weight %d\n", sbm->sd[type][level], level, sbm->sd[type][level]->level, sbm->sd[type][level]->span_weight);
+ }
+ }
+
+ printk("WYT: \t affine map\n");
+
+ for_each_possible_cpu(i) {
+ if (sbm->affine_map[i])
+ printk("WYT: \t\t affine with cpu %x in sd %x, weight %d\n", i, sbm->affine_map[i], sbm->affine_map[i]->span_weight);
+ }
+}
+
static void build_sched_balance_map(int cpu)
{
struct sched_balance_map *sbm = &per_cpu(sbm_array, cpu);
struct sched_domain *sd = cpu_rq(cpu)->sd;
- struct sched_domain *top_sd = NULL;
int i, type, level = 0;

memset(sbm->top_level, 0, sizeof((*sbm).top_level));
@@ -5656,11 +5684,9 @@ static void build_sched_balance_map(int
* fill the hole to get lower level sd easily.
*/
for (type = 0; type < SBM_MAX_TYPE; type++) {
- level = sbm->top_level[type];
- top_sd = sbm->sd[type][level];
- if ((++level != sbm_max_level) && top_sd) {
- for (; level < sbm_max_level; level++)
- sbm->sd[type][level] = top_sd;
+ for (level = 1; level < sbm_max_level; level++) {
+ if (!sbm->sd[type][level])
+ sbm->sd[type][level] = sbm->sd[type][level - 1];
}
}
}
@@ -5719,6 +5745,7 @@ cpu_attach_domain(struct sched_domain *s
* destroy_sched_domains() already do the work.
*/
build_sched_balance_map(cpu);
+//MIKE debug_sched_balance_map(cpu);
rcu_assign_pointer(rq->sbm, sbm);
}

@@ -6220,7 +6247,7 @@ sd_numa_init(struct sched_domain_topolog
| 1*SD_BALANCE_NEWIDLE
| 0*SD_BALANCE_EXEC
| 0*SD_BALANCE_FORK
- | 0*SD_BALANCE_WAKE
+ | 1*SD_BALANCE_WAKE
| 0*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUPOWER
| 0*SD_SHARE_PKG_RESOURCES
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3312,7 +3312,7 @@ static int select_idle_sibling(struct ta
static int
select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
{
- struct sched_domain *sd = NULL;
+ struct sched_domain *sd = NULL, *tmp;
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
int new_cpu = cpu;
@@ -3376,31 +3376,45 @@ select_task_rq_fair(struct task_struct *

balance_path:
new_cpu = (sd_flag & SD_BALANCE_WAKE) ? prev_cpu : cpu;
- sd = sbm->sd[type][sbm->top_level[type]];
+ sd = tmp = sbm->sd[type][sbm->top_level[type]];

while (sd) {
int load_idx = sd->forkexec_idx;
- struct sched_group *sg = NULL;
+ struct sched_group *group;
+ int weight;
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }

if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;

- sg = find_idlest_group(sd, p, cpu, load_idx);
- if (!sg)
- goto next_sd;
-
- new_cpu = find_idlest_cpu(sg, p, cpu);
- if (new_cpu != -1)
- cpu = new_cpu;
-next_sd:
- if (!sd->level)
- break;
-
- sbm = cpu_rq(cpu)->sbm;
- if (!sbm)
- break;
-
- sd = sbm->sd[type][sd->level - 1];
+ group = find_idlest_group(sd, p, cpu, load_idx);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_cpu(group, p, cpu);
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
}

unlock:


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/