[RFC][PATCH 04/14] sched: Change NODE sched_domain group creation

From: Peter Zijlstra
Date: Mon Mar 14 2011 - 11:29:55 EST


The NODE sched_domain is 'special' in that it allocates sched_groups
per CPU, instead of sharing the sched_groups between all CPUs.

While this might have some benefits on large NUMA and avoid remote
memory accesses when iterating the sched_groups, this does break
current code that assumes sched_groups are shared between all
sched_domains (since the dynamic cpu_power patches).

So refactor the NODE groups to behave like all other groups.

(The ALLNODES domain again shared its groups across the CPUs for some
reason).

If someone does measure a performance decrease due to this change we
need to revisit this and come up with another way to have both dynamic
cpu_power and NUMA work nice together.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
LKML-Reference: <new-submission>
---
kernel/sched.c | 231 ++++++++-------------------------------------------------
1 file changed, 33 insertions(+), 198 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -6837,29 +6837,18 @@ struct static_sched_domain {
struct s_data {
#ifdef CONFIG_NUMA
int sd_allnodes;
- cpumask_var_t domainspan;
- cpumask_var_t covered;
- cpumask_var_t notcovered;
#endif
cpumask_var_t nodemask;
cpumask_var_t send_covered;
cpumask_var_t tmpmask;
- struct sched_group **sched_group_nodes;
struct root_domain *rd;
};

enum s_alloc {
- sa_sched_groups = 0,
sa_rootdomain,
sa_tmpmask,
sa_send_covered,
sa_nodemask,
- sa_sched_group_nodes,
-#ifdef CONFIG_NUMA
- sa_notcovered,
- sa_covered,
- sa_domainspan,
-#endif
sa_none,
};

@@ -6955,18 +6944,10 @@ cpu_to_phys_group(int cpu, const struct
}

#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
-static struct sched_group ***sched_group_nodes_bycpu;
-
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_node);

-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+static int cpu_to_node_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg,
struct cpumask *nodemask)
{
@@ -6976,142 +6957,27 @@ static int cpu_to_allnodes_group(int cpu
group = cpumask_first(nodemask);

if (sg)
- *sg = &per_cpu(sched_group_allnodes, group).sg;
+ *sg = &per_cpu(sched_group_node, group).sg;
return group;
}

-static void init_numa_sched_groups_power(struct sched_group *group_head)
-{
- struct sched_group *sg = group_head;
- int j;
-
- if (!sg)
- return;
- do {
- for_each_cpu(j, sched_group_cpus(sg)) {
- struct sched_domain *sd;
-
- sd = &per_cpu(phys_domains, j).sd;
- if (j != group_first_cpu(sd->groups)) {
- /*
- * Only add "power" once for each
- * physical package.
- */
- continue;
- }
-
- sg->cpu_power += sd->groups->cpu_power;
- }
- sg = sg->next;
- } while (sg != group_head);
-}
+static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

-static int build_numa_sched_groups(struct s_data *d,
- const struct cpumask *cpu_map, int num)
+static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+ struct sched_group **sg,
+ struct cpumask *nodemask)
{
- struct sched_domain *sd;
- struct sched_group *sg, *prev;
- int n, j;
+ int group;

- cpumask_clear(d->covered);
- cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
- if (cpumask_empty(d->nodemask)) {
- d->sched_group_nodes[num] = NULL;
- goto out;
- }
-
- sched_domain_node_span(num, d->domainspan);
- cpumask_and(d->domainspan, d->domainspan, cpu_map);
-
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING "Can not alloc domain group for node %d\n",
- num);
- return -ENOMEM;
- }
- d->sched_group_nodes[num] = sg;
-
- for_each_cpu(j, d->nodemask) {
- sd = &per_cpu(node_domains, j).sd;
- sd->groups = sg;
- }
-
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->nodemask);
- sg->next = sg;
- cpumask_or(d->covered, d->covered, d->nodemask);
-
- prev = sg;
- for (j = 0; j < nr_node_ids; j++) {
- n = (num + j) % nr_node_ids;
- cpumask_complement(d->notcovered, d->covered);
- cpumask_and(d->tmpmask, d->notcovered, cpu_map);
- cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
- if (cpumask_empty(d->tmpmask))
- break;
- cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
- if (cpumask_empty(d->tmpmask))
- continue;
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING
- "Can not alloc domain group for node %d\n", j);
- return -ENOMEM;
- }
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->tmpmask);
- sg->next = prev->next;
- cpumask_or(d->covered, d->covered, d->tmpmask);
- prev->next = sg;
- prev = sg;
- }
-out:
- return 0;
-}
-#endif /* CONFIG_NUMA */
+ cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
+ group = cpumask_first(nodemask);

-#ifdef CONFIG_NUMA
-/* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
- int cpu, i;
-
- for_each_cpu(cpu, cpu_map) {
- struct sched_group **sched_group_nodes
- = sched_group_nodes_bycpu[cpu];
-
- if (!sched_group_nodes)
- continue;
-
- for (i = 0; i < nr_node_ids; i++) {
- struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
- cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- if (cpumask_empty(nodemask))
- continue;
-
- if (sg == NULL)
- continue;
- sg = sg->next;
-next_sg:
- oldsg = sg;
- sg = sg->next;
- kfree(oldsg);
- if (oldsg != sched_group_nodes[i])
- goto next_sg;
- }
- kfree(sched_group_nodes);
- sched_group_nodes_bycpu[cpu] = NULL;
- }
-}
-#else /* !CONFIG_NUMA */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
+ if (sg)
+ *sg = &per_cpu(sched_group_allnodes, group).sg;
+ return group;
}
+
#endif /* CONFIG_NUMA */

/*
@@ -7212,9 +7078,6 @@ static void __free_domain_allocs(struct
const struct cpumask *cpu_map)
{
switch (what) {
- case sa_sched_groups:
- free_sched_groups(cpu_map, d->tmpmask); /* fall through */
- d->sched_group_nodes = NULL;
case sa_rootdomain:
free_rootdomain(d->rd); /* fall through */
case sa_tmpmask:
@@ -7223,16 +7086,6 @@ static void __free_domain_allocs(struct
free_cpumask_var(d->send_covered); /* fall through */
case sa_nodemask:
free_cpumask_var(d->nodemask); /* fall through */
- case sa_sched_group_nodes:
-#ifdef CONFIG_NUMA
- kfree(d->sched_group_nodes); /* fall through */
- case sa_notcovered:
- free_cpumask_var(d->notcovered); /* fall through */
- case sa_covered:
- free_cpumask_var(d->covered); /* fall through */
- case sa_domainspan:
- free_cpumask_var(d->domainspan); /* fall through */
-#endif
case sa_none:
break;
}
@@ -7241,24 +7094,8 @@ static void __free_domain_allocs(struct
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
const struct cpumask *cpu_map)
{
-#ifdef CONFIG_NUMA
- if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
- return sa_none;
- if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
- return sa_domainspan;
- if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
- return sa_covered;
- /* Allocate the per-node list of sched groups */
- d->sched_group_nodes = kcalloc(nr_node_ids,
- sizeof(struct sched_group *), GFP_KERNEL);
- if (!d->sched_group_nodes) {
- printk(KERN_WARNING "Can not alloc sched group node list\n");
- return sa_notcovered;
- }
- sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
-#endif
if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
- return sa_sched_group_nodes;
+ return sa_none;
if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
return sa_nodemask;
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
@@ -7298,6 +7135,7 @@ static struct sched_domain *__build_numa
if (parent)
parent->child = sd;
cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
+ cpu_to_node_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
return sd;
}
@@ -7410,6 +7248,13 @@ static void build_sched_groups(struct s_
d->send_covered, d->tmpmask);
break;
#ifdef CONFIG_NUMA
+ case SD_LV_NODE:
+ sd = &per_cpu(node_domains, cpu).sd;
+ if (cpu == cpumask_first(sched_domain_span(sd)))
+ init_sched_build_groups(sched_domain_span(sd), cpu_map,
+ &cpu_to_node_group,
+ d->send_covered, d->tmpmask);
+
case SD_LV_ALLNODES:
init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
d->send_covered, d->tmpmask);
@@ -7438,7 +7283,6 @@ static int __build_sched_domains(const s
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
- alloc_state = sa_sched_groups;

/*
* Set up domains for cpus specified by the cpu_map.
@@ -7462,16 +7306,13 @@ static int __build_sched_domains(const s
build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
+ build_sched_groups(&d, SD_LV_NODE, cpu_map, i);
}

#ifdef CONFIG_NUMA
/* Set up node groups */
if (d.sd_allnodes)
build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
-
- for (i = 0; i < nr_node_ids; i++)
- if (build_numa_sched_groups(&d, cpu_map, i))
- goto error;
#endif

/* Calculate CPU power for physical packages and nodes */
@@ -7500,15 +7341,16 @@ static int __build_sched_domains(const s
}

#ifdef CONFIG_NUMA
- for (i = 0; i < nr_node_ids; i++)
- init_numa_sched_groups_power(d.sched_group_nodes[i]);
+ for_each_cpu(i, cpu_map) {
+ sd = &per_cpu(node_domains, i).sd;
+ init_sched_groups_power(i, sd);
+ }

if (d.sd_allnodes) {
- struct sched_group *sg;
-
- cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
- d.tmpmask);
- init_numa_sched_groups_power(sg);
+ for_each_cpu(i, cpu_map) {
+ sd = &per_cpu(allnodes_domains, i).sd;
+ init_sched_groups_power(i, sd);
+ }
}
#endif

@@ -7526,7 +7368,6 @@ static int __build_sched_domains(const s
cpu_attach_domain(sd, d.rd, i);
}

- d.sched_group_nodes = NULL; /* don't free this we still need it */
__free_domain_allocs(&d, sa_tmpmask, cpu_map);
return 0;

@@ -7612,7 +7453,6 @@ static int init_sched_domains(const stru
static void destroy_sched_domains(const struct cpumask *cpu_map,
struct cpumask *tmpmask)
{
- free_sched_groups(cpu_map, tmpmask);
}

/*
@@ -7889,11 +7729,6 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

-#if defined(CONFIG_NUMA)
- sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
- GFP_KERNEL);
- BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
get_online_cpus();
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/