Re: [PATCH 02/12] sched: Separate out allocation/free/goto-hellfrom __build_sched_domains

From: Andreas Herrmann
Date: Tue Aug 18 2009 - 09:36:49 EST


On Tue, Aug 18, 2009 at 02:57:10PM +0200, Peter Zijlstra wrote:
> On Tue, 2009-08-18 at 12:53 +0200, Andreas Herrmann wrote:
> > @@ -8213,6 +8213,23 @@ struct s_data {
> > struct root_domain *rd;
> > };
> >
> > +enum s_alloc {
> > + sa_sched_groups = 0,
> > + sa_rootdomain,
> > + sa_tmpmask,
> > + sa_send_covered,
> > + sa_this_core_map,
> > + sa_this_sibling_map,
> > + sa_nodemask,
> > + sa_sched_group_nodes,
> > +#ifdef CONFIG_NUMA
> > + sa_notcovered,
> > + sa_covered,
> > + sa_domainspan,
> > +#endif
> > + sa_none,
> > +};
> > +
> > /*
> > * SMT sched-domains:
> > */
> > @@ -8500,6 +8517,77 @@ static void set_domain_attribute(struct sched_domain *sd,
> > }
> > }
> >
> > +static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
> > + const struct cpumask *cpu_map)
> > +{
> > + switch (what) {
> > + case sa_sched_groups:
> > + free_sched_groups(cpu_map, d->tmpmask); /* fall through */
> > + d->sched_group_nodes = NULL;
> > + case sa_rootdomain:
> > + free_rootdomain(d->rd); /* fall through */
> > + case sa_tmpmask:
> > + free_cpumask_var(d->tmpmask); /* fall through */
> > + case sa_send_covered:
> > + free_cpumask_var(d->send_covered); /* fall through */
> > + case sa_this_core_map:
> > + free_cpumask_var(d->this_core_map); /* fall through */
> > + case sa_this_sibling_map:
> > + free_cpumask_var(d->this_sibling_map); /* fall through */
> > + case sa_nodemask:
> > + free_cpumask_var(d->nodemask); /* fall through */
> > + case sa_sched_group_nodes:
> > +#ifdef CONFIG_NUMA
> > + kfree(d->sched_group_nodes); /* fall through */
> > + case sa_notcovered:
> > + free_cpumask_var(d->notcovered); /* fall through */
> > + case sa_covered:
> > + free_cpumask_var(d->covered); /* fall through */
> > + case sa_domainspan:
> > + free_cpumask_var(d->domainspan); /* fall through */
> > +#endif
> > + case sa_none:
> > + break;
> > + }
> > +}
> > +
> > +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
> > + const struct cpumask *cpu_map)
> > +{
> > +#ifdef CONFIG_NUMA
> > + if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
> > + return sa_none;
> > + if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
> > + return sa_domainspan;
> > + if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
> > + return sa_covered;
> > + /* Allocate the per-node list of sched groups */
> > + d->sched_group_nodes = kcalloc(nr_node_ids,
> > + sizeof(struct sched_group *), GFP_KERNEL);
> > + if (!d->sched_group_nodes) {
> > + printk(KERN_WARNING "Can not alloc sched group node list\n");
> > + return sa_notcovered;
> > + }
> > + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
> > +#endif
> > + if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
> > + return sa_sched_group_nodes;
> > + if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
> > + return sa_nodemask;
> > + if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
> > + return sa_this_sibling_map;
> > + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
> > + return sa_this_core_map;
> > + if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
> > + return sa_send_covered;
> > + d->rd = alloc_rootdomain();
> > + if (!d->rd) {
> > + printk(KERN_WARNING "Cannot alloc root domain\n");
> > + return sa_tmpmask;
> > + }
> > + return sa_rootdomain;
> > +}
>
> Code like this makes me wonder if the decomposition you chose is the
> right one.

It was the fastest decomposition to get much stuff out of the way when
working on this huge function -- plus without introducing (too many)
regressions.

> I'd much rather see something that keeps the various domain levels fully
> isolated. That is, the numa code should not need to know anything about
> the multi-core code etc.

The question is how fesible this is.

There are various dependencies in the current code, e.g. the
degeneration step is done at very last.
Not sure at the moment whether all steps can be intermediately
performed. (i.e. initial creation, building groups, set power)
But probably this could work.

> The above we still have everything mixed in one.

Yep.

> Maybe something along the lines of (skipping lots of fun detail):
>
> struct domain_constructor {
> struct sched_domain *func(const struct cpumask *cpu_map,
> struct sched_domain_attr *attr);
> };
>
> struct domain_constructor domain_constructors[] = {
> { &construct_numa_domain },
> { &construct_mc_domain },
> { &construct_cpu_domain },
> { &construct_smt_domain },
> };
>
> static int construct_sched_domains(const struct cpumask *cpu_map,
> struct sched_domain_attr *attr)
> {
> int i;
> struct sched_domain *top = NULL, *parent = NULL, *sd;
>
> for (i = 0; i < ARRAY_SIZE(domain_constructors); i++) {
> sd = domain_constructors[i].func(cpu_map, attr);
> if (!sd)
> continue;
> if (IS_PTR(sd)) {
> ret = PTR_ERR(sd);
> goto fail;
> }
> if (!top)
> top = sd;
>
> if (degenerate_domain(parent, sd)) {
> fold_domain(parent, sd);
> sd->destroy();
> continue;
> }
>
> sd->parent = parent;
> parent = sd;
> }
>
> ret = attach_domain(sd);
> if (ret)
> goto fail;
>
> out:
> return ret;
>
> fail:
> for (sd = parent; sd; sd = parent) {
> parent = sd->parent;
> sd->destroy();
> }
>
> goto out;
> }

Yes, it would be interesting to see this implemented ;-)
At least there's room for improvement in the domain creation code.


Regards,
Andreas

--
Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. München, Germany
Research | Geschäftsführer: Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis München
(OSRC) | Registergericht München, HRB Nr. 43632


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/