[PATCH] cpuset: Rework sched domains and CPU hotplug handling

From: Max Krasnyansky
Date: Wed Jul 23 2008 - 18:33:21 EST


This is an updated version of my previous cpuset patch:
"Make rebuild_sched_domains() usable from any context (take 2)"

rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().

In order to support that properly we need to rework cpuset locking
rules to avoid circular depencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.

The rest of the cpuset code now offloads sched domains rebuilds to
the single threaded workqueue. As suggested by both Paul J. and Paul M.

This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
__rebuild_sched_domains() has been renamed to async_rebuild_sched_domains().

I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.

The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bring cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.

Signed-off-by: Max Krasnyansky <maxk@xxxxxxxxxxxx>
Cc: mingo@xxxxxxx
Cc: pj@xxxxxxx
Cc: menage@xxxxxxxxxx
Cc: a.p.zijlstra@xxxxxxxxx
Cc: vegard.nossum@xxxxxxxxx
---
kernel/cpuset.c | 328 ++++++++++++++++++++++++++++++++-----------------------
1 files changed, 191 insertions(+), 137 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3c3ef02..8121770 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* 2006 Rework by Paul Menage to use generic cgroups
+ * 2008 Rework of the scheduler domains and CPU hotplug handling
+ * by Max Krasnyansky
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
@@ -59,6 +61,11 @@
#include <linux/cgroup.h>

/*
+ * Workqueue for cpuset related tasks.
+ */
+static struct workqueue_struct *cpuset_wq;
+
+/*
* Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can
* short circuit some hooks.
@@ -241,9 +248,11 @@ static struct cpuset top_cpuset = {

static DEFINE_MUTEX(callback_mutex);

-/* This is ugly, but preserves the userspace API for existing cpuset
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead */
+ * silently switch it to mount "cgroup" instead
+ */
static int cpuset_get_sb(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data, struct vfsmount *mnt)
@@ -478,10 +487,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
}

/*
- * Helper routine for rebuild_sched_domains().
+ * Helper routine for generate_sched_domains().
* Do cpusets a, b have overlapping cpus_allowed masks?
*/
-
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -498,21 +506,41 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
}

/*
- * rebuild_sched_domains()
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
+ * Helper routine for generate_sched_domains().
+ * Generates single, full, sched domain that includes all online
+ * CPUs in the system.
+ */
+static int generate_single_sched_domain(cpumask_t **domains,
+ struct sched_domain_attr **attributes)
+{
+ struct sched_domain_attr *dattr;
+ cpumask_t *doms;
+
+ doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+ if (!doms)
+ return -ENOMEM;
+ dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
+ if (dattr) {
+ *dattr = SD_ATTR_INIT;
+ update_domain_attr(dattr, &top_cpuset);
+ }
+ *doms = top_cpuset.cpus_allowed;
+
+ *domains = doms;
+ *attributes = dattr;
+ return 1;
+}
+
+/*
+ * generate_sched_domains()
*
- * This routine builds a partial partition of the systems CPUs
- * (the set of non-overlappping cpumask_t's in the array 'part'
- * below), and passes that partial partition to the kernel/sched.c
- * partition_sched_domains() routine, which will rebuild the
- * schedulers load balancing domains (sched domains) as specified
- * by that partial partition. A 'partial partition' is a set of
- * non-overlapping subsets whose union is a subset of that set.
+ * This function builds a partial partition of the systems CPUs
+ * A 'partial partition' is a set of non-overlapping subsets whose
+ * union is a subset of that set.
+ * The output of this function needs to be passed to kernel/sched.c
+ * partition_sched_domains() routine, which will rebuild the scheduler's
+ * load balancing domains (sched domains) as specified by that partial
+ * partition.
*
* See "What is sched_load_balance" in Documentation/cpusets.txt
* for a background explanation of this.
@@ -522,13 +550,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Call with cgroup_mutex held. May take callback_mutex during
- * call due to the kfifo_alloc() and kmalloc() calls. May nest
- * a call to the get_online_cpus()/put_online_cpus() pair.
- * Must not be called holding callback_mutex, because we must not
- * call get_online_cpus() while holding callback_mutex. Elsewhere
- * the kernel nests callback_mutex inside get_online_cpus() calls.
- * So the reverse nesting would risk an ABBA deadlock.
+ * Must be called with cgroup_lock held.
*
* The three key local variables below are:
* q - a kfifo queue of cpuset pointers, used to implement a
@@ -563,8 +585,8 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
* element of the partition (one sched domain) to be passed to
* partition_sched_domains().
*/
-
-void rebuild_sched_domains(void)
+static int generate_sched_domains(cpumask_t **domains,
+ struct sched_domain_attr **attributes)
{
struct kfifo *q; /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
@@ -576,32 +598,23 @@ void rebuild_sched_domains(void)
int ndoms; /* number of sched domains in result */
int nslot; /* next empty doms[] cpumask_t slot */

- q = NULL;
- csa = NULL;
- doms = NULL;
+ doms = NULL;
dattr = NULL;

/* Special case for the 99% of systems with one, full, sched domain */
- if (is_sched_load_balance(&top_cpuset)) {
- ndoms = 1;
- doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
- if (!doms)
- goto rebuild;
- dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
- if (dattr) {
- *dattr = SD_ATTR_INIT;
- update_domain_attr(dattr, &top_cpuset);
- }
- *doms = top_cpuset.cpus_allowed;
- goto rebuild;
- }
+ if (is_sched_load_balance(&top_cpuset))
+ return generate_single_sched_domain(domains, attributes);

q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
- if (IS_ERR(q))
- goto done;
+ if (!q || IS_ERR(q))
+ return 0;
+
csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
- if (!csa)
- goto done;
+ if (!csa) {
+ kfifo_free(q);
+ return 0;
+ }
+
csn = 0;

cp = &top_cpuset;
@@ -644,61 +657,119 @@ restart:
}
}

- /* Convert <csn, csa> to <ndoms, doms> */
+ /*
+ * Now we know how many domains to create.
+ * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+ */
doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
- if (!doms)
- goto rebuild;
+ if (!doms) {
+ ndoms = 0;
+ goto done;
+ }
dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);

for (nslot = 0, i = 0; i < csn; i++) {
struct cpuset *a = csa[i];
int apn = a->pn;
+ cpumask_t *dp = doms + nslot;

- if (apn >= 0) {
- cpumask_t *dp = doms + nslot;
-
- if (nslot == ndoms) {
- static int warnings = 10;
- if (warnings) {
- printk(KERN_WARNING
- "rebuild_sched_domains confused:"
- " nslot %d, ndoms %d, csn %d, i %d,"
- " apn %d\n",
- nslot, ndoms, csn, i, apn);
- warnings--;
- }
- continue;
+ /* Skip partitions we've already looked at */
+ if (apn < 0)
+ continue;
+
+ if (nslot == ndoms) {
+ static int warnings = 10;
+ if (warnings) {
+ printk(KERN_WARNING
+ "rebuild_sched_domains confused:"
+ " nslot %d, ndoms %d, csn %d, i %d,"
+ " apn %d\n",
+ nslot, ndoms, csn, i, apn);
+ warnings--;
}
+ continue;
+ }

- cpus_clear(*dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
- for (j = i; j < csn; j++) {
- struct cpuset *b = csa[j];
+ cpus_clear(*dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ for (j = i; j < csn; j++) {
+ struct cpuset *b = csa[j];

- if (apn == b->pn) {
- cpus_or(*dp, *dp, b->cpus_allowed);
- b->pn = -1;
- update_domain_attr(dattr, b);
- }
+ if (apn == b->pn) {
+ cpus_or(*dp, *dp, b->cpus_allowed);
+ update_domain_attr(dattr, b);
+
+ /* Done with this partition */
+ b->pn = -1;
}
- nslot++;
}
+ nslot++;
}
BUG_ON(nslot != ndoms);

-rebuild:
- /* Have scheduler rebuild sched domains */
+done:
+ kfifo_free(q);
+ kfree(csa);
+
+ *domains = doms;
+ *attributes = dattr;
+ return ndoms;
+}
+
+/*
+ * Rebuilds scheduler domains. See generate_sched_domains() description
+ * for details.
+ *
+ * Must be called under get_online_cpus(). This is an external interface
+ * and must not be used inside the cpuset code to avoid circular locking
+ * dependency. cpuset code should use async_rebuild_sched_domains() instead.
+ */
+void rebuild_sched_domains(void)
+{
+ struct sched_domain_attr *attr;
+ cpumask_t *doms;
+ int ndoms;
+
+ /*
+ * We have to iterate cgroup hierarch which requires cgroup lock.
+ */
+ cgroup_lock();
+ ndoms = generate_sched_domains(&doms, &attr);
+ cgroup_unlock();
+
+ /* Have scheduler rebuild the domains */
+ partition_sched_domains(ndoms, doms, attr);
+}
+
+/*
+ * Simply calls rebuild_sched_domains() with correct locking rules.
+ */
+static void rebuild_domains_callback(struct work_struct *work)
+{
get_online_cpus();
- partition_sched_domains(ndoms, doms, dattr);
+ rebuild_sched_domains();
put_online_cpus();
+}
+static DECLARE_WORK(rebuild_domains_work, rebuild_domains_callback);

-done:
- if (q && !IS_ERR(q))
- kfifo_free(q);
- kfree(csa);
- /* Don't kfree(doms) -- partition_sched_domains() does that. */
- /* Don't kfree(dattr) -- partition_sched_domains() does that. */
+/*
+ * Internal helper for rebuilding sched domains when something changes.
+ *
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * rebuild_sched_domains() must be called under get_online_cpus() and
+ * it needs to take cgroup_lock(). But most of the cpuset code is already
+ * holding cgroup_lock(). In order to avoid incorrect lock nesting we
+ * delegate the actual domain rebuilding to the workqueue.
+ */
+static void async_rebuild_sched_domains(void)
+{
+ queue_work(cpuset_wq, &rebuild_domains_work);
}

static inline int started_after_time(struct task_struct *t1,
@@ -831,7 +902,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
heap_free(&heap);

if (is_load_balanced)
- rebuild_sched_domains();
+ async_rebuild_sched_domains();
return 0;
}

@@ -1042,7 +1113,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)

if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
- rebuild_sched_domains();
+ async_rebuild_sched_domains();
}

return 0;
@@ -1083,7 +1154,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex);

if (cpus_nonempty && balance_flag_changed)
- rebuild_sched_domains();
+ async_rebuild_sched_domains();

return 0;
}
@@ -1479,6 +1550,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
default:
BUG();
}
+
+ /* Unreachable but makes gcc happy */
+ return 0;
}

static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1491,6 +1565,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
default:
BUG();
}
+
+ /* Unrechable but makes gcc happy */
+ return 0;
}


@@ -1677,15 +1754,9 @@ static struct cgroup_subsys_state *cpuset_create(
}

/*
- * Locking note on the strange update_flag() call below:
- *
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains(). The get_online_cpus()
- * call in rebuild_sched_domains() must not be made while holding
- * callback_mutex. Elsewhere the kernel nests callback_mutex inside
- * get_online_cpus() calls. So the reverse nesting would risk an
- * ABBA deadlock.
+ * will call rebuild_sched_domains().
*/

static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1796,7 +1867,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
}

/*
- * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty
@@ -1884,35 +1955,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
}

/*
- * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug event.
- *
- * Since there are two callers of this routine, one for CPU hotplug
- * events and one for memory node hotplug events, we could have coded
- * two separate routines here. We code it as a single common routine
- * in order to minimize text size.
- */
-
-static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
-{
- cgroup_lock();
-
- top_cpuset.cpus_allowed = cpu_online_map;
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- scan_for_empty_cpusets(&top_cpuset);
-
- /*
- * Scheduler destroys domains on hotplug events.
- * Rebuild them based on the current settings.
- */
- if (rebuild_sd)
- rebuild_sched_domains();
-
- cgroup_unlock();
-}
-
-/*
* The top_cpuset tracks what CPUs and Memory Nodes are online,
* period. This is necessary in order to make cpusets transparent
* (of no affect) on systems that are actively using CPU hotplug
@@ -1921,39 +1963,48 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
* This routine ensures that top_cpuset.cpus_allowed tracks
* cpu_online_map on each CPU hotplug (cpuhp) event.
*/
-
-static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
+static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
unsigned long phase, void *unused_cpu)
{
+ struct sched_domain_attr *attr;
+ cpumask_t *doms;
+ int ndoms;
+
switch (phase) {
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- common_cpu_mem_hotplug_unplug(1);
break;
+
default:
return NOTIFY_DONE;
}

+ cgroup_lock();
+ top_cpuset.cpus_allowed = cpu_online_map;
+ scan_for_empty_cpusets(&top_cpuset);
+ ndoms = generate_sched_domains(&doms, &attr);
+ cgroup_unlock();
+
+ /* Have scheduler rebuild the domains */
+ partition_sched_domains(ndoms, doms, attr);
+
return NOTIFY_OK;
}

#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after you change
- * node_states[N_HIGH_MEMORY].
- * See also the previous routine cpuset_handle_cpuhp().
+ * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
+ * See also the previous routine cpuset_track_online_cpus().
*/
-
void cpuset_track_online_nodes(void)
{
- common_cpu_mem_hotplug_unplug(0);
+ cgroup_lock();
+ top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+ scan_for_empty_cpusets(&top_cpuset);
+ cgroup_unlock();
}
#endif

@@ -1968,7 +2019,10 @@ void __init cpuset_init_smp(void)
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];

- hotcpu_notifier(cpuset_handle_cpuhp, 0);
+ hotcpu_notifier(cpuset_track_online_cpus, 0);
+
+ cpuset_wq = create_singlethread_workqueue("cpuset");
+ BUG_ON(!cpuset_wq);
}

/**
--
1.5.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/