[PATCH -mm] cpuhotplug: introduce try_get_online_cpus() take 3

From: Lai Jiangshan
Date: Tue Jun 09 2009 - 08:06:04 EST


It's for -mm tree.

It also works for mainline if you apply this at first:
http://lkml.org/lkml/2009/2/17/58

Subject: [PATCH -mm] cpuhotplug: introduce try_get_online_cpus() take 3

get_online_cpus() is a typically coarsely granular lock.
It's a source of ABBA deadlock.

Thanks to the CPU notifiers, Some subsystem's global lock will
be required after cpu_hotplug.lock. Subsystem's global lock
is coarsely granular lock too, thus a lot's of lock in kernel
should be required after cpu_hotplug.lock(if we need
cpu_hotplug.lock held too)

Otherwise it may come to a ABBA deadlock like this:

thread 1 | thread 2
_cpu_down() | Lock a-kernel-lock.
cpu_hotplug_begin() |
down_write(&cpu_hotplug.lock) |
__raw_notifier_call_chain(CPU_DOWN_PREPARE) | get_online_cpus()
------------------------------------------------------------------------
Lock a-kernel-lock.(wait thread2) | down_read(&cpu_hotplug.lock)
(wait thread 1)

But CPU online/offline are happened very rarely, get_online_cpus()
returns success quickly in all probability.
So it's an asinine behavior that get_online_cpus() is not allowed
to be required after we had held "a-kernel-lock".

To dispel the ABBA deadlock, this patch introduces
try_get_online_cpus(). It returns fail very rarely. It gives the
caller a chance to select an alternative way to finish works,
instead of sleeping or deadlock.

Changed from V1
Lockless for get_online_cpus()'s fast path

Changed from V2
Fix patch as Oleg Nesterov valuable suggestions.

Suggested-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Signed-off-by: Lai Jiangshan <laijs@xxxxxxxxxxxxxx>
---
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4d668e0..eeb9ca5 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -99,6 +99,7 @@ extern struct sysdev_class cpu_sysdev_class;

extern void get_online_cpus(void);
extern void put_online_cpus(void);
+extern int try_get_online_cpus(void);
#define hotcpu_notifier(fn, pri) { \
static struct notifier_block fn##_nb __cpuinitdata = \
{ .notifier_call = fn, .priority = pri }; \
@@ -112,6 +113,7 @@ int cpu_down(unsigned int cpu);

#define get_online_cpus() do { } while (0)
#define put_online_cpus() do { } while (0)
+static inline int try_get_online_cpus(void) { return 1; }
#define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
/* These aren't inline functions due to a GCC bug. */
#define register_hotcpu_notifier(nb) ({ (void)(nb); 0; })
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 609fae1..afecc95 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,8 @@
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
+#include <asm/atomic.h>
+#include <linux/wait.h>

#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -26,16 +28,23 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
*/
static int cpu_hotplug_disabled;

+/*
+ * @cpu_hotplug is a special read-write semaphore with these semantics:
+ * 1) It is read-preference and allows reader-in-reader recursion.
+ * 2) It allows writer to downgrade to a reader when required.
+ * (allows reader-in-writer recursion.)
+ * 3) It allows only one thread to require the write-side lock at most.
+ * (cpu_add_remove_lock ensures it.)
+ */
static struct {
struct task_struct *active_writer;
- struct mutex lock; /* Synchronizes accesses to refcount, */
- /*
- * Also blocks the new readers during
- * an ongoing cpu hotplug operation.
- */
- int refcount;
+ wait_queue_head_t sleeping_readers;
+ /* refcount = 0 means the writer owns the lock. */
+ atomic_t refcount;
} cpu_hotplug = {
- .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+ NULL,
+ __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.sleeping_readers),
+ ATOMIC_INIT(1),
};

#ifdef CONFIG_HOTPLUG_CPU
@@ -45,10 +54,9 @@ void get_online_cpus(void)
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
- mutex_lock(&cpu_hotplug.lock);
- cpu_hotplug.refcount++;
- mutex_unlock(&cpu_hotplug.lock);

+ wait_event(cpu_hotplug.sleeping_readers,
+ likely(atomic_inc_not_zero(&cpu_hotplug.refcount)));
}
EXPORT_SYMBOL_GPL(get_online_cpus);

@@ -56,14 +64,27 @@ void put_online_cpus(void)
{
if (cpu_hotplug.active_writer == current)
return;
- mutex_lock(&cpu_hotplug.lock);
- if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
- wake_up_process(cpu_hotplug.active_writer);
- mutex_unlock(&cpu_hotplug.lock);

+ if (unlikely(atomic_dec_and_test(&cpu_hotplug.refcount))) {
+ /* atomic_dec_and_test() implies smp_mb__after_atomic_dec() */
+ BUG_ON(!cpu_hotplug.active_writer);
+ wake_up_process(cpu_hotplug.active_writer);
+ }
}
EXPORT_SYMBOL_GPL(put_online_cpus);

+int try_get_online_cpus(void)
+{
+ if (cpu_hotplug.active_writer == current)
+ return 1;
+
+ if (likely(atomic_inc_not_zero(&cpu_hotplug.refcount)))
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(try_get_online_cpus);
+
#endif /* CONFIG_HOTPLUG_CPU */

/*
@@ -81,46 +102,42 @@ void cpu_maps_update_done(void)
}

/*
- * This ensures that the hotplug operation can begin only when the
- * refcount goes to zero.
+ * This ensures that the hotplug operation can begin only when
+ * there is no ongoing reader.
*
* Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
+ * will be blocked and queued at cpu_hotplug.sleeping_readers.
*
* Since cpu_hotplug_begin() is always called after invoking
* cpu_maps_update_begin(), we can be sure that only one writer is active.
*
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- * writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- * non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
*/
static void cpu_hotplug_begin(void)
{
cpu_hotplug.active_writer = current;

+ /* atomic_dec_and_test() implies smp_mb__before_atomic_dec() */
+ if (atomic_dec_and_test(&cpu_hotplug.refcount))
+ return;
+
for (;;) {
- mutex_lock(&cpu_hotplug.lock);
- if (likely(!cpu_hotplug.refcount))
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&cpu_hotplug.refcount))
break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&cpu_hotplug.lock);
schedule();
}
+
+ __set_current_state(TASK_RUNNING);
}

static void cpu_hotplug_done(void)
{
+ atomic_inc(&cpu_hotplug.refcount);
+ wake_up_all(&cpu_hotplug.sleeping_readers);
+
cpu_hotplug.active_writer = NULL;
- mutex_unlock(&cpu_hotplug.lock);
}
+
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
{

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/