[RFC] new locking primitive (pulled from fs_pin)

From: Al Viro
Date: Mon Apr 13 2015 - 15:05:58 EST


New structure. Intended use: embed into an object that will always
be freed with RCU delay.

Initialize with init_kill_once(&object->kill_once).

Use by grabbing rcu_read_lock(), finding the object, then
if (needs_killing(&object->kill_once)) {
// do whatever actions you want, including
// removal of references from the places
// where they could be found
finished_killing(&object->kill_once);
// arrange for RCU-delayed freeing
}

If several threads attempt that, only the first one will
see needs_killing() return true *and* everything else will
wait in needs_killing() until the first one is past
finished_killing(). Note that they might end up returning
after object gets freed - needs_killing() itself is very
careful about dereferencing and its caller MUST NOT touch
the object after getting false from needs_killing().

needs_killing() must be called with rcu_read_lock() held and
drops it in all cases.

This thing used to be the locking side of fs/fs_pin.c, but
IMO it might make sense to separate it from fs_pin completely -
it could be useful elsewhere...

Comments (and especially suggestions on better names) are welcome...

Signed-off-by: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
---
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index b06c987..b124faf 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -12,10 +12,7 @@ void pin_remove(struct fs_pin *pin)
hlist_del(&pin->m_list);
hlist_del(&pin->s_list);
spin_unlock(&pin_lock);
- spin_lock_irq(&pin->wait.lock);
- pin->done = 1;
- wake_up_locked(&pin->wait);
- spin_unlock_irq(&pin->wait.lock);
+ finished_killing(&pin->head);
}

void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
@@ -34,43 +31,12 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m)

void pin_kill(struct fs_pin *p)
{
- wait_queue_t wait;
-
if (!p) {
rcu_read_unlock();
return;
}
- init_wait(&wait);
- spin_lock_irq(&p->wait.lock);
- if (likely(!p->done)) {
- p->done = -1;
- spin_unlock_irq(&p->wait.lock);
- rcu_read_unlock();
+ if (needs_killing(&p->head))
p->kill(p);
- return;
- }
- if (p->done > 0) {
- spin_unlock_irq(&p->wait.lock);
- rcu_read_unlock();
- return;
- }
- __add_wait_queue(&p->wait, &wait);
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&p->wait.lock);
- rcu_read_unlock();
- schedule();
- rcu_read_lock();
- if (likely(list_empty(&wait.task_list)))
- break;
- /* OK, we know p couldn't have been freed yet */
- spin_lock_irq(&p->wait.lock);
- if (p->done > 0) {
- spin_unlock_irq(&p->wait.lock);
- break;
- }
- }
- rcu_read_unlock();
}

void mnt_pin_kill(struct mount *m)
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 9dc4e03..f65daad 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -1,8 +1,7 @@
-#include <linux/wait.h>
+#include <linux/kill_once.h>

struct fs_pin {
- wait_queue_head_t wait;
- int done;
+ struct kill_once head;
struct hlist_node s_list;
struct hlist_node m_list;
void (*kill)(struct fs_pin *);
@@ -12,7 +11,7 @@ struct vfsmount;

static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
{
- init_waitqueue_head(&p->wait);
+ init_kill_once(&p->head);
p->kill = kill;
}

diff --git a/include/linux/kill_once.h b/include/linux/kill_once.h
new file mode 100644
index 0000000..03a3717
--- /dev/null
+++ b/include/linux/kill_once.h
@@ -0,0 +1,28 @@
+#include <linux/wait.h>
+
+/*
+Intended use:
+ rcu_read_lock();
+ p = <....>
+ if (needs_killing(p)) {
+ kill it
+ finished_killing(p);
+ arrange for rcu-delayed freeing
+ } else {
+ we are guaranteed that it is an ex-parrot
+ }
+*/
+
+struct kill_once {
+ wait_queue_head_t wait;
+ int done;
+};
+
+static inline void init_kill_once(struct kill_once *p)
+{
+ init_waitqueue_head(&p->wait);
+ p->done = 0;
+}
+
+bool needs_killing(struct kill_once *);
+void finished_killing(struct kill_once *);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index de7a416..c404207 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@

-obj-y += mutex.o semaphore.o rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o kill_once.o

ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/kill_once.c b/kernel/locking/kill_once.c
new file mode 100644
index 0000000..f59ad4b
--- /dev/null
+++ b/kernel/locking/kill_once.c
@@ -0,0 +1,59 @@
+#include <linux/sched.h>
+#include <linux/kill_once.h>
+
+void finished_killing(struct kill_once *p)
+{
+ spin_lock_irq(&p->wait.lock);
+ p->done = 1;
+ wake_up_locked(&p->wait);
+ spin_unlock_irq(&p->wait.lock);
+}
+
+bool needs_killing(struct kill_once *p)
+{
+ wait_queue_t wait;
+
+ init_wait(&wait);
+ spin_lock_irq(&p->wait.lock);
+ if (likely(!p->done)) {
+ p->done = -1;
+ spin_unlock_irq(&p->wait.lock);
+ rcu_read_unlock();
+ return true;
+ }
+ if (p->done > 0) {
+ spin_unlock_irq(&p->wait.lock);
+ rcu_read_unlock();
+ return false;
+ }
+ __add_wait_queue(&p->wait, &wait);
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&p->wait.lock);
+ rcu_read_unlock();
+ schedule();
+ rcu_read_lock();
+ /*
+ * racy, but we are OK with false negatives -
+ * if we observe anything other than an empty
+ * wait.task_list after taking rcu_read_lock(),
+ * we know that RCU grace period started after
+ * pin_remove() couldn't have ended yet and
+ * dereferencing p is still safe.
+ */
+ if (likely(list_empty(&wait.task_list)))
+ break;
+ /*
+ * OK, we know p couldn't have been freed yet and
+ * can take that spinlock safely
+ */
+ spin_lock_irq(&p->wait.lock);
+ /* now we can check p->done */
+ if (p->done > 0) {
+ spin_unlock_irq(&p->wait.lock);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/