[PATCH 1/2] sched: Introduce rcuwait machinery
From: Davidlohr Bueso
Date: Wed Jan 11 2017 - 10:22:58 EST
rcuwait provides support for (single) rcu-safe task wait/wake functionality,
with the caveat that it must not be called after exit_notify(), such that
we avoid racing with rcu delayed_put_task_struct callbacks, task_struct
being rcu unaware in this context -- for which we similarly have
task_rcu_dereference() magic, but with different return semantics, which
can conflict with the wakeup side.
The interfaces are quite straightforward:
rcuwait_wait_event()
rcuwait_wake_up()
More details are in the comments, but it's perhaps worth mentioning at least,
that users must provide proper serialization when waiting on a condition, and
avoid corrupting a concurrent waiter. Also care must be taken between the task
and the condition for when calling the wakeup -- we cannot miss wakeups. When
porting users, this is for example, a given when using waitqueues in that
everything is done under the q->lock. As such, it can remove sources of non
preemptable unbounded work for realtime.
Reviewed-by: Oleg Nesterov <oleg@xxxxxxxxxx>
Signed-off-by: Davidlohr Bueso <dbueso@xxxxxxx>
---
include/linux/rcuwait.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
kernel/exit.c | 30 +++++++++++++++++++++++
2 files changed, 93 insertions(+)
create mode 100644 include/linux/rcuwait.h
diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
new file mode 100644
index 000000000000..0e93d56c7ab2
--- /dev/null
+++ b/include/linux/rcuwait.h
@@ -0,0 +1,63 @@
+#ifndef _LINUX_RCUWAIT_H_
+#define _LINUX_RCUWAIT_H_
+
+#include <linux/rcupdate.h>
+
+/*
+ * rcuwait provides a way of blocking and waking up a single
+ * task in an rcu-safe manner; where it is forbidden to use
+ * after exit_notify(). task_struct is not properly rcu protected,
+ * unless dealing with rcu-aware lists, ie: find_task_by_*().
+ *
+ * Alternatively we have task_rcu_dereference(), but the return
+ * semantics have different implications which would break the
+ * wakeup side. The only time @task is non-nil is when a user is
+ * blocked (or checking if it needs to) on a condition, and reset
+ * as soon as we know that the condition has succeeded and are
+ * awoken.
+ */
+struct rcuwait {
+ struct task_struct *task;
+};
+
+#define __RCUWAIT_INITIALIZER(name) \
+ { .task = NULL, }
+
+static inline void rcuwait_init(struct rcuwait *w)
+{
+ w->task = NULL;
+}
+
+extern void rcuwait_wake_up(struct rcuwait *w);
+
+/*
+ * The caller is responsible for locking around rcuwait_wait_event(),
+ * such that writes to @task are properly serialized.
+ */
+#define rcuwait_wait_event(w, condition) \
+({ \
+ /* \
+ * Complain if we are called after do_exit()/exit_notify(), \
+ * as we cannot rely on the rcu critical region for the \
+ * wakeup side. \
+ */ \
+ WARN_ON(current->exit_state); \
+ \
+ rcu_assign_pointer((w)->task, current); \
+ for (;;) { \
+ /* \
+ * Implicit barrier (A) pairs with (B) in \
+ * rcuwait_trywake(). \
+ */ \
+ set_current_state(TASK_UNINTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ \
+ schedule(); \
+ } \
+ \
+ WRITE_ONCE((w)->task, NULL); \
+ __set_current_state(TASK_RUNNING); \
+})
+
+#endif /* _LINUX_RCUWAIT_H_ */
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f14b866f9f6..3e0aa08bdf4e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,6 +55,7 @@
#include <linux/shm.h>
#include <linux/kcov.h>
#include <linux/random.h>
+#include <linux/rcuwait.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -282,6 +283,35 @@ struct task_struct *task_rcu_dereference(struct task_struct **ptask)
return task;
}
+void rcuwait_wake_up(struct rcuwait *w)
+{
+ struct task_struct *task;
+
+ rcu_read_lock();
+
+ /*
+ * Order condition vs @task, such that everything prior to the load
+ * of @task is visible. This is the condition as to why the user called
+ * rcuwait_trywake() in the first place. Pairs with set_current_state()
+ * barrier (A) in rcuwait_wait_event().
+ *
+ * WAIT WAKE
+ * [S] tsk = current [S] cond = true
+ * MB (A) MB (B)
+ * [L] cond [L] tsk
+ */
+ smp_rmb(); /* (B) */
+
+ /*
+ * Avoid using task_rcu_dereference() magic as long as we are careful,
+ * see comment in rcuwait_wait_event() regarding ->exit_state.
+ */
+ task = rcu_dereference(w->task);
+ if (task)
+ wake_up_process(task);
+ rcu_read_unlock();
+}
+
struct task_struct *try_get_task_struct(struct task_struct **ptask)
{
struct task_struct *task;
--
2.6.6