From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Waiting for spinlocks and rwlocks on non RT enabled kernels is task::state
preserving. Any wakeup which matches the state is valid.
RT enabled kernels substitutes them with 'sleeping' spinlocks. This creates
an issue vs. task::state.
In order to block on the lock the task has to overwrite task::state and a
consecutive wakeup issued by the unlocker sets the state back to
TASK_RUNNING. As a consequence the task loses the state which was set
before the lock acquire and also any regular wakeup targeted at the task
while it is blocked on the lock.
To handle this gracefully add a 'saved_state' member to task_struct which
is used in the following way:
1) When a task blocks on a 'sleeping' spinlock, the current state is saved
in task::saved_state before it is set to TASK_RTLOCK_WAIT.
2) When the task unblocks and after acquiring the lock, it restores the saved
state.
3) When a regular wakeup happens for a task while it is blocked then the
state change of that wakeup is redirected to operate on task::saved_state.
This is also required when the task state is running because the task
might have been woken up from the lock wait and has not yet restored
the saved state.
To make it complete provide the necessary helpers to save and restore the
saved state along with the necessary documentation how the RT lock blocking
is supposed to work.
For non-RT kernels there is no functional change.
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
include/linux/sched.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/core.c | 33 +++++++++++++++++++++++
2 files changed, 103 insertions(+)
---
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -155,6 +155,27 @@ struct task_group;
WRITE_ONCE(current->__state, (state_value)); \
raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \
} while (0)
+
+
+#define current_save_and_set_rtlock_wait_state() \
+ do { \
+ raw_spin_lock(¤t->pi_lock); \
+ current->saved_state = current->__state; \
+ current->saved_state_change = current->task_state_change;\
+ current->task_state_change = _THIS_IP_; \
+ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \
+ raw_spin_unlock(¤t->pi_lock); \
+ } while (0);
+
+#define current_restore_rtlock_saved_state() \
+ do { \
+ raw_spin_lock(¤t->pi_lock); \
+ current->task_state_change = current->saved_state_change;\
+ WRITE_ONCE(current->__state, current->saved_state); \
+ current->saved_state = TASK_RUNNING; \
+ raw_spin_unlock(¤t->pi_lock); \
+ } while (0);
+
#else
/*
* set_current_state() includes a barrier so that the write of current->state
@@ -213,6 +234,47 @@ struct task_group;
raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \
} while (0)
+/*
+ * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
+ *
+ * RT's spin/rwlock substitutions are state preserving. The state of the
+ * task when blocking on the lock is saved in task_struct::saved_state and
+ * restored after the lock has been acquired. These operations are
+ * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
+ * lock related wakeups while the task is blocked on the lock are
+ * redirected to operate on task_struct::saved_state to ensure that these
+ * are not dropped. On restore task_struct::saved_state is set to
+ * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
+ *
+ * The lock operation looks like this:
+ *
+ * current_save_and_set_rtlock_wait_state();
+ * for (;;) {
+ * if (try_lock())
+ * break;
+ * raw_spin_unlock_irq(&lock->wait_lock);
+ * schedule_rtlock();
+ * raw_spin_lock_irq(&lock->wait_lock);
+ * set_current_state(TASK_RTLOCK_WAIT);
+ * }
+ * current_restore_rtlock_saved_state();
+ */
+#define current_save_and_set_rtlock_wait_state() \
+ do { \
+ raw_spin_lock(¤t->pi_lock); \
+ current->saved_state = current->state; \
+ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \
+ raw_spin_unlock(¤t->pi_lock); \
+ } while (0);
+
+#define current_restore_rtlock_saved_state() \
+ do { \
+ raw_spin_lock(¤t->pi_lock); \
+ WRITE_ONCE(current->__state, current->saved_state); \
+ current->saved_state = TASK_RUNNING; \
+ raw_spin_unlock(¤t->pi_lock); \
+ } while (0);
+
#endif