[RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
From: Mathieu Desnoyers
Date: Tue Sep 03 2019 - 16:11:52 EST
The membarrier_state field is located within the mm_struct, which
is not guaranteed to exist when used from runqueue-lock-free iteration
on runqueues by the membarrier system call.
Copy the membarrier_state from the mm_struct into the next task_struct
in the scheduler prepare task switch. Upon membarrier registration,
iterate over each runqueue and copy the membarrier_state from the
mm_struct into all currently running task struct which have the same mm
as the current task.
Suggested-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Oleg Nesterov <oleg@xxxxxxxxxx>
Cc: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Russell King - ARM Linux admin <linux@xxxxxxxxxxxxxxx>
Cc: Chris Metcalf <cmetcalf@xxxxxxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxx>
Cc: Kirill Tkhai <tkhai@xxxxxxxxx>
Cc: Mike Galbraith <efault@xxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
---
include/linux/sched.h | 4 ++
include/linux/sched/mm.h | 13 +++++++
kernel/sched/core.c | 1 +
kernel/sched/membarrier.c | 78 ++++++++++++++++++++++++++++-----------
4 files changed, 74 insertions(+), 22 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9f51932bd543..e24d52a4c37a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1130,6 +1130,10 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_MEMBARRIER
+ atomic_t membarrier_state;
+#endif
+
#ifdef CONFIG_RSEQ
struct rseq __user *rseq;
u32 rseq_sig;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 4a7944078cc3..3577cd7b3dbb 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -371,7 +371,17 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
static inline void membarrier_execve(struct task_struct *t)
{
atomic_set(&t->mm->membarrier_state, 0);
+ atomic_set(&t->membarrier_state, 0);
}
+
+static inline void membarrier_prepare_task_switch(struct task_struct *t)
+{
+ if (!t->mm)
+ return;
+ atomic_set(&t->membarrier_state,
+ atomic_read(&t->mm->membarrier_state));
+}
+
#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -386,6 +396,9 @@ static inline void membarrier_execve(struct task_struct *t)
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
+static inline void membarrier_prepare_task_switch(struct task_struct *t)
+{
+}
#endif
#endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 010d578118d6..8d4f1f20db15 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3038,6 +3038,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
+ membarrier_prepare_task_switch(next);
prepare_task(next);
prepare_arch_switch(next);
}
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index aa8d75804108..d564ca1b5d69 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -72,8 +72,8 @@ static int membarrier_global_expedited(void)
rcu_read_lock();
p = task_rcu_dereference(&cpu_rq(cpu)->curr);
- if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
- MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+ if (p && (atomic_read(&p->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
if (!fallback)
__cpumask_set_cpu(cpu, tmpmask);
else
@@ -177,6 +177,46 @@ static int membarrier_private_expedited(int flags)
return 0;
}
+static void sync_other_runqueues_membarrier_state(struct mm_struct *mm)
+{
+ int cpu;
+
+ if (num_online_cpus() == 1)
+ return;
+
+ /*
+ * For multi-mm user threads, we need to ensure all future scheduler
+ * executions will observe @mm's new membarrier state.
+ */
+ synchronize_rcu();
+
+ /*
+ * For each cpu runqueue (except the current cpu), if the task's mm
+ * match @mm, ensure that all @mm's membarrier state set bits are also
+ * set in in the runqueue's current task membarrier state.
+ *
+ * Use an atomic_or() to set the task membarrier state, thus ensuring
+ * this operation is always additive. This is important in case many
+ * different membarrier registration commands are invoked concurrently,
+ * given that they do not hold the mmap_sem.
+ */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /* Skip current CPU. */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm == mm)
+ atomic_or(atomic_read(&mm->membarrier_state),
+ &p->membarrier_state);
+ rcu_read_unlock();
+ }
+ cpus_read_unlock();
+}
+
static int membarrier_register_global_expedited(void)
{
struct task_struct *p = current;
@@ -186,6 +226,8 @@ static int membarrier_register_global_expedited(void)
MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
return 0;
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &p->membarrier_state);
+
if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
/*
* For single mm user, single threaded process, we can
@@ -196,12 +238,7 @@ static int membarrier_register_global_expedited(void)
*/
smp_mb();
} else {
- /*
- * For multi-mm user threads, we need to ensure all
- * future scheduler executions will observe the new
- * thread flag state for this mm.
- */
- synchronize_rcu();
+ sync_other_runqueues_membarrier_state(mm);
}
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
&mm->membarrier_state);
@@ -213,12 +250,14 @@ static int membarrier_register_private_expedited(int flags)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
- int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+ int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
- state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ ready_state =
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
}
/*
@@ -226,20 +265,15 @@ static int membarrier_register_private_expedited(int flags)
* groups, which use the same mm. (CLONE_VM but not
* CLONE_THREAD).
*/
- if (atomic_read(&mm->membarrier_state) & state)
+ if ((atomic_read(&mm->membarrier_state) & ready_state))
return 0;
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
- &mm->membarrier_state);
- if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
- /*
- * Ensure all future scheduler executions will observe the
- * new thread flag state for this process.
- */
- synchronize_rcu();
- }
- atomic_or(state, &mm->membarrier_state);
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+ atomic_or(set_state, &mm->membarrier_state);
+ atomic_or(set_state, &p->membarrier_state);
+ if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1))
+ sync_other_runqueues_membarrier_state(mm);
+ atomic_or(ready_state, &mm->membarrier_state);
return 0;
}
--
2.17.1