[RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()

From: Peter Zijlstra
Date: Fri May 15 2015 - 11:50:57 EST


Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
without locks, a caller might observe an old value and race with the
set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
it.

__kthread_bind()
do_set_cpus_allowed()
<SYSCALL>
sched_setaffinity()
if (p->flags & PF_NO_SETAFFINITIY)
set_cpus_allowed_ptr()
p->flags |= PF_NO_SETAFFINITY

Fix the issue by putting everything under the regular scheduler locks.

This also closes a hole in the serialization of
task_struct::{nr_,}cpus_allowed.

Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Oleg Nesterov <oleg@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/kthread.h | 1 +
include/linux/sched.h | 7 -------
kernel/kthread.c | 21 ++++++++++++++++++---
kernel/sched/core.c | 30 ++++++++++++++++++++++++++----
kernel/workqueue.c | 11 ++---------
5 files changed, 47 insertions(+), 23 deletions(-)

--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cp
})

void kthread_bind(struct task_struct *k, unsigned int cpu);
+void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
int kthread_stop(struct task_struct *k);
bool kthread_should_stop(void);
bool kthread_should_park(void);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2215,13 +2215,6 @@ static inline void calc_load_enter_idle(
static inline void calc_load_exit_idle(void) { }
#endif /* CONFIG_NO_HZ_COMMON */

-#ifndef CONFIG_CPUMASK_OFFSTACK
-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
-{
- return set_cpus_allowed_ptr(p, &new_mask);
-}
-#endif
-
/*
* Do not use outside of architecture code which knows its limitations.
*
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_no
}
EXPORT_SYMBOL(kthread_create_on_node);

-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
+ unsigned long flags;
+
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+
/* It's safe because the task is inactive. */
- do_set_cpus_allowed(p, cpumask_of(cpu));
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+ __kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+ __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
}

/**
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4044,6 +4044,9 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pi
return retval;
}

+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check);
+
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
cpumask_var_t cpus_allowed, new_mask;
@@ -4110,7 +4113,7 @@ long sched_setaffinity(pid_t pid, const
}
#endif
again:
- retval = set_cpus_allowed_ptr(p, new_mask);
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);

if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
@@ -4638,7 +4641,8 @@ void init_idle(struct task_struct *idle,
struct rq *rq = cpu_rq(cpu);
unsigned long flags;

- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_lock(&rq->lock);

__sched_fork(0, idle);
idle->state = TASK_RUNNING;
@@ -4664,7 +4668,8 @@ void init_idle(struct task_struct *idle,
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);

/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
@@ -4788,6 +4793,8 @@ static struct rq *move_queued_task(struc

void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
+ lockdep_assert_held(&p->pi_lock);
+
if (p->sched_class->set_cpus_allowed)
p->sched_class->set_cpus_allowed(p, new_mask);

@@ -4818,7 +4825,8 @@ void do_set_cpus_allowed(struct task_str
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
{
unsigned long flags;
struct rq *rq;
@@ -4827,6 +4835,15 @@ int set_cpus_allowed_ptr(struct task_str

rq = task_rq_lock(p, &flags);

+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;

@@ -4856,6 +4873,11 @@ int set_cpus_allowed_ptr(struct task_str

return ret;
}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, new_mask, false);
+}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

/*
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1619,11 +1619,7 @@ static void worker_attach_to_pool(struct
{
mutex_lock(&pool->attach_mutex);

- /*
- * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
- * online CPUs. It'll be re-applied when any of the CPUs come up.
- */
- set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+ kthread_bind_mask(worker->task, pool->attrs->cpumask);

/*
* The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
@@ -1708,9 +1704,6 @@ static struct worker *create_worker(stru

set_user_nice(worker->task, pool->attrs->nice);

- /* prevent userland from meddling with cpumask of workqueue workers */
- worker->task->flags |= PF_NO_SETAFFINITY;
-
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);

@@ -3832,7 +3825,7 @@ struct workqueue_struct *__alloc_workque
}

wq->rescuer = rescuer;
- rescuer->task->flags |= PF_NO_SETAFFINITY;
+ kthread_bind_mask(rescuer->task, cpu_possible_mask);
wake_up_process(rescuer->task);
}



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/