Re: sched_setscheduler() vs idle_balance() race

From: Peter Zijlstra
Date: Thu May 28 2015 - 09:54:09 EST


On Thu, May 28, 2015 at 09:43:52AM +0200, Mike Galbraith wrote:
> Hi Peter,
>
> I'm not seeing what prevents pull_task() from yanking a task out from
> under __sched_setscheduler(). A box sprinkling smoldering 3.0 kernel
> wreckage all over my bugzilla mbox isn't seeing it either ;-)

Say, how easy can that thing be reproduced?

The below is compile tested only, but it might just work if I didn't
miss anything :-)


---
kernel/sched/core.c | 137 +++++++++++++++++++++++++++++-----------------------
1 file changed, 77 insertions(+), 60 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4eec60757b16..28f1ddc0bef2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1000,22 +1000,6 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}

-/*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
- */
-static inline void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
-{
- if (prev_class != p->sched_class) {
- if (prev_class->switched_from)
- prev_class->switched_from(rq, p);
- /* Possble rq->lock 'hole'. */
- p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio || dl_task(p))
- p->sched_class->prio_changed(rq, p, oldprio);
-}
-
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
const struct sched_class *class;
@@ -3075,12 +3059,38 @@ void rt_mutex_setprio(struct task_struct *p, int prio)

p->prio = prio;

+ if (prev_class != p->sched_class) {
+ prev_class->switched_from(rq, p);
+ /*
+ * switched_from() is allowed to drop @rq->lock; which opens a
+ * race against load-balancing, however since @p is not
+ * currently enqueued it is invisible to the load-balancer.
+ *
+ * double check @p is still where we thought it was.
+ */
+ WARN_ON_ONCE(task_rq(p) != rq);
+ }
+
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, enqueue_flag);

- check_class_changed(rq, p, prev_class, oldprio);
+ /*
+ * Both switched_to() and prio_changed() are allowed to drop @rq->lock;
+ * which opens a race against load-balancing, and since @p is now
+ * enqueued it can indeed be subject to this.
+ *
+ * This means that any balancing done by these functions must double
+ * check a task's rq.
+ */
+ if (prev_class != p->sched_class)
+ p->sched_class->switched_to(rq, p);
+ else if (oldprio != p->prio || dl_task(p))
+ p->sched_class->prio_changed(rq, p, oldprio);
+ /*
+ * It further means we should not rely on @p's rq from here on.
+ */
out_unlock:
__task_rq_unlock(rq);
}
@@ -3420,7 +3430,7 @@ static bool dl_param_changed(struct task_struct *p,

static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
- bool user)
+ bool user, bool pi)
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3606,18 +3616,20 @@ static int __sched_setscheduler(struct task_struct *p,
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;

- /*
- * Take priority boosted tasks into account. If the new
- * effective priority is unchanged, we just store the new
- * normal parameters and do not touch the scheduler class and
- * the runqueue. This will be done when the task deboost
- * itself.
- */
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- if (new_effective_prio == oldprio) {
- __setscheduler_params(p, attr);
- task_rq_unlock(rq, p, &flags);
- return 0;
+ if (pi) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ if (new_effective_prio == oldprio) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
}

queued = task_on_rq_queued(p);
@@ -3628,7 +3640,19 @@ static int __sched_setscheduler(struct task_struct *p,
put_prev_task(rq, p);

prev_class = p->sched_class;
- __setscheduler(rq, p, attr, true);
+ __setscheduler(rq, p, attr, pi);
+
+ if (prev_class != p->sched_class) {
+ prev_class->switched_from(rq, p);
+ /*
+ * switched_from() is allowed to drop @rq->lock; which opens a
+ * race against load-balancing, however since @p is not
+ * currently enqueued it is invisible to the load-balancer.
+ *
+ * double check @p is still where we thought it was.
+ */
+ WARN_ON_ONCE(task_rq(p) != rq);
+ }

if (running)
p->sched_class->set_curr_task(rq);
@@ -3640,10 +3664,25 @@ static int __sched_setscheduler(struct task_struct *p,
enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
}

- check_class_changed(rq, p, prev_class, oldprio);
+ /*
+ * Both switched_to() and prio_changed() are allowed to drop @rq->lock;
+ * which opens a race against load-balancing, and since @p is now
+ * enqueued it can indeed be subject to this.
+ *
+ * This means that any balancing done by these functions must double
+ * check a task's rq.
+ */
+ if (prev_class != p->sched_class)
+ p->sched_class->switched_to(rq, p);
+ else if (oldprio != p->prio || dl_task(p))
+ p->sched_class->prio_changed(rq, p, oldprio);
+ /*
+ * It further means we should not rely on @p's rq from here on.
+ */
task_rq_unlock(rq, p, &flags);

- rt_mutex_adjust_pi(p);
+ if (pi)
+ rt_mutex_adjust_pi(p);

return 0;
}
@@ -3664,7 +3703,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
attr.sched_policy = policy;
}

- return __sched_setscheduler(p, &attr, check);
+ return __sched_setscheduler(p, &attr, check, true);
}
/**
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3685,7 +3724,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);

int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
- return __sched_setscheduler(p, attr, true);
+ return __sched_setscheduler(p, attr, true, true);
}
EXPORT_SYMBOL_GPL(sched_setattr);

@@ -7346,32 +7385,12 @@ EXPORT_SYMBOL(___might_sleep);
#endif

#ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
{
- const struct sched_class *prev_class = p->sched_class;
+ struct task_struct *g, *p;
struct sched_attr attr = {
.sched_policy = SCHED_NORMAL,
};
- int old_prio = p->prio;
- int queued;
-
- queued = task_on_rq_queued(p);
- if (queued)
- dequeue_task(rq, p, 0);
- __setscheduler(rq, p, &attr, false);
- if (queued) {
- enqueue_task(rq, p, 0);
- resched_curr(rq);
- }
-
- check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
- struct task_struct *g, *p;
- unsigned long flags;
- struct rq *rq;

read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
@@ -7398,9 +7417,7 @@ void normalize_rt_tasks(void)
continue;
}

- rq = task_rq_lock(p, &flags);
- normalize_task(rq, p);
- task_rq_unlock(rq, p, &flags);
+ __sched_setscheduler(p, &attr, false, false);
}
read_unlock(&tasklist_lock);
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/