Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks

From: Peter Zijlstra
Date: Fri Apr 01 2016 - 17:52:00 EST


On Fri, Apr 01, 2016 at 09:34:24PM +0800, Xunlei Pang wrote:

> >> I checked the code, currently only deadline accesses the
> >> pi_waiters/pi_waiters_leftmost
> >> without pi_lock held via rt_mutex_get_top_task(), other cases all have
> >> pi_lock held.

> Any better ideas is welcome.

Something like the below _might_ work; but its late and I haven't looked
at the PI code in a while. This basically caches a pointer to the top
waiter task in the running task_struct, under pi_lock and rq->lock, and
therefore we can use it with only rq->lock held.

Since the task is blocked, and cannot unblock without taking itself from
the block chain -- which would cause rt_mutex_setprio() to set another
top waiter task, the lifetime rules should be good.

Having this top waiter pointer around might also be useful to further
implement bandwidth inheritance or such, but I've not thought about that
too much.

Lots of code deleted because we move the entire task->prio mapping muck
into the scheduler, because we now pass a pi_task, not a prio.

---
include/linux/sched.h | 1 +
include/linux/sched/rt.h | 20 +-------------------
kernel/locking/rtmutex.c | 45 +++++----------------------------------------
kernel/sched/core.c | 34 +++++++++++++++++++++++-----------
kernel/sched/deadline.c | 2 +-
5 files changed, 31 insertions(+), 71 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52c4847b05e2..30169f38cb24 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1615,6 +1615,7 @@ struct task_struct {

/* Protection of the PI data structures: */
raw_spinlock_t pi_lock;
+ struct task_struct *pi_task;

struct wake_q_node wake_q;

diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index a30b172df6e1..0a8f071b16e3 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -16,31 +16,13 @@ static inline int rt_task(struct task_struct *p)
}

#ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
return tsk->pi_blocked_on != NULL;
}
#else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
- return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
- int newprio)
-{
- return newprio;
-}
-
-static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
- return NULL;
-}
# define rt_mutex_adjust_pi(p) do { } while (0)
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e746607abe5..13b6b5922d3c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -257,53 +257,18 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
}

/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
-
- return min(task_top_pi_waiter(task)->prio,
- task->normal_prio);
-}
-
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return NULL;
-
- return task_top_pi_waiter(task)->task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
- if (!task_has_pi_waiters(task))
- return newprio;
-
- if (task_top_pi_waiter(task)->task->prio <= newprio)
- return task_top_pi_waiter(task)->task->prio;
- return newprio;
-}
-
-/*
* Adjust the priority of a task, after its pi_waiters got modified.
*
* This can be both boosting and unboosting. task->pi_lock must be held.
*/
static void __rt_mutex_adjust_prio(struct task_struct *task)
{
- int prio = rt_mutex_getprio(task);
+ struct task_struct *pi_task = task;
+
+ if (unlikely(task_has_pi_waiters(task)))
+ pi_task = task_top_pi_waiter(task)->task;

- if (task->prio != prio || dl_prio(prio))
- rt_mutex_setprio(task, prio);
+ rt_mutex_setprio(task, pi_task);
}

/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fcac37b..7d7e3a0eaeb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3392,7 +3392,7 @@ EXPORT_SYMBOL(default_wake_function);
/*
* rt_mutex_setprio - set the current priority of a task
* @p: task
- * @prio: prio value (kernel-internal form)
+ * @pi_task: top waiter, donating state
*
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
@@ -3400,13 +3400,20 @@ EXPORT_SYMBOL(default_wake_function);
* Used by the rt_mutex code to implement priority inheritance
* logic. Call site only calls if the priority of the task changed.
*/
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
- struct rq *rq;
+ int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *prev_class;
+ struct rq *rq;

- BUG_ON(prio > MAX_PRIO);
+ /*
+ * For FIFO/RR we simply donate prio; for DL things are
+ * more interesting.
+ */
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = min(pi_task->prio, p->normal_prio);
+ if (p->prio == prio && !dl_prio(prio))
+ return;

rq = __task_rq_lock(p);

@@ -3442,6 +3449,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
put_prev_task(rq, p);

+ if (pi_task == p)
+ pi_task = NULL;
+ p->pi_task = pi_task;
+
/*
* Boosting condition are:
* 1. -rt task is running and holds mutex A
@@ -3452,7 +3463,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* running task
*/
if (dl_prio(prio)) {
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
@@ -3727,10 +3737,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
* Keep a potential priority boosting if called from
* sched_setscheduler().
*/
- if (keep_boost)
- p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
- else
- p->prio = normal_prio(p);
+ p->prio = normal_prio(p);
+ if (keep_boost && p->pi_task)
+ p->prio = min(p->prio, p->pi_task->prio);

if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -4017,7 +4026,10 @@ static int __sched_setscheduler(struct task_struct *p,
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ new_effective_prio = newprio;
+ if (p->pi_task)
+ new_effective_prio = min(new_effective_prio, p->pi_task->prio);
+
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index affd97ec9f65..6c5aa4612eb6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -932,7 +932,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)

static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ struct task_struct *pi_task = p->pi_task;
struct sched_dl_entity *pi_se = &p->dl;

/*