Re: [PATCH] sched/rt: Fix double enqueue caused by rt_effective_prio

From: Peter Zijlstra
Date: Thu Jul 08 2021 - 06:06:48 EST


On Thu, Jul 01, 2021 at 11:14:31AM +0200, Juri Lelli wrote:
> Double enqueues in rt runqueues (list) have been reported while running
> a simple test that spawns a number of threads doing a short sleep/run
> pattern while being concurrently setscheduled between rt and fair class.

> __sched_setscheduler() uses rt_effective_prio() to handle proper queuing
> of priority boosted tasks that are setscheduled while being boosted.
> rt_effective_prio() is however called twice per each
> __sched_setscheduler() call: first directly by __sched_setscheduler()
> before dequeuing the task and then by __setscheduler() to actually do
> the priority change. If the priority of the pi_top_task is concurrently
> being changed however, it might happen that the two calls return
> different results. If, for example, the first call returned the same rt
> priority the task was running at and the second one a fair priority, the
> task won't be removed by the rt list (on_list still set) and then
> enqueued in the fair runqueue. When eventually setscheduled back to rt
> it will be seen as enqueued already and the WARNING/BUG be issued.
>
> Fix this by calling rt_effective_prio() only once and then reusing the
> return value. Concurrent priority inheritance handling is still safe and
> will eventually converge to a new state by following the inheritance
> chain(s).
>
> Fixes: ff77e46853598 ("sched/rt: Fix PI handling vs. sched_setscheduler()")

You sure? AFAICT that commit only adds the warning that's triggered
here, but the double rt_effective_prio() call seems to predate that. Or
am I missing something about the on_rq changes in that patch?

Commit 0782e63bc6fe seems to introduce the double call..

> ---
> kernel/sched/core.c | 12 ++++++++----
> 1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0c22cd026440..c84ac1d675f4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6823,7 +6823,8 @@ static void __setscheduler_params(struct task_struct *p,
>
> /* Actually do priority change: must hold pi & rq lock. */
> static void __setscheduler(struct rq *rq, struct task_struct *p,
> - const struct sched_attr *attr, bool keep_boost)
> + const struct sched_attr *attr, bool keep_boost,
> + int new_effective_prio)
> {
> /*
> * If params can't change scheduling class changes aren't allowed
> @@ -6840,7 +6841,7 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
> */
> p->prio = normal_prio(p);
> if (keep_boost)
> - p->prio = rt_effective_prio(p, p->prio);
> + p->prio = new_effective_prio;
>
> if (dl_prio(p->prio))
> p->sched_class = &dl_sched_class;
> @@ -6873,7 +6874,7 @@ static int __sched_setscheduler(struct task_struct *p,
> int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
> MAX_RT_PRIO - 1 - attr->sched_priority;
> int retval, oldprio, oldpolicy = -1, queued, running;
> - int new_effective_prio, policy = attr->sched_policy;
> + int new_effective_prio = newprio, policy = attr->sched_policy;
> const struct sched_class *prev_class;
> struct callback_head *head;
> struct rq_flags rf;
> @@ -7072,6 +7073,9 @@ static int __sched_setscheduler(struct task_struct *p,
> oldprio = p->prio;
>
> if (pi) {
> + newprio = fair_policy(attr->sched_policy) ?
> + NICE_TO_PRIO(attr->sched_nice) : newprio;

*groan*.... That completes our local copy of normal_prio() based off of
attr.

> /*
> * Take priority boosted tasks into account. If the new
> * effective priority is unchanged, we just store the new
> @@ -7093,7 +7097,7 @@ static int __sched_setscheduler(struct task_struct *p,
>
> prev_class = p->sched_class;
>
> - __setscheduler(rq, p, attr, pi);
> + __setscheduler(rq, p, attr, pi, new_effective_prio);
> __setscheduler_uclamp(p, attr);
>
> if (queued) {

Slightly larger patch, but perhaps a little cleaner.. still pondering if
we can share a little more between __sched_setscheduler() and
rt_mutex_setprio().

---
kernel/sched/core.c | 51 +++++++++++++++++++++------------------------------
1 file changed, 21 insertions(+), 30 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 970b95ff2952..ede10642612c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1983,12 +1983,18 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}

-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
+static inline int __normal_prio(int policy, int rt_prio, int nice)
{
- return p->static_prio;
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
}

/*
@@ -2000,15 +2006,7 @@ static inline int __normal_prio(struct task_struct *p)
*/
static inline int normal_prio(struct task_struct *p)
{
- int prio;
-
- if (task_has_dl_policy(p))
- prio = MAX_DL_PRIO-1;
- else if (task_has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}

/*
@@ -4101,7 +4099,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);

- p->prio = p->normal_prio = __normal_prio(p);
+ p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);

/*
@@ -6828,7 +6826,7 @@ static void __setscheduler_params(struct task_struct *p,

/* Actually do priority change: must hold pi & rq lock. */
static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr, bool keep_boost)
+ const struct sched_attr *attr, int newprio)
{
/*
* If params can't change scheduling class changes aren't allowed
@@ -6839,13 +6837,7 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,

__setscheduler_params(p, attr);

- /*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
- */
- p->prio = normal_prio(p);
- if (keep_boost)
- p->prio = rt_effective_prio(p, p->prio);
+ p->prio = newprio;

if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -6875,10 +6867,8 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
struct callback_head *head;
struct rq_flags rf;
@@ -7076,6 +7066,7 @@ static int __sched_setscheduler(struct task_struct *p,
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;

+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
/*
* Take priority boosted tasks into account. If the new
@@ -7084,8 +7075,8 @@ static int __sched_setscheduler(struct task_struct *p,
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_effective_prio(p, newprio);
- if (new_effective_prio == oldprio)
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}

@@ -7098,7 +7089,7 @@ static int __sched_setscheduler(struct task_struct *p,

prev_class = p->sched_class;

- __setscheduler(rq, p, attr, pi);
+ __setscheduler(rq, p, attr, newprio);
__setscheduler_uclamp(p, attr);

if (queued) {