Re: [PATCH RFC 1/3] sched/pe: Exclude balance callback queuing during proxy()'s migrate

From: Dietmar Eggemann
Date: Fri Dec 09 2022 - 10:07:49 EST


On 23/11/2022 02:21, Joel Fernandes (Google) wrote:
> In commit 565790d28b1e ("sched: Fix balance_callback()"), it is clear
> that rq lock needs to be held from when balance callbacks are queued to
> when __balance_callbacks() in schedule() is called.
>
> This has to be done without dropping the runqueue lock in between. If
> dropped between queuing and executing callbacks, it is possible that
> another CPU, say in __sched_setscheduler() can queue balancing callbacks
> and cause issues as show in that commit.
>
> This is precisely what happens in proxy(). During a proxy(), the current
> CPU's rq lock is temporary dropped when moving the tasks in the migrate
> list to the owner CPU.
>
> This commit therefore make proxy() exclude balance callback queuing on
> other CPUs, in the section where proxy() temporarily drops the rq lock
> of current CPU.
>
> CPUs that acquire a remote CPU's rq lock but later queue a balance
> callback, are made to call the new helpers in this commit to check
> whether balance_lock is held. If it is held, then the rq lock is
> released and a re-attempt is made to acquire it in the hopes that
> the ban on balancing callback queuing has elapsed.
>
> Reported-by: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
> Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
> ---
> kernel/sched/core.c | 72 ++++++++++++++++++++++++++++++++++++++++++--
> kernel/sched/sched.h | 7 ++++-
> 2 files changed, 76 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 88a5fa34dc06..aba90b3dc3ef 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -633,6 +633,29 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
> }
> }
>
> +/*
> + * Helper to call __task_rq_lock safely, in scenarios where we might be about to
> + * queue a balance callback on a remote CPU. That CPU might be in proxy(), and
> + * could have released its rq lock while holding balance_lock. So release rq
> + * lock in such a situation to avoid deadlock, and retry.
> + */
> +struct rq *__task_rq_lock_balance(struct task_struct *p, struct rq_flags *rf)
> +{
> + struct rq *rq;
> + bool locked = false;
> +
> + do {
> + if (locked) {
> + __task_rq_unlock(rq, rf);
> + cpu_relax();
> + }
> + rq = __task_rq_lock(p, rf);
> + locked = true;
> + } while (raw_spin_is_locked(&rq->balance_lock));
> +
> + return rq;
> +}
> +
> /*
> * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
> */
> @@ -675,6 +698,29 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
> }
> }
>
> +/*
> + * Helper to call task_rq_lock safely, in scenarios where we might be about to
> + * queue a balance callback on a remote CPU. That CPU might be in proxy(), and
> + * could have released its rq lock while holding balance_lock. So release rq
> + * lock in such a situation to avoid deadlock, and retry.
> + */
> +struct rq *task_rq_lock_balance(struct task_struct *p, struct rq_flags *rf)
> +{
> + struct rq *rq;
> + bool locked = false;
> +
> + do {
> + if (locked) {
> + task_rq_unlock(rq, p, rf);
> + cpu_relax();
> + }
> + rq = task_rq_lock(p, rf);
> + locked = true;
> + } while (raw_spin_is_locked(&rq->balance_lock));
> +
> + return rq;
> +}
> +
> /*
> * RQ-clock updating methods:
> */
> @@ -6739,6 +6785,12 @@ proxy(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
> p->wake_cpu = wake_cpu;
> }
>
> + /*
> + * Prevent other CPUs from queuing balance callbacks while we migrate
> + * tasks in the migrate_list with the rq lock released.
> + */
> + raw_spin_lock(&rq->balance_lock);
> +
> rq_unpin_lock(rq, rf);
> raw_spin_rq_unlock(rq);
> raw_spin_rq_lock(that_rq);
> @@ -6758,7 +6810,21 @@ proxy(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
> }
>
> raw_spin_rq_unlock(that_rq);
> +
> + /*
> + * This may make lockdep unhappy as we acquire rq->lock with
> + * balance_lock held. But that should be a false positive, as the
> + * following pattern happens only on the current CPU with interrupts
> + * disabled:
> + * rq_lock()
> + * balance_lock();
> + * rq_unlock();
> + * rq_lock();
> + */
> raw_spin_rq_lock(rq);
> +
> + raw_spin_unlock(&rq->balance_lock);
> +
> rq_repin_lock(rq, rf);
>
> return NULL; /* Retry task selection on _this_ CPU. */
> @@ -7489,7 +7555,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
> if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
> return;
>
> - rq = __task_rq_lock(p, &rf);
> + rq = __task_rq_lock_balance(p, &rf);
> update_rq_clock(rq);
> /*
> * Set under pi_lock && rq->lock, such that the value can be used under
> @@ -8093,7 +8159,8 @@ static int __sched_setscheduler(struct task_struct *p,
> * To be able to change p->policy safely, the appropriate
> * runqueue lock must be held.
> */
> - rq = task_rq_lock(p, &rf);
> + rq = task_rq_lock_balance(p, &rf);
> +
> update_rq_clock(rq);
>
> /*

You consider rt_mutex_setprio() and __sched_setscheduler() versus
proxy() but what about all the other places like load_balance(),
update_blocked_averages(), __set_cpus_allowed_ptr() and many
more in which we take the rq lock (via task_rq_lock() or
rq_lock{_xxx}())?

With your changes to locktorture in {2-3}/3 you still run CFS
lock_torture_writers but you should see the issue popping up
in __set_cpus_allowed_ptr() (from torture_shuffle()) for example.

Tried with:

insmod /lib/modules/torture.ko
insmod /lib/modules/locktorture.ko torture_type=mutex_lock rt_boost=1 rt_boost_factor=1 nlocks=3
^^^^^^^^^^^^^^^^^

When changing all lock_torture_writer's to FIFO it becomes even
more visible.

-->8--

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index e4529c2166e9..ea75d525fe7c 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -683,7 +683,8 @@ static int lock_torture_writer(void *arg)
DEFINE_TORTURE_RANDOM(rand);

VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, MAX_NICE);
+ if (!rt_task(current))
+ set_user_nice(current, MAX_NICE);

do {
if ((torture_random(&rand) & 0xfffff) == 0)
diff --git a/kernel/torture.c b/kernel/torture.c
index 1d0dd88369e3..55d8ac417a4a 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -57,6 +57,9 @@ module_param(verbose_sleep_duration, int, 0444);
static int random_shuffle;
module_param(random_shuffle, int, 0444);

+static int lock_torture_writer_fifo;
+module_param(lock_torture_writer_fifo, int, 0444);
+
static char *torture_type;
static int verbose;

@@ -734,7 +737,7 @@ bool stutter_wait(const char *title)
cond_resched_tasks_rcu_qs();
spt = READ_ONCE(stutter_pause_test);
for (; spt; spt = READ_ONCE(stutter_pause_test)) {
- if (!ret) {
+ if (!ret && !rt_task(current)) {
sched_set_normal(current, MAX_NICE);
ret = true;
}
@@ -944,6 +947,11 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
*tp = NULL;
return ret;
}
+
+ if (lock_torture_writer_fifo &&
+ !strncmp(s, "lock_torture_writer", strlen(s)))
+ sched_set_fifo(*tp);
+
wake_up_process(*tp); // Process is sleeping, so ordering provided.
torture_shuffle_task_register(*tp);
return ret;

[...]