Re: [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

From: Valentin Schneider
Date: Thu Oct 15 2020 - 09:55:05 EST



On 15/10/20 12:05, Peter Zijlstra wrote:
> +static int affine_move_task(struct rq *rq, struct rq_flags *rf,
> + struct task_struct *p, int dest_cpu, unsigned int flags)
> +{
> + struct set_affinity_pending my_pending = { }, *pending = NULL;
> + struct migration_arg arg = {
> + .task = p,
> + .dest_cpu = dest_cpu,
> + };
> + bool complete = false;
> +
> + /* Can the task run on the task's current CPU? If so, we're done */
> + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
> + pending = p->migration_pending;
> + if (pending) {
> + p->migration_pending = NULL;
> + complete = true;

Deciphering my TLA+ deadlock traces leads me to think this needs

refcount_inc(&pending->refs);

because the 'goto do_complete' leads us to an unconditional decrement.

> + }
> + task_rq_unlock(rq, p, rf);
> +
> + if (complete)
> + goto do_complete;
^^^^
that here

> +
> + return 0;
> + }
> +

[...]

> +do_complete:
> + if (complete)
> + complete_all(&pending->done);
> + }
> +
> + wait_for_completion(&pending->done);
> +
> + if (refcount_dec_and_test(&pending->refs))
^^^^^^^^^^^^^^^^^^^^^^^
leads to this guy there

> + wake_up_var(&pending->refs);
> +
> + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
> +
> + return 0;
> +}