Re: [PATCH -v2 1/5] sched: Fix ttwu() race

From: Peter Zijlstra
Date: Thu Jul 23 2020 - 14:29:09 EST


On Wed, Jul 22, 2020 at 10:57:56AM +0100, Chris Wilson wrote:

> Perhaps more damning is that I can replace WF_ON_CPU with p->on_cpu to
> suppress the warning:

*argh*, I'm starting to go mad...

Chris, could you please try the below patch?

Can you also confirm that if you do:

$ echo NO_TTWU_QUEUE_ON_CPU > /debug/sched_features

or wherever else system-doofus mounts debugfs these days,
the issue no longer manifests? Because if I don't get a handle on this
soon we might have to disable this thing for now :/


---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a2a244af9a537..8218779734288 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2430,13 +2430,15 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}

-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+static inline bool ttwu_queue_cond(struct task_struct *p, int cpu, int wake_flags)
{
+ int this_cpu = smp_processor_id();
+
/*
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
*/
- if (!cpus_share_cache(smp_processor_id(), cpu))
+ if (!cpus_share_cache(this_cpu, cpu))
return true;

/*
@@ -2445,15 +2447,30 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
* the soon-to-be-idle CPU as the current CPU is likely busy.
* nr_running is checked to avoid unnecessary task stacking.
*/
- if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
- return true;
+ if (wake_flags & WF_ON_CPU) {
+
+ if (unlikely(cpu == this_cpu)) {
+ int on_cpu = READ_ONCE(p->on_cpu);
+ int cpu1 = task_cpu(p);
+
+ smp_rmb();
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ pr_alert("ttwu-IPI-self: %d==%d, p->on_cpu=%d;0, task_cpu(p)=%d;%d\n",
+ cpu, this_cpu, on_cpu, cpu1, task_cpu(p));
+
+ return false;
+ }
+
+ return cpu_rq(cpu)->nr_running <= 1;
+ }

return false;
}

static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
- if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu, wake_flags)) {
if (WARN_ON_ONCE(cpu == smp_processor_id()))
return false;

@@ -2713,7 +2730,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* to ensure we observe the correct CPU on which the task is currently
* scheduling.
*/
- if (smp_load_acquire(&p->on_cpu) &&
+ if (sched_feat(TTWU_QUEUE_ON_CPU) && smp_load_acquire(&p->on_cpu) &&
ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
goto unlock;

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7481cd96f3915..b231a840c3eba 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -50,6 +50,7 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, true)
+SCHED_FEAT(TTWU_QUEUE_ON_CPU, true)

/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.