Re: [PATCH 00/13] Reconcile NUMA balancing decisions with the load balancer v6

From: Peter Zijlstra
Date: Fri May 15 2020 - 07:18:15 EST


On Fri, May 15, 2020 at 09:47:40AM +0100, Mel Gorman wrote:

> However, the wakeups are so rapid that the wakeup
> happens while the server is descheduling. That forces the waker to spin
> on smp_cond_load_acquire for longer. In this case, it can be cheaper to
> add the task to the rq->wake_list even if that potentially requires an IPI.

Right, I think Rik ran into that as well at some point. He wanted to
make ->on_cpu do a hand-off, but simply queueing the wakeup on the prev
cpu (which is currently in the middle of schedule()) should be an easier
proposition.

Maybe something like this untested thing... could explode most mighty,
didn't thing too hard.

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa6c19d38e82..c07b92a0ee5d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2312,7 +2312,7 @@ static void wake_csd_func(void *info)
sched_ttwu_pending();
}

-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+static void __ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);

@@ -2354,6 +2354,17 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
+
+static bool ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+{
+ if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+ __ttwu_queue_remote(p, cpu, wake_flags);
+ return true;
+ }
+
+ return false;
+}
#endif /* CONFIG_SMP */

static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2362,11 +2373,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
struct rq_flags rf;

#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
- ttwu_queue_remote(p, cpu, wake_flags);
+ if (ttwu_queue_remote(p, cpu, wake_flags))
return;
- }
#endif

rq_lock(rq, &rf);
@@ -2550,7 +2558,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->on_rq && ttwu_remote(p, wake_flags))
goto unlock;

+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
#ifdef CONFIG_SMP
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
+
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
@@ -2581,15 +2597,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_cond_load_acquire(&p->on_cpu, !VAL);
-
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
- p->state = TASK_WAKING;
+ if (READ_ONCE(p->on_cpu) && __ttwu_queue_remote(p, cpu, wake_flags))
+ goto unlock;

- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
+ smp_cond_load_acquire(&p->on_cpu, !VAL);

cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
@@ -2597,14 +2608,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
-
-#else /* CONFIG_SMP */
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
#endif /* CONFIG_SMP */

ttwu_queue(p, cpu, wake_flags);