Re: [RFC][PATCH 0/8] sched,idle: need resched polling rework

From: Peter Zijlstra
Date: Tue Jun 03 2014 - 06:44:02 EST


On Mon, Jun 02, 2014 at 11:40:26PM -0700, Andy Lutomirski wrote:
> You're testing polling on the task being woken, which cannot possibly
> succeed: the only tasks that have any business polling are the idle
> tasks. Something like this seems to help:
>
> static void ttwu_queue_remote(struct task_struct *p, int cpu)
> {
> + struct rq *rq = cpu_rq(cpu);
> +
> if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
> - if (!set_nr_if_polling(p))
> + if (!set_nr_if_polling(rq->idle))
> smp_send_reschedule(cpu);
> + else
> + trace_sched_wake_polling_cpu(cpu);
> }
> }
>
> If you don't beat me to it, I'll send real patches in the morning.
> I'll also send some followup patches to make it even better. Fully
> fixed up, this gets rid of almost all of my rescheduling interrupts
> except for interrupts from the timer tick.

We need rq->curr, rq->idle 'sleeps' with polling set and nr clear, but
it obviously has no effect setting that if its not actually the current
task.

Touching rq->curr needs holding rcu_read_lock() though, to make sure the
task stays around, still shouldn't be a problem.

> Also, grr, I still think this would be clearer if polling and
> need_resched were per cpu instead of per task -- they only make sense
> on a running task. I guess that need_resched being in
> thread_info->flags is helpful because it streamlines the interrupt
> exit code. Oh, well.

Yes, and the fact that traditionally we didn't have per-cpu storage. But
yes, the interrupt/system return path is a good reason to keep all this.

---
Subject: sched: Optimize ttwu IPI
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Thu May 22 17:19:22 CEST 2014

When enqueueing tasks on remote LLC domains, we send an IPI to do the
work 'locally' and avoid bouncing all the cachelines over.

However, when the remote CPU is idle (and polling, say x86 mwait), we
don't need to send an IPI, we can simply kick the TIF word to wake it
up and have the 'idle' loop do the work.

So when _TIF_POLLING_NRFLAG is set, but _TIF_NEED_RESCHED is not (yet)
set, set _TIF_NEED_RESCHED and avoid sending the IPI.

Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Mike Galbraith <umgwanakikbuti@xxxxxxxxx>
Much-Requested-by: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 61 +++++++++++++++++++++++++++++++++++++--------------
kernel/sched/idle.c | 3 ++
kernel/sched/sched.h | 6 +++++
3 files changed, 54 insertions(+), 16 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -535,7 +535,7 @@ static inline void init_hrtick(void)
__old; \
})

-#ifdef TIF_POLLING_NRFLAG
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
@@ -546,12 +546,40 @@ static bool set_nr_and_not_polling(struc
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+
+ for (;;) {
+ if ((val & (_TIF_NEED_RESCHED | _TIF_POLLING_NRFLAG)) !=
+ _TIF_POLLING_NRFLAG)
+ return false;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
#endif

/*
@@ -652,16 +680,7 @@ static void wake_up_idle_cpu(int cpu)
if (rq->curr != rq->idle)
return;

- /*
- * We can set TIF_RESCHED on the idle task of the other CPU
- * lockless. The worst case is that the other CPU runs the
- * idle task through an additional NOOP schedule()
- */
- set_tsk_need_resched(rq->idle);
-
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(rq->idle))
+ if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
}

@@ -1521,13 +1540,17 @@ static int ttwu_remote(struct task_struc
}

#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
+ unsigned long flags;
+
+ if (!llist)
+ return;

- raw_spin_lock(&rq->lock);
+ raw_spin_lock_irqsave(&rq->lock, flags);

while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1535,7 +1558,7 @@ static void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}

- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}

void scheduler_ipi(void)
@@ -1581,8 +1604,14 @@ void scheduler_ipi(void)

static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
- smp_send_reschedule(cpu);
+ struct rq *rq = cpu_rq(cpu);
+
+ if (llist_add(&p->wake_entry, &rq->wake_list)) {
+ rcu_read_lock();
+ if (!set_nr_if_polling(rq->curr))
+ smp_send_reschedule(cpu);
+ rcu_read_unlock();
+ }
}

bool cpus_share_cache(int this_cpu, int that_cpu)
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -12,6 +12,8 @@

#include <trace/events/power.h>

+#include "sched.h"
+
static int __read_mostly cpu_idle_force_poll;

void cpu_idle_poll_ctrl(bool enable)
@@ -218,6 +220,7 @@ static void cpu_idle_loop(void)
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
+ sched_ttwu_pending();
schedule_preempt_disabled();
}
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -670,6 +670,8 @@ extern int migrate_swap(struct task_stru

#ifdef CONFIG_SMP

+extern void sched_ttwu_pending(void);
+
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -787,6 +789,10 @@ static inline unsigned int group_first_c

extern int group_balance_cpu(struct sched_group *sg);

+#else
+
+static inline void sched_ttwu_pending(void) { }
+
#endif /* CONFIG_SMP */

#include "stats.h"

Attachment: pgppSJsefwyez.pgp
Description: PGP signature