Re: [RFC][PATCH 0/8] sched,idle: need resched polling rework
From: Peter Zijlstra
Date: Thu May 22 2014 - 08:58:29 EST
On Fri, Apr 11, 2014 at 08:00:23AM -0700, Andy Lutomirski wrote:
>
> That being said, I think that this addresses once one of the two major
> issues. While the race you're fixing is more interesting, I think its
> impact is dwarfed by the fact that ttwu_queue_remote completely
> ignores polling. (NB: I haven't actually tested this patch set, but I
> did try to instrument this stuff awhile ago.)
>
> To fix this, presumably the wake-from-idle path needs a
> sched_ttwu_pending call, and ttwu_queue_remote could use resched_task.
> sched_ttwu_pending could benefit from a straightforward optimization:
> it doesn't need rq->lock if llist is empty.
>
> If you're not planning on trying to fix that, I can try to write up a
> patch in the next day or two.
>
> Even with all of this fixed, what happens when ttwu_queue_remote is
> called with a task that has lower priority than whatever is currently
> running on the targeted cpu? I think the result is an IPI that serves
> very little purpose other than avoiding taking a spinlock in the
> waking thread. This may be a bad tradeoff. I doubt that this matters
> for my particular workload, though.
>
Ok, now that that all settled, how about something like this on top?
---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4ea7b3f1a087..a5da85fb3570 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -546,12 +546,38 @@ static bool set_nr_and_not_polling(struct task_struct *p)
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = ti->flags;
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ return true;
+ val = old;
+ }
+}
+
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
+
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
#endif
/*
@@ -652,16 +678,7 @@ static void wake_up_idle_cpu(int cpu)
if (rq->curr != rq->idle)
return;
- /*
- * We can set TIF_RESCHED on the idle task of the other CPU
- * lockless. The worst case is that the other CPU runs the
- * idle task through an additional NOOP schedule()
- */
- set_tsk_need_resched(rq->idle);
-
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(rq->idle))
+ if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
}
@@ -1521,12 +1538,15 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
+ if (!llist)
+ return;
+
raw_spin_lock(&rq->lock);
while (llist) {
@@ -1581,8 +1601,10 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
- smp_send_reschedule(cpu);
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(p))
+ smp_send_reschedule(cpu);
+ }
}
bool cpus_share_cache(int this_cpu, int that_cpu)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 34083c9ac976..4286f2e59136 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -12,6 +12,8 @@
#include <trace/events/power.h>
+#include "sched.h"
+
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
@@ -223,6 +225,7 @@ static void cpu_idle_loop(void)
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
+ sched_ttwu_pending();
schedule_preempt_disabled();
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2cbe81308af..2b35ec6df6b3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
#ifdef CONFIG_SMP
+extern void sched_ttwu_pending(void);
+
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
+#else
+
+static inline void sched_ttwu_pending(void) { }
+
#endif /* CONFIG_SMP */
#include "stats.h"
Attachment:
pgpqz2H_5Hzn8.pgp
Description: PGP signature