[PATCH] sched: Add logic to handle parallel try_to_wake_up() of the same task

From: Kirill Tkhai
Date: Sat Jul 13 2013 - 11:46:03 EST


This patch adds optimization of try_to_wake_up() function
for cases when the system is doing parallel wake_up
of the same task on the different cpus. Also it adds
accounting the statistics of these situations.

We check the status of the task we want to wake up.
If it is TASK_WAKING then the task is manipulated
by try_to_wake_up() on another cpu. And after this
check it will be a moment when the task is queued
and his status is TASK_RUNNING. We just return
earlier when we are sure the task will be TASK_RUNNING
in the future (maybe right after the check). The profit is
we don't loop while we are waiting the spinlock.

There mustn't be any problems connected with
we return earlier, besause scheduler allready does
the same, when he queues a task in wake_list to be
waked up on another cpu.

Parallel wake up is not unreal situation. Here is
statistics from my 2-cpu laptop:

~# grep 'nr_wakeups_parallel.*[1-9]' -B100 -h /proc/*/sched | grep 'threads\|parallel\|wakeups ' | sed 's/(.*)//g'

rcu_sched
se.statistics.nr_wakeups : 102
se.statistics.nr_wakeups_parallel : 2
Xorg
se.statistics.nr_wakeups : 36030
se.statistics.nr_wakeups_parallel : 56
gnome-terminal
se.statistics.nr_wakeups : 70573
se.statistics.nr_wakeups_parallel : 55
rcu_preempt
se.statistics.nr_wakeups : 68101
se.statistics.nr_wakeups_parallel : 1368

It's the moment after boot (5-10 minutes uptime). Later the ratio
goes down:

rcu_sched
se.statistics.nr_wakeups : 102
se.statistics.nr_wakeups_parallel : 2
Xorg
se.statistics.nr_wakeups : 49057
se.statistics.nr_wakeups_parallel : 74
gnome-terminal
se.statistics.nr_wakeups : 1495463
se.statistics.nr_wakeups_parallel : 99
rcu_preempt
se.statistics.nr_wakeups : 2015010
se.statistics.nr_wakeups_parallel : 3738

Signed-off-by: Kirill Tkhai <tkhai@xxxxxxxxx>
CC: Steven Rostedt <rostedt@xxxxxxxxxxx>
CC: Ingo Molnar <mingo@xxxxxxxxxx>
CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
---
include/linux/sched.h | 1 +
kernel/sched/core.c | 29 +++++++++++++++++++++++++----
kernel/sched/debug.c | 7 +++++++
kernel/sched/stats.h | 16 ++++++++++++++++
4 files changed, 49 insertions(+), 4 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fc09d21..235a466 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -964,6 +964,7 @@ struct sched_statistics {
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
+ atomic_t nr_wakeups_parallel;
};
#endif

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9d06ad6..1e1475f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1336,6 +1336,11 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
+ /*
+ * Pair bracket with TASK_WAKING check it try_to_wake_up()
+ */
+ smp_wmb();
+
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);

@@ -1487,20 +1492,37 @@ static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
- int cpu, success = 0;
+ int cpu, success = 1;

+ /*
+ * See commentary for commit 04e2f1741d235ba599037734878d72e57cb302b5.
+ */
smp_wmb();
+#ifdef CONFIG_SMP
+ if (p->state == TASK_WAKING) {
+ /*
+ * Pairs with sets of p->state: below and in ttwu_do_wakeup().
+ */
+ smp_rmb();
+ inc_nr_parallel_wakeups(p);
+ return success;
+ }
+#endif
raw_spin_lock_irqsave(&p->pi_lock, flags);
- if (!(p->state & state))
+ if (!(p->state & state)) {
+ success = 0;
goto out;
+ }

- success = 1; /* we're going to change ->state */
cpu = task_cpu(p);

if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;

#ifdef CONFIG_SMP
+ p->state = TASK_WAKING;
+ smp_wmb();
+
/*
* If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
@@ -1513,7 +1535,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
smp_rmb();

p->sched_contributes_to_load = !!task_contributes_to_load(p);
- p->state = TASK_WAKING;

if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e076bdd..f18736d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -542,6 +542,13 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.statistics.nr_wakeups_idle);

{
+ int nr = get_nr_parallel_wakeups(p);
+
+ SEQ_printf(m, "%-45s:%21d\n",
+ "se.statistics.nr_wakeups_parallel", nr);
+ }
+
+ {
u64 avg_atom, avg_per_cpu;

avg_atom = p->se.sum_exec_runtime;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 5aef494..dbbc6e9 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -155,6 +155,22 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */

+#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_SMP)
+static inline void
+inc_nr_parallel_wakeups(struct task_struct *t)
+{
+ atomic_inc(&t->se.statistics.nr_wakeups_parallel);
+}
+static inline int
+get_nr_parallel_wakeups(struct task_struct *t)
+{
+ return atomic_read(&t->se.statistics.nr_wakeups_parallel);
+}
+#else
+#define inc_nr_parallel_wakeups(t) do { } while (0)
+#define get_nr_parallel_wakeups(t) (0)
+#endif /* CONFIG_SCHEDSTATS && CONFIG_SMP */
+
/*
* The following are functions that support scheduler-internal time accounting.
* These functions are generally called at the timer tick. None of this depends
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/