[PATCH 1/3] sched/cputime: Improve scalability of times()/clock_gettime() on 32 bit cpus
From: Stanislaw Gruszka
Date: Thu Sep 01 2016 - 05:31:47 EST
My previous commit:
a1eb1411b4e4 ("sched/cputime: Improve scalability by not accounting thread group tasks pending runtime")
helped to achieve good performance of SYS_times() and
SYS_clock_gettimes(CLOCK_PROCESS_CPUTIME_ID) on 64 bit architectures.
However taking task_rq_lock() when reading t->se.sum_exec_runtime on
32 bit architectures still make those syscalls slow.
The reason why we take the lock is to make 64bit sum_exec_runtime
variable consistent. While a inconsistency scenario is very very unlike,
I assume it still may happen at least on some 32 bit architectures.
To protect the variable I introduced new seqcount lock. Performance
improvements on machine with 32 cores (32-bit cpus) measured by
benchmarks described in commit:
6075620b0590 ("sched/cputime: Mitigate performance regression in times()/clock_gettime()")
are shown below:
syscall-nr_threads: not patched: patched:
clock_gettime-2 3.42 ( 0.00%) 2.24 ( 34.29%)
clock_gettime-5 3.72 ( 0.00%) 1.41 ( 61.98%)
clock_gettime-8 3.76 ( 0.00%) 1.25 ( 66.74%)
clock_gettime-12 3.68 ( 0.00%) 1.14 ( 68.91%)
clock_gettime-21 4.24 ( 0.00%) 1.02 ( 75.88%)
clock_gettime-30 4.56 ( 0.00%) 0.89 ( 80.43%)
clock_gettime-48 5.98 ( 0.00%) 0.87 ( 85.38%)
clock_gettime-79 8.87 ( 0.00%) 0.87 ( 90.18%)
clock_gettime-110 12.30 ( 0.00%) 0.88 ( 92.82%)
clock_gettime-128 14.98 ( 0.00%) 0.89 ( 94.02%)
times-2 4.03 ( 0.00%) 2.50 ( 38.08%)
times-5 3.71 ( 0.00%) 1.61 ( 56.55%)
times-8 3.77 ( 0.00%) 1.47 ( 61.04%)
times-12 3.79 ( 0.00%) 2.30 ( 39.37%)
times-21 4.32 ( 0.00%) 2.62 ( 39.25%)
times-30 4.58 ( 0.00%) 2.87 ( 37.31%)
times-48 5.98 ( 0.00%) 2.75 ( 53.98%)
times-79 8.96 ( 0.00%) 2.82 ( 68.55%)
times-110 12.83 ( 0.00%) 2.88 ( 77.59%)
times-128 15.39 ( 0.00%) 2.82 ( 81.67%)
I also tested some alternatives to the patch. I tried to use
vtime_seqcount instead of new seqcount and make sum_exec_runtime
atomic64_t variable. Performance results of those were worse:
syscall-nr_threads: New seqcount: vtime_seqcount: atomic64_t
clock_gettime-5 1.41 ( 0.00%) 1.89 (-33.43%) 2.37 (-67.70%)
clock_gettime-8 1.25 ( 0.00%) 1.64 (-31.39%) 2.25 (-79.82%)
clock_gettime-12 1.14 ( 0.00%) 1.52 (-32.63%) 2.36 (-106.56%)
clock_gettime-21 1.02 ( 0.00%) 1.30 (-27.17%) 2.51 (-145.85%)
clock_gettime-30 0.89 ( 0.00%) 1.09 (-22.17%) 3.06 (-243.11%)
clock_gettime-48 0.87 ( 0.00%) 1.05 (-20.14%) 3.81 (-336.16%)
clock_gettime-79 0.87 ( 0.00%) 1.00 (-14.47%) 4.61 (-429.51%)
clock_gettime-110 0.88 ( 0.00%) 0.98 (-11.33%) 5.28 (-497.73%)
clock_gettime-128 0.89 ( 0.00%) 1.00 (-11.96%) 5.64 (-530.39%)
times-2 2.50 ( 0.00%) 2.66 ( -6.65%) 3.45 (-38.19%)
times-5 1.61 ( 0.00%) 1.52 ( 5.40%) 2.33 (-44.54%)
times-8 1.47 ( 0.00%) 1.67 (-13.83%) 2.43 (-65.67%)
times-12 2.30 ( 0.00%) 2.07 ( 10.02%) 2.24 ( 2.35%)
times-21 2.62 ( 0.00%) 2.48 ( 5.53%) 2.49 ( 5.26%)
times-30 2.87 ( 0.00%) 2.71 ( 5.75%) 3.13 ( -8.88%)
times-48 2.75 ( 0.00%) 2.70 ( 2.03%) 3.78 (-37.46%)
times-79 2.82 ( 0.00%) 3.01 ( -6.92%) 4.53 (-60.55%)
times-110 2.88 ( 0.00%) 2.76 ( 4.17%) 5.22 (-81.57%)
times-128 2.82 ( 0.00%) 2.90 ( -2.87%) 5.48 (-94.43%)
Especially atomic64_t results are very disappointing, looks atomic64
operations variants on this machine (atomic64_*_cx8()) are slow.
Signed-off-by: Stanislaw Gruszka <sgruszka@xxxxxxxxxx>
---
include/linux/sched.h | 39 +++++++++++++++++++++++++++++++++++++--
kernel/sched/core.c | 17 +++++------------
kernel/sched/cputime.c | 22 +---------------------
kernel/sched/deadline.c | 2 +-
kernel/sched/fair.c | 2 +-
kernel/sched/rt.c | 2 +-
kernel/time/posix-cpu-timers.c | 3 ++-
7 files changed, 48 insertions(+), 39 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index eb64fcd..65714bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1334,6 +1334,10 @@ struct sched_entity {
u64 nr_migrations;
+#ifndef CONFIG_64BIT
+ seqcount_t sum_exec_runtime_seqcount;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics statistics;
#endif
@@ -2505,8 +2509,39 @@ static inline void enable_sched_clock_irqtime(void) {}
static inline void disable_sched_clock_irqtime(void) {}
#endif
-extern unsigned long long
-task_sched_runtime(struct task_struct *task);
+extern void update_sched_runtime(struct task_struct *task);
+
+#ifdef CONFIG_64BIT
+static inline void update_sum_exec_runtime(struct sched_entity *se, u64 delta)
+{
+ se->sum_exec_runtime += delta;
+}
+
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ return t->se.sum_exec_runtime;
+}
+#else
+static inline void update_sum_exec_runtime(struct sched_entity *se, u64 delta)
+{
+ write_seqcount_begin(&se->sum_exec_runtime_seqcount);
+ se->sum_exec_runtime += delta;
+ write_seqcount_end(&se->sum_exec_runtime_seqcount);
+}
+
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ u64 ns;
+ int seq;
+
+ do {
+ seq = read_seqcount_begin(&t->se.sum_exec_runtime_seqcount);
+ ns = t->se.sum_exec_runtime;
+ } while (read_seqcount_retry(&t->se.sum_exec_runtime_seqcount, seq));
+
+ return ns;
+}
+#endif
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 556cb07..9f87e0c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2991,20 +2991,17 @@ static inline void prefetch_curr_exec_start(struct task_struct *p)
}
/*
- * Return accounted runtime for the task.
- * In case the task is currently running, return the runtime plus current's
+ * In case the task is currently running, update scheduler stats to include
* pending runtime that have not been accounted yet.
*/
-unsigned long long task_sched_runtime(struct task_struct *p)
+void update_sched_runtime(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
- u64 ns;
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
- * 64-bit doesn't need locks to atomically read a 64bit value.
- * So we have a optimization chance when the task's delta_exec is 0.
+ * Optimization to to avoid taking task lock.
+ *
* Reading ->on_cpu is racy, but this is ok.
*
* If we race with it leaving cpu, we'll take a lock. So we're correct.
@@ -3014,8 +3011,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* been accounted, so we're correct here as well.
*/
if (!p->on_cpu || !task_on_rq_queued(p))
- return p->se.sum_exec_runtime;
-#endif
+ return;
rq = task_rq_lock(p, &rf);
/*
@@ -3028,10 +3024,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
- ns = p->se.sum_exec_runtime;
task_rq_unlock(rq, p, &rf);
-
- return ns;
}
/*
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b93c72d..026c1c4 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -306,26 +306,6 @@ static inline cputime_t account_other_time(cputime_t max)
return accounted;
}
-#ifdef CONFIG_64BIT
-static inline u64 read_sum_exec_runtime(struct task_struct *t)
-{
- return t->se.sum_exec_runtime;
-}
-#else
-static u64 read_sum_exec_runtime(struct task_struct *t)
-{
- u64 ns;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(t, &rf);
- ns = t->se.sum_exec_runtime;
- task_rq_unlock(rq, t, &rf);
-
- return ns;
-}
-#endif
-
/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
* tasks (sum on group iteration) belonging to @tsk's group.
@@ -347,7 +327,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
* other scheduler action.
*/
if (same_thread_group(current, tsk))
- (void) task_sched_runtime(current);
+ update_sched_runtime(current);
rcu_read_lock();
/* Attempt a lockless read on the first round. */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d091f4a..3e75329 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -742,7 +742,7 @@ static void update_curr_dl(struct rq *rq)
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
- curr->se.sum_exec_runtime += delta_exec;
+ update_sum_exec_runtime(&curr->se, delta_exec);
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 61d4854..2e88bb5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -802,7 +802,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
schedstat_set(curr->statistics.exec_max,
max(delta_exec, curr->statistics.exec_max));
- curr->sum_exec_runtime += delta_exec;
+ update_sum_exec_runtime(curr, delta_exec);
schedstat_add(cfs_rq, exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d5690b7..74990a5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -964,7 +964,7 @@ static void update_curr_rt(struct rq *rq)
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
- curr->se.sum_exec_runtime += delta_exec;
+ update_sum_exec_runtime(&curr->se, delta_exec);
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 39008d7..4d8466b 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -190,7 +190,8 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
*sample = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- *sample = task_sched_runtime(p);
+ update_sched_runtime(p);
+ *sample = read_sum_exec_runtime(p);
break;
}
return 0;
--
1.8.3.1