[PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

From: John Stultz
Date: Wed Jun 12 2024 - 21:58:59 EST


I recently got a bug report that
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
5.10 and 6.1. Its not a huge regression in absolute time
(~30-40ns), but is >10% change.

I narrowed the cause down to the addition of
psi_account_irqtime() in update_rq_clock_task(), in commit
52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
pressure")

So that explains the behavior change, but it also seems odd that
we're doing psi irq accounting from a syscall that is just
trying to read the thread's cputime.

Thinking about it more, it seems the re-use of update_rq_clock()
to handle accounting for any in-progress time for the current
task has the potential for side effects and unnecessary work.

So instead rework the logic so we calculate the current cpu
runtime in a read-only fashion.

This has the side benefit of improving
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) performance by ~12%
over the behavior in 5.10, and ~21% over the 6.1 behavior.

NOTE: I'm not 100% sure this is correct yet. There may be some
edge cases I've overlooked, so I'd greatly appreciate any
review or feedback.

Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Frederic Weisbecker <frederic@xxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Ben Segall <bsegall@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
Cc: Valentin Schneider <vschneid@xxxxxxxxxx>
Cc: Qais Yousef <qyousef@xxxxxxxxxxx>
Cc: Joel Fernandes <joel@xxxxxxxxxxxxxxxxx>
Cc: kernel-team@xxxxxxxxxxx
Signed-off-by: John Stultz <jstultz@xxxxxxxxxx>
---
kernel/sched/core.c | 82 ++++++++++++++++++++++++++-------------------
1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4cc0522..b29cde5ded84 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -692,16 +692,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* RQ-clock updating methods:
*/

-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
- s64 __maybe_unused steal = 0, irq_delta = 0;

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+static inline s64 get_irq_delta(struct rq *rq, s64 delta)
+{
+ s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into
@@ -720,7 +715,45 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
*/
if (irq_delta > delta)
irq_delta = delta;
+ return irq_delta;
+}
+#else
+static inline s64 get_irq_delta(struct rq *rq, s64 delta)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+static inline s64 get_steal_time(struct rq *rq, s64 delta)
+{
+ s64 steal;

+ if (!static_key_false(&paravirt_steal_rq_enabled))
+ return 0;
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+ if (unlikely(steal > delta))
+ steal = delta;
+ return steal;
+}
+#else
+static inline s64 get_steal_time(struct rq *rq, s64 delta)
+{
+ return 0;
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+ s64 __maybe_unused steal = 0, irq_delta = 0;
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ irq_delta = get_irq_delta(rq, delta);
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
@@ -728,12 +761,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
- steal -= rq->prev_steal_time_rq;
-
- if (unlikely(steal > delta))
- steal = delta;
-
+ steal = get_steal_time(rq, delta);
rq->prev_steal_time_rq += steal;
delta -= steal;
}
@@ -5547,23 +5575,6 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);

-/*
- * The function fair_sched_class.update_curr accesses the struct curr
- * and its field curr->exec_start; when called from task_sched_runtime(),
- * we observe a high rate of cache misses in practice.
- * Prefetching this data results in improved performance.
- */
-static inline void prefetch_curr_exec_start(struct task_struct *p)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
- struct sched_entity *curr = (&p->se)->cfs_rq->curr;
-#else
- struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
-#endif
- prefetch(curr);
- prefetch(&curr->exec_start);
-}
-
/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
@@ -5573,6 +5584,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
+ s64 delta_exec = 0;
u64 ns;

#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
@@ -5598,11 +5610,11 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
- prefetch_curr_exec_start(p);
- update_rq_clock(rq);
- p->sched_class->update_curr(rq);
+ delta_exec = sched_clock_cpu(cpu_of(rq)) - p->se.exec_start;
+ delta_exec -= get_irq_delta(rq, delta_exec);
+ delta_exec -= get_steal_time(rq, delta_exec);
}
- ns = p->se.sum_exec_runtime;
+ ns = p->se.sum_exec_runtime + delta_exec;
task_rq_unlock(rq, p, &rf);

return ns;
--
2.45.2.505.gda0bf45e8d-goog