Re: [PATCH] tick/nohz: Reduce the critical region for jiffies_seq

From: Thomas Gleixner
Date: Mon Nov 16 2020 - 09:15:22 EST


On Mon, Nov 16 2020 at 21:24, Yunfeng Ye wrote:
> On 2020/11/16 19:29, Thomas Gleixner wrote:
>> There are quite some other inefficiencies in that code and the seqcount
>> held time can be reduced way further. Let me stare at it.
>>
> I think the write seqcount only protecting the last_jiffies_update/jiffies_64/
> tick_next_period is enough. The modification which has not been tested,
> look like this:

Yes, but it can be further simplified. I was doing that before I got
distracted. Uncompiled and untested.

A split up patch series is here: https://tglx.de/~tglx/patches-jiffies.tar

Thanks,

tglx
---
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -193,7 +193,6 @@ extern int try_to_del_timer_sync(struct
#define del_singleshot_timer_sync(t) del_timer_sync(t)

extern void init_timers(void);
-extern void run_local_timers(void);
struct hrtimer;
extern enum hrtimer_restart it_real_fn(struct hrtimer *);

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1693,29 +1693,6 @@ void timer_clear_idle(void)
}
#endif

-/*
- * Called from the timer interrupt handler to charge one tick to the current
- * process. user_tick is 1 if the tick is user time, 0 for system.
- */
-void update_process_times(int user_tick)
-{
- struct task_struct *p = current;
-
- PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
-
- /* Note: this timer irq context must be accounted for as well. */
- account_process_tick(p, user_tick);
- run_local_timers();
- rcu_sched_clock_irq(user_tick);
-#ifdef CONFIG_IRQ_WORK
- if (in_irq())
- irq_work_tick();
-#endif
- scheduler_tick();
- if (IS_ENABLED(CONFIG_POSIX_TIMERS))
- run_posix_cpu_timers();
-}
-
/**
* __run_timers - run all expired timers (if any) on this CPU.
* @base: the timer vector to be processed.
@@ -1765,7 +1742,7 @@ static __latent_entropy void run_timer_s
/*
* Called by the local, per-CPU timer interrupt on SMP.
*/
-void run_local_timers(void)
+static void run_local_timers(void)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);

@@ -1783,6 +1760,29 @@ void run_local_timers(void)
}

/*
+ * Called from the timer interrupt handler to charge one tick to the current
+ * process. user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+ struct task_struct *p = current;
+
+ PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
+
+ /* Note: this timer irq context must be accounted for as well. */
+ account_process_tick(p, user_tick);
+ run_local_timers();
+ rcu_sched_clock_irq(user_tick);
+#ifdef CONFIG_IRQ_WORK
+ if (in_irq())
+ irq_work_tick();
+#endif
+ scheduler_tick();
+ if (IS_ENABLED(CONFIG_POSIX_TIMERS))
+ run_posix_cpu_timers();
+}
+
+/*
* Since schedule_timeout()'s timer is defined on the stack, it must store
* the target task on the stack as well.
*/
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -877,6 +877,21 @@ static void tick_broadcast_init_next_eve
}
}

+static inline ktime_t tick_get_next_period(void)
+{
+ ktime_t next;
+
+ /*
+ * Protect against concurrent updates (store /load tearing on
+ * 32bit). It does not matter if the time is already in the
+ * past. The broadcast device which is about to be programmed will
+ * fire in any case.
+ */
+ raw_spin_lock(&jiffies_lock);
+ next = tick_next_period;
+ raw_spin_unlock(&jiffies_lock);
+}
+
/**
* tick_broadcast_setup_oneshot - setup the broadcast device
*/
@@ -905,10 +920,11 @@ static void tick_broadcast_setup_oneshot
tick_broadcast_oneshot_mask, tmpmask);

if (was_periodic && !cpumask_empty(tmpmask)) {
+ u64 ktime_t = tick_get_next_period();
+
clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
- tick_broadcast_init_next_event(tmpmask,
- tick_next_period);
- tick_broadcast_set_event(bc, cpu, tick_next_period);
+ tick_broadcast_init_next_event(tmpmask, nextevt);
+ tick_broadcast_set_event(bc, cpu, nextevt);
} else
bc->next_event = KTIME_MAX;
} else {
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -27,7 +27,9 @@
*/
DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
/*
- * Tick next event: keeps track of the tick time
+ * Tick next event: keeps track of the tick time. It's updated by the
+ * CPU which handles the tick and protected by jiffies_lock. There is
+ * no requirement to write hold the jiffies seqcount for it.
*/
ktime_t tick_next_period;
ktime_t tick_period;
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -44,7 +44,9 @@ struct tick_sched *tick_get_tick_sched(i

#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
/*
- * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ * The time, when the last jiffy update happened. Write access must hold
+ * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
+ * consistent view of jiffies and last_jiffies_update.
*/
static ktime_t last_jiffies_update;

@@ -53,50 +55,59 @@ static ktime_t last_jiffies_update;
*/
static void tick_do_update_jiffies64(ktime_t now)
{
- unsigned long ticks = 0;
+ unsigned long ticks = 1;
ktime_t delta;

/*
- * Do a quick check without holding jiffies_lock:
- * The READ_ONCE() pairs with two updates done later in this function.
+ * Do a quick check without holding jiffies_lock. The READ_ONCE()
+ * pairs with the update done later in this function.
*/
- delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
- if (delta < tick_period)
+ if (ktime_before(now, READ_ONCE(tick_next_period)))
return;

/* Reevaluate with jiffies_lock held */
raw_spin_lock(&jiffies_lock);
+ if (ktime_before(now, tick_next_period)) {
+ raw_spin_unlock(&jiffies_lock);
+ return;
+ }
+
write_seqcount_begin(&jiffies_seq);

- delta = ktime_sub(now, last_jiffies_update);
- if (delta >= tick_period) {
+ delta = ktime_sub(now, tick_next_period);
+ if (unlikely(delta >= tick_period)) {
+ /* Slow path for long idle sleep times */
+ s64 incr = ktime_to_ns(tick_period);

- delta = ktime_sub(delta, tick_period);
- /* Pairs with the lockless read in this function. */
- WRITE_ONCE(last_jiffies_update,
- ktime_add(last_jiffies_update, tick_period));
-
- /* Slow path for long timeouts */
- if (unlikely(delta >= tick_period)) {
- s64 incr = ktime_to_ns(tick_period);
-
- ticks = ktime_divns(delta, incr);
-
- /* Pairs with the lockless read in this function. */
- WRITE_ONCE(last_jiffies_update,
- ktime_add_ns(last_jiffies_update,
- incr * ticks));
- }
- do_timer(++ticks);
+ ticks += ktime_divns(delta, incr);

- /* Keep the tick_next_period variable up to date */
- tick_next_period = ktime_add(last_jiffies_update, tick_period);
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
+ incr * ticks);
} else {
- write_seqcount_end(&jiffies_seq);
- raw_spin_unlock(&jiffies_lock);
- return;
+ last_jiffies_update = ktime_add(last_jiffies_update,
+ tick_period);
}
+
+ /*
+ * Keep the tick_next_period variable up to date. WRITE_ONCE()
+ * pairs with the READ_ONCE() in the lockless quick check above. Do
+ * this here to make the lockless quick check above more efficient.
+ */
+ WRITE_ONCE(tick_next_period,
+ ktime_add(last_jiffies_update, tick_period));
+
+ /* Advance jiffies to complete the jiffies_seq protected job */
+ jiffies_64 += ticks;
+
+ /*
+ * Release the sequence count. calc_global_load() below is not
+ * protected by it, but jiffies_lock needs to be held to prevent
+ * concurrent invocations.
+ */
write_seqcount_end(&jiffies_seq);
+
+ calc_global_load();
+
raw_spin_unlock(&jiffies_lock);
update_wall_time();
}