[tip:timers/core] hrtimer: Allow hrtimer::function() to free the timer
From: tip-bot for Peter Zijlstra
Date: Thu Jun 18 2015 - 18:20:42 EST
Commit-ID: 887d9dc989eb0154492e41e7c07492edbb088ba1
Gitweb: http://git.kernel.org/tip/887d9dc989eb0154492e41e7c07492edbb088ba1
Author: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
AuthorDate: Thu, 11 Jun 2015 14:46:48 +0200
Committer: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
CommitDate: Fri, 19 Jun 2015 00:09:56 +0200
hrtimer: Allow hrtimer::function() to free the timer
Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.
Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.
Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Cc: ktkhai@xxxxxxxxxxxxx
Cc: rostedt@xxxxxxxxxxx
Cc: juri.lelli@xxxxxxxxx
Cc: pang.xunlei@xxxxxxxxxx
Cc: wanpeng.li@xxxxxxxxxxxxxxx
Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Paul McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Oleg Nesterov <oleg@xxxxxxxxxx>
Cc: umgwanakikbuti@xxxxxxxxx
Link: http://lkml.kernel.org/r/20150611124743.471563047@xxxxxxxxxxxxx
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
include/linux/hrtimer.h | 41 +++++++----------
kernel/time/hrtimer.c | 114 ++++++++++++++++++++++++++++++++++++++----------
2 files changed, 107 insertions(+), 48 deletions(-)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2f9e57d..5db0558 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,30 +53,25 @@ enum hrtimer_restart {
*
* 0x00 inactive
* 0x01 enqueued into rbtree
- * 0x02 callback function running
- * 0x04 timer is migrated to another cpu
*
- * Special cases:
- * 0x03 callback function running and enqueued
- * (was requeued on another CPU)
- * 0x05 timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
*
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in:
+ *
+ * timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
* queued a signal. Between dropping the lock which protects the posix timer
* and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
+ * signal and rearm the timer.
*
* All state transitions are protected by cpu_base->lock.
*/
#define HRTIMER_STATE_INACTIVE 0x00
#define HRTIMER_STATE_ENQUEUED 0x01
-#define HRTIMER_STATE_CALLBACK 0x02
/**
* struct hrtimer - the basic hrtimer structure
@@ -163,6 +158,8 @@ enum hrtimer_base_type {
* struct hrtimer_cpu_base - the per cpu clock bases
* @lock: lock protecting the base and associated clock bases
* and timers
+ * @seq: seqcount around __run_hrtimer
+ * @running: pointer to the currently running hrtimer
* @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events
@@ -184,6 +181,8 @@ enum hrtimer_base_type {
*/
struct hrtimer_cpu_base {
raw_spinlock_t lock;
+ seqcount_t seq;
+ struct hrtimer *running;
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
@@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
extern u64 hrtimer_get_next_event(void);
-/*
- * A timer is active, when it is enqueued into the rbtree or the
- * callback function is running or it's in the state of being migrated
- * to another cpu.
- */
-static inline int hrtimer_active(const struct hrtimer *timer)
-{
- return timer->state != HRTIMER_STATE_INACTIVE;
-}
+extern bool hrtimer_active(const struct hrtimer *timer);
/*
* Helper function to check, whether the timer is on one of the queues
@@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
*/
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
- return timer->state & HRTIMER_STATE_CALLBACK;
+ return timer->base->cpu_base->running == timer;
}
/* Forward a hrtimer so it expires after now: */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1604157..f026413 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+ .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
@@ -111,6 +112,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
#ifdef CONFIG_SMP
/*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+ .seq = SEQCNT_ZERO(migration_cpu_base),
+ .clock_base = { { .cpu_base = &migration_cpu_base, }, },
+};
+
+#define migration_base migration_cpu_base.clock_base[0]
+
+/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
* locked, and the base itself is locked too.
@@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
* be found on the lists/queues.
*
* When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
*/
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
for (;;) {
base = timer->base;
- if (likely(base != NULL)) {
+ if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
return base;
@@ -194,8 +207,8 @@ again:
if (unlikely(hrtimer_callback_running(timer)))
return base;
- /* See the comment in lock_timer_base() */
- timer->base = NULL;
+ /* See the comment in lock_hrtimer_base() */
+ timer->base = &migration_base;
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
@@ -838,11 +851,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
base->cpu_base->active_bases |= 1 << base->index;
- /*
- * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
- * state of a possibly running callback.
- */
- timer->state |= HRTIMER_STATE_ENQUEUED;
+ timer->state = HRTIMER_STATE_ENQUEUED;
return timerqueue_add(&base->active, &timer->node);
}
@@ -907,14 +916,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
- if (!restart) {
- /*
- * We must preserve the CALLBACK state flag here,
- * otherwise we could move the timer base in
- * switch_hrtimer_base.
- */
- state &= HRTIMER_STATE_CALLBACK;
- }
+ if (!restart)
+ state = HRTIMER_STATE_INACTIVE;
+
__remove_hrtimer(timer, base, state, reprogram);
return 1;
}
@@ -1115,6 +1119,51 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
}
EXPORT_SYMBOL_GPL(hrtimer_init);
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
+ *
+ * It is important for this function to not return a false negative.
+ */
+bool hrtimer_active(const struct hrtimer *timer)
+{
+ struct hrtimer_cpu_base *cpu_base;
+ unsigned int seq;
+
+ do {
+ cpu_base = READ_ONCE(timer->base->cpu_base);
+ seq = raw_read_seqcount_begin(&cpu_base->seq);
+
+ if (timer->state != HRTIMER_STATE_INACTIVE ||
+ cpu_base->running == timer)
+ return true;
+
+ } while (read_seqcount_retry(&cpu_base->seq, seq) ||
+ cpu_base != READ_ONCE(timer->base->cpu_base));
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(hrtimer_active);
+
+/*
+ * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+ * distinct sections:
+ *
+ * - queued: the timer is queued
+ * - callback: the timer is being ran
+ * - post: the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * The sequence numbers are required because otherwise we could still observe
+ * a false negative if the read side got smeared over multiple consequtive
+ * __run_hrtimer() invocations.
+ */
+
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
struct hrtimer *timer, ktime_t *now)
@@ -1122,10 +1171,21 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer);
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ cpu_base->running = timer;
+
+ /*
+ * Separate the ->running assignment from the ->state assignment.
+ *
+ * As with a regular write barrier, this ensures the read side in
+ * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * timer->state == INACTIVE.
+ */
+ raw_write_seqcount_barrier(&cpu_base->seq);
+
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
@@ -1141,7 +1201,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
raw_spin_lock(&cpu_base->lock);
/*
- * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+ * Note: We clear the running state after enqueue_hrtimer and
* we do not reprogramm the event hardware. Happens either in
* hrtimer_start_range_ns() or in hrtimer_interrupt()
*
@@ -1153,9 +1213,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
!(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base);
- WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+ /*
+ * Separate the ->running assignment from the ->state assignment.
+ *
+ * As with a regular write barrier, this ensures the read side in
+ * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * timer->state == INACTIVE.
+ */
+ raw_write_seqcount_barrier(&cpu_base->seq);
- timer->state &= ~HRTIMER_STATE_CALLBACK;
+ WARN_ON_ONCE(cpu_base->running != timer);
+ cpu_base->running = NULL;
}
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/