Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer

From: Peter Zijlstra
Date: Thu Jun 04 2015 - 06:49:25 EST


On Thu, Jun 04, 2015 at 12:07:03PM +0300, Kirill Tkhai wrote:
> > --- a/include/linux/hrtimer.h
> > +++ b/include/linux/hrtimer.h
> > @@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
> > * A timer is active, when it is enqueued into the rbtree or the
> > * callback function is running or it's in the state of being migrated
> > * to another cpu.
> > + *
> > + * See __run_hrtimer().
> > */
> > -static inline int hrtimer_active(const struct hrtimer *timer)
> > +static inline bool hrtimer_active(const struct hrtimer *timer)
> > {
> > - return timer->state != HRTIMER_STATE_INACTIVE ||
> > - timer->base->running == timer;
> > + if (timer->state != HRTIMER_STATE_INACTIVE)
> > + return true;
> > +
> > + smp_rmb(); /* C matches A */
> > +
> > + if (timer->base->running == timer)
> > + return true;
> > +
> > + smp_rmb(); /* D matches B */
> > +
> > + if (timer->state != HRTIMER_STATE_INACTIVE)
> > + return true;
> > +
> > + return false;
>
> This races with two sequential timer handlers. hrtimer_active()
> is preemptible everywhere, and no guarantees that all three "if"
> conditions check the same timer tick.

Indeed.

> How about transformation of hrtimer_bases.lock: raw_spinlock_t --> seqlock_t?

Ingo will like that because it means we already need to touch cpu_base.

But I think there's a problem there on timer migration, the timer can
migrate between bases while we do the seq read loop and then you can get
false positives on the different seqcount numbers.

We could of course do something like the below, but hrtimer_is_active()
is turning into quite the monster.

Needs more comments at the very least, its fully of trickery.

---
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -59,7 +59,9 @@ enum hrtimer_restart {
* mean touching the timer after the callback, this makes it impossible to free
* the timer from the callback function.
*
- * Therefore we track the callback state in timer->base->running == timer.
+ * Therefore we track the callback state in:
+ *
+ * timer->base->cpu_base->running == timer
*
* On SMP it is possible to have a "callback function running and enqueued"
* status. It happens for example when a posix timer expired and the callback
@@ -144,7 +146,6 @@ struct hrtimer_clock_base {
struct timerqueue_head active;
ktime_t (*get_time)(void);
ktime_t offset;
- struct hrtimer *running;
} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));

enum hrtimer_base_type {
@@ -159,6 +160,8 @@ enum hrtimer_base_type {
* struct hrtimer_cpu_base - the per cpu clock bases
* @lock: lock protecting the base and associated clock bases
* and timers
+ * @seq: seqcount around __run_hrtimer
+ * @running: pointer to the currently running hrtimer
* @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events
@@ -180,6 +183,8 @@ enum hrtimer_base_type {
*/
struct hrtimer_cpu_base {
raw_spinlock_t lock;
+ seqcount_t seq;
+ struct hrtimer *running;
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
@@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
*/
static inline int hrtimer_active(const struct hrtimer *timer)
{
- return timer->state != HRTIMER_STATE_INACTIVE ||
- timer->base->running == timer;
+ struct hrtimer_cpu_base *cpu_base;
+ unsigned int seq;
+ bool active;
+
+ do {
+ active = false;
+ cpu_base = READ_ONCE(timer->base->cpu_base);
+ seqcount_lockdep_reader_access(&cpu_base->seq);
+ seq = raw_read_seqcount(&cpu_base->seq);
+
+ if (timer->state != HRTIMER_STATE_INACTIVE ||
+ cpu_base->running == timer)
+ active = true;
+
+ } while (read_seqcount_retry(&cpu_base->seq, seq) ||
+ cpu_base != READ_ONCE(timer->base->cpu_base));
+
+ return active;
}

/*
@@ -412,7 +433,7 @@ static inline int hrtimer_is_queued(stru
*/
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
- return timer->base->running == timer;
+ return timer->base->cpu_base->running == timer;
}

/* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+ .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
@@ -113,9 +114,15 @@ static inline int hrtimer_clockid_to_bas
/*
* We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
* such that hrtimer_callback_running() can unconditionally dereference
- * timer->base.
+ * timer->base->cpu_base
*/
-static struct hrtimer_clock_base migration_base;
+static struct hrtimer_cpu_base migration_cpu_base = {
+ .seq = SEQCNT_ZERO(migration_cpu_base),
+};
+
+static struct hrtimer_clock_base migration_base {
+ .cpu_base = &migration_cpu_base,
+};

/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
@@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;

- WARN_ON(!irqs_disabled());
+ lockdep_assert_held(&cpu_base->lock);

debug_deactivate(timer);
- base->running = timer;
+ cpu_base->running = timer;
+
+ /*
+ * separate the ->running assignment from the ->state assignment
+ */
+ write_seqcount_begin(&cpu_base->seq);
+
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
@@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
!(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base);

- WARN_ON_ONCE(base->running != timer);
- base->running = NULL;
+ /*
+ * separate the ->running assignment from the ->state assignment
+ */
+ write_seqcount_end(&cpu_base->seq);
+
+ WARN_ON_ONCE(cpu_base->running != timer);
+ cpu_base->running = NULL;
}

static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/