[RFC patch 3/5] hrtimer: Add support for deferrable hrtimers

From: Thomas Gleixner
Date: Fri Feb 21 2014 - 12:56:24 EST


Deferrable timers have a relaxed expiry mode. The timers are not
guaranteed to expire at the programmed expiry time. They are always
batched with the expiry of non deferrrable timers. If the system goes
idle non deferrable timers are not taken into account for the
calculation of the next timer expiry. This helps for power saving as
the deferrable timers do not wake an idle system.

So far we have only support for deferrable timers for the timer wheel.
User space applications want to optimize their timer usage for power
consumption as well, but the user space interfaces are based on
hrtimers.

There is no way to bring back timer wheel timers to user space
interfaces as they would reintroduce the problems of CLOCK_REALTIME
and clock setting again and add quite some mess to the various
interfaces.

Add deferrable hrtimer support instead. The deferrable hrtimers are
stored in separate hrtimer bases which have the same underlying rules
as the non deferrable standard bases. The deferrable mode is selected
by the new HRTIMER_MODE_DEFERRABLE flag, which is ored on
HRTIMER_MODE_REL/ABS.

The new deferrable bases are not taken into account when the
underlying clock event device is programmed in high resolution mode
and they are not accounted for when the system retrieves the next
expiring timer for an extended idle sleep.

There is no impact on the non deferred hrtimers by the deferred ones
aside of a slightly larger memory footprint.

Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>

---
include/linux/hrtimer.h | 8 ++++-
kernel/hrtimer.c | 72 ++++++++++++++++++++++++++++++++++++++++++------
2 files changed, 69 insertions(+), 11 deletions(-)

Index: tip/include/linux/hrtimer.h
===================================================================
--- tip.orig/include/linux/hrtimer.h
+++ tip/include/linux/hrtimer.h
@@ -36,6 +36,7 @@ enum hrtimer_mode {
HRTIMER_MODE_PINNED = 0x02, /* Timer is bound to CPU */
HRTIMER_MODE_ABS_PINNED = 0x02,
HRTIMER_MODE_REL_PINNED = 0x03,
+ HRTIMER_MODE_DEFERRABLE = 0x04, /* Timer is deferrable */
};

/*
@@ -158,7 +159,8 @@ enum hrtimer_base_type {
HRTIMER_BASE_REALTIME,
HRTIMER_BASE_BOOTTIME,
HRTIMER_BASE_TAI,
- HRTIMER_MAX_CLOCK_BASES,
+ HRTIMER_MAX_STD_BASES,
+ HRTIMER_MAX_CLOCK_BASES = 2 * HRTIMER_MAX_STD_BASES,
};

/*
@@ -175,7 +177,9 @@ enum hrtimer_base_type {
* @nr_retries: Total number of hrtimer interrupt retries
* @nr_hangs: Total number of hrtimer interrupt hangs
* @max_hang_time: Maximum time spent in hrtimer_interrupt
- * @clock_base: array of clock bases for this cpu
+ * @clock_base: array of clock bases for this cpu. The array size is
+ * twice the MAX_STD_BASES size. The second part is
+ * a duplication of the first for deferrable timers.
*/
struct hrtimer_cpu_base {
raw_spinlock_t lock;
Index: tip/kernel/hrtimer.c
===================================================================
--- tip.orig/kernel/hrtimer.c
+++ tip/kernel/hrtimer.c
@@ -92,6 +92,30 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base,
.get_time = &ktime_get_clocktai,
.resolution = KTIME_LOW_RES,
},
+ {
+ .index = HRTIMER_BASE_MONOTONIC + HRTIMER_MAX_STD_BASES,
+ .clockid = CLOCK_MONOTONIC,
+ .get_time = &ktime_get,
+ .resolution = KTIME_LOW_RES,
+ },
+ {
+ .index = HRTIMER_BASE_REALTIME + HRTIMER_MAX_STD_BASES,
+ .clockid = CLOCK_REALTIME,
+ .get_time = &ktime_get_real,
+ .resolution = KTIME_LOW_RES,
+ },
+ {
+ .index = HRTIMER_BASE_BOOTTIME + HRTIMER_MAX_STD_BASES,
+ .clockid = CLOCK_BOOTTIME,
+ .get_time = &ktime_get_boottime,
+ .resolution = KTIME_LOW_RES,
+ },
+ {
+ .index = HRTIMER_BASE_TAI + HRTIMER_MAX_STD_BASES,
+ .clockid = CLOCK_TAI,
+ .get_time = &ktime_get_clocktai,
+ .resolution = KTIME_LOW_RES,
+ },
}
};

@@ -194,7 +218,9 @@ hrtimer_check_target(struct hrtimer *tim
#ifdef CONFIG_HIGH_RES_TIMERS
ktime_t expires;

- if (!new_base->cpu_base->hres_active)
+ /* We do not touch hardware for deferrable timers */
+ if (!new_base->cpu_base->hres_active ||
+ new_base->index >= HRTIMER_MAX_STD_BASES)
return 0;

expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
@@ -556,7 +582,7 @@ hrtimer_force_reprogram(struct hrtimer_c

expires_next.tv64 = KTIME_MAX;

- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ for (i = 0; i < HRTIMER_MAX_STD_BASES; i++, base++) {
struct hrtimer *timer;
struct timerqueue_node *next;

@@ -615,6 +641,13 @@ static int hrtimer_reprogram(struct hrti
return 0;

/*
+ * Deferrable timers are not touching the underlying
+ * hardware.
+ */
+ if (base->index >= HRTIMER_MAX_STD_BASES)
+ return 0;
+
+ /*
* CLOCK_REALTIME timer might be requested with an absolute
* expiry time which is less than base->offset. Nothing wrong
* about that, just avoid to call into the tick code, which
@@ -924,7 +957,10 @@ static void __remove_hrtimer(struct hrti

expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
- if (base->cpu_base->expires_next.tv64 == expires.tv64)
+
+ /* We only care about non deferrable timers here */
+ if (base->index < HRTIMER_MAX_STD_BASES &&
+ base->cpu_base->expires_next.tv64 == expires.tv64)
hrtimer_force_reprogram(base->cpu_base, 1);
}
#endif
@@ -1152,7 +1188,8 @@ ktime_t hrtimer_get_next_event(void)
raw_spin_lock_irqsave(&cpu_base->lock, flags);

if (!hrtimer_hres_active()) {
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ /* We only care about non deferrable timers here */
+ for (i = 0; i < HRTIMER_MAX_STD_BASES; i++, base++) {
struct hrtimer *timer;
struct timerqueue_node *next;

@@ -1190,6 +1227,9 @@ static void __hrtimer_init(struct hrtime
clock_id = CLOCK_MONOTONIC;

base = hrtimer_clockid_to_base(clock_id);
+ if (mode & HRTIMER_MODE_DEFERRABLE)
+ base += HRTIMER_MAX_STD_BASES;
+
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);

@@ -1342,8 +1382,14 @@ retry:
base->offset);
if (expires.tv64 < 0)
expires.tv64 = KTIME_MAX;
- if (expires.tv64 < expires_next.tv64)
- expires_next = expires;
+ if (expires.tv64 < expires_next.tv64) {
+ /*
+ * We do not take deferrable timers
+ * into account here:
+ */
+ if (idx < HRTIMER_MAX_STD_BASES)
+ expires_next = expires;
+ }
break;
}

@@ -1584,14 +1630,20 @@ static int update_rmtp(struct hrtimer *t
return 1;
}

+#define CLOCKID_DEFERRABLE 0x8000
+
long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
- struct hrtimer_sleeper t;
+ clockid_t clockid = restart->nanosleep.clockid & ~CLOCKID_DEFERRABLE;
+ enum hrtimer_mode mode = HRTIMER_MODE_ABS;
struct timespec __user *rmtp;
+ struct hrtimer_sleeper t;
int ret = 0;

- hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
- HRTIMER_MODE_ABS);
+ if (restart->nanosleep.clockid & CLOCKID_DEFERRABLE)
+ mode |= HRTIMER_MODE_DEFERRABLE;
+
+ hrtimer_init_on_stack(&t.timer, clockid, mode);
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);

if (do_nanosleep(&t, HRTIMER_MODE_ABS))
@@ -1643,6 +1695,8 @@ long hrtimer_nanosleep(struct timespec *
restart = &current_thread_info()->restart_block;
restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.clockid = t.timer.base->clockid;
+ if (mode & HRTIMER_MODE_DEFERRABLE)
+ restart->nanosleep.clockid |= CLOCKID_DEFERRABLE;
restart->nanosleep.rmtp = rmtp;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/