seqlock/rt: Prevent livelocks with seqlocks in RT

From: Steven Rostedt
Date: Thu Mar 01 2012 - 09:38:12 EST


With RT, seqlocks can be preempted. Worse yet, when holding a
write_seqlock, migration is disabled. If an high priority task
preempts a task holding a write_seqlock, and that high priority
task uses the associated read_seqlock(), it can go into an infinite
spin waiting for the write_seqlock() to finish. But since the holder
of that lock has migration disabled, and is of lower priority than
the task using the read_seqlock() we end up with a live lock.

To prevent this, when the read_seqcount_begin() detects that a
write lock is being held, it will grab that lock and release it.
This will cause the holder of the lock to wake up and finish.
The priority inheritance will help it as well.

Because read_seqlocks are used in the VDSO area, a raw_read_seqcount_begin()
was created to allow userspace tasks to access read_seqcount().
As the grabbing of the write_lock() is not allowed in VDSO, nor
is even referencing it.

Note, a live lock can still happen if the userspace task that
does the read_seqlock is of higher priority than a user doing
the write_lock, so userspace needs to be careful.

Signed-off-by: Steven Rostedt <rostedt@xxxxxxxxxxx>
---
arch/x86/vdso/vclock_gettime.c | 8 +++---
include/linux/seqlock.h | 59 +++++++++++++++++++++++++++++++++------
2 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index d8511fb..01d0c2c 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -86,7 +86,7 @@ notrace static noinline int do_realtime(struct timespec *ts)
{
unsigned long seq, ns;
do {
- seq = read_seqcount_begin(&gtod->seq);
+ seq = raw_read_seqcount_begin(&gtod->seq);
ts->tv_sec = gtod->wall_time_sec;
ts->tv_nsec = gtod->wall_time_nsec;
ns = vgetns();
@@ -99,7 +99,7 @@ notrace static noinline int do_monotonic(struct timespec *ts)
{
unsigned long seq, ns, secs;
do {
- seq = read_seqcount_begin(&gtod->seq);
+ seq = raw_read_seqcount_begin(&gtod->seq);
secs = gtod->wall_time_sec;
ns = gtod->wall_time_nsec + vgetns();
secs += gtod->wall_to_monotonic.tv_sec;
@@ -123,7 +123,7 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts)
{
unsigned long seq;
do {
- seq = read_seqcount_begin(&gtod->seq);
+ seq = raw_read_seqcount_begin(&gtod->seq);
ts->tv_sec = gtod->wall_time_coarse.tv_sec;
ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -134,7 +134,7 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
{
unsigned long seq, ns, secs;
do {
- seq = read_seqcount_begin(&gtod->seq);
+ seq = raw_read_seqcount_begin(&gtod->seq);
secs = gtod->wall_time_coarse.tv_sec;
ns = gtod->wall_time_coarse.tv_nsec;
secs += gtod->wall_to_monotonic.tv_sec;
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 723274d..6d93b72 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -38,10 +38,32 @@
*/
typedef struct seqcount {
unsigned sequence;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ int has_lock;
+#endif
} seqcount_t;

-#define SEQCNT_ZERO { 0 }
-#define seqcount_init(x) do { *(x) = (seqcount_t) SEQCNT_ZERO; } while (0)
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define SEQCNT_ZERO { 0, 0 }
+# define SEQCNT_ZERO_LOCK { 0, 1 }
+# define seqcount_init(x) do { *(x) = (seqcount_t) SEQCNT_ZERO; } while (0)
+# define seqcount_init_lock(x) do { *(x) = (seqcount_t) SEQCNT_ZERO_LOCK; } while (0)
+#else
+# define SEQCNT_ZERO { 0 }
+# define SEQCNT_ZERO_LOCK SEQCNT_ZERO
+# define seqcount_init(x) do { *(x) = (seqcount_t) SEQCNT_ZERO; } while (0)
+# define seqcount_init_lock(x) seqcount_init(x)
+#endif
+
+typedef struct {
+ struct seqcount seqcount;
+ raw_spinlock_t lock;
+} raw_seqlock_t;
+
+typedef struct {
+ struct seqcount seqcount;
+ spinlock_t lock;
+} seqlock_t;

/**
* __read_seqcount_begin - begin a seq-read critical section (without barrier)
@@ -63,6 +85,30 @@ static inline unsigned __read_seqcount_begin(const seqcount_t *s)
repeat:
ret = s->sequence;
if (unlikely(ret & 1)) {
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (s->has_lock) {
+ seqlock_t *sl;
+
+ sl = container_of(s, seqlock_t, seqcount);
+ /* This process may be blocking the writer, kick it */
+ spin_lock(&sl->lock);
+ spin_unlock(&sl->lock);
+ }
+#endif
+ cpu_relax();
+ goto repeat;
+ }
+ return ret;
+}
+
+/* Needed for VDSO areas */
+static inline unsigned raw_read_seqcount_begin(const seqcount_t *s)
+{
+ unsigned ret;
+
+repeat:
+ ret = s->sequence;
+ if (unlikely(ret & 1)) {
cpu_relax();
goto repeat;
}
@@ -150,24 +196,19 @@ static inline void write_seqcount_barrier(seqcount_t *s)
s->sequence+=2;
}

-typedef struct {
- struct seqcount seqcount;
- spinlock_t lock;
-} seqlock_t;
-
/*
* These macros triggered gcc-3.x compile-time problems. We think these are
* OK now. Be cautious.
*/
#define __SEQLOCK_UNLOCKED(lockname) \
{ \
- .seqcount = SEQCNT_ZERO, \
+ .seqcount = SEQCNT_ZERO_LOCK, \
.lock = __SPIN_LOCK_UNLOCKED(lockname) \
}

#define seqlock_init(x) \
do { \
- seqcount_init(&(x)->seqcount); \
+ seqcount_init_lock(&(x)->seqcount); \
spin_lock_init(&(x)->lock); \
} while (0)

--
1.7.3.4



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/