[GIT PULL] locking fixes

From: Ingo Molnar
Date: Fri Jun 10 2016 - 08:45:30 EST


Linus,

Please pull the latest locking-urgent-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking-urgent-for-linus

# HEAD: 077fa7aed17de5022e44bf07dbaf732078b7b5b2 futex: Calculate the futex key based on a tail page for file-based futexes

Misc fixes:

- a file-based futex fix
- one more spin_unlock_wait() fix
- a ww-mutex deadlock detection improvement/fix
- and a raw_read_seqcount_latch() barrier fix.

Thanks,

Ingo

------------------>
Chris Wilson (1):
locking/ww_mutex: Report recursive ww_mutex locking early

Mel Gorman (1):
futex: Calculate the futex key based on a tail page for file-based futexes

Peter Zijlstra (2):
locking/seqcount: Re-fix raw_read_seqcount_latch()
locking/qspinlock: Fix spin_unlock_wait() some more


include/asm-generic/qspinlock.h | 53 ++++++++++++------------------------
include/linux/seqlock.h | 7 +++--
kernel/futex.c | 14 +++++++---
kernel/locking/mutex.c | 9 ++++---
kernel/locking/qspinlock.c | 60 +++++++++++++++++++++++++++++++++++++++++
5 files changed, 99 insertions(+), 44 deletions(-)

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index 6bd05700d8c9..05f05f17a7c2 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -22,37 +22,33 @@
#include <asm-generic/qspinlock_types.h>

/**
+ * queued_spin_unlock_wait - wait until the _current_ lock holder releases the lock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * There is a very slight possibility of live-lock if the lockers keep coming
+ * and the waiter is just unfortunate enough to not see any unlock state.
+ */
+#ifndef queued_spin_unlock_wait
+extern void queued_spin_unlock_wait(struct qspinlock *lock);
+#endif
+
+/**
* queued_spin_is_locked - is the spinlock locked?
* @lock: Pointer to queued spinlock structure
* Return: 1 if it is locked, 0 otherwise
*/
+#ifndef queued_spin_is_locked
static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
{
/*
- * queued_spin_lock_slowpath() can ACQUIRE the lock before
- * issuing the unordered store that sets _Q_LOCKED_VAL.
- *
- * See both smp_cond_acquire() sites for more detail.
- *
- * This however means that in code like:
- *
- * spin_lock(A) spin_lock(B)
- * spin_unlock_wait(B) spin_is_locked(A)
- * do_something() do_something()
- *
- * Both CPUs can end up running do_something() because the store
- * setting _Q_LOCKED_VAL will pass through the loads in
- * spin_unlock_wait() and/or spin_is_locked().
+ * See queued_spin_unlock_wait().
*
- * Avoid this by issuing a full memory barrier between the spin_lock()
- * and the loads in spin_unlock_wait() and spin_is_locked().
- *
- * Note that regular mutual exclusion doesn't care about this
- * delayed store.
+ * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL
+ * isn't immediately observable.
*/
- smp_mb();
- return atomic_read(&lock->val) & _Q_LOCKED_MASK;
+ return atomic_read(&lock->val);
}
+#endif

/**
* queued_spin_value_unlocked - is the spinlock structure unlocked?
@@ -122,21 +118,6 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
}
#endif

-/**
- * queued_spin_unlock_wait - wait until current lock holder releases the lock
- * @lock : Pointer to queued spinlock structure
- *
- * There is a very slight possibility of live-lock if the lockers keep coming
- * and the waiter is just unfortunate enough to not see any unlock state.
- */
-static inline void queued_spin_unlock_wait(struct qspinlock *lock)
-{
- /* See queued_spin_is_locked() */
- smp_mb();
- while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
- cpu_relax();
-}
-
#ifndef virt_spin_lock
static __always_inline bool virt_spin_lock(struct qspinlock *lock)
{
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 7973a821ac58..ead97654c4e9 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -277,7 +277,10 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s)

static inline int raw_read_seqcount_latch(seqcount_t *s)
{
- return lockless_dereference(s)->sequence;
+ int seq = READ_ONCE(s->sequence);
+ /* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */
+ smp_read_barrier_depends();
+ return seq;
}

/**
@@ -331,7 +334,7 @@ static inline int raw_read_seqcount_latch(seqcount_t *s)
* unsigned seq, idx;
*
* do {
- * seq = lockless_dereference(latch)->seq;
+ * seq = raw_read_seqcount_latch(&latch->seq);
*
* idx = seq & 0x01;
* entry = data_query(latch->data[idx], ...);
diff --git a/kernel/futex.c b/kernel/futex.c
index ee25f5ba4aca..33664f70e2d2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page;
+ struct page *page, *tail;
struct address_space *mapping;
int err, ro = 0;

@@ -530,7 +530,15 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
* considered here and page lock forces unnecessarily serialization
* From this point on, mapping will be re-verified if necessary and
* page lock will be acquired only if it is unavoidable
- */
+ *
+ * Mapping checks require the head page for any compound page so the
+ * head page and mapping is looked up now. For anonymous pages, it
+ * does not matter if the page splits in the future as the key is
+ * based on the address. For filesystem-backed pages, the tail is
+ * required as the index of the page determines the key. For
+ * base pages, there is no tail page and tail == page.
+ */
+ tail = page;
page = compound_head(page);
mapping = READ_ONCE(page->mapping);

@@ -654,7 +662,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)

key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.inode = inode;
- key->shared.pgoff = basepage_index(page);
+ key->shared.pgoff = basepage_index(tail);
rcu_read_unlock();
}

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index e364b424b019..79d2d765a75f 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
if (!hold_ctx)
return 0;

- if (unlikely(ctx == hold_ctx))
- return -EALREADY;
-
if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
(ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
#ifdef CONFIG_DEBUG_MUTEXES
@@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
unsigned long flags;
int ret;

+ if (use_ww_ctx) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+ return -EALREADY;
+ }
+
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e32ae1..5fc8c311b8fe 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -267,6 +267,66 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif

+/*
+ * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
+ * issuing an _unordered_ store to set _Q_LOCKED_VAL.
+ *
+ * This means that the store can be delayed, but no later than the
+ * store-release from the unlock. This means that simply observing
+ * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
+ *
+ * There are two paths that can issue the unordered store:
+ *
+ * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
+ *
+ * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
+ * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
+ *
+ * However, in both cases we have other !0 state we've set before to queue
+ * ourseves:
+ *
+ * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
+ * load is constrained by that ACQUIRE to not pass before that, and thus must
+ * observe the store.
+ *
+ * For (2) we have a more intersting scenario. We enqueue ourselves using
+ * xchg_tail(), which ends up being a RELEASE. This in itself is not
+ * sufficient, however that is followed by an smp_cond_acquire() on the same
+ * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
+ * guarantees we must observe that store.
+ *
+ * Therefore both cases have other !0 state that is observable before the
+ * unordered locked byte store comes through. This means we can use that to
+ * wait for the lock store, and then wait for an unlock.
+ */
+#ifndef queued_spin_unlock_wait
+void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+ u32 val;
+
+ for (;;) {
+ val = atomic_read(&lock->val);
+
+ if (!val) /* not locked, we're done */
+ goto done;
+
+ if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
+ break;
+
+ /* not locked, but pending, wait until we observe the lock */
+ cpu_relax();
+ }
+
+ /* any unlock is good */
+ while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+ cpu_relax();
+
+done:
+ smp_rmb(); /* CTRL + RMB -> ACQUIRE */
+}
+EXPORT_SYMBOL(queued_spin_unlock_wait);
+#endif
+
#endif /* _GEN_PV_LOCK_SLOWPATH */

/**