Re: [PATCH RFC ticketlock] Auto-queued ticketlock

From: Paul E. McKenney
Date: Tue Jun 11 2013 - 05:57:33 EST


On Mon, Jun 10, 2013 at 09:04:09PM -0400, Steven Rostedt wrote:
> On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
> > Breaking up locks is better than implementing high-contention locks, but
> > if we must have high-contention locks, why not make them automatically
> > switch between light-weight ticket locks at low contention and queued
> > locks at high contention?
> >
> > This commit therefore allows ticket locks to automatically switch between
> > pure ticketlock and queued-lock operation as needed. If too many CPUs
> > are spinning on a given ticket lock, a queue structure will be allocated
> > and the lock will switch to queued-lock operation. When the lock becomes
> > free, it will switch back into ticketlock operation. The low-order bit
> > of the head counter is used to indicate that the lock is in queued mode,
> > which forces an unconditional mismatch between the head and tail counters.
> > This approach means that the common-case code path under conditions of
> > low contention is very nearly that of a plain ticket lock.
> >
> > A fixed number of queueing structures is statically allocated in an
> > array. The ticket-lock address is used to hash into an initial element,
> > but if that element is already in use, it moves to the next element. If
> > the entire array is already in use, continue to spin in ticket mode.
> >
> > This has been only lightly tested in the kernel, though a userspace
> > implementation has survived substantial testing.
>
> I guess the point of this patch is to lower the cache ping pong effect
> of spin locks. I believe Rik was doing something similar to this as
> well.

Yep, as was Michel Lespinasse, IIRC.

> Now, when we switch from ticket to queue, we basically blow away the old
> FIFO and all the tasks do a thundering herd to get on the queue. Even if
> the task was next to get the lock, it could end up being stuck at the
> back of the queue again and have to wait. When I put my real-time hat
> on, this bothers me. Even though it's still a bound latency, as it only
> gets put to the end of the queue once, it just doubled the length of the
> worse case grabbing of a lock.

Almost.

The size of the switch-to-queue thundering herd is limited by the
ticket gap that initiates the switch. So what you are saying is that
in an RT kernel, you might want to tune down the ticket gap. Which
reminds me -- I do need to make this tuning more explicit.

Thanx, Paul

> -- Steve
>
>
>
> >
> > Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
> >
> > diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> > index 33692ea..b4a91b0 100644
> > --- a/arch/x86/include/asm/spinlock.h
> > +++ b/arch/x86/include/asm/spinlock.h
> > @@ -34,6 +34,8 @@
> > # define UNLOCK_LOCK_PREFIX
> > #endif
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > +
> > /*
> > * Ticket locks are conceptually two parts, one indicating the current head of
> > * the queue, and the other indicating the current tail. The lock is acquired
> > @@ -62,6 +64,25 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > barrier(); /* make sure nothing creeps before the lock is taken */
> > }
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> > +
> > +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > +{
> > + register struct __raw_tickets inc = { .tail = 2 };
> > +
> > + inc = xadd(&lock->tickets, inc);
> > + for (;;) {
> > + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> > + break;
> > + inc.head = ACCESS_ONCE(lock->tickets.head);
> > + }
> > + barrier(); /* smp_mb() on Power or ARM. */
> > +}
> > +
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > {
> > arch_spinlock_t old, new;
> > @@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > if (old.tickets.head != old.tickets.tail)
> > return 0;
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >
> > /* cmpxchg is a full barrier, so nothing can move before it */
> > return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> > }
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > +
> > static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > {
> > __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> > }
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +extern void tkt_q_do_wake(arch_spinlock_t *asp);
> > +
> > +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > +{
> > + __ticket_t head = 2;
> > +
> > + head = xadd(&lock->tickets.head, 2);
> > + if (head & 0x1)
> > + tkt_q_do_wake(lock);
> > +}
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> > {
> > struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> > diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> > index ad0ad07..cdaefdd 100644
> > --- a/arch/x86/include/asm/spinlock_types.h
> > +++ b/arch/x86/include/asm/spinlock_types.h
> > @@ -7,12 +7,18 @@
> >
> > #include <linux/types.h>
> >
> > -#if (CONFIG_NR_CPUS < 256)
> > +#if (CONFIG_NR_CPUS < 128)
> > typedef u8 __ticket_t;
> > typedef u16 __ticketpair_t;
> > -#else
> > +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> > +#elif (CONFIG_NR_CPUS < 32768)
> > typedef u16 __ticket_t;
> > typedef u32 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> > +#else
> > +typedef u32 __ticket_t;
> > +typedef u64 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> > #endif
> >
> > #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> > @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> > union {
> > __ticketpair_t head_tail;
> > struct __raw_tickets {
> > +#ifdef __BIG_ENDIAN__
> > + __ticket_t tail, head;
> > +#else /* #ifdef __BIG_ENDIAN__ */
> > __ticket_t head, tail;
> > +#endif /* #else #ifdef __BIG_ENDIAN__ */
> > } tickets;
> > };
> > } arch_spinlock_t;
> > diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> > index e9ef6d6..816a87c 100644
> > --- a/include/linux/kernel.h
> > +++ b/include/linux/kernel.h
> > @@ -15,6 +15,7 @@
> > #include <asm/byteorder.h>
> > #include <uapi/linux/kernel.h>
> >
> > +#define UCHAR_MAX ((u8)(~0U))
> > #define USHRT_MAX ((u16)(~0U))
> > #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> > #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> > diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> > index 44511d1..ad9c67c 100644
> > --- a/kernel/Kconfig.locks
> > +++ b/kernel/Kconfig.locks
> > @@ -223,3 +223,21 @@ endif
> > config MUTEX_SPIN_ON_OWNER
> > def_bool y
> > depends on SMP && !DEBUG_MUTEXES
> > +
> > +config TICKET_LOCK_QUEUED
> > + bool "Dynamically switch between ticket and queued locking"
> > + default n
> > + ---help---
> > + Enable dynamic switching between ticketlock and queued locking
> > + on a per-lock basis. This option will slow down low-contention
> > + acquisition and release very slightly (additional conditional
> > + in release path), but will provide more efficient operation at
> > + high levels of lock contention. High-contention operation will
> > + not be quite as efficient as would be a pure queued lock, but
> > + this dynamic approach consumes less memory than queud locks
> > + and also runs faster at low levels of contention.
> > +
> > + Say "Y" if you are running on a large system with a workload
> > + that is likely to result in high levels of contention.
> > +
> > + Say "N" if you are unsure.
> > diff --git a/kernel/Makefile b/kernel/Makefile
> > index 271fd31..70a91f7 100644
> > --- a/kernel/Makefile
> > +++ b/kernel/Makefile
> > @@ -51,6 +51,7 @@ endif
> > obj-$(CONFIG_SMP) += spinlock.o
> > obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> > obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> > +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> > obj-$(CONFIG_UID16) += uid16.o
> > obj-$(CONFIG_MODULES) += module.o
> > obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> > diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> > new file mode 100644
> > index 0000000..f01b760
> > --- /dev/null
> > +++ b/kernel/tktqlock.c
> > @@ -0,0 +1,333 @@
> > +/*
> > + * Queued ticket spinlocks.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> > + *
> > + * Copyright IBM Corporation, 2013
> > + *
> > + * Authors: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
> > + */
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/smp.h>
> > +#include <linux/percpu.h>
> > +
> > +struct tkt_q {
> > + int cpu;
> > + __ticket_t tail;
> > + struct tkt_q *next;
> > +};
> > +
> > +struct tkt_q_head {
> > + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> > + s32 head_tkt; /* Head ticket when started queuing. */
> > + struct tkt_q *spin; /* Head of queue. */
> > + struct tkt_q **spin_tail; /* Tail of queue. */
> > +};
> > +
> > +/*
> > + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> > + * given ticket lock to motivate switching to spinning on a queue.
> > + * The reason that it is twice the number is because the bottom bit of
> > + * the ticket is reserved for the bit that indicates that a queue is
> > + * associated with the lock.
> > + */
> > +#define TKT_Q_SWITCH (16 * 2)
> > +
> > +/*
> > + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> > + * might have multiple highly contended locks, so provide more queues for
> > + * systems with larger numbers of CPUs.
> > + */
> > +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> > +
> > +/* The queues themselves. */
> > +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> > +
> > +/* Advance to the next queue slot, wrapping around to the beginning. */
> > +static int tkt_q_next_slot(int i)
> > +{
> > + return (++i < TKT_Q_NQUEUES) ? i : 0;
> > +}
> > +
> > +/* Very crude hash from lock address to queue slot number. */
> > +static unsigned long tkt_q_hash(arch_spinlock_t *asp)
> > +{
> > + return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
> > +}
> > +
> > +/*
> > + * Return a pointer to the queue header associated with the specified lock,
> > + * or return NULL if there is no queue for the lock or if the lock's queue
> > + * is in transition.
> > + */
> > +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> > +{
> > + int i;
> > + int start;
> > +
> > + start = i = tkt_q_hash(asp);
> > + do
> > + if (tkt_q_heads[i].ref == asp)
> > + return &tkt_q_heads[i];
> > + while ((i = tkt_q_next_slot(i)) != start);
> > + return NULL;
> > +}
> > +
> > +/*
> > + * Try to stop queuing, reverting back to normal ticket-lock operation.
> > + * We can only stop queuing when the queue is empty, which means that
> > + * we need to correctly handle races where someone shows up in the queue
> > + * just as we are trying to dispense with the queue. They win, we lose.
> > + */
> > +static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > +
> > + /* Pick up the ticket values. */
> > + asold = ACCESS_ONCE(*asp);
> > + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> > +
> > + /* Attempt to mark the lock as not having a queue. */
> > + asnew = asold;
> > + asnew.tickets.head &= ~0x1;
> > + if (cmpxchg(&asp->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) == asold.head_tail) {
> > +
> > + /* Succeeded, mark the queue as unused. */
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > + return true;
> > + }
> > + }
> > +
> > + /* Failed, tell the caller there is still a queue to pass off to. */
> > + return false;
> > +}
> > +
> > +/*
> > + * Hand the lock off to the first CPU on the queue.
> > + */
> > +void tkt_q_do_wake(arch_spinlock_t *asp)
> > +{
> > + struct tkt_q_head *tqhp;
> > + struct tkt_q *tqp;
> > +
> > + /* If the queue is still being set up, wait for it. */
> > + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> > + cpu_relax();
> > +
> > + for (;;) {
> > +
> > + /* Find the first queue element. */
> > + tqp = ACCESS_ONCE(tqhp->spin);
> > + if (tqp != NULL)
> > + break; /* Element exists, hand off lock. */
> > + if (tkt_q_try_unqueue(asp, tqhp))
> > + return; /* No element, successfully removed queue. */
> > + cpu_relax();
> > + }
> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> > + ACCESS_ONCE(tqp->cpu) = -1;
> > +}
> > +
> > +/*
> > + * Given a lock that already has a queue associated with it, spin on
> > + * that queue. Return false if there was no queue (which means we do not
> > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> > + */
> > +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + struct tkt_q **oldtail;
> > + struct tkt_q tq;
> > + struct tkt_q_head *tqhp;
> > +
> > + /*
> > + * Ensure that accesses to queue header happen after sensing
> > + * the lock's have-queue bit.
> > + */
> > + smp_mb(); /* See above block comment. */
> > +
> > + /* If there no longer is a queue, leave. */
> > + tqhp = tkt_q_find_head(asp);
> > + if (tqhp == NULL)
> > + return false;
> > +
> > + /* Initialize our queue element. */
> > + tq.cpu = raw_smp_processor_id();
> > + tq.tail = inc.tail;
> > + tq.next = NULL;
> > +
> > + /* Check to see if we already hold the lock. */
> > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > + /* The last holder left before queue formed, we hold lock. */
> > + tqhp->head_tkt = -1;
> > + return true;
> > + }
> > +
> > + /* Add our element to the tail of the queue. */
> > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> > + ACCESS_ONCE(*oldtail) = &tq;
> > +
> > + /* Spin until handoff. */
> > + while (ACCESS_ONCE(tq.cpu) != -1)
> > + cpu_relax();
> > +
> > + /*
> > + * Remove our element from the queue. If the queue is now empty,
> > + * update carefully so that the next acquisition will queue itself
> > + * at the head of the list.
> > + */
> > + if (tq.next == NULL) {
> > +
> > + /* Mark the queue empty. */
> > + tqhp->spin = NULL;
> > +
> > + /* Try to point the tail back at the head. */
> > + if (cmpxchg(&tqhp->spin_tail,
> > + &tq.next,
> > + &tqhp->spin) == &tq.next)
> > + return true; /* Succeeded, queue is now empty. */
> > +
> > + /* Failed, if needed, wait for the enqueue to complete. */
> > + while (tq.next == NULL)
> > + cpu_relax();
> > +
> > + /* The following code will repair the head. */
> > + }
> > + smp_mb(); /* Force ordering between handoff and critical section. */
> > +
> > + /* Advance list-head pointer. */
> > + ACCESS_ONCE(tqhp->spin) = tq.next;
> > + return true;
> > +}
> > +
> > +/*
> > + * Given a lock that does not have a queue, attempt to associate the
> > + * i-th queue with it, returning true if successful (meaning we hold
> > + * the lock) or false otherwise (meaning we do -not- hold the lock).
> > + * Note that the caller has already filled in ->ref with 0x1, so we
> > + * own the queue.
> > + */
> > +static bool
> > +tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > + struct tkt_q_head *tqhp;
> > +
> > + /* Initialize the i-th queue header. */
> > + tqhp = &tkt_q_heads[i];
> > + tqhp->spin = NULL;
> > + tqhp->spin_tail = &tqhp->spin;
> > +
> > + /* Each pass through this loop attempts to mark the lock as queued. */
> > + do {
> > + asold.head_tail = ACCESS_ONCE(asp->head_tail);
> > + asnew = asold;
> > + if (asnew.tickets.head & 0x1) {
> > +
> > + /* Someone beat us to it, back out. */
> > + smp_mb();
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > +
> > + /* Spin on the queue element they set up. */
> > + return tkt_q_do_spin(asp, inc);
> > + }
> > +
> > + /* The low-order bit in the head counter says "queued". */
> > + asnew.tickets.head |= 0x1;
> > + } while (cmpxchg(&asp->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) != asold.head_tail);
> > +
> > + /* Point the queue at the lock and go spin on it. */
> > + tqhp->head_tkt = asold.tickets.head;
> > + smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> > + ACCESS_ONCE(tqhp->ref) = asp;
> > + return tkt_q_do_spin(asp, inc);
> > +}
> > +
> > +/*
> > + * Start handling a period of high contention by finding a queue to associate
> > + * with this lock. Returns true if successful (in which case we hold the
> > + * lock) and false otherwise (in which case we do -not- hold the lock).
> > + */
> > +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + int i;
> > + int start;
> > +
> > + /* Hash the lock address to find a starting point. */
> > + start = i = tkt_q_hash(asp);
> > +
> > + /*
> > + * Each pass through the following loop attempts to associate
> > + * the lock with the corresponding queue.
> > + */
> > + do {
> > + /*
> > + * Use 0x1 to mark the queue in use, but also avoiding
> > + * any spinners trying to use it before we get it all
> > + * initialized.
> > + */
> > + if (cmpxchg(&tkt_q_heads[i].ref,
> > + NULL,
> > + (arch_spinlock_t *)0x1) == NULL) {
> > +
> > + /* Succeeded, now go initialize it. */
> > + return tkt_q_init_contend(i, asp, inc);
> > + }
> > +
> > + /* If someone beat us to it, go spin on their queue. */
> > + if (ACCESS_ONCE(asp->tickets.head) & 0x1)
> > + return tkt_q_do_spin(asp, inc);
> > + } while ((i = tkt_q_next_slot(i)) != start);
> > +
> > + /* All the queues are in use, revert to spinning on the ticket lock. */
> > + return false;
> > +}
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + if (unlikely(inc.head & 0x1)) {
> > +
> > + /* This lock has a queue, so go spin on the queue. */
> > + if (tkt_q_do_spin(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > +
> > + } else if (TICKET_T_CMP_GE(ACCESS_ONCE(ap->tickets.tail) - TKT_Q_SWITCH,
> > + ACCESS_ONCE(ap->tickets.head))) {
> > +
> > + /*
> > + * This lock has lots of spinners, but no queue.
> > + * Go create a queue to spin on.
> > + */
> > + if (tkt_q_start_contend(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > + }
> > +
> > + /* Either no need for a queue or the queue is in transition. Spin. */
> > + cpu_relax();
> > + return false;
> > +}
>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/