Re: [PATCH RFC ticketlock] Auto-queued ticketlock
From: Steven Rostedt
Date: Mon Jun 10 2013 - 20:44:50 EST
On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> +
> +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> +{
> + register struct __raw_tickets inc = { .tail = 2 };
> +
> + inc = xadd(&lock->tickets, inc);
> + for (;;) {
> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> + break;
> + inc.head = ACCESS_ONCE(lock->tickets.head);
> + }
> + barrier(); /* smp_mb() on Power or ARM. */
> +}
> +
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> {
> arch_spinlock_t old, new;
> @@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> if (old.tickets.head != old.tickets.tail)
> return 0;
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>
> /* cmpxchg is a full barrier, so nothing can move before it */
> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> }
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> +
> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> {
> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> }
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +extern void tkt_q_do_wake(arch_spinlock_t *asp);
> +
> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> +{
> + __ticket_t head = 2;
> +
> + head = xadd(&lock->tickets.head, 2);
> + if (head & 0x1)
> + tkt_q_do_wake(lock);
> +}
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> {
> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index ad0ad07..cdaefdd 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -7,12 +7,18 @@
>
> #include <linux/types.h>
>
> -#if (CONFIG_NR_CPUS < 256)
> +#if (CONFIG_NR_CPUS < 128)
> typedef u8 __ticket_t;
> typedef u16 __ticketpair_t;
> -#else
> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> +#elif (CONFIG_NR_CPUS < 32768)
> typedef u16 __ticket_t;
> typedef u32 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> +#else
> +typedef u32 __ticket_t;
> +typedef u64 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> #endif
>
> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> union {
> __ticketpair_t head_tail;
> struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__
> + __ticket_t tail, head;
> +#else /* #ifdef __BIG_ENDIAN__ */
> __ticket_t head, tail;
> +#endif /* #else #ifdef __BIG_ENDIAN__ */
> } tickets;
> };
> } arch_spinlock_t;
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e9ef6d6..816a87c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -15,6 +15,7 @@
> #include <asm/byteorder.h>
> #include <uapi/linux/kernel.h>
>
> +#define UCHAR_MAX ((u8)(~0U))
> #define USHRT_MAX ((u16)(~0U))
> #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 44511d1..ad9c67c 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -223,3 +223,21 @@ endif
> config MUTEX_SPIN_ON_OWNER
> def_bool y
> depends on SMP && !DEBUG_MUTEXES
> +
> +config TICKET_LOCK_QUEUED
> + bool "Dynamically switch between ticket and queued locking"
> + default n
> + ---help---
> + Enable dynamic switching between ticketlock and queued locking
> + on a per-lock basis. This option will slow down low-contention
> + acquisition and release very slightly (additional conditional
> + in release path), but will provide more efficient operation at
> + high levels of lock contention. High-contention operation will
> + not be quite as efficient as would be a pure queued lock, but
> + this dynamic approach consumes less memory than queud locks
> + and also runs faster at low levels of contention.
> +
> + Say "Y" if you are running on a large system with a workload
> + that is likely to result in high levels of contention.
> +
> + Say "N" if you are unsure.
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 271fd31..70a91f7 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -51,6 +51,7 @@ endif
> obj-$(CONFIG_SMP) += spinlock.o
> obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> obj-$(CONFIG_UID16) += uid16.o
> obj-$(CONFIG_MODULES) += module.o
> obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> new file mode 100644
> index 0000000..f01b760
> --- /dev/null
> +++ b/kernel/tktqlock.c
> @@ -0,0 +1,333 @@
> +/*
> + * Queued ticket spinlocks.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright IBM Corporation, 2013
> + *
> + * Authors: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
> + */
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/smp.h>
> +#include <linux/percpu.h>
> +
> +struct tkt_q {
> + int cpu;
> + __ticket_t tail;
> + struct tkt_q *next;
> +};
> +
> +struct tkt_q_head {
> + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> + s32 head_tkt; /* Head ticket when started queuing. */
> + struct tkt_q *spin; /* Head of queue. */
> + struct tkt_q **spin_tail; /* Tail of queue. */
> +};
> +
> +/*
> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> + * given ticket lock to motivate switching to spinning on a queue.
> + * The reason that it is twice the number is because the bottom bit of
> + * the ticket is reserved for the bit that indicates that a queue is
> + * associated with the lock.
> + */
> +#define TKT_Q_SWITCH (16 * 2)
> +
> +/*
> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> + * might have multiple highly contended locks, so provide more queues for
> + * systems with larger numbers of CPUs.
> + */
> +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> +
> +/* The queues themselves. */
> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> +
> +/* Advance to the next queue slot, wrapping around to the beginning. */
> +static int tkt_q_next_slot(int i)
> +{
> + return (++i < TKT_Q_NQUEUES) ? i : 0;
> +}
> +
> +/* Very crude hash from lock address to queue slot number. */
> +static unsigned long tkt_q_hash(arch_spinlock_t *asp)
> +{
> + return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
> +}
> +
> +/*
> + * Return a pointer to the queue header associated with the specified lock,
> + * or return NULL if there is no queue for the lock or if the lock's queue
> + * is in transition.
> + */
> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> +{
> + int i;
> + int start;
> +
> + start = i = tkt_q_hash(asp);
> + do
> + if (tkt_q_heads[i].ref == asp)
> + return &tkt_q_heads[i];
> + while ((i = tkt_q_next_slot(i)) != start);
> + return NULL;
> +}
> +
> +/*
> + * Try to stop queuing, reverting back to normal ticket-lock operation.
> + * We can only stop queuing when the queue is empty, which means that
> + * we need to correctly handle races where someone shows up in the queue
> + * just as we are trying to dispense with the queue. They win, we lose.
> + */
> +static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> +
> + /* Pick up the ticket values. */
> + asold = ACCESS_ONCE(*asp);
> + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> +
> + /* Attempt to mark the lock as not having a queue. */
> + asnew = asold;
> + asnew.tickets.head &= ~0x1;
> + if (cmpxchg(&asp->head_tail,
> + asold.head_tail,
> + asnew.head_tail) == asold.head_tail) {
> +
> + /* Succeeded, mark the queue as unused. */
> + ACCESS_ONCE(tqhp->ref) = NULL;
> + return true;
> + }
> + }
> +
> + /* Failed, tell the caller there is still a queue to pass off to. */
> + return false;
> +}
> +
> +/*
> + * Hand the lock off to the first CPU on the queue.
> + */
> +void tkt_q_do_wake(arch_spinlock_t *asp)
> +{
> + struct tkt_q_head *tqhp;
> + struct tkt_q *tqp;
> +
> + /* If the queue is still being set up, wait for it. */
> + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> + cpu_relax();
> +
> + for (;;) {
> +
> + /* Find the first queue element. */
> + tqp = ACCESS_ONCE(tqhp->spin);
> + if (tqp != NULL)
> + break; /* Element exists, hand off lock. */
> + if (tkt_q_try_unqueue(asp, tqhp))
> + return; /* No element, successfully removed queue. */
> + cpu_relax();
> + }
> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> + ACCESS_ONCE(tqhp->head_tkt) = -1;
> + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> + ACCESS_ONCE(tqp->cpu) = -1;
> +}
> +
> +/*
> + * Given a lock that already has a queue associated with it, spin on
> + * that queue. Return false if there was no queue (which means we do not
> + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> + */
> +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + struct tkt_q **oldtail;
> + struct tkt_q tq;
> + struct tkt_q_head *tqhp;
> +
> + /*
> + * Ensure that accesses to queue header happen after sensing
> + * the lock's have-queue bit.
> + */
> + smp_mb(); /* See above block comment. */
> +
> + /* If there no longer is a queue, leave. */
> + tqhp = tkt_q_find_head(asp);
> + if (tqhp == NULL)
> + return false;
> +
> + /* Initialize our queue element. */
> + tq.cpu = raw_smp_processor_id();
> + tq.tail = inc.tail;
> + tq.next = NULL;
> +
> + /* Check to see if we already hold the lock. */
> + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> + /* The last holder left before queue formed, we hold lock. */
> + tqhp->head_tkt = -1;
> + return true;
> + }
> +
> + /* Add our element to the tail of the queue. */
> + oldtail = xchg(&tqhp->spin_tail, &tq.next);
Boy this is tricky code! I thought I found a race window here, but as I
went to write my email saying "Gotcha!" I found that it wasn't a race
after all. But as I went though the effort of writing this, I figured I
would send this out as documentation for others to see. Hmm, I wonder if
we can use this email to add more comments. Anyway, here's what I
thought was wrong ;-)
OK, I originally thought there was a race window here. Let's say that an
NMI hit right here, and it happens to be a big one, where lots of things
can happen on other CPUs right now.
The scenario is that there's just one item on the queue, which is
waiting for the lock to be released, and is spinning below in the:
while (ACCESS_ONCE(tq.cpu) != -1)
cpu_relax();
And then the lock is released, where in tkt_q_do_wake() the following is
called:
ACCESS_ONCE(tqp->cpu) = -1;
Now the old queued task is released. But it's tq->next hasn't been set
yet, and is still NULL. It leaves by doing:
ACCESS_ONCE(tqhp->spin) = tq.next;
return true;
All before this task gets to set *oldtail to &tq. But, I then looked
below...
> + ACCESS_ONCE(*oldtail) = &tq;
> +
> + /* Spin until handoff. */
> + while (ACCESS_ONCE(tq.cpu) != -1)
> + cpu_relax();
> +
> + /*
> + * Remove our element from the queue. If the queue is now empty,
> + * update carefully so that the next acquisition will queue itself
> + * at the head of the list.
> + */
> + if (tq.next == NULL) {
This checks for that scenario. As if the old task were to come out
spinning, the problem would only be if it was the last one on the list,
and its tq.next was NULL. But if that was the case, then we set spin to
NULL and do the next trick, where I thought I gotcha again...
> +
> + /* Mark the queue empty. */
> + tqhp->spin = NULL;
> +
> + /* Try to point the tail back at the head. */
> + if (cmpxchg(&tqhp->spin_tail,
> + &tq.next,
> + &tqhp->spin) == &tq.next)
Here, I was thinking, oh wait, what happens if this is called right
before the xchg() above. Then we would set spin_tail but not update the
old tq.next. But wait! look at what we assign spin_tail to. It's the
address of spin, which would be what oldtail would point to above, and
then above would set spin to the new tq!
OK, I haven't found a issue here yet, but youss are beiing trickssy! We
don't like trickssy, and we must find precccciouss!!!
This code is starting to make me look like Gollum :-p
-- Steve
> + return true; /* Succeeded, queue is now empty. */
> +
> + /* Failed, if needed, wait for the enqueue to complete. */
> + while (tq.next == NULL)
> + cpu_relax();
> +
> + /* The following code will repair the head. */
> + }
> + smp_mb(); /* Force ordering between handoff and critical section. */
> +
> + /* Advance list-head pointer. */
> + ACCESS_ONCE(tqhp->spin) = tq.next;
> + return true;
> +}
> +
> +/*
> + * Given a lock that does not have a queue, attempt to associate the
> + * i-th queue with it, returning true if successful (meaning we hold
> + * the lock) or false otherwise (meaning we do -not- hold the lock).
> + * Note that the caller has already filled in ->ref with 0x1, so we
> + * own the queue.
> + */
> +static bool
> +tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> + struct tkt_q_head *tqhp;
> +
> + /* Initialize the i-th queue header. */
> + tqhp = &tkt_q_heads[i];
> + tqhp->spin = NULL;
> + tqhp->spin_tail = &tqhp->spin;
> +
> + /* Each pass through this loop attempts to mark the lock as queued. */
> + do {
> + asold.head_tail = ACCESS_ONCE(asp->head_tail);
> + asnew = asold;
> + if (asnew.tickets.head & 0x1) {
> +
> + /* Someone beat us to it, back out. */
> + smp_mb();
> + ACCESS_ONCE(tqhp->ref) = NULL;
> +
> + /* Spin on the queue element they set up. */
> + return tkt_q_do_spin(asp, inc);
> + }
> +
> + /* The low-order bit in the head counter says "queued". */
> + asnew.tickets.head |= 0x1;
> + } while (cmpxchg(&asp->head_tail,
> + asold.head_tail,
> + asnew.head_tail) != asold.head_tail);
> +
> + /* Point the queue at the lock and go spin on it. */
> + tqhp->head_tkt = asold.tickets.head;
> + smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> + ACCESS_ONCE(tqhp->ref) = asp;
> + return tkt_q_do_spin(asp, inc);
> +}
> +
> +/*
> + * Start handling a period of high contention by finding a queue to associate
> + * with this lock. Returns true if successful (in which case we hold the
> + * lock) and false otherwise (in which case we do -not- hold the lock).
> + */
> +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + int i;
> + int start;
> +
> + /* Hash the lock address to find a starting point. */
> + start = i = tkt_q_hash(asp);
> +
> + /*
> + * Each pass through the following loop attempts to associate
> + * the lock with the corresponding queue.
> + */
> + do {
> + /*
> + * Use 0x1 to mark the queue in use, but also avoiding
> + * any spinners trying to use it before we get it all
> + * initialized.
> + */
> + if (cmpxchg(&tkt_q_heads[i].ref,
> + NULL,
> + (arch_spinlock_t *)0x1) == NULL) {
> +
> + /* Succeeded, now go initialize it. */
> + return tkt_q_init_contend(i, asp, inc);
> + }
> +
> + /* If someone beat us to it, go spin on their queue. */
> + if (ACCESS_ONCE(asp->tickets.head) & 0x1)
> + return tkt_q_do_spin(asp, inc);
> + } while ((i = tkt_q_next_slot(i)) != start);
> +
> + /* All the queues are in use, revert to spinning on the ticket lock. */
> + return false;
> +}
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + if (unlikely(inc.head & 0x1)) {
> +
> + /* This lock has a queue, so go spin on the queue. */
> + if (tkt_q_do_spin(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> +
> + } else if (TICKET_T_CMP_GE(ACCESS_ONCE(ap->tickets.tail) - TKT_Q_SWITCH,
> + ACCESS_ONCE(ap->tickets.head))) {
> +
> + /*
> + * This lock has lots of spinners, but no queue.
> + * Go create a queue to spin on.
> + */
> + if (tkt_q_start_contend(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> + }
> +
> + /* Either no need for a queue or the queue is in transition. Spin. */
> + cpu_relax();
> + return false;
> +}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/