RE: Serious problem with ticket spinlocks on ia64

From: Luck, Tony
Date: Mon Aug 30 2010 - 14:17:34 EST


> I may tinker with this test a bit to include some short random
> amounts of hold-time for the lock, and delays between attempts
> to acquire it (to make it look more like a contended kernel lock
> and less like a continuous queue of processes trading around a
> lock that is never free

I've been iterating ... adding new bits to try to reproduce the
kernel environment:

1) Added delays (bigger delay not holding the lock than holding
it - so contention is controlled)
2) Check that locking actually works (with a critzone element that
is only modified when the lock is held).
3) Sometimes use trylock (all the odd numbered threads do this).

Compile with -frename-registers ... and add a nop() { } function
in another file (just to make sure the compiler doesn't optimize
the delay loops).

Sadly ... my user mode experiments haven't yet yielded any cases
where the ticket locks fail in the way that Petr saw them mess up
inside the kernel. This latest version has been running for ~90
minutes and has completed 25 million lock/trylock iterations (with
about a third of the ticket lock wraparounds passing through the
uncontested case (lock == 0) and the rest happening with some
processes waiting for the lock.

So now I'm trying to think of other ways that the kernel case
differs from my user mode mock-up.

-Tony
typedef struct {
volatile unsigned int lock;
} arch_spinlock_t;

#define WORKERS 16
int workers = WORKERS;

struct sh {
arch_spinlock_t l; char pad1[60];
long success; char pad2[56];
unsigned long critzone; char pad3[56];
unsigned long running; char pad4[56];
long worker[WORKERS];
int wraps[WORKERS];
} *s;

int me;

/* cut & paste infrastructure from kernel start here */
typedef unsigned long __u64;
typedef unsigned int __u32;
typedef unsigned short __u16;
typedef unsigned char __u8;
#define __always_inline inline __attribute__((always_inline))
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))

extern unsigned long __bad_size_for_ia64_fetch_and_add (void);
extern unsigned long __bad_increment_for_ia64_fetch_and_add (void);

#define ia64_invala() asm volatile ("invala" ::: "memory")

#define ia64_hint_pause 0

#define ia64_hint(mode) \
({ \
switch (mode) { \
case ia64_hint_pause: \
asm volatile ("hint @pause" ::: "memory"); \
break; \
} \
})
#define cpu_relax() ia64_hint(ia64_hint_pause)

#define ia64_fetchadd4_acq(p, inc) \
({ \
\
__u64 ia64_intri_res; \
asm volatile ("fetchadd4.acq %0=[%1],%2" \
: "=r"(ia64_intri_res) : "r"(p), "i" (inc) \
: "memory"); \
\
ia64_intri_res; \
})

#define IA64_FETCHADD(tmp,v,n,sz,sem) \
({ \
switch (sz) { \
case 4: \
tmp = ia64_fetchadd4_##sem((unsigned int *) v, n); \
break; \
\
case 8: \
tmp = ia64_fetchadd8_##sem((unsigned long *) v, n); \
break; \
\
default: \
__bad_size_for_ia64_fetch_and_add(); \
} \
})

#define ia64_fetchadd(i,v,sem) \
({ \
__u64 _tmp; \
volatile __typeof__(*(v)) *_v = (v); \
/* Can't use a switch () here: gcc isn't always smart enough for that... */ \
if ((i) == -16) \
IA64_FETCHADD(_tmp, _v, -16, sizeof(*(v)), sem); \
else if ((i) == -8) \
IA64_FETCHADD(_tmp, _v, -8, sizeof(*(v)), sem); \
else if ((i) == -4) \
IA64_FETCHADD(_tmp, _v, -4, sizeof(*(v)), sem); \
else if ((i) == -1) \
IA64_FETCHADD(_tmp, _v, -1, sizeof(*(v)), sem); \
else if ((i) == 1) \
IA64_FETCHADD(_tmp, _v, 1, sizeof(*(v)), sem); \
else if ((i) == 4) \
IA64_FETCHADD(_tmp, _v, 4, sizeof(*(v)), sem); \
else if ((i) == 8) \
IA64_FETCHADD(_tmp, _v, 8, sizeof(*(v)), sem); \
else if ((i) == 16) \
IA64_FETCHADD(_tmp, _v, 16, sizeof(*(v)), sem); \
else \
_tmp = __bad_increment_for_ia64_fetch_and_add(); \
(__typeof__(*(v))) (_tmp); /* return old value */ \
})

#define ia64_cmpxchg4_acq(ptr, new, old) \
({ \
__u64 ia64_intri_res; \
asm volatile ("mov ar.ccv=%0;;" :: "rO"(old)); \
asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv": \
"=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory"); \
ia64_intri_res; \
})

/*
* This function doesn't exist, so you'll get a linker error
* if something tries to do an invalid cmpxchg().
*/
extern long ia64_cmpxchg_called_with_bad_pointer (void);

#define ia64_cmpxchg(sem,ptr,old,new,size) \
({ \
__u64 _o_, _r_; \
\
switch (size) { \
case 1: _o_ = (__u8 ) (long) (old); break; \
case 2: _o_ = (__u16) (long) (old); break; \
case 4: _o_ = (__u32) (long) (old); break; \
case 8: _o_ = (__u64) (long) (old); break; \
default: break; \
} \
switch (size) { \
case 1: \
_r_ = ia64_cmpxchg1_##sem((__u8 *) ptr, new, _o_); \
break; \
\
case 2: \
_r_ = ia64_cmpxchg2_##sem((__u16 *) ptr, new, _o_); \
break; \
\
case 4: \
_r_ = ia64_cmpxchg4_##sem((__u32 *) ptr, new, _o_); \
break; \
\
case 8: \
_r_ = ia64_cmpxchg8_##sem((__u64 *) ptr, new, _o_); \
break; \
\
default: \
_r_ = ia64_cmpxchg_called_with_bad_pointer(); \
break; \
} \
(__typeof__(old)) _r_; \
})

#define TICKET_SHIFT 17 /*was17*/
#define TICKET_BITS 15 /*was15*/
#define TICKET_MASK ((1 << TICKET_BITS) - 1)

static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
{
int *p = (int *)&lock->lock, ticket, serve;

ticket = ia64_fetchadd(1, p, acq);

if (ticket == 0) s->wraps[me]++;

if (!(((ticket >> TICKET_SHIFT) ^ ticket) & TICKET_MASK))
return;

ia64_invala();

for (;;) {
asm volatile ("ld4.c.nc %0=[%1]" : "=r"(serve) : "r"(p) : "memory");

if (!(((serve >> TICKET_SHIFT) ^ ticket) & TICKET_MASK)) {
return;
}
cpu_relax();
}
}

static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
{
int tmp = ACCESS_ONCE(lock->lock);

if (tmp == 0) s->wraps[me]++;
if (!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK))
return ia64_cmpxchg(acq, &lock->lock, tmp, tmp + 1, sizeof (tmp)) == tmp;
return 0;
}

static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
{
unsigned short *p = (unsigned short *)&lock->lock + 1, tmp;

asm volatile ("ld2.bias %0=[%1]" : "=r"(tmp) : "r"(p));
ACCESS_ONCE(*p) = (tmp + 2) & ~1;
}
/* cut & paste infrastructure from kernel ends here */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>

extern void nop(void);

void
delay(int scale)
{
int r = random();

r &= 0xffff;

r *= scale;

while (r-- > 0)
nop();
}

work()
{
unsigned long crit, mymask = (1ul << me);
int gotlock;

srandom(getpid());
while (s->running) {
delay(workers);
if (me & 1) {
gotlock = __ticket_spin_trylock(&s->l);
} else {
__ticket_spin_lock(&s->l);
gotlock = 1;
}
if (gotlock) {
if (crit = s->critzone) {
s->running = 0;
printf("Worker %d saw %lx at entry\n", me, crit);
break;
}
s->critzone |= mymask;
s->success++;
s->worker[me]++;

delay(1);

if ((crit = s->critzone) != mymask) {
printf("Worker %d saw %lx at exit\n", me, crit);
s->running = 0;
break;
}
s->critzone = 0;
__ticket_spin_unlock(&s->l);
}
}
}

main(int argc, char **argv)
{
int i, pid;

if (argc > 1) {
workers = atoi(argv[1]);
if (workers < 1 || workers > WORKERS)
workers = WORKERS;
}
s = mmap(NULL, 65536, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0L);

printf("shared mapping at %p\n", s);
s->running = 1;

for (i = 0; i < workers; i++) switch (pid = fork()) {
case -1: perror("fork"); s->running = 0; return 1;
case 0: me = i; work(); return 0;
}

while (s->running) {
sleep(5);
printf("%ld [lock = %.8x]\n", s->success, s->l.lock);
for (i = 0; i < workers; i++)
printf(" %ld %.8x\n", s->worker[i], s->wraps[i]);
}
}