i386 spinlock fairness: bizarre test results

From: Chuck Ebbert
Date: Mon Oct 10 2005 - 23:09:00 EST


After seeing Kirill's message about spinlocks I decided to do my own
testing with the userspace program below; the results were very strange.

When using the 'mov' instruction to do the unlock I was able to reproduce
hogging of the spinlock by a single CPU even on Pentium II under some
conditions, while using 'xchg' always allowed the other CPU to get the
lock:


[me@d2 t]$ gcc -O2 -o spintest.o -Wall -DUSE_MOV=1 spintest.c ; ./spintest.o
parent CPU 1, child CPU 0, using mov instruction for unlock
parent did: 34534 of 10000001 iterations
CPU clocks: 2063591864

[me@d2 t]$ gcc -O2 -o spintest.o -Wall -DUSE_MOV=0 spintest.c ; ./spintest.o
parent CPU 1, child CPU 0, using xchg instruction for unlock
parent did: 5169760 of 10000001 iterations
CPU clocks: 2164689784


The results were dependent on the alignment of the "lock ; decl" at the start
of the spinlock code. If that 7-byte instruction spanned two cachelines then
the CPU with that code could somehow hog the spinlock. Optimizing for
Pentium II forced 16-byte alignment and made the spinlock fairer, but still
somewhat biased when using 'mov':


[me@d2 t]$ gcc -O2 -o spintest.o -Wall -DUSE_MOV=1 -mcpu=pentium2 spintest.c ; ./spintest.o
parent CPU 1, child CPU 0, using mov instruction for unlock
parent did: 4181147 of 10000001 iterations
CPU clocks: 2057436825


That test machine was a dual 350MHz Pentium II Xeon; on a dual 333MHz Pentium II
Overdrive (with very slow Socket 8 bus) I could not reproduce those results.
However, on that machine the 'xchg' instruction made the test run almost 20%
_faster_ than using 'mov'.

So I think the i386 spinlock code should be changed to always use 'xchg' to do
spin_unlock.

================================================================================
/* spinlock test
*
* Tests spinlock fairness on SMP i386 machine.
*/

/* number of tests */
#ifndef ITERS
#define ITERS 10000000
#endif

/* use "mov" instruction for spin_unlock instead of "xchg" */
#ifndef USE_MOV
#define USE_MOV 1
#endif

/* cpu to run parent process; child will use !PARENT_CPU */
#ifndef PARENT_CPU
#define PARENT_CPU 1
#endif

/* change this to match your version of glibc -- "2" means use 2 args */
#ifndef SETAFFINITY
#define SETAFFINITY 2
#endif

#if SETAFFINITY == 2
#define setaffinity(pid, mask) sched_setaffinity((pid), &(mask))
#else /* 3 args */
#define setaffinity(pid, mask) sched_setaffinity((pid), sizeof(mask), &(mask))
#endif

#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sched.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <asm/user.h>

#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

#define RDTSCLL(r) asm volatile("rdtsc" : "=A" (r))

unsigned long long tsc1, tsc2;
cpu_set_t cpuset0, cpuset1;
int test = 1; /* for testing -- starts unlocked */
int strt = 0; /* sync startup -- starts locked */
int stk[1024];
int iters, p_iters, c_iters;

static inline void __raw_spin_lock(int *l)
{
asm volatile(
"\n1:\t"
"lock ; decl %0\n\t"
"jns 3f\n"
"2:\t"
"rep;nop\n\t"
"cmpl $0,%0\n\t"
"jle 2b\n\t"
"jmp 1b\n"
"3:\n\t"
:"=m" (*l) : : "memory");
}

#if USE_MOV

static inline void __raw_spin_unlock(int *l)
{
asm volatile("movl $1,%0"
: "=m" (*l) : : "memory");
}

#else

static inline void __raw_spin_unlock(int *l)
{
int oldval = 1;

asm volatile("xchgl %0,%1"
: "=q" (oldval), "=m" (*l)
: "0" (oldval) : "memory");
}

#endif /* USE_MOV */

int do_child(void *vp)
{
CPU_SET(!PARENT_CPU, &cpuset1);
if (unlikely(setaffinity(0, cpuset1)))
perror("child setaffinity");

/* Add "nop" insns as necessary to make 1st
* insn of __raw_spin_lock span cachelines.
*/
asm("nop ; nop ; nop");

__raw_spin_unlock(&strt); /* release parent */
again:
__raw_spin_lock(&test);
c_iters++;
if (likely(++iters < ITERS)) {
__raw_spin_unlock(&test);
goto again;
}
__raw_spin_unlock(&test);

return 0;
}

int main(int argc, char * const argv[])
{
CPU_SET(PARENT_CPU, &cpuset0);
if (unlikely(setaffinity(0, cpuset0)))
perror("parent setaffinity");

clone(do_child, &stk[1023], CLONE_VM, 0);

__raw_spin_lock(&strt); /* wait for child to init */
RDTSCLL(tsc1);
again:
__raw_spin_lock(&test);
p_iters++;
if (likely(++iters < ITERS)) {
__raw_spin_unlock(&test);
goto again;
}
__raw_spin_unlock(&test);
RDTSCLL(tsc2);

printf("parent CPU %d, child CPU %d, ", PARENT_CPU, !PARENT_CPU);
printf("using %s instruction for unlock\n", USE_MOV ? "mov" : "xchg");
printf("parent did: %d of %d iterations\n", p_iters, iters);
printf("CPU clocks: %14llu\n", tsc2 - tsc1);

return 0;
}
__
Chuck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/