Alternative implementation of the generic __ffs
From: Alexander van Heukelum
Date: Fri Apr 18 2008 - 16:20:04 EST
On Mon, Apr 07, 2008 at 12:25:50PM +0200, Alexander van Heukelum wrote:
> On Sun, 6 Apr 2008 13:22:58 -0700 (PDT), "dean gaudet" <dean@xxxxxxxxxx> said:
> > On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> > > The current generic implementation of ffz is O(lg(n)) already
> >
> > it's O(lg(n)) time... the operations all depend on each other.
> >
> > the implementation i pointed to is O(lg(n)) code space... and the time
> > depends on how parallel the machine is, they're not dependent on each
> > other.
>
> Indeed. The worst dependencies are in the sum of all the partial
> results in this implementation. And addition is associative, so
> partial results can be written as ((a+b)+(c+d))+(e+f). Assuming
> perfect parallel execution this would lead to O(ln(ln(n))). Good.
Hello all,
I've implemented ffs (find first set bit) like it is shown
in http://www.hackersdelight.org/ (see revisions, page 21).
I would be interested to see how it compares with the generic
implementation of __ffs in the linux kernel, in particular
on architectures that do not have an obvious fast way of
determining the first set bit of a word.
I have included a simple benchmark program that should give
a reasonable estimate of the performance ratio of the two
implementations. Please test and report :).
Is this implementation suitable to replace the current one?
Greetings,
Alexander
P.S. Some results for some machines I could test on:
-----------
On a 2.1 GHz Athlon XP:
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original: 396 tics, 756 tics
New: 378 tics, 851 tics
Assembly: 262 tics, 383 tics
Empty loop: 128 tics, 203 tics
real 0m33.862s
user 0m33.026s
sys 0m0.344s
-----------
On a 2.33 GHz Xeon:
$ gcc -m64 -Os ffs.c && time ./a.out
Original: 277 tics, 324 tics
New: 230 tics, 236 tics
Assembly: 90 tics, 84 tics
Empty loop: 90 tics, 82 tics
real 0m14.490s
user 0m14.270s
sys 0m0.220s
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original: 303 tics, 449 tics
New: 231 tics, 394 tics
Assembly: 90 tics, 144 tics
Empty loop: 102 tics, 116 tics
real 0m18.521s
user 0m18.380s
sys 0m0.140s
-----------
On an alpha EV6.7 (21264A) operating at 667 MHz:
$ gcc -Os ffs.c && time ./a.out
Original: 622 tics, 633 tics
New: 431 tics, 465 tics
Assembly: 235 tics, 210 tics
Empty loop: 199 tics, 218 tics
50.358u 0.259s 1:14.28 68.1% 0+1k 2+0io 2pf+0w
-----------
#include <stdio.h>
#include <sys/times.h>
#define LOOPS32 (1<<(30-5-1))
#define LOOPS64 (1<<(30-6-1))
#define ATTR __attribute__((noinline))
static ATTR int __ffs32_orig(unsigned int word)
{
int num = 0;
if ((word & 0xffff) == 0) {
num += 16;
word >>= 16;
}
if ((word & 0xff) == 0) {
num += 8;
word >>= 8;
}
if ((word & 0xf) == 0) {
num += 4;
word >>= 4;
}
if ((word & 0x3) == 0) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
}
static ATTR int __ffs64_orig(unsigned long long word)
{
int num = 0;
if ((word & 0xffffffff) == 0) {
num += 32;
word >>= 32;
}
if ((word & 0xffff) == 0) {
num += 16;
word >>= 16;
}
if ((word & 0xff) == 0) {
num += 8;
word >>= 8;
}
if ((word & 0xf) == 0) {
num += 4;
word >>= 4;
}
if ((word & 0x3) == 0) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
}
static ATTR int __ffs32_new(unsigned int value)
{
int x0, x1, x2, x3, x4;
value &= -value;
x0 = (value & 0x55555555) ? 0 : 1;
x1 = (value & 0x33333333) ? 0 : 2;
x2 = (value & 0x0f0f0f0f) ? 0 : 4;
x3 = (value & 0x00ff00ff) ? 0 : 8;
x4 = (value & 0x0000ffff) ? 0 : 16;
return x0 | x1 | x2 | x3 | x4;
}
static ATTR int __ffs64_new(unsigned long long value)
{
int x0, x1, x2, x3, x4, x5;
value &= -value;
x0 = (value & 0x5555555555555555ull) ? 0 : 1;
x1 = (value & 0x3333333333333333ull) ? 0 : 2;
x2 = (value & 0x0f0f0f0f0f0f0f0full) ? 0 : 4;
x3 = (value & 0x00ff00ff00ff00ffull) ? 0 : 8;
x4 = (value & 0x0000ffff0000ffffull) ? 0 : 16;
x5 = (value & 0x00000000ffffffffull) ? 0 : 32;
return x0 | x1 | x2 | x3 | x4 | x5;
}
#ifdef __x86_64__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
int res;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
static ATTR int __ffs64_asm(unsigned long long value)
{
long res;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
#endif
#ifdef __i386__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
int res;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
static ATTR int __ffs64_asm(unsigned long long value)
{
unsigned int low, high;
int res;
low = value;
if (low) {
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (low)
);
return res;
}
high = value >> 32;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (high)
);
return res | 32;
}
#endif
#ifdef __alpha__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
int res;
asm volatile("cttz %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
static ATTR int __ffs64_asm(unsigned long long value)
{
long res;
asm volatile("cttz %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
#endif
static ATTR int __nothing32(unsigned int value)
{
return value;
}
static ATTR int __nothing64(unsigned long long value)
{
return (int)value;
}
/* Random numbers: modified version of libc mrand48 */
static unsigned long long myrand_next = 0x330eull;
static const unsigned long long myrand_a = 0x5deece66dull;
static const unsigned long long myrand_b = 0xbull;
unsigned int myrand(void)
{
myrand_next = myrand_a * myrand_next + myrand_b;
return (unsigned int)(myrand_next >> 16);
}
unsigned long long myrand64(void)
{
return ((unsigned long long)myrand() << 32) | myrand();
}
void myrand_seed(unsigned int seed)
{
int n;
myrand_next = ((unsigned long long)seed << 16) + 0x330eull;
for (n=0; n<100; n++) myrand(); /* warm it up a bit */
}
/* tic: wait for next clock tick and save current tick */
clock_t tictime;
void tic(void)
{
struct tms t;
times(&t);
tictime = t.tms_utime;
do {
times(&t);
} while (tictime == t.tms_utime);
tictime = t.tms_utime;
}
/* toc: report number of ticks since tic() */
long toc(void)
{
struct tms t;
times(&t);
return (long)(t.tms_utime - tictime);
}
__attribute__((noinline))
int bench_orig32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __ffs32_orig(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_orig64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __ffs64_orig(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_new32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __ffs32_new(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_new64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __ffs64_new(i);
i &= i - 1;
}
}
return res;
}
#ifdef ASSEMBLYVERSION
__attribute__((noinline))
int bench_asm32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __ffs32_asm(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_asm64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __ffs64_asm(i);
i &= i - 1;
}
}
return res;
}
#endif
__attribute__((noinline))
int bench_none32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __nothing32(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_none64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __nothing64(i);
i &= i - 1;
}
}
return res;
}
void test(void)
{
unsigned long long int i;
int n, res1, res2;
myrand_seed(0);
for (n=0; n<1000; n++) {
i = myrand64();
while (i) {
res1 = __ffs64_orig(i);
res2 = __ffs64_new(i);
if (res1 != res2) {
printf("%llu %i %i\n", i,
res1, res2);
}
i &= i - 1;
}
}
}
int main(void)
{
long tics;
setvbuf(stdout, NULL, _IONBF, 0);
test();
printf("%-14s", "Original:");
tic(); bench_orig32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_orig64(); tics = toc();
printf("%6lu tics\n", tics);
printf("%-14s", "New:");
tic(); bench_new32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_new64(); tics = toc();
printf("%6lu tics\n", tics);
#ifdef ASSEMBLYVERSION
printf("%-14s", "Assembly:");
tic(); bench_asm32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_asm64(); tics = toc();
printf("%6lu tics\n", tics);
#endif
printf("%-14s", "Empty loop:");
tic(); bench_none32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_none64(); tics = toc();
printf("%6lu tics\n", tics);
return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/