[patch V2] lib: GCD: add binary GCD algorithm

From: zengzhaoxiu
Date: Wed Apr 27 2016 - 04:07:46 EST


From: Zhaoxiu Zeng <zhaoxiu.zeng@xxxxxxxxx>

Because some architectures (alpha, armv6, etc.) don't provide hardware division,
the mod operation is slow! Binary GCD algorithm uses simple arithmetic operations,
it replaces division with arithmetic shifts, comparisons, and subtraction.

I have compiled successfully with x86_64_defconfig and i386_defconfig.

I use the following code to test:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>

#define swap(a, b) \
do { \
a ^= b; \
b ^= a; \
a ^= b; \
} while (0)

unsigned long gcd0(unsigned long a, unsigned long b)
{
unsigned long r;

if (a < b) {
swap(a, b);
}

if (b == 0)
return a;

while ((r = a % b) != 0) {
a = b;
b = r;
}

return b;
}

unsigned long gcd1(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

b >>= __builtin_ctzl(b);
for (;;) {

a >>= __builtin_ctzl(a);
if (a == b)
break;

if (a < b)
swap(a, b);
a -= b;
}

b <<= __builtin_ctzl(r);
return b;
}

unsigned long gcd2(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

r ^= (r - 1);

while (!(b & r))
b >>= 1;

for (;;) {
while (!(a & r))
a >>= 1;
if (a == b)
break;

if (a < b)
swap(a, b);

a -= b;
a >>= 1;
if (a & r)
a += b;
}

return b;
}

static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
gcd0, gcd1, gcd2,
};

#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))
#define TEST_TIMES 100
static unsigned long res[TEST_ENTRIES][TEST_TIMES];

#define TIME_T struct timespec

static inline struct timespec read_time(void)
{
struct timespec time;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
return time;
}

static uint64_t diff_time(struct timespec start, struct timespec end)
{
struct timespec temp;

if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000UL + end.tv_nsec - start.tv_nsec;
} else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}

return temp.tv_sec * 1000000000UL + temp.tv_nsec;
}

static inline unsigned long get_rand()
{
if (sizeof(long) == 8)
return (unsigned long)rand() << 32 | rand();
else
return rand();
}

int main(int argc, char **argv)
{
unsigned int seed = time(0);
unsigned int i, j;
TIME_T start, end;

for (i = 0; i < TEST_ENTRIES; i++) {
srand(seed);
start = read_time();
for (j = 0; j < TEST_TIMES; j++)
res[i][j] = gcd_func[i](get_rand(), get_rand());
end = read_time();
printf("gcd%d: elapsed %lld\n", i, diff_time(start, end));
sleep(1);
}

for (j = 0; j < TEST_TIMES; j++) {
for (i = 1; i < TEST_ENTRIES; i++) {
if (res[i][j] != res[0][j])
break;
}
if (i < TEST_ENTRIES) {
fprintf(stderr, "Error: ");
for (i = 0; i < TEST_ENTRIES; i++)
fprintf(stderr, "res%d %ld%s", i, res[i][j], i < TEST_ENTRIES - 1 ? ", " : "\n");
}
}

return 0;
}

Compiled with "-O2", on "VirtualBox 4.2.0-35-generic #40-Ubuntu x86_64" got:

zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 92281
gcd1: elapsed 55005
gcd2: elapsed 91088
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 115546
gcd1: elapsed 55928
gcd2: elapsed 91938
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 91189
gcd1: elapsed 55493
gcd2: elapsed 90078
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 157364
gcd1: elapsed 55204
gcd2: elapsed 90058
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 91667
gcd1: elapsed 54641
gcd2: elapsed 91364

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@xxxxxxxxx>
---
lib/gcd.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 72 insertions(+), 9 deletions(-)

diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..460b414 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,19 +2,82 @@
#include <linux/gcd.h>
#include <linux/export.h>

-/* Greatest common divisor */
+/*
+ * use __ffs if the CPU has efficient __ffs
+ */
+#if (defined(CONFIG_ALPHA) && defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)) || \
+ defined(CONFIG_ARC) || \
+ (defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 5) || defined(CONFIG_ARM64) || \
+ defined(CONFIG_AVR32) || \
+ defined(CONFIG_BLACKFIN) || \
+ defined(CONFIG_C6X) || \
+ defined(CONFIG_CRIS) || \
+ defined(CONFIG_FRV) || \
+ defined(CONFIG_HEXAGON) || \
+ defined(CONFIG_IA64) || \
+ (defined(CONFIG_M68K) && \
+ (!defined(CONFIG_CPU_HAS_NO_BITFIELDS) || \
+ ((defined(__mcfisaaplus__) || defined(__mcfisac__)) && \
+ !defined(CONFIG_M68000) && !defined(CONFIG_MCPU32)))) || \
+ defined(CONFIG_MN10300) || \
+ defined(CONFIG_OPENRISC) || \
+ defined(CONFIG_POWERPC) || \
+ defined(CONFIG_S390) || \
+ defined(CONFIG_TILE) || \
+ defined(CONFIG_UNICORE32) || \
+ defined(CONFIG_X86) || \
+ defined(CONFIG_XTENSA)
+# define USE_FFS 1
+#elif defined(CONFIG_MIPS)
+# define USE_FFS (__builtin_constant_p(cpu_has_clo_clz) && cpu_has_clo_clz)
+#else
+# define USE_FFS 0
+#endif
+
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knith has noted, appears a first-century Chinese math text.)
+ */
unsigned long gcd(unsigned long a, unsigned long b)
{
- unsigned long r;
+ unsigned long r = a | b;
+
+ if (!a || !b)
+ return r;
+
+ if (USE_FFS) {
+ b >>= __ffs(b);
+ } else {
+ /* least-significant mask, equ to "(1UL << ffs(r)) - 1" */
+ r ^= (r - 1);
+
+ while (!(b & r))
+ b >>= 1;
+ }
+
+ for (;;) {
+ if (USE_FFS) {
+ a >>= __ffs(a);
+ } else {
+ while (!(a & r))
+ a >>= 1;
+ }
+ if (a == b)
+ break;

- if (a < b)
- swap(a, b);
+ if (a < b)
+ swap(a, b);
+
+ a -= b;
+ if (!USE_FFS) {
+ a >>= 1;
+ if (a & r)
+ a += b;
+ }
+ }

- if (!b)
- return a;
- while ((r = a % b) != 0) {
- a = b;
- b = r;
+ if (USE_FFS) {
+ b <<= __ffs(r);
}
return b;
}
--
2.5.0