aboutsummaryrefslogtreecommitdiffstats
path: root/arch/m68k
diff options
context:
space:
mode:
authorZhaoxiu Zeng <zhaoxiu.zeng@gmail.com>2016-05-20 20:03:57 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-20 20:58:30 -0400
commitfff7fb0b2d908dec779783d8eaf3d7725230f75e (patch)
tree907c5d69832b2f05c3c56272d8b6e87114e9a5f4 /arch/m68k
parent3bcadd6fa6c4fd07ace3626357c824eb532488a6 (diff)
lib/GCD.c: use binary GCD algorithm instead of Euclidean
The binary GCD algorithm is based on the following facts: 1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2) 2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b) 3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b) Even on x86 machines with reasonable division hardware, the binary algorithm runs about 25% faster (80% the execution time) than the division-based Euclidian algorithm. On platforms like Alpha and ARMv6 where division is a function call to emulation code, it's even more significant. There are two variants of the code here, depending on whether a fast __ffs (find least significant set bit) instruction is available. This allows the unpredictable branches in the bit-at-a-time shifting loop to be eliminated. If fast __ffs is not available, the "even/odd" GCD variant is used. I use the following code to benchmark: #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <string.h> #include <time.h> #include <unistd.h> #define swap(a, b) \ do { \ a ^= b; \ b ^= a; \ a ^= b; \ } while (0) unsigned long gcd0(unsigned long a, unsigned long b) { unsigned long r; if (a < b) { swap(a, b); } if (b == 0) return a; while ((r = a % b) != 0) { a = b; b = r; } return b; } unsigned long gcd1(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; b >>= __builtin_ctzl(b); for (;;) { a >>= __builtin_ctzl(a); if (a == b) return a << __builtin_ctzl(r); if (a < b) swap(a, b); a -= b; } } unsigned long gcd2(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; r &= -r; while (!(b & r)) b >>= 1; for (;;) { while (!(a & r)) a >>= 1; if (a == b) return a; if (a < b) swap(a, b); a -= b; a >>= 1; if (a & r) a += b; a >>= 1; } } unsigned long gcd3(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; b >>= __builtin_ctzl(b); if (b == 1) return r & -r; for (;;) { a >>= __builtin_ctzl(a); if (a == 1) return r & -r; if (a == b) return a << __builtin_ctzl(r); if (a < b) swap(a, b); a -= b; } } unsigned long gcd4(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; r &= -r; while (!(b & r)) b >>= 1; if (b == r) return r; for (;;) { while (!(a & r)) a >>= 1; if (a == r) return r; if (a == b) return a; if (a < b) swap(a, b); a -= b; a >>= 1; if (a & r) a += b; a >>= 1; } } static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = { gcd0, gcd1, gcd2, gcd3, gcd4, }; #define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0])) #if defined(__x86_64__) #define rdtscll(val) do { \ unsigned long __a,__d; \ __asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \ } while(0) static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long), unsigned long a, unsigned long b, unsigned long *res) { unsigned long long start, end; unsigned long long ret; unsigned long gcd_res; rdtscll(start); gcd_res = gcd(a, b); rdtscll(end); if (end >= start) ret = end - start; else ret = ~0ULL - start + 1 + end; *res = gcd_res; return ret; } #else static inline struct timespec read_time(void) { struct timespec time; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time); return time; } static inline unsigned long long diff_time(struct timespec start, struct timespec end) { struct timespec temp; if ((end.tv_nsec - start.tv_nsec) < 0) { temp.tv_sec = end.tv_sec - start.tv_sec - 1; temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec; } else { temp.tv_sec = end.tv_sec - start.tv_sec; temp.tv_nsec = end.tv_nsec - start.tv_nsec; } return temp.tv_sec * 1000000000ULL + temp.tv_nsec; } static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long), unsigned long a, unsigned long b, unsigned long *res) { struct timespec start, end; unsigned long gcd_res; start = read_time(); gcd_res = gcd(a, b); end = read_time(); *res = gcd_res; return diff_time(start, end); } #endif static inline unsigned long get_rand() { if (sizeof(long) == 8) return (unsigned long)rand() << 32 | rand(); else return rand(); } int main(int argc, char **argv) { unsigned int seed = time(0); int loops = 100; int repeats = 1000; unsigned long (*res)[TEST_ENTRIES]; unsigned long long elapsed[TEST_ENTRIES]; int i, j, k; for (;;) { int opt = getopt(argc, argv, "n:r:s:"); /* End condition always first */ if (opt == -1) break; switch (opt) { case 'n': loops = atoi(optarg); break; case 'r': repeats = atoi(optarg); break; case 's': seed = strtoul(optarg, NULL, 10); break; default: /* You won't actually get here. */ break; } } res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops); memset(elapsed, 0, sizeof(elapsed)); srand(seed); for (j = 0; j < loops; j++) { unsigned long a = get_rand(); /* Do we have args? */ unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand(); unsigned long long min_elapsed[TEST_ENTRIES]; for (k = 0; k < repeats; k++) { for (i = 0; i < TEST_ENTRIES; i++) { unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]); if (k == 0 || min_elapsed[i] > tmp) min_elapsed[i] = tmp; } } for (i = 0; i < TEST_ENTRIES; i++) elapsed[i] += min_elapsed[i]; } for (i = 0; i < TEST_ENTRIES; i++) printf("gcd%d: elapsed %llu\n", i, elapsed[i]); k = 0; srand(seed); for (j = 0; j < loops; j++) { unsigned long a = get_rand(); unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand(); for (i = 1; i < TEST_ENTRIES; i++) { if (res[j][i] != res[j][0]) break; } if (i < TEST_ENTRIES) { if (k == 0) { k = 1; fprintf(stderr, "Error:\n"); } fprintf(stderr, "gcd(%lu, %lu): ", a, b); for (i = 0; i < TEST_ENTRIES; i++) fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n"); } } if (k == 0) fprintf(stderr, "PASS\n"); free(res); return 0; } Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got: zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10 gcd0: elapsed 10174 gcd1: elapsed 2120 gcd2: elapsed 2902 gcd3: elapsed 2039 gcd4: elapsed 2812 PASS zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10 gcd0: elapsed 9309 gcd1: elapsed 2280 gcd2: elapsed 2822 gcd3: elapsed 2217 gcd4: elapsed 2710 PASS zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10 gcd0: elapsed 9589 gcd1: elapsed 2098 gcd2: elapsed 2815 gcd3: elapsed 2030 gcd4: elapsed 2718 PASS zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10 gcd0: elapsed 9914 gcd1: elapsed 2309 gcd2: elapsed 2779 gcd3: elapsed 2228 gcd4: elapsed 2709 PASS [akpm@linux-foundation.org: avoid #defining a CONFIG_ variable] Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com> Signed-off-by: George Spelvin <linux@horizon.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/m68k')
-rw-r--r--arch/m68k/Kconfig.cpu11
1 files changed, 11 insertions, 0 deletions
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index c1beb5ae181f..8ace920ca24a 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
40 select CPU_HAS_NO_MULDIV64 40 select CPU_HAS_NO_MULDIV64
41 select CPU_HAS_NO_UNALIGNED 41 select CPU_HAS_NO_UNALIGNED
42 select GENERIC_CSUM 42 select GENERIC_CSUM
43 select CPU_NO_EFFICIENT_FFS
43 help 44 help
44 The Freescale (was Motorola) 68000 CPU is the first generation of 45 The Freescale (was Motorola) 68000 CPU is the first generation of
45 the well known M68K family of processors. The CPU core as well as 46 the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
51 bool 52 bool
52 select CPU_HAS_NO_BITFIELDS 53 select CPU_HAS_NO_BITFIELDS
53 select CPU_HAS_NO_UNALIGNED 54 select CPU_HAS_NO_UNALIGNED
55 select CPU_NO_EFFICIENT_FFS
54 help 56 help
55 The Freescale (was then Motorola) CPU32 is a CPU core that is 57 The Freescale (was then Motorola) CPU32 is a CPU core that is
56 based on the 68020 processor. For the most part it is used in 58 based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
130 depends on !MMU 132 depends on !MMU
131 select COLDFIRE_SW_A7 133 select COLDFIRE_SW_A7
132 select HAVE_MBAR 134 select HAVE_MBAR
135 select CPU_NO_EFFICIENT_FFS
133 help 136 help
134 Motorola ColdFire 5206 processor support. 137 Motorola ColdFire 5206 processor support.
135 138
@@ -138,6 +141,7 @@ config M5206e
138 depends on !MMU 141 depends on !MMU
139 select COLDFIRE_SW_A7 142 select COLDFIRE_SW_A7
140 select HAVE_MBAR 143 select HAVE_MBAR
144 select CPU_NO_EFFICIENT_FFS
141 help 145 help
142 Motorola ColdFire 5206e processor support. 146 Motorola ColdFire 5206e processor support.
143 147
@@ -163,6 +167,7 @@ config M5249
163 depends on !MMU 167 depends on !MMU
164 select COLDFIRE_SW_A7 168 select COLDFIRE_SW_A7
165 select HAVE_MBAR 169 select HAVE_MBAR
170 select CPU_NO_EFFICIENT_FFS
166 help 171 help
167 Motorola ColdFire 5249 processor support. 172 Motorola ColdFire 5249 processor support.
168 173
@@ -171,6 +176,7 @@ config M525x
171 depends on !MMU 176 depends on !MMU
172 select COLDFIRE_SW_A7 177 select COLDFIRE_SW_A7
173 select HAVE_MBAR 178 select HAVE_MBAR
179 select CPU_NO_EFFICIENT_FFS
174 help 180 help
175 Freescale (Motorola) Coldfire 5251/5253 processor support. 181 Freescale (Motorola) Coldfire 5251/5253 processor support.
176 182
@@ -189,6 +195,7 @@ config M5272
189 depends on !MMU 195 depends on !MMU
190 select COLDFIRE_SW_A7 196 select COLDFIRE_SW_A7
191 select HAVE_MBAR 197 select HAVE_MBAR
198 select CPU_NO_EFFICIENT_FFS
192 help 199 help
193 Motorola ColdFire 5272 processor support. 200 Motorola ColdFire 5272 processor support.
194 201
@@ -217,6 +224,7 @@ config M5307
217 select COLDFIRE_SW_A7 224 select COLDFIRE_SW_A7
218 select HAVE_CACHE_CB 225 select HAVE_CACHE_CB
219 select HAVE_MBAR 226 select HAVE_MBAR
227 select CPU_NO_EFFICIENT_FFS
220 help 228 help
221 Motorola ColdFire 5307 processor support. 229 Motorola ColdFire 5307 processor support.
222 230
@@ -242,6 +250,7 @@ config M5407
242 select COLDFIRE_SW_A7 250 select COLDFIRE_SW_A7
243 select HAVE_CACHE_CB 251 select HAVE_CACHE_CB
244 select HAVE_MBAR 252 select HAVE_MBAR
253 select CPU_NO_EFFICIENT_FFS
245 help 254 help
246 Motorola ColdFire 5407 processor support. 255 Motorola ColdFire 5407 processor support.
247 256
@@ -251,6 +260,7 @@ config M547x
251 select MMU_COLDFIRE if MMU 260 select MMU_COLDFIRE if MMU
252 select HAVE_CACHE_CB 261 select HAVE_CACHE_CB
253 select HAVE_MBAR 262 select HAVE_MBAR
263 select CPU_NO_EFFICIENT_FFS
254 help 264 help
255 Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support. 265 Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
256 266
@@ -260,6 +270,7 @@ config M548x
260 select M54xx 270 select M54xx
261 select HAVE_CACHE_CB 271 select HAVE_CACHE_CB
262 select HAVE_MBAR 272 select HAVE_MBAR
273 select CPU_NO_EFFICIENT_FFS
263 help 274 help
264 Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support. 275 Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
265 276