aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2006-09-26 04:52:38 -0400
committerAndi Kleen <andi@basil.nowhere.org>2006-09-26 04:52:38 -0400
commit0136611c62e8650e354b95c76dff6d2ce6030eff (patch)
tree9ba66105bccc4d83b84663b8dda7e51962c22a04
parent8380aabb99719af583447133f19a4d8074b5c337 (diff)
[PATCH] optimize hweight64 for x86_64
Based on patch from David Rientjes <rientjes@google.com>, but changed by AK. Optimizes the 64-bit hamming weight for x86_64 processors assuming they have fast multiplication. Uses five fewer bitops than the generic hweight64. Benchmark on one EMT64 showed ~25% speedup with 2^24 consecutive calls. Define a new ARCH_HAS_FAST_MULTIPLIER that can be set by other architectures that can also multiply fast. Signed-off-by: Andi Kleen <ak@suse.de>
-rw-r--r--include/asm-x86_64/bitops.h2
-rw-r--r--lib/hweight.c10
2 files changed, 10 insertions, 2 deletions
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index f7ba57b1cc08..5b535eaf5309 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -399,6 +399,8 @@ static __inline__ int fls(int x)
399 return r+1; 399 return r+1;
400} 400}
401 401
402#define ARCH_HAS_FAST_MULTIPLIER 1
403
402#include <asm-generic/bitops/hweight.h> 404#include <asm-generic/bitops/hweight.h>
403 405
404#endif /* __KERNEL__ */ 406#endif /* __KERNEL__ */
diff --git a/lib/hweight.c b/lib/hweight.c
index 438257671708..360556a7803d 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -1,5 +1,6 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <asm/types.h> 2#include <asm/types.h>
3#include <asm/bitops.h>
3 4
4/** 5/**
5 * hweightN - returns the hamming weight of a N-bit word 6 * hweightN - returns the hamming weight of a N-bit word
@@ -40,14 +41,19 @@ unsigned long hweight64(__u64 w)
40#if BITS_PER_LONG == 32 41#if BITS_PER_LONG == 32
41 return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w); 42 return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w);
42#elif BITS_PER_LONG == 64 43#elif BITS_PER_LONG == 64
44#ifdef ARCH_HAS_FAST_MULTIPLIER
45 w -= (w >> 1) & 0x5555555555555555ul;
46 w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul);
47 w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful;
48 return (w * 0x0101010101010101ul) >> 56;
49#else
43 __u64 res = w - ((w >> 1) & 0x5555555555555555ul); 50 __u64 res = w - ((w >> 1) & 0x5555555555555555ul);
44 res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul); 51 res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
45 res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful; 52 res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful;
46 res = res + (res >> 8); 53 res = res + (res >> 8);
47 res = res + (res >> 16); 54 res = res + (res >> 16);
48 return (res + (res >> 32)) & 0x00000000000000FFul; 55 return (res + (res >> 32)) & 0x00000000000000FFul;
49#else 56#endif
50#error BITS_PER_LONG not defined
51#endif 57#endif
52} 58}
53EXPORT_SYMBOL(hweight64); 59EXPORT_SYMBOL(hweight64);