diff options
author | Andi Kleen <ak@suse.de> | 2006-09-26 04:52:38 -0400 |
---|---|---|
committer | Andi Kleen <andi@basil.nowhere.org> | 2006-09-26 04:52:38 -0400 |
commit | 0136611c62e8650e354b95c76dff6d2ce6030eff (patch) | |
tree | 9ba66105bccc4d83b84663b8dda7e51962c22a04 | |
parent | 8380aabb99719af583447133f19a4d8074b5c337 (diff) |
[PATCH] optimize hweight64 for x86_64
Based on patch from David Rientjes <rientjes@google.com>, but
changed by AK.
Optimizes the 64-bit hamming weight for x86_64 processors assuming they
have fast multiplication. Uses five fewer bitops than the generic
hweight64. Benchmark on one EMT64 showed ~25% speedup with 2^24
consecutive calls.
Define a new ARCH_HAS_FAST_MULTIPLIER that can be set by other
architectures that can also multiply fast.
Signed-off-by: Andi Kleen <ak@suse.de>
-rw-r--r-- | include/asm-x86_64/bitops.h | 2 | ||||
-rw-r--r-- | lib/hweight.c | 10 |
2 files changed, 10 insertions, 2 deletions
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h index f7ba57b1cc08..5b535eaf5309 100644 --- a/include/asm-x86_64/bitops.h +++ b/include/asm-x86_64/bitops.h | |||
@@ -399,6 +399,8 @@ static __inline__ int fls(int x) | |||
399 | return r+1; | 399 | return r+1; |
400 | } | 400 | } |
401 | 401 | ||
402 | #define ARCH_HAS_FAST_MULTIPLIER 1 | ||
403 | |||
402 | #include <asm-generic/bitops/hweight.h> | 404 | #include <asm-generic/bitops/hweight.h> |
403 | 405 | ||
404 | #endif /* __KERNEL__ */ | 406 | #endif /* __KERNEL__ */ |
diff --git a/lib/hweight.c b/lib/hweight.c index 438257671708..360556a7803d 100644 --- a/lib/hweight.c +++ b/lib/hweight.c | |||
@@ -1,5 +1,6 @@ | |||
1 | #include <linux/module.h> | 1 | #include <linux/module.h> |
2 | #include <asm/types.h> | 2 | #include <asm/types.h> |
3 | #include <asm/bitops.h> | ||
3 | 4 | ||
4 | /** | 5 | /** |
5 | * hweightN - returns the hamming weight of a N-bit word | 6 | * hweightN - returns the hamming weight of a N-bit word |
@@ -40,14 +41,19 @@ unsigned long hweight64(__u64 w) | |||
40 | #if BITS_PER_LONG == 32 | 41 | #if BITS_PER_LONG == 32 |
41 | return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w); | 42 | return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w); |
42 | #elif BITS_PER_LONG == 64 | 43 | #elif BITS_PER_LONG == 64 |
44 | #ifdef ARCH_HAS_FAST_MULTIPLIER | ||
45 | w -= (w >> 1) & 0x5555555555555555ul; | ||
46 | w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul); | ||
47 | w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful; | ||
48 | return (w * 0x0101010101010101ul) >> 56; | ||
49 | #else | ||
43 | __u64 res = w - ((w >> 1) & 0x5555555555555555ul); | 50 | __u64 res = w - ((w >> 1) & 0x5555555555555555ul); |
44 | res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul); | 51 | res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul); |
45 | res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful; | 52 | res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful; |
46 | res = res + (res >> 8); | 53 | res = res + (res >> 8); |
47 | res = res + (res >> 16); | 54 | res = res + (res >> 16); |
48 | return (res + (res >> 32)) & 0x00000000000000FFul; | 55 | return (res + (res >> 32)) & 0x00000000000000FFul; |
49 | #else | 56 | #endif |
50 | #error BITS_PER_LONG not defined | ||
51 | #endif | 57 | #endif |
52 | } | 58 | } |
53 | EXPORT_SYMBOL(hweight64); | 59 | EXPORT_SYMBOL(hweight64); |