diff options
author | Jan Beulich <JBeulich@suse.com> | 2012-09-10 07:24:43 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-09-13 11:44:01 -0400 |
commit | 5870661c091e827973674cc3469b50c959008c2b (patch) | |
tree | 4043f28631c98fbc03babd7b8b32af788f6f362e | |
parent | 1edfbb4153bd29bcf8d2236676238d5237972be1 (diff) |
x86: Prefer TZCNT over BFS
Following a relatively recent compiler change, make use of the
fact that for non-zero input BSF and TZCNT produce the same
result, and that CPUs not knowing of TZCNT will treat the
instruction as BSF (i.e. ignore what looks like a REP prefix to
them). The assumption here is that TZCNT would never have worse
performance than BSF.
For the moment, only do this when the respective generic-CPU
option is selected (as there are no specific-CPU options
covering the CPUs supporting TZCNT), and don't do that when size
optimization was requested.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/504DEA1B020000780009A277@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | arch/x86/include/asm/bitops.h | 19 |
1 files changed, 17 insertions, 2 deletions
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index ebaee695394e..b2af6645ea7e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h | |||
@@ -347,6 +347,19 @@ static int test_bit(int nr, const volatile unsigned long *addr); | |||
347 | ? constant_test_bit((nr), (addr)) \ | 347 | ? constant_test_bit((nr), (addr)) \ |
348 | : variable_test_bit((nr), (addr))) | 348 | : variable_test_bit((nr), (addr))) |
349 | 349 | ||
350 | #if (defined(CONFIG_X86_GENERIC) || defined(CONFIG_GENERIC_CPU)) \ | ||
351 | && !defined(CONFIG_CC_OPTIMIZE_FOR_SIZE) | ||
352 | /* | ||
353 | * Since BSF and TZCNT have sufficiently similar semantics for the purposes | ||
354 | * for which we use them here, BMI-capable hardware will decode the prefixed | ||
355 | * variant as 'tzcnt ...' and may execute that faster than 'bsf ...', while | ||
356 | * older hardware will ignore the REP prefix and decode it as 'bsf ...'. | ||
357 | */ | ||
358 | # define BSF_PREFIX "rep;" | ||
359 | #else | ||
360 | # define BSF_PREFIX | ||
361 | #endif | ||
362 | |||
350 | /** | 363 | /** |
351 | * __ffs - find first set bit in word | 364 | * __ffs - find first set bit in word |
352 | * @word: The word to search | 365 | * @word: The word to search |
@@ -355,7 +368,7 @@ static int test_bit(int nr, const volatile unsigned long *addr); | |||
355 | */ | 368 | */ |
356 | static inline unsigned long __ffs(unsigned long word) | 369 | static inline unsigned long __ffs(unsigned long word) |
357 | { | 370 | { |
358 | asm("bsf %1,%0" | 371 | asm(BSF_PREFIX "bsf %1,%0" |
359 | : "=r" (word) | 372 | : "=r" (word) |
360 | : "rm" (word)); | 373 | : "rm" (word)); |
361 | return word; | 374 | return word; |
@@ -369,12 +382,14 @@ static inline unsigned long __ffs(unsigned long word) | |||
369 | */ | 382 | */ |
370 | static inline unsigned long ffz(unsigned long word) | 383 | static inline unsigned long ffz(unsigned long word) |
371 | { | 384 | { |
372 | asm("bsf %1,%0" | 385 | asm(BSF_PREFIX "bsf %1,%0" |
373 | : "=r" (word) | 386 | : "=r" (word) |
374 | : "r" (~word)); | 387 | : "r" (~word)); |
375 | return word; | 388 | return word; |
376 | } | 389 | } |
377 | 390 | ||
391 | #undef BSF_PREFIX | ||
392 | |||
378 | /* | 393 | /* |
379 | * __fls: find last set bit in word | 394 | * __fls: find last set bit in word |
380 | * @word: The word to search | 395 | * @word: The word to search |