diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-01-12 21:16:42 -0500 |
|---|---|---|
| committer | H. Peter Anvin <hpa@zytor.com> | 2010-01-14 01:39:50 -0500 |
| commit | bafaecd11df15ad5b1e598adc7736afcd38ee13d (patch) | |
| tree | 99b676d1ecc202358fe67acd095aa2c1f1ef2b1f | |
| parent | 5d0b7235d83eefdafda300656e97d368afcafc9a (diff) | |
x86-64: support native xadd rwsem implementation
This one is much faster than the spinlock based fallback rwsem code,
with certain artifical benchmarks having shown 300%+ improvement on
threaded page faults etc.
Again, note the 32767-thread limit here. So this really does need that
whole "make rwsem_count_t be 64-bit and fix the BIAS values to match"
extension on top of it, but that is conceptually a totally independent
issue.
NOT TESTED! The original patch that this all was based on were tested by
KAMEZAWA Hiroyuki, but maybe I screwed up something when I created the
cleaned-up series, so caveat emptor..
Also note that it _may_ be a good idea to mark some more registers
clobbered on x86-64 in the inline asms instead of saving/restoring them.
They are inline functions, but they are only used in places where there
are not a lot of live registers _anyway_, so doing for example the
clobbers of %r8-%r11 in the asm wouldn't make the fast-path code any
worse, and would make the slow-path code smaller.
(Not that the slow-path really matters to that degree. Saving a few
unnecessary registers is the _least_ of our problems when we hit the slow
path. The instruction/cycle counting really only matters in the fast
path).
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.00.1001121810410.17145@localhost.localdomain>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
| -rw-r--r-- | arch/x86/Kconfig.cpu | 2 | ||||
| -rw-r--r-- | arch/x86/lib/Makefile | 1 | ||||
| -rw-r--r-- | arch/x86/lib/rwsem_64.S | 81 |
3 files changed, 83 insertions, 1 deletions
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 08e442bc3ab9..9d38a13b4ceb 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu | |||
| @@ -319,7 +319,7 @@ config X86_L1_CACHE_SHIFT | |||
| 319 | 319 | ||
| 320 | config X86_XADD | 320 | config X86_XADD |
| 321 | def_bool y | 321 | def_bool y |
| 322 | depends on X86_32 && !M386 | 322 | depends on X86_64 || !M386 |
| 323 | 323 | ||
| 324 | config X86_PPRO_FENCE | 324 | config X86_PPRO_FENCE |
| 325 | bool "PentiumPro memory ordering errata workaround" | 325 | bool "PentiumPro memory ordering errata workaround" |
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index cffd754f3039..c80245131fdc 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile | |||
| @@ -39,4 +39,5 @@ else | |||
| 39 | lib-y += thunk_64.o clear_page_64.o copy_page_64.o | 39 | lib-y += thunk_64.o clear_page_64.o copy_page_64.o |
| 40 | lib-y += memmove_64.o memset_64.o | 40 | lib-y += memmove_64.o memset_64.o |
| 41 | lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o | 41 | lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o |
| 42 | lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o | ||
| 42 | endif | 43 | endif |
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S new file mode 100644 index 000000000000..15acecf0d7aa --- /dev/null +++ b/arch/x86/lib/rwsem_64.S | |||
| @@ -0,0 +1,81 @@ | |||
| 1 | /* | ||
| 2 | * x86-64 rwsem wrappers | ||
| 3 | * | ||
| 4 | * This interfaces the inline asm code to the slow-path | ||
| 5 | * C routines. We need to save the call-clobbered regs | ||
| 6 | * that the asm does not mark as clobbered, and move the | ||
| 7 | * argument from %rax to %rdi. | ||
| 8 | * | ||
| 9 | * NOTE! We don't need to save %rax, because the functions | ||
| 10 | * will always return the semaphore pointer in %rax (which | ||
| 11 | * is also the input argument to these helpers) | ||
| 12 | * | ||
| 13 | * The following can clobber %rdx because the asm clobbers it: | ||
| 14 | * call_rwsem_down_write_failed | ||
| 15 | * call_rwsem_wake | ||
| 16 | * but %rdi, %rsi, %rcx, %r8-r11 always need saving. | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include <linux/linkage.h> | ||
| 20 | #include <asm/rwlock.h> | ||
| 21 | #include <asm/alternative-asm.h> | ||
| 22 | #include <asm/frame.h> | ||
| 23 | #include <asm/dwarf2.h> | ||
| 24 | |||
| 25 | #define save_common_regs \ | ||
| 26 | pushq %rdi; \ | ||
| 27 | pushq %rsi; \ | ||
| 28 | pushq %rcx; \ | ||
| 29 | pushq %r8; \ | ||
| 30 | pushq %r9; \ | ||
| 31 | pushq %r10; \ | ||
| 32 | pushq %r11 | ||
| 33 | |||
| 34 | #define restore_common_regs \ | ||
| 35 | popq %r11; \ | ||
| 36 | popq %r10; \ | ||
| 37 | popq %r9; \ | ||
| 38 | popq %r8; \ | ||
| 39 | popq %rcx; \ | ||
| 40 | popq %rsi; \ | ||
| 41 | popq %rdi | ||
| 42 | |||
| 43 | /* Fix up special calling conventions */ | ||
| 44 | ENTRY(call_rwsem_down_read_failed) | ||
| 45 | save_common_regs | ||
| 46 | pushq %rdx | ||
| 47 | movq %rax,%rdi | ||
| 48 | call rwsem_down_read_failed | ||
| 49 | popq %rdx | ||
| 50 | restore_common_regs | ||
| 51 | ret | ||
| 52 | ENDPROC(call_rwsem_down_read_failed) | ||
| 53 | |||
| 54 | ENTRY(call_rwsem_down_write_failed) | ||
| 55 | save_common_regs | ||
| 56 | movq %rax,%rdi | ||
| 57 | call rwsem_down_write_failed | ||
| 58 | restore_common_regs | ||
| 59 | ret | ||
| 60 | ENDPROC(call_rwsem_down_write_failed) | ||
| 61 | |||
| 62 | ENTRY(call_rwsem_wake) | ||
| 63 | decw %dx /* do nothing if still outstanding active readers */ | ||
| 64 | jnz 1f | ||
| 65 | save_common_regs | ||
| 66 | movq %rax,%rdi | ||
| 67 | call rwsem_wake | ||
| 68 | restore_common_regs | ||
| 69 | 1: ret | ||
| 70 | ENDPROC(call_rwsem_wake) | ||
| 71 | |||
| 72 | /* Fix up special calling conventions */ | ||
| 73 | ENTRY(call_rwsem_downgrade_wake) | ||
| 74 | save_common_regs | ||
| 75 | pushq %rdx | ||
| 76 | movq %rax,%rdi | ||
| 77 | call rwsem_downgrade_wake | ||
| 78 | popq %rdx | ||
| 79 | restore_common_regs | ||
| 80 | ret | ||
| 81 | ENDPROC(call_rwsem_downgrade_wake) | ||
