aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/lib
diff options
context:
space:
mode:
authorBorislav Petkov <bp@suse.de>2015-01-12 12:19:40 -0500
committerBorislav Petkov <bp@suse.de>2015-02-23 07:44:12 -0500
commit090a3f615524c3f75d09fdb37f15ea1868d79f7e (patch)
treed349bcdaab0b39049169de5a0844ddd22f548056 /arch/x86/lib
parent4fd4b6e5537cec5b56db0b22546dd439ebb26830 (diff)
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections. What is more, make the REP_GOOD version be the default copy_page() version as the majority of the relevant x86 CPUs do set X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to: ffffffff8130af80 <copy_page>: ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs> ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi) ffffffff8130af8d: c3 retq ffffffff8130af8e: 66 90 xchg %ax,%ax ffffffff8130af90 <copy_page_regs>: ... and after the alternatives have run, the JMP to the old, unrolled version gets NOPed out: ffffffff8130af80 <copy_page>: ffffffff8130af80: 66 66 90 xchg %ax,%ax ffffffff8130af83: 66 90 xchg %ax,%ax ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi) ffffffff8130af8d: c3 retq On modern uarches, those NOPs are cheaper than the unconditional JMP previously. Signed-off-by: Borislav Petkov <bp@suse.de>
Diffstat (limited to 'arch/x86/lib')
-rw-r--r--arch/x86/lib/copy_page_64.S37
1 files changed, 12 insertions, 25 deletions
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index f1ffdbb07755..8239dbcbf984 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
5#include <asm/cpufeature.h>
5#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
6 7
8/*
9 * Some CPUs run faster using the string copy instructions (sane microcode).
10 * It is also a lot simpler. Use this when possible. But, don't use streaming
11 * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
12 * prefetch distance based on SMP/UP.
13 */
7 ALIGN 14 ALIGN
8copy_page_rep: 15ENTRY(copy_page)
9 CFI_STARTPROC 16 CFI_STARTPROC
17 ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
10 movl $4096/8, %ecx 18 movl $4096/8, %ecx
11 rep movsq 19 rep movsq
12 ret 20 ret
13 CFI_ENDPROC 21 CFI_ENDPROC
14ENDPROC(copy_page_rep) 22ENDPROC(copy_page)
15
16/*
17 * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
18 * Could vary the prefetch distance based on SMP/UP.
19*/
20 23
21ENTRY(copy_page) 24ENTRY(copy_page_regs)
22 CFI_STARTPROC 25 CFI_STARTPROC
23 subq $2*8, %rsp 26 subq $2*8, %rsp
24 CFI_ADJUST_CFA_OFFSET 2*8 27 CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
90 addq $2*8, %rsp 93 addq $2*8, %rsp
91 CFI_ADJUST_CFA_OFFSET -2*8 94 CFI_ADJUST_CFA_OFFSET -2*8
92 ret 95 ret
93.Lcopy_page_end:
94 CFI_ENDPROC 96 CFI_ENDPROC
95ENDPROC(copy_page) 97ENDPROC(copy_page_regs)
96
97 /* Some CPUs run faster using the string copy instructions.
98 It is also a lot simpler. Use this when possible */
99
100#include <asm/cpufeature.h>
101
102 .section .altinstr_replacement,"ax"
1031: .byte 0xeb /* jmp <disp8> */
104 .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
1052:
106 .previous
107 .section .altinstructions,"a"
108 altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
109 .Lcopy_page_end-copy_page, 2b-1b, 0
110 .previous