diff options
author | Ingo Molnar <mingo@elte.hu> | 2011-05-18 14:59:27 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-05-18 14:59:30 -0400 |
commit | 01ed58abec07633791f03684b937a7e22e00c9bb (patch) | |
tree | 7bb5b60c102aa08e404928ffcb3edf1e8404b5a2 | |
parent | af2d03d4aaa847ef41a229dfee098a47908437c6 (diff) | |
parent | 26afb7c661080ae3f1f13ddf7f0c58c4f931c22b (diff) |
Merge branch 'x86/mem' into perf/core
Merge reason: memcpy_64.S changes an assumption perf bench has, so merge this
here so we can fix it.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | arch/x86/include/asm/alternative-asm.h | 9 | ||||
-rw-r--r-- | arch/x86/include/asm/cpufeature.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/uaccess.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/alternative.c | 9 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel.c | 19 | ||||
-rw-r--r-- | arch/x86/lib/clear_page_64.S | 33 | ||||
-rw-r--r-- | arch/x86/lib/copy_user_64.S | 69 | ||||
-rw-r--r-- | arch/x86/lib/memcpy_64.S | 45 | ||||
-rw-r--r-- | arch/x86/lib/memmove_64.S | 29 | ||||
-rw-r--r-- | arch/x86/lib/memset_64.S | 54 |
11 files changed, 219 insertions, 54 deletions
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index a63a68be1cce..94d420b360d1 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h | |||
@@ -15,4 +15,13 @@ | |||
15 | .endm | 15 | .endm |
16 | #endif | 16 | #endif |
17 | 17 | ||
18 | .macro altinstruction_entry orig alt feature orig_len alt_len | ||
19 | .align 8 | ||
20 | .quad \orig | ||
21 | .quad \alt | ||
22 | .word \feature | ||
23 | .byte \orig_len | ||
24 | .byte \alt_len | ||
25 | .endm | ||
26 | |||
18 | #endif /* __ASSEMBLY__ */ | 27 | #endif /* __ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 91f3e087cf21..7f2f7b123293 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -195,6 +195,7 @@ | |||
195 | 195 | ||
196 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ | 196 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ |
197 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ | 197 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ |
198 | #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ | ||
198 | 199 | ||
199 | #if defined(__KERNEL__) && !defined(__ASSEMBLY__) | 200 | #if defined(__KERNEL__) && !defined(__ASSEMBLY__) |
200 | 201 | ||
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index abd3e0ea762a..99f0ad753f32 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h | |||
@@ -42,7 +42,7 @@ | |||
42 | * Returns 0 if the range is valid, nonzero otherwise. | 42 | * Returns 0 if the range is valid, nonzero otherwise. |
43 | * | 43 | * |
44 | * This is equivalent to the following test: | 44 | * This is equivalent to the following test: |
45 | * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) | 45 | * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64) |
46 | * | 46 | * |
47 | * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... | 47 | * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... |
48 | */ | 48 | */ |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 651454b0c811..1eeeafcb4410 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -210,6 +210,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
210 | u8 insnbuf[MAX_PATCH_LEN]; | 210 | u8 insnbuf[MAX_PATCH_LEN]; |
211 | 211 | ||
212 | DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); | 212 | DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); |
213 | /* | ||
214 | * The scan order should be from start to end. A later scanned | ||
215 | * alternative code can overwrite a previous scanned alternative code. | ||
216 | * Some kernel functions (e.g. memcpy, memset, etc) use this order to | ||
217 | * patch code. | ||
218 | * | ||
219 | * So be careful if you want to change the scan order to any other | ||
220 | * order. | ||
221 | */ | ||
213 | for (a = start; a < end; a++) { | 222 | for (a = start; a < end; a++) { |
214 | u8 *instr = a->instr; | 223 | u8 *instr = a->instr; |
215 | BUG_ON(a->replacementlen > a->instrlen); | 224 | BUG_ON(a->replacementlen > a->instrlen); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e2ced0074a45..173f3a3fa1a6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -565,8 +565,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) | |||
565 | 565 | ||
566 | cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); | 566 | cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); |
567 | 567 | ||
568 | if (eax > 0) | 568 | c->x86_capability[9] = ebx; |
569 | c->x86_capability[9] = ebx; | ||
570 | } | 569 | } |
571 | 570 | ||
572 | /* AMD-defined flags: level 0x80000001 */ | 571 | /* AMD-defined flags: level 0x80000001 */ |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index df86bc8c859d..fc73a34ba8c9 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -29,10 +29,10 @@ | |||
29 | 29 | ||
30 | static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | 30 | static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) |
31 | { | 31 | { |
32 | u64 misc_enable; | ||
33 | |||
32 | /* Unmask CPUID levels if masked: */ | 34 | /* Unmask CPUID levels if masked: */ |
33 | if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { | 35 | if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { |
34 | u64 misc_enable; | ||
35 | |||
36 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | 36 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); |
37 | 37 | ||
38 | if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { | 38 | if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { |
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | |||
118 | * (model 2) with the same problem. | 118 | * (model 2) with the same problem. |
119 | */ | 119 | */ |
120 | if (c->x86 == 15) { | 120 | if (c->x86 == 15) { |
121 | u64 misc_enable; | ||
122 | |||
123 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | 121 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); |
124 | 122 | ||
125 | if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { | 123 | if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { |
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | |||
130 | } | 128 | } |
131 | } | 129 | } |
132 | #endif | 130 | #endif |
131 | |||
132 | /* | ||
133 | * If fast string is not enabled in IA32_MISC_ENABLE for any reason, | ||
134 | * clear the fast string and enhanced fast string CPU capabilities. | ||
135 | */ | ||
136 | if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { | ||
137 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | ||
138 | if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { | ||
139 | printk(KERN_INFO "Disabled fast string operations\n"); | ||
140 | setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); | ||
141 | setup_clear_cpu_cap(X86_FEATURE_ERMS); | ||
142 | } | ||
143 | } | ||
133 | } | 144 | } |
134 | 145 | ||
135 | #ifdef CONFIG_X86_32 | 146 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index aa4326bfb24a..f2145cfa12a6 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S | |||
@@ -1,5 +1,6 @@ | |||
1 | #include <linux/linkage.h> | 1 | #include <linux/linkage.h> |
2 | #include <asm/dwarf2.h> | 2 | #include <asm/dwarf2.h> |
3 | #include <asm/alternative-asm.h> | ||
3 | 4 | ||
4 | /* | 5 | /* |
5 | * Zero a page. | 6 | * Zero a page. |
@@ -14,6 +15,15 @@ ENTRY(clear_page_c) | |||
14 | CFI_ENDPROC | 15 | CFI_ENDPROC |
15 | ENDPROC(clear_page_c) | 16 | ENDPROC(clear_page_c) |
16 | 17 | ||
18 | ENTRY(clear_page_c_e) | ||
19 | CFI_STARTPROC | ||
20 | movl $4096,%ecx | ||
21 | xorl %eax,%eax | ||
22 | rep stosb | ||
23 | ret | ||
24 | CFI_ENDPROC | ||
25 | ENDPROC(clear_page_c_e) | ||
26 | |||
17 | ENTRY(clear_page) | 27 | ENTRY(clear_page) |
18 | CFI_STARTPROC | 28 | CFI_STARTPROC |
19 | xorl %eax,%eax | 29 | xorl %eax,%eax |
@@ -38,21 +48,26 @@ ENTRY(clear_page) | |||
38 | .Lclear_page_end: | 48 | .Lclear_page_end: |
39 | ENDPROC(clear_page) | 49 | ENDPROC(clear_page) |
40 | 50 | ||
41 | /* Some CPUs run faster using the string instructions. | 51 | /* |
42 | It is also a lot simpler. Use this when possible */ | 52 | * Some CPUs support enhanced REP MOVSB/STOSB instructions. |
53 | * It is recommended to use this when possible. | ||
54 | * If enhanced REP MOVSB/STOSB is not available, try to use fast string. | ||
55 | * Otherwise, use original function. | ||
56 | * | ||
57 | */ | ||
43 | 58 | ||
44 | #include <asm/cpufeature.h> | 59 | #include <asm/cpufeature.h> |
45 | 60 | ||
46 | .section .altinstr_replacement,"ax" | 61 | .section .altinstr_replacement,"ax" |
47 | 1: .byte 0xeb /* jmp <disp8> */ | 62 | 1: .byte 0xeb /* jmp <disp8> */ |
48 | .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ | 63 | .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ |
49 | 2: | 64 | 2: .byte 0xeb /* jmp <disp8> */ |
65 | .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ | ||
66 | 3: | ||
50 | .previous | 67 | .previous |
51 | .section .altinstructions,"a" | 68 | .section .altinstructions,"a" |
52 | .align 8 | 69 | altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ |
53 | .quad clear_page | 70 | .Lclear_page_end-clear_page, 2b-1b |
54 | .quad 1b | 71 | altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ |
55 | .word X86_FEATURE_REP_GOOD | 72 | .Lclear_page_end-clear_page,3b-2b |
56 | .byte .Lclear_page_end - clear_page | ||
57 | .byte 2b - 1b | ||
58 | .previous | 73 | .previous |
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 99e482615195..024840266ba0 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S | |||
@@ -15,23 +15,30 @@ | |||
15 | #include <asm/asm-offsets.h> | 15 | #include <asm/asm-offsets.h> |
16 | #include <asm/thread_info.h> | 16 | #include <asm/thread_info.h> |
17 | #include <asm/cpufeature.h> | 17 | #include <asm/cpufeature.h> |
18 | #include <asm/alternative-asm.h> | ||
18 | 19 | ||
19 | .macro ALTERNATIVE_JUMP feature,orig,alt | 20 | /* |
21 | * By placing feature2 after feature1 in altinstructions section, we logically | ||
22 | * implement: | ||
23 | * If CPU has feature2, jmp to alt2 is used | ||
24 | * else if CPU has feature1, jmp to alt1 is used | ||
25 | * else jmp to orig is used. | ||
26 | */ | ||
27 | .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 | ||
20 | 0: | 28 | 0: |
21 | .byte 0xe9 /* 32bit jump */ | 29 | .byte 0xe9 /* 32bit jump */ |
22 | .long \orig-1f /* by default jump to orig */ | 30 | .long \orig-1f /* by default jump to orig */ |
23 | 1: | 31 | 1: |
24 | .section .altinstr_replacement,"ax" | 32 | .section .altinstr_replacement,"ax" |
25 | 2: .byte 0xe9 /* near jump with 32bit immediate */ | 33 | 2: .byte 0xe9 /* near jump with 32bit immediate */ |
26 | .long \alt-1b /* offset */ /* or alternatively to alt */ | 34 | .long \alt1-1b /* offset */ /* or alternatively to alt1 */ |
35 | 3: .byte 0xe9 /* near jump with 32bit immediate */ | ||
36 | .long \alt2-1b /* offset */ /* or alternatively to alt2 */ | ||
27 | .previous | 37 | .previous |
38 | |||
28 | .section .altinstructions,"a" | 39 | .section .altinstructions,"a" |
29 | .align 8 | 40 | altinstruction_entry 0b,2b,\feature1,5,5 |
30 | .quad 0b | 41 | altinstruction_entry 0b,3b,\feature2,5,5 |
31 | .quad 2b | ||
32 | .word \feature /* when feature is set */ | ||
33 | .byte 5 | ||
34 | .byte 5 | ||
35 | .previous | 42 | .previous |
36 | .endm | 43 | .endm |
37 | 44 | ||
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user) | |||
72 | addq %rdx,%rcx | 79 | addq %rdx,%rcx |
73 | jc bad_to_user | 80 | jc bad_to_user |
74 | cmpq TI_addr_limit(%rax),%rcx | 81 | cmpq TI_addr_limit(%rax),%rcx |
75 | jae bad_to_user | 82 | ja bad_to_user |
76 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | 83 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ |
84 | copy_user_generic_unrolled,copy_user_generic_string, \ | ||
85 | copy_user_enhanced_fast_string | ||
77 | CFI_ENDPROC | 86 | CFI_ENDPROC |
78 | ENDPROC(_copy_to_user) | 87 | ENDPROC(_copy_to_user) |
79 | 88 | ||
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user) | |||
85 | addq %rdx,%rcx | 94 | addq %rdx,%rcx |
86 | jc bad_from_user | 95 | jc bad_from_user |
87 | cmpq TI_addr_limit(%rax),%rcx | 96 | cmpq TI_addr_limit(%rax),%rcx |
88 | jae bad_from_user | 97 | ja bad_from_user |
89 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | 98 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ |
99 | copy_user_generic_unrolled,copy_user_generic_string, \ | ||
100 | copy_user_enhanced_fast_string | ||
90 | CFI_ENDPROC | 101 | CFI_ENDPROC |
91 | ENDPROC(_copy_from_user) | 102 | ENDPROC(_copy_from_user) |
92 | 103 | ||
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string) | |||
255 | .previous | 266 | .previous |
256 | CFI_ENDPROC | 267 | CFI_ENDPROC |
257 | ENDPROC(copy_user_generic_string) | 268 | ENDPROC(copy_user_generic_string) |
269 | |||
270 | /* | ||
271 | * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. | ||
272 | * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. | ||
273 | * | ||
274 | * Input: | ||
275 | * rdi destination | ||
276 | * rsi source | ||
277 | * rdx count | ||
278 | * | ||
279 | * Output: | ||
280 | * eax uncopied bytes or 0 if successful. | ||
281 | */ | ||
282 | ENTRY(copy_user_enhanced_fast_string) | ||
283 | CFI_STARTPROC | ||
284 | andl %edx,%edx | ||
285 | jz 2f | ||
286 | movl %edx,%ecx | ||
287 | 1: rep | ||
288 | movsb | ||
289 | 2: xorl %eax,%eax | ||
290 | ret | ||
291 | |||
292 | .section .fixup,"ax" | ||
293 | 12: movl %ecx,%edx /* ecx is zerorest also */ | ||
294 | jmp copy_user_handle_tail | ||
295 | .previous | ||
296 | |||
297 | .section __ex_table,"a" | ||
298 | .align 8 | ||
299 | .quad 1b,12b | ||
300 | .previous | ||
301 | CFI_ENDPROC | ||
302 | ENDPROC(copy_user_enhanced_fast_string) | ||
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 75ef61e35e38..daab21dae2d1 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S | |||
@@ -4,6 +4,7 @@ | |||
4 | 4 | ||
5 | #include <asm/cpufeature.h> | 5 | #include <asm/cpufeature.h> |
6 | #include <asm/dwarf2.h> | 6 | #include <asm/dwarf2.h> |
7 | #include <asm/alternative-asm.h> | ||
7 | 8 | ||
8 | /* | 9 | /* |
9 | * memcpy - Copy a memory block. | 10 | * memcpy - Copy a memory block. |
@@ -37,6 +38,23 @@ | |||
37 | .Lmemcpy_e: | 38 | .Lmemcpy_e: |
38 | .previous | 39 | .previous |
39 | 40 | ||
41 | /* | ||
42 | * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than | ||
43 | * memcpy_c. Use memcpy_c_e when possible. | ||
44 | * | ||
45 | * This gets patched over the unrolled variant (below) via the | ||
46 | * alternative instructions framework: | ||
47 | */ | ||
48 | .section .altinstr_replacement, "ax", @progbits | ||
49 | .Lmemcpy_c_e: | ||
50 | movq %rdi, %rax | ||
51 | |||
52 | movl %edx, %ecx | ||
53 | rep movsb | ||
54 | ret | ||
55 | .Lmemcpy_e_e: | ||
56 | .previous | ||
57 | |||
40 | ENTRY(__memcpy) | 58 | ENTRY(__memcpy) |
41 | ENTRY(memcpy) | 59 | ENTRY(memcpy) |
42 | CFI_STARTPROC | 60 | CFI_STARTPROC |
@@ -171,21 +189,22 @@ ENDPROC(memcpy) | |||
171 | ENDPROC(__memcpy) | 189 | ENDPROC(__memcpy) |
172 | 190 | ||
173 | /* | 191 | /* |
174 | * Some CPUs run faster using the string copy instructions. | 192 | * Some CPUs are adding enhanced REP MOVSB/STOSB feature |
175 | * It is also a lot simpler. Use this when possible: | 193 | * If the feature is supported, memcpy_c_e() is the first choice. |
176 | */ | 194 | * If enhanced rep movsb copy is not available, use fast string copy |
177 | 195 | * memcpy_c() when possible. This is faster and code is simpler than | |
178 | .section .altinstructions, "a" | 196 | * original memcpy(). |
179 | .align 8 | 197 | * Otherwise, original memcpy() is used. |
180 | .quad memcpy | 198 | * In .altinstructions section, ERMS feature is placed after REG_GOOD |
181 | .quad .Lmemcpy_c | 199 | * feature to implement the right patch order. |
182 | .word X86_FEATURE_REP_GOOD | 200 | * |
183 | |||
184 | /* | ||
185 | * Replace only beginning, memcpy is used to apply alternatives, | 201 | * Replace only beginning, memcpy is used to apply alternatives, |
186 | * so it is silly to overwrite itself with nops - reboot is the | 202 | * so it is silly to overwrite itself with nops - reboot is the |
187 | * only outcome... | 203 | * only outcome... |
188 | */ | 204 | */ |
189 | .byte .Lmemcpy_e - .Lmemcpy_c | 205 | .section .altinstructions, "a" |
190 | .byte .Lmemcpy_e - .Lmemcpy_c | 206 | altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ |
207 | .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c | ||
208 | altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ | ||
209 | .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e | ||
191 | .previous | 210 | .previous |
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 0ecb8433e5a8..d0ec9c2936d7 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S | |||
@@ -8,6 +8,7 @@ | |||
8 | #define _STRING_C | 8 | #define _STRING_C |
9 | #include <linux/linkage.h> | 9 | #include <linux/linkage.h> |
10 | #include <asm/dwarf2.h> | 10 | #include <asm/dwarf2.h> |
11 | #include <asm/cpufeature.h> | ||
11 | 12 | ||
12 | #undef memmove | 13 | #undef memmove |
13 | 14 | ||
@@ -24,6 +25,7 @@ | |||
24 | */ | 25 | */ |
25 | ENTRY(memmove) | 26 | ENTRY(memmove) |
26 | CFI_STARTPROC | 27 | CFI_STARTPROC |
28 | |||
27 | /* Handle more 32bytes in loop */ | 29 | /* Handle more 32bytes in loop */ |
28 | mov %rdi, %rax | 30 | mov %rdi, %rax |
29 | cmp $0x20, %rdx | 31 | cmp $0x20, %rdx |
@@ -31,8 +33,13 @@ ENTRY(memmove) | |||
31 | 33 | ||
32 | /* Decide forward/backward copy mode */ | 34 | /* Decide forward/backward copy mode */ |
33 | cmp %rdi, %rsi | 35 | cmp %rdi, %rsi |
34 | jb 2f | 36 | jge .Lmemmove_begin_forward |
37 | mov %rsi, %r8 | ||
38 | add %rdx, %r8 | ||
39 | cmp %rdi, %r8 | ||
40 | jg 2f | ||
35 | 41 | ||
42 | .Lmemmove_begin_forward: | ||
36 | /* | 43 | /* |
37 | * movsq instruction have many startup latency | 44 | * movsq instruction have many startup latency |
38 | * so we handle small size by general register. | 45 | * so we handle small size by general register. |
@@ -78,6 +85,8 @@ ENTRY(memmove) | |||
78 | rep movsq | 85 | rep movsq |
79 | movq %r11, (%r10) | 86 | movq %r11, (%r10) |
80 | jmp 13f | 87 | jmp 13f |
88 | .Lmemmove_end_forward: | ||
89 | |||
81 | /* | 90 | /* |
82 | * Handle data backward by movsq. | 91 | * Handle data backward by movsq. |
83 | */ | 92 | */ |
@@ -194,4 +203,22 @@ ENTRY(memmove) | |||
194 | 13: | 203 | 13: |
195 | retq | 204 | retq |
196 | CFI_ENDPROC | 205 | CFI_ENDPROC |
206 | |||
207 | .section .altinstr_replacement,"ax" | ||
208 | .Lmemmove_begin_forward_efs: | ||
209 | /* Forward moving data. */ | ||
210 | movq %rdx, %rcx | ||
211 | rep movsb | ||
212 | retq | ||
213 | .Lmemmove_end_forward_efs: | ||
214 | .previous | ||
215 | |||
216 | .section .altinstructions,"a" | ||
217 | .align 8 | ||
218 | .quad .Lmemmove_begin_forward | ||
219 | .quad .Lmemmove_begin_forward_efs | ||
220 | .word X86_FEATURE_ERMS | ||
221 | .byte .Lmemmove_end_forward-.Lmemmove_begin_forward | ||
222 | .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs | ||
223 | .previous | ||
197 | ENDPROC(memmove) | 224 | ENDPROC(memmove) |
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 09d344269652..79bd454b78a3 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S | |||
@@ -2,9 +2,13 @@ | |||
2 | 2 | ||
3 | #include <linux/linkage.h> | 3 | #include <linux/linkage.h> |
4 | #include <asm/dwarf2.h> | 4 | #include <asm/dwarf2.h> |
5 | #include <asm/cpufeature.h> | ||
6 | #include <asm/alternative-asm.h> | ||
5 | 7 | ||
6 | /* | 8 | /* |
7 | * ISO C memset - set a memory block to a byte value. | 9 | * ISO C memset - set a memory block to a byte value. This function uses fast |
10 | * string to get better performance than the original function. The code is | ||
11 | * simpler and shorter than the orignal function as well. | ||
8 | * | 12 | * |
9 | * rdi destination | 13 | * rdi destination |
10 | * rsi value (char) | 14 | * rsi value (char) |
@@ -31,6 +35,28 @@ | |||
31 | .Lmemset_e: | 35 | .Lmemset_e: |
32 | .previous | 36 | .previous |
33 | 37 | ||
38 | /* | ||
39 | * ISO C memset - set a memory block to a byte value. This function uses | ||
40 | * enhanced rep stosb to override the fast string function. | ||
41 | * The code is simpler and shorter than the fast string function as well. | ||
42 | * | ||
43 | * rdi destination | ||
44 | * rsi value (char) | ||
45 | * rdx count (bytes) | ||
46 | * | ||
47 | * rax original destination | ||
48 | */ | ||
49 | .section .altinstr_replacement, "ax", @progbits | ||
50 | .Lmemset_c_e: | ||
51 | movq %rdi,%r9 | ||
52 | movb %sil,%al | ||
53 | movl %edx,%ecx | ||
54 | rep stosb | ||
55 | movq %r9,%rax | ||
56 | ret | ||
57 | .Lmemset_e_e: | ||
58 | .previous | ||
59 | |||
34 | ENTRY(memset) | 60 | ENTRY(memset) |
35 | ENTRY(__memset) | 61 | ENTRY(__memset) |
36 | CFI_STARTPROC | 62 | CFI_STARTPROC |
@@ -112,16 +138,20 @@ ENTRY(__memset) | |||
112 | ENDPROC(memset) | 138 | ENDPROC(memset) |
113 | ENDPROC(__memset) | 139 | ENDPROC(__memset) |
114 | 140 | ||
115 | /* Some CPUs run faster using the string instructions. | 141 | /* Some CPUs support enhanced REP MOVSB/STOSB feature. |
116 | It is also a lot simpler. Use this when possible */ | 142 | * It is recommended to use this when possible. |
117 | 143 | * | |
118 | #include <asm/cpufeature.h> | 144 | * If enhanced REP MOVSB/STOSB feature is not available, use fast string |
119 | 145 | * instructions. | |
146 | * | ||
147 | * Otherwise, use original memset function. | ||
148 | * | ||
149 | * In .altinstructions section, ERMS feature is placed after REG_GOOD | ||
150 | * feature to implement the right patch order. | ||
151 | */ | ||
120 | .section .altinstructions,"a" | 152 | .section .altinstructions,"a" |
121 | .align 8 | 153 | altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ |
122 | .quad memset | 154 | .Lfinal-memset,.Lmemset_e-.Lmemset_c |
123 | .quad .Lmemset_c | 155 | altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ |
124 | .word X86_FEATURE_REP_GOOD | 156 | .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e |
125 | .byte .Lfinal - memset | ||
126 | .byte .Lmemset_e - .Lmemset_c | ||
127 | .previous | 157 | .previous |