diff options
Diffstat (limited to 'arch')
| -rw-r--r-- | arch/x86/include/asm/alternative-asm.h | 9 | ||||
| -rw-r--r-- | arch/x86/include/asm/cpufeature.h | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/uaccess.h | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/alternative.c | 9 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/common.c | 3 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel.c | 19 | ||||
| -rw-r--r-- | arch/x86/lib/clear_page_64.S | 33 | ||||
| -rw-r--r-- | arch/x86/lib/copy_user_64.S | 69 | ||||
| -rw-r--r-- | arch/x86/lib/memcpy_64.S | 45 | ||||
| -rw-r--r-- | arch/x86/lib/memmove_64.S | 29 | ||||
| -rw-r--r-- | arch/x86/lib/memset_64.S | 54 |
11 files changed, 219 insertions, 54 deletions
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index a63a68be1cce..94d420b360d1 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h | |||
| @@ -15,4 +15,13 @@ | |||
| 15 | .endm | 15 | .endm |
| 16 | #endif | 16 | #endif |
| 17 | 17 | ||
| 18 | .macro altinstruction_entry orig alt feature orig_len alt_len | ||
| 19 | .align 8 | ||
| 20 | .quad \orig | ||
| 21 | .quad \alt | ||
| 22 | .word \feature | ||
| 23 | .byte \orig_len | ||
| 24 | .byte \alt_len | ||
| 25 | .endm | ||
| 26 | |||
| 18 | #endif /* __ASSEMBLY__ */ | 27 | #endif /* __ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 91f3e087cf21..7f2f7b123293 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
| @@ -195,6 +195,7 @@ | |||
| 195 | 195 | ||
| 196 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ | 196 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ |
| 197 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ | 197 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ |
| 198 | #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ | ||
| 198 | 199 | ||
| 199 | #if defined(__KERNEL__) && !defined(__ASSEMBLY__) | 200 | #if defined(__KERNEL__) && !defined(__ASSEMBLY__) |
| 200 | 201 | ||
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index abd3e0ea762a..99f0ad753f32 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h | |||
| @@ -42,7 +42,7 @@ | |||
| 42 | * Returns 0 if the range is valid, nonzero otherwise. | 42 | * Returns 0 if the range is valid, nonzero otherwise. |
| 43 | * | 43 | * |
| 44 | * This is equivalent to the following test: | 44 | * This is equivalent to the following test: |
| 45 | * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) | 45 | * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64) |
| 46 | * | 46 | * |
| 47 | * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... | 47 | * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... |
| 48 | */ | 48 | */ |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 651454b0c811..1eeeafcb4410 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
| @@ -210,6 +210,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
| 210 | u8 insnbuf[MAX_PATCH_LEN]; | 210 | u8 insnbuf[MAX_PATCH_LEN]; |
| 211 | 211 | ||
| 212 | DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); | 212 | DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); |
| 213 | /* | ||
| 214 | * The scan order should be from start to end. A later scanned | ||
| 215 | * alternative code can overwrite a previous scanned alternative code. | ||
| 216 | * Some kernel functions (e.g. memcpy, memset, etc) use this order to | ||
| 217 | * patch code. | ||
| 218 | * | ||
| 219 | * So be careful if you want to change the scan order to any other | ||
| 220 | * order. | ||
| 221 | */ | ||
| 213 | for (a = start; a < end; a++) { | 222 | for (a = start; a < end; a++) { |
| 214 | u8 *instr = a->instr; | 223 | u8 *instr = a->instr; |
| 215 | BUG_ON(a->replacementlen > a->instrlen); | 224 | BUG_ON(a->replacementlen > a->instrlen); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e2ced0074a45..173f3a3fa1a6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
| @@ -565,8 +565,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) | |||
| 565 | 565 | ||
| 566 | cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); | 566 | cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); |
| 567 | 567 | ||
| 568 | if (eax > 0) | 568 | c->x86_capability[9] = ebx; |
| 569 | c->x86_capability[9] = ebx; | ||
| 570 | } | 569 | } |
| 571 | 570 | ||
| 572 | /* AMD-defined flags: level 0x80000001 */ | 571 | /* AMD-defined flags: level 0x80000001 */ |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index df86bc8c859d..fc73a34ba8c9 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
| @@ -29,10 +29,10 @@ | |||
| 29 | 29 | ||
| 30 | static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | 30 | static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) |
| 31 | { | 31 | { |
| 32 | u64 misc_enable; | ||
| 33 | |||
| 32 | /* Unmask CPUID levels if masked: */ | 34 | /* Unmask CPUID levels if masked: */ |
| 33 | if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { | 35 | if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { |
| 34 | u64 misc_enable; | ||
| 35 | |||
| 36 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | 36 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); |
| 37 | 37 | ||
| 38 | if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { | 38 | if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { |
| @@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | |||
| 118 | * (model 2) with the same problem. | 118 | * (model 2) with the same problem. |
| 119 | */ | 119 | */ |
| 120 | if (c->x86 == 15) { | 120 | if (c->x86 == 15) { |
| 121 | u64 misc_enable; | ||
| 122 | |||
| 123 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | 121 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); |
| 124 | 122 | ||
| 125 | if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { | 123 | if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { |
| @@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | |||
| 130 | } | 128 | } |
| 131 | } | 129 | } |
| 132 | #endif | 130 | #endif |
| 131 | |||
| 132 | /* | ||
| 133 | * If fast string is not enabled in IA32_MISC_ENABLE for any reason, | ||
| 134 | * clear the fast string and enhanced fast string CPU capabilities. | ||
| 135 | */ | ||
| 136 | if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { | ||
| 137 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | ||
| 138 | if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { | ||
| 139 | printk(KERN_INFO "Disabled fast string operations\n"); | ||
| 140 | setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); | ||
| 141 | setup_clear_cpu_cap(X86_FEATURE_ERMS); | ||
| 142 | } | ||
| 143 | } | ||
| 133 | } | 144 | } |
| 134 | 145 | ||
| 135 | #ifdef CONFIG_X86_32 | 146 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index aa4326bfb24a..f2145cfa12a6 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | #include <linux/linkage.h> | 1 | #include <linux/linkage.h> |
| 2 | #include <asm/dwarf2.h> | 2 | #include <asm/dwarf2.h> |
| 3 | #include <asm/alternative-asm.h> | ||
| 3 | 4 | ||
| 4 | /* | 5 | /* |
| 5 | * Zero a page. | 6 | * Zero a page. |
| @@ -14,6 +15,15 @@ ENTRY(clear_page_c) | |||
| 14 | CFI_ENDPROC | 15 | CFI_ENDPROC |
| 15 | ENDPROC(clear_page_c) | 16 | ENDPROC(clear_page_c) |
| 16 | 17 | ||
| 18 | ENTRY(clear_page_c_e) | ||
| 19 | CFI_STARTPROC | ||
| 20 | movl $4096,%ecx | ||
| 21 | xorl %eax,%eax | ||
| 22 | rep stosb | ||
| 23 | ret | ||
| 24 | CFI_ENDPROC | ||
| 25 | ENDPROC(clear_page_c_e) | ||
| 26 | |||
| 17 | ENTRY(clear_page) | 27 | ENTRY(clear_page) |
| 18 | CFI_STARTPROC | 28 | CFI_STARTPROC |
| 19 | xorl %eax,%eax | 29 | xorl %eax,%eax |
| @@ -38,21 +48,26 @@ ENTRY(clear_page) | |||
| 38 | .Lclear_page_end: | 48 | .Lclear_page_end: |
| 39 | ENDPROC(clear_page) | 49 | ENDPROC(clear_page) |
| 40 | 50 | ||
| 41 | /* Some CPUs run faster using the string instructions. | 51 | /* |
| 42 | It is also a lot simpler. Use this when possible */ | 52 | * Some CPUs support enhanced REP MOVSB/STOSB instructions. |
| 53 | * It is recommended to use this when possible. | ||
| 54 | * If enhanced REP MOVSB/STOSB is not available, try to use fast string. | ||
| 55 | * Otherwise, use original function. | ||
| 56 | * | ||
| 57 | */ | ||
| 43 | 58 | ||
| 44 | #include <asm/cpufeature.h> | 59 | #include <asm/cpufeature.h> |
| 45 | 60 | ||
| 46 | .section .altinstr_replacement,"ax" | 61 | .section .altinstr_replacement,"ax" |
| 47 | 1: .byte 0xeb /* jmp <disp8> */ | 62 | 1: .byte 0xeb /* jmp <disp8> */ |
| 48 | .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ | 63 | .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ |
| 49 | 2: | 64 | 2: .byte 0xeb /* jmp <disp8> */ |
| 65 | .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ | ||
| 66 | 3: | ||
| 50 | .previous | 67 | .previous |
| 51 | .section .altinstructions,"a" | 68 | .section .altinstructions,"a" |
| 52 | .align 8 | 69 | altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ |
| 53 | .quad clear_page | 70 | .Lclear_page_end-clear_page, 2b-1b |
| 54 | .quad 1b | 71 | altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ |
| 55 | .word X86_FEATURE_REP_GOOD | 72 | .Lclear_page_end-clear_page,3b-2b |
| 56 | .byte .Lclear_page_end - clear_page | ||
| 57 | .byte 2b - 1b | ||
| 58 | .previous | 73 | .previous |
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 99e482615195..024840266ba0 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S | |||
| @@ -15,23 +15,30 @@ | |||
| 15 | #include <asm/asm-offsets.h> | 15 | #include <asm/asm-offsets.h> |
| 16 | #include <asm/thread_info.h> | 16 | #include <asm/thread_info.h> |
| 17 | #include <asm/cpufeature.h> | 17 | #include <asm/cpufeature.h> |
| 18 | #include <asm/alternative-asm.h> | ||
| 18 | 19 | ||
| 19 | .macro ALTERNATIVE_JUMP feature,orig,alt | 20 | /* |
| 21 | * By placing feature2 after feature1 in altinstructions section, we logically | ||
| 22 | * implement: | ||
| 23 | * If CPU has feature2, jmp to alt2 is used | ||
| 24 | * else if CPU has feature1, jmp to alt1 is used | ||
| 25 | * else jmp to orig is used. | ||
| 26 | */ | ||
| 27 | .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 | ||
| 20 | 0: | 28 | 0: |
| 21 | .byte 0xe9 /* 32bit jump */ | 29 | .byte 0xe9 /* 32bit jump */ |
| 22 | .long \orig-1f /* by default jump to orig */ | 30 | .long \orig-1f /* by default jump to orig */ |
| 23 | 1: | 31 | 1: |
| 24 | .section .altinstr_replacement,"ax" | 32 | .section .altinstr_replacement,"ax" |
| 25 | 2: .byte 0xe9 /* near jump with 32bit immediate */ | 33 | 2: .byte 0xe9 /* near jump with 32bit immediate */ |
| 26 | .long \alt-1b /* offset */ /* or alternatively to alt */ | 34 | .long \alt1-1b /* offset */ /* or alternatively to alt1 */ |
| 35 | 3: .byte 0xe9 /* near jump with 32bit immediate */ | ||
| 36 | .long \alt2-1b /* offset */ /* or alternatively to alt2 */ | ||
| 27 | .previous | 37 | .previous |
| 38 | |||
| 28 | .section .altinstructions,"a" | 39 | .section .altinstructions,"a" |
| 29 | .align 8 | 40 | altinstruction_entry 0b,2b,\feature1,5,5 |
| 30 | .quad 0b | 41 | altinstruction_entry 0b,3b,\feature2,5,5 |
| 31 | .quad 2b | ||
| 32 | .word \feature /* when feature is set */ | ||
| 33 | .byte 5 | ||
| 34 | .byte 5 | ||
| 35 | .previous | 42 | .previous |
| 36 | .endm | 43 | .endm |
| 37 | 44 | ||
| @@ -72,8 +79,10 @@ ENTRY(_copy_to_user) | |||
| 72 | addq %rdx,%rcx | 79 | addq %rdx,%rcx |
| 73 | jc bad_to_user | 80 | jc bad_to_user |
| 74 | cmpq TI_addr_limit(%rax),%rcx | 81 | cmpq TI_addr_limit(%rax),%rcx |
| 75 | jae bad_to_user | 82 | ja bad_to_user |
| 76 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | 83 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ |
| 84 | copy_user_generic_unrolled,copy_user_generic_string, \ | ||
| 85 | copy_user_enhanced_fast_string | ||
| 77 | CFI_ENDPROC | 86 | CFI_ENDPROC |
| 78 | ENDPROC(_copy_to_user) | 87 | ENDPROC(_copy_to_user) |
| 79 | 88 | ||
| @@ -85,8 +94,10 @@ ENTRY(_copy_from_user) | |||
| 85 | addq %rdx,%rcx | 94 | addq %rdx,%rcx |
| 86 | jc bad_from_user | 95 | jc bad_from_user |
| 87 | cmpq TI_addr_limit(%rax),%rcx | 96 | cmpq TI_addr_limit(%rax),%rcx |
| 88 | jae bad_from_user | 97 | ja bad_from_user |
| 89 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | 98 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ |
| 99 | copy_user_generic_unrolled,copy_user_generic_string, \ | ||
| 100 | copy_user_enhanced_fast_string | ||
| 90 | CFI_ENDPROC | 101 | CFI_ENDPROC |
| 91 | ENDPROC(_copy_from_user) | 102 | ENDPROC(_copy_from_user) |
| 92 | 103 | ||
| @@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string) | |||
| 255 | .previous | 266 | .previous |
| 256 | CFI_ENDPROC | 267 | CFI_ENDPROC |
| 257 | ENDPROC(copy_user_generic_string) | 268 | ENDPROC(copy_user_generic_string) |
| 269 | |||
| 270 | /* | ||
| 271 | * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. | ||
| 272 | * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. | ||
| 273 | * | ||
| 274 | * Input: | ||
| 275 | * rdi destination | ||
| 276 | * rsi source | ||
| 277 | * rdx count | ||
| 278 | * | ||
| 279 | * Output: | ||
| 280 | * eax uncopied bytes or 0 if successful. | ||
| 281 | */ | ||
| 282 | ENTRY(copy_user_enhanced_fast_string) | ||
| 283 | CFI_STARTPROC | ||
| 284 | andl %edx,%edx | ||
| 285 | jz 2f | ||
| 286 | movl %edx,%ecx | ||
| 287 | 1: rep | ||
| 288 | movsb | ||
| 289 | 2: xorl %eax,%eax | ||
| 290 | ret | ||
| 291 | |||
| 292 | .section .fixup,"ax" | ||
| 293 | 12: movl %ecx,%edx /* ecx is zerorest also */ | ||
| 294 | jmp copy_user_handle_tail | ||
| 295 | .previous | ||
| 296 | |||
| 297 | .section __ex_table,"a" | ||
| 298 | .align 8 | ||
| 299 | .quad 1b,12b | ||
| 300 | .previous | ||
| 301 | CFI_ENDPROC | ||
| 302 | ENDPROC(copy_user_enhanced_fast_string) | ||
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 75ef61e35e38..daab21dae2d1 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S | |||
| @@ -4,6 +4,7 @@ | |||
| 4 | 4 | ||
| 5 | #include <asm/cpufeature.h> | 5 | #include <asm/cpufeature.h> |
| 6 | #include <asm/dwarf2.h> | 6 | #include <asm/dwarf2.h> |
| 7 | #include <asm/alternative-asm.h> | ||
| 7 | 8 | ||
| 8 | /* | 9 | /* |
| 9 | * memcpy - Copy a memory block. | 10 | * memcpy - Copy a memory block. |
| @@ -37,6 +38,23 @@ | |||
| 37 | .Lmemcpy_e: | 38 | .Lmemcpy_e: |
| 38 | .previous | 39 | .previous |
| 39 | 40 | ||
| 41 | /* | ||
| 42 | * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than | ||
| 43 | * memcpy_c. Use memcpy_c_e when possible. | ||
| 44 | * | ||
| 45 | * This gets patched over the unrolled variant (below) via the | ||
| 46 | * alternative instructions framework: | ||
| 47 | */ | ||
| 48 | .section .altinstr_replacement, "ax", @progbits | ||
| 49 | .Lmemcpy_c_e: | ||
| 50 | movq %rdi, %rax | ||
| 51 | |||
| 52 | movl %edx, %ecx | ||
| 53 | rep movsb | ||
| 54 | ret | ||
| 55 | .Lmemcpy_e_e: | ||
| 56 | .previous | ||
| 57 | |||
| 40 | ENTRY(__memcpy) | 58 | ENTRY(__memcpy) |
| 41 | ENTRY(memcpy) | 59 | ENTRY(memcpy) |
| 42 | CFI_STARTPROC | 60 | CFI_STARTPROC |
| @@ -171,21 +189,22 @@ ENDPROC(memcpy) | |||
| 171 | ENDPROC(__memcpy) | 189 | ENDPROC(__memcpy) |
| 172 | 190 | ||
| 173 | /* | 191 | /* |
| 174 | * Some CPUs run faster using the string copy instructions. | 192 | * Some CPUs are adding enhanced REP MOVSB/STOSB feature |
| 175 | * It is also a lot simpler. Use this when possible: | 193 | * If the feature is supported, memcpy_c_e() is the first choice. |
| 176 | */ | 194 | * If enhanced rep movsb copy is not available, use fast string copy |
| 177 | 195 | * memcpy_c() when possible. This is faster and code is simpler than | |
| 178 | .section .altinstructions, "a" | 196 | * original memcpy(). |
| 179 | .align 8 | 197 | * Otherwise, original memcpy() is used. |
| 180 | .quad memcpy | 198 | * In .altinstructions section, ERMS feature is placed after REG_GOOD |
| 181 | .quad .Lmemcpy_c | 199 | * feature to implement the right patch order. |
| 182 | .word X86_FEATURE_REP_GOOD | 200 | * |
| 183 | |||
| 184 | /* | ||
| 185 | * Replace only beginning, memcpy is used to apply alternatives, | 201 | * Replace only beginning, memcpy is used to apply alternatives, |
| 186 | * so it is silly to overwrite itself with nops - reboot is the | 202 | * so it is silly to overwrite itself with nops - reboot is the |
| 187 | * only outcome... | 203 | * only outcome... |
| 188 | */ | 204 | */ |
| 189 | .byte .Lmemcpy_e - .Lmemcpy_c | 205 | .section .altinstructions, "a" |
| 190 | .byte .Lmemcpy_e - .Lmemcpy_c | 206 | altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ |
| 207 | .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c | ||
| 208 | altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ | ||
| 209 | .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e | ||
| 191 | .previous | 210 | .previous |
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 0ecb8433e5a8..d0ec9c2936d7 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | #define _STRING_C | 8 | #define _STRING_C |
| 9 | #include <linux/linkage.h> | 9 | #include <linux/linkage.h> |
| 10 | #include <asm/dwarf2.h> | 10 | #include <asm/dwarf2.h> |
| 11 | #include <asm/cpufeature.h> | ||
| 11 | 12 | ||
| 12 | #undef memmove | 13 | #undef memmove |
| 13 | 14 | ||
| @@ -24,6 +25,7 @@ | |||
| 24 | */ | 25 | */ |
| 25 | ENTRY(memmove) | 26 | ENTRY(memmove) |
| 26 | CFI_STARTPROC | 27 | CFI_STARTPROC |
| 28 | |||
| 27 | /* Handle more 32bytes in loop */ | 29 | /* Handle more 32bytes in loop */ |
| 28 | mov %rdi, %rax | 30 | mov %rdi, %rax |
| 29 | cmp $0x20, %rdx | 31 | cmp $0x20, %rdx |
| @@ -31,8 +33,13 @@ ENTRY(memmove) | |||
| 31 | 33 | ||
| 32 | /* Decide forward/backward copy mode */ | 34 | /* Decide forward/backward copy mode */ |
| 33 | cmp %rdi, %rsi | 35 | cmp %rdi, %rsi |
| 34 | jb 2f | 36 | jge .Lmemmove_begin_forward |
| 37 | mov %rsi, %r8 | ||
| 38 | add %rdx, %r8 | ||
| 39 | cmp %rdi, %r8 | ||
| 40 | jg 2f | ||
| 35 | 41 | ||
| 42 | .Lmemmove_begin_forward: | ||
| 36 | /* | 43 | /* |
| 37 | * movsq instruction have many startup latency | 44 | * movsq instruction have many startup latency |
| 38 | * so we handle small size by general register. | 45 | * so we handle small size by general register. |
| @@ -78,6 +85,8 @@ ENTRY(memmove) | |||
| 78 | rep movsq | 85 | rep movsq |
| 79 | movq %r11, (%r10) | 86 | movq %r11, (%r10) |
| 80 | jmp 13f | 87 | jmp 13f |
| 88 | .Lmemmove_end_forward: | ||
| 89 | |||
| 81 | /* | 90 | /* |
| 82 | * Handle data backward by movsq. | 91 | * Handle data backward by movsq. |
| 83 | */ | 92 | */ |
| @@ -194,4 +203,22 @@ ENTRY(memmove) | |||
| 194 | 13: | 203 | 13: |
| 195 | retq | 204 | retq |
| 196 | CFI_ENDPROC | 205 | CFI_ENDPROC |
| 206 | |||
| 207 | .section .altinstr_replacement,"ax" | ||
| 208 | .Lmemmove_begin_forward_efs: | ||
| 209 | /* Forward moving data. */ | ||
| 210 | movq %rdx, %rcx | ||
| 211 | rep movsb | ||
| 212 | retq | ||
| 213 | .Lmemmove_end_forward_efs: | ||
| 214 | .previous | ||
| 215 | |||
| 216 | .section .altinstructions,"a" | ||
| 217 | .align 8 | ||
| 218 | .quad .Lmemmove_begin_forward | ||
| 219 | .quad .Lmemmove_begin_forward_efs | ||
| 220 | .word X86_FEATURE_ERMS | ||
| 221 | .byte .Lmemmove_end_forward-.Lmemmove_begin_forward | ||
| 222 | .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs | ||
| 223 | .previous | ||
| 197 | ENDPROC(memmove) | 224 | ENDPROC(memmove) |
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 09d344269652..79bd454b78a3 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S | |||
| @@ -2,9 +2,13 @@ | |||
| 2 | 2 | ||
| 3 | #include <linux/linkage.h> | 3 | #include <linux/linkage.h> |
| 4 | #include <asm/dwarf2.h> | 4 | #include <asm/dwarf2.h> |
| 5 | #include <asm/cpufeature.h> | ||
| 6 | #include <asm/alternative-asm.h> | ||
| 5 | 7 | ||
| 6 | /* | 8 | /* |
| 7 | * ISO C memset - set a memory block to a byte value. | 9 | * ISO C memset - set a memory block to a byte value. This function uses fast |
| 10 | * string to get better performance than the original function. The code is | ||
| 11 | * simpler and shorter than the orignal function as well. | ||
| 8 | * | 12 | * |
| 9 | * rdi destination | 13 | * rdi destination |
| 10 | * rsi value (char) | 14 | * rsi value (char) |
| @@ -31,6 +35,28 @@ | |||
| 31 | .Lmemset_e: | 35 | .Lmemset_e: |
| 32 | .previous | 36 | .previous |
| 33 | 37 | ||
| 38 | /* | ||
| 39 | * ISO C memset - set a memory block to a byte value. This function uses | ||
| 40 | * enhanced rep stosb to override the fast string function. | ||
| 41 | * The code is simpler and shorter than the fast string function as well. | ||
| 42 | * | ||
| 43 | * rdi destination | ||
| 44 | * rsi value (char) | ||
| 45 | * rdx count (bytes) | ||
| 46 | * | ||
| 47 | * rax original destination | ||
| 48 | */ | ||
| 49 | .section .altinstr_replacement, "ax", @progbits | ||
| 50 | .Lmemset_c_e: | ||
| 51 | movq %rdi,%r9 | ||
| 52 | movb %sil,%al | ||
| 53 | movl %edx,%ecx | ||
| 54 | rep stosb | ||
| 55 | movq %r9,%rax | ||
| 56 | ret | ||
| 57 | .Lmemset_e_e: | ||
| 58 | .previous | ||
| 59 | |||
| 34 | ENTRY(memset) | 60 | ENTRY(memset) |
| 35 | ENTRY(__memset) | 61 | ENTRY(__memset) |
| 36 | CFI_STARTPROC | 62 | CFI_STARTPROC |
| @@ -112,16 +138,20 @@ ENTRY(__memset) | |||
| 112 | ENDPROC(memset) | 138 | ENDPROC(memset) |
| 113 | ENDPROC(__memset) | 139 | ENDPROC(__memset) |
| 114 | 140 | ||
| 115 | /* Some CPUs run faster using the string instructions. | 141 | /* Some CPUs support enhanced REP MOVSB/STOSB feature. |
| 116 | It is also a lot simpler. Use this when possible */ | 142 | * It is recommended to use this when possible. |
| 117 | 143 | * | |
| 118 | #include <asm/cpufeature.h> | 144 | * If enhanced REP MOVSB/STOSB feature is not available, use fast string |
| 119 | 145 | * instructions. | |
| 146 | * | ||
| 147 | * Otherwise, use original memset function. | ||
| 148 | * | ||
| 149 | * In .altinstructions section, ERMS feature is placed after REG_GOOD | ||
| 150 | * feature to implement the right patch order. | ||
| 151 | */ | ||
| 120 | .section .altinstructions,"a" | 152 | .section .altinstructions,"a" |
| 121 | .align 8 | 153 | altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ |
| 122 | .quad memset | 154 | .Lfinal-memset,.Lmemset_e-.Lmemset_c |
| 123 | .quad .Lmemset_c | 155 | altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ |
| 124 | .word X86_FEATURE_REP_GOOD | 156 | .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e |
| 125 | .byte .Lfinal - memset | ||
| 126 | .byte .Lmemset_e - .Lmemset_c | ||
| 127 | .previous | 157 | .previous |
