diff options
23 files changed, 433 insertions, 380 deletions
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 372231c22a47..524bddce0b76 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h | |||
@@ -18,12 +18,53 @@ | |||
18 | .endm | 18 | .endm |
19 | #endif | 19 | #endif |
20 | 20 | ||
21 | .macro altinstruction_entry orig alt feature orig_len alt_len | 21 | .macro altinstruction_entry orig alt feature orig_len alt_len pad_len |
22 | .long \orig - . | 22 | .long \orig - . |
23 | .long \alt - . | 23 | .long \alt - . |
24 | .word \feature | 24 | .word \feature |
25 | .byte \orig_len | 25 | .byte \orig_len |
26 | .byte \alt_len | 26 | .byte \alt_len |
27 | .byte \pad_len | ||
28 | .endm | ||
29 | |||
30 | .macro ALTERNATIVE oldinstr, newinstr, feature | ||
31 | 140: | ||
32 | \oldinstr | ||
33 | 141: | ||
34 | .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 | ||
35 | 142: | ||
36 | |||
37 | .pushsection .altinstructions,"a" | ||
38 | altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b | ||
39 | .popsection | ||
40 | |||
41 | .pushsection .altinstr_replacement,"ax" | ||
42 | 143: | ||
43 | \newinstr | ||
44 | 144: | ||
45 | .popsection | ||
46 | .endm | ||
47 | |||
48 | .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 | ||
49 | 140: | ||
50 | \oldinstr | ||
51 | 141: | ||
52 | .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 | ||
53 | .skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90 | ||
54 | 142: | ||
55 | |||
56 | .pushsection .altinstructions,"a" | ||
57 | altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b | ||
58 | altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b | ||
59 | .popsection | ||
60 | |||
61 | .pushsection .altinstr_replacement,"ax" | ||
62 | 143: | ||
63 | \newinstr1 | ||
64 | 144: | ||
65 | \newinstr2 | ||
66 | 145: | ||
67 | .popsection | ||
27 | .endm | 68 | .endm |
28 | 69 | ||
29 | #endif /* __ASSEMBLY__ */ | 70 | #endif /* __ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 473bdbee378a..5aef6a97d80e 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h | |||
@@ -48,8 +48,9 @@ struct alt_instr { | |||
48 | s32 repl_offset; /* offset to replacement instruction */ | 48 | s32 repl_offset; /* offset to replacement instruction */ |
49 | u16 cpuid; /* cpuid bit set for replacement */ | 49 | u16 cpuid; /* cpuid bit set for replacement */ |
50 | u8 instrlen; /* length of original instruction */ | 50 | u8 instrlen; /* length of original instruction */ |
51 | u8 replacementlen; /* length of new instruction, <= instrlen */ | 51 | u8 replacementlen; /* length of new instruction */ |
52 | }; | 52 | u8 padlen; /* length of build-time padding */ |
53 | } __packed; | ||
53 | 54 | ||
54 | extern void alternative_instructions(void); | 55 | extern void alternative_instructions(void); |
55 | extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); | 56 | extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); |
@@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end) | |||
76 | } | 77 | } |
77 | #endif /* CONFIG_SMP */ | 78 | #endif /* CONFIG_SMP */ |
78 | 79 | ||
79 | #define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" | 80 | #define b_replacement(num) "664"#num |
81 | #define e_replacement(num) "665"#num | ||
80 | 82 | ||
81 | #define b_replacement(number) "663"#number | 83 | #define alt_end_marker "663" |
82 | #define e_replacement(number) "664"#number | 84 | #define alt_slen "662b-661b" |
85 | #define alt_pad_len alt_end_marker"b-662b" | ||
86 | #define alt_total_slen alt_end_marker"b-661b" | ||
87 | #define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" | ||
83 | 88 | ||
84 | #define alt_slen "662b-661b" | 89 | #define __OLDINSTR(oldinstr, num) \ |
85 | #define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" | 90 | "661:\n\t" oldinstr "\n662:\n" \ |
91 | ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ | ||
92 | "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" | ||
86 | 93 | ||
87 | #define ALTINSTR_ENTRY(feature, number) \ | 94 | #define OLDINSTR(oldinstr, num) \ |
95 | __OLDINSTR(oldinstr, num) \ | ||
96 | alt_end_marker ":\n" | ||
97 | |||
98 | /* | ||
99 | * Pad the second replacement alternative with additional NOPs if it is | ||
100 | * additionally longer than the first replacement alternative. | ||
101 | */ | ||
102 | #define OLDINSTR_2(oldinstr, num1, num2) \ | ||
103 | __OLDINSTR(oldinstr, num1) \ | ||
104 | ".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \ | ||
105 | "((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n" \ | ||
106 | alt_end_marker ":\n" | ||
107 | |||
108 | #define ALTINSTR_ENTRY(feature, num) \ | ||
88 | " .long 661b - .\n" /* label */ \ | 109 | " .long 661b - .\n" /* label */ \ |
89 | " .long " b_replacement(number)"f - .\n" /* new instruction */ \ | 110 | " .long " b_replacement(num)"f - .\n" /* new instruction */ \ |
90 | " .word " __stringify(feature) "\n" /* feature bit */ \ | 111 | " .word " __stringify(feature) "\n" /* feature bit */ \ |
91 | " .byte " alt_slen "\n" /* source len */ \ | 112 | " .byte " alt_total_slen "\n" /* source len */ \ |
92 | " .byte " alt_rlen(number) "\n" /* replacement len */ | 113 | " .byte " alt_rlen(num) "\n" /* replacement len */ \ |
114 | " .byte " alt_pad_len "\n" /* pad len */ | ||
93 | 115 | ||
94 | #define DISCARD_ENTRY(number) /* rlen <= slen */ \ | 116 | #define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ |
95 | " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" | 117 | b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t" |
96 | |||
97 | #define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ | ||
98 | b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" | ||
99 | 118 | ||
100 | /* alternative assembly primitive: */ | 119 | /* alternative assembly primitive: */ |
101 | #define ALTERNATIVE(oldinstr, newinstr, feature) \ | 120 | #define ALTERNATIVE(oldinstr, newinstr, feature) \ |
102 | OLDINSTR(oldinstr) \ | 121 | OLDINSTR(oldinstr, 1) \ |
103 | ".pushsection .altinstructions,\"a\"\n" \ | 122 | ".pushsection .altinstructions,\"a\"\n" \ |
104 | ALTINSTR_ENTRY(feature, 1) \ | 123 | ALTINSTR_ENTRY(feature, 1) \ |
105 | ".popsection\n" \ | 124 | ".popsection\n" \ |
106 | ".pushsection .discard,\"aw\",@progbits\n" \ | ||
107 | DISCARD_ENTRY(1) \ | ||
108 | ".popsection\n" \ | ||
109 | ".pushsection .altinstr_replacement, \"ax\"\n" \ | 125 | ".pushsection .altinstr_replacement, \"ax\"\n" \ |
110 | ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ | 126 | ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ |
111 | ".popsection" | 127 | ".popsection" |
112 | 128 | ||
113 | #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ | 129 | #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ |
114 | OLDINSTR(oldinstr) \ | 130 | OLDINSTR_2(oldinstr, 1, 2) \ |
115 | ".pushsection .altinstructions,\"a\"\n" \ | 131 | ".pushsection .altinstructions,\"a\"\n" \ |
116 | ALTINSTR_ENTRY(feature1, 1) \ | 132 | ALTINSTR_ENTRY(feature1, 1) \ |
117 | ALTINSTR_ENTRY(feature2, 2) \ | 133 | ALTINSTR_ENTRY(feature2, 2) \ |
118 | ".popsection\n" \ | 134 | ".popsection\n" \ |
119 | ".pushsection .discard,\"aw\",@progbits\n" \ | ||
120 | DISCARD_ENTRY(1) \ | ||
121 | DISCARD_ENTRY(2) \ | ||
122 | ".popsection\n" \ | ||
123 | ".pushsection .altinstr_replacement, \"ax\"\n" \ | 135 | ".pushsection .altinstr_replacement, \"ax\"\n" \ |
124 | ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ | 136 | ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ |
125 | ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ | 137 | ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ |
@@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end) | |||
146 | #define alternative(oldinstr, newinstr, feature) \ | 158 | #define alternative(oldinstr, newinstr, feature) \ |
147 | asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") | 159 | asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") |
148 | 160 | ||
161 | #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ | ||
162 | asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") | ||
163 | |||
149 | /* | 164 | /* |
150 | * Alternative inline assembly with input. | 165 | * Alternative inline assembly with input. |
151 | * | 166 | * |
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index efc3b22d896e..8118e94d50ab 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h | |||
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) | |||
91 | { | 91 | { |
92 | volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); | 92 | volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); |
93 | 93 | ||
94 | alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, | 94 | alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, |
95 | ASM_OUTPUT2("=r" (v), "=m" (*addr)), | 95 | ASM_OUTPUT2("=r" (v), "=m" (*addr)), |
96 | ASM_OUTPUT2("0" (v), "m" (*addr))); | 96 | ASM_OUTPUT2("0" (v), "m" (*addr))); |
97 | } | 97 | } |
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 2ab1eb33106e..959e45b81fe2 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h | |||
@@ -95,13 +95,11 @@ do { \ | |||
95 | * Stop RDTSC speculation. This is needed when you need to use RDTSC | 95 | * Stop RDTSC speculation. This is needed when you need to use RDTSC |
96 | * (or get_cycles or vread that possibly accesses the TSC) in a defined | 96 | * (or get_cycles or vread that possibly accesses the TSC) in a defined |
97 | * code region. | 97 | * code region. |
98 | * | ||
99 | * (Could use an alternative three way for this if there was one.) | ||
100 | */ | 98 | */ |
101 | static __always_inline void rdtsc_barrier(void) | 99 | static __always_inline void rdtsc_barrier(void) |
102 | { | 100 | { |
103 | alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); | 101 | alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, |
104 | alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); | 102 | "lfence", X86_FEATURE_LFENCE_RDTSC); |
105 | } | 103 | } |
106 | 104 | ||
107 | #endif /* _ASM_X86_BARRIER_H */ | 105 | #endif /* _ASM_X86_BARRIER_H */ |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index d6428ea5d316..0f7a5a1a8db2 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -419,6 +419,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
419 | " .word %P0\n" /* 1: do replace */ | 419 | " .word %P0\n" /* 1: do replace */ |
420 | " .byte 2b - 1b\n" /* source len */ | 420 | " .byte 2b - 1b\n" /* source len */ |
421 | " .byte 0\n" /* replacement len */ | 421 | " .byte 0\n" /* replacement len */ |
422 | " .byte 0\n" /* pad len */ | ||
422 | ".previous\n" | 423 | ".previous\n" |
423 | /* skipping size check since replacement size = 0 */ | 424 | /* skipping size check since replacement size = 0 */ |
424 | : : "i" (X86_FEATURE_ALWAYS) : : t_warn); | 425 | : : "i" (X86_FEATURE_ALWAYS) : : t_warn); |
@@ -433,6 +434,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
433 | " .word %P0\n" /* feature bit */ | 434 | " .word %P0\n" /* feature bit */ |
434 | " .byte 2b - 1b\n" /* source len */ | 435 | " .byte 2b - 1b\n" /* source len */ |
435 | " .byte 0\n" /* replacement len */ | 436 | " .byte 0\n" /* replacement len */ |
437 | " .byte 0\n" /* pad len */ | ||
436 | ".previous\n" | 438 | ".previous\n" |
437 | /* skipping size check since replacement size = 0 */ | 439 | /* skipping size check since replacement size = 0 */ |
438 | : : "i" (bit) : : t_no); | 440 | : : "i" (bit) : : t_no); |
@@ -458,6 +460,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
458 | " .word %P1\n" /* feature bit */ | 460 | " .word %P1\n" /* feature bit */ |
459 | " .byte 2b - 1b\n" /* source len */ | 461 | " .byte 2b - 1b\n" /* source len */ |
460 | " .byte 4f - 3f\n" /* replacement len */ | 462 | " .byte 4f - 3f\n" /* replacement len */ |
463 | " .byte 0\n" /* pad len */ | ||
461 | ".previous\n" | 464 | ".previous\n" |
462 | ".section .discard,\"aw\",@progbits\n" | 465 | ".section .discard,\"aw\",@progbits\n" |
463 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ | 466 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ |
@@ -484,31 +487,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
484 | static __always_inline __pure bool _static_cpu_has_safe(u16 bit) | 487 | static __always_inline __pure bool _static_cpu_has_safe(u16 bit) |
485 | { | 488 | { |
486 | #ifdef CC_HAVE_ASM_GOTO | 489 | #ifdef CC_HAVE_ASM_GOTO |
487 | /* | 490 | asm_volatile_goto("1: jmp %l[t_dynamic]\n" |
488 | * We need to spell the jumps to the compiler because, depending on the offset, | ||
489 | * the replacement jump can be bigger than the original jump, and this we cannot | ||
490 | * have. Thus, we force the jump to the widest, 4-byte, signed relative | ||
491 | * offset even though the last would often fit in less bytes. | ||
492 | */ | ||
493 | asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" | ||
494 | "2:\n" | 491 | "2:\n" |
492 | ".skip -(((5f-4f) - (2b-1b)) > 0) * " | ||
493 | "((5f-4f) - (2b-1b)),0x90\n" | ||
494 | "3:\n" | ||
495 | ".section .altinstructions,\"a\"\n" | 495 | ".section .altinstructions,\"a\"\n" |
496 | " .long 1b - .\n" /* src offset */ | 496 | " .long 1b - .\n" /* src offset */ |
497 | " .long 3f - .\n" /* repl offset */ | 497 | " .long 4f - .\n" /* repl offset */ |
498 | " .word %P1\n" /* always replace */ | 498 | " .word %P1\n" /* always replace */ |
499 | " .byte 2b - 1b\n" /* src len */ | 499 | " .byte 3b - 1b\n" /* src len */ |
500 | " .byte 4f - 3f\n" /* repl len */ | 500 | " .byte 5f - 4f\n" /* repl len */ |
501 | " .byte 3b - 2b\n" /* pad len */ | ||
501 | ".previous\n" | 502 | ".previous\n" |
502 | ".section .altinstr_replacement,\"ax\"\n" | 503 | ".section .altinstr_replacement,\"ax\"\n" |
503 | "3: .byte 0xe9\n .long %l[t_no] - 2b\n" | 504 | "4: jmp %l[t_no]\n" |
504 | "4:\n" | 505 | "5:\n" |
505 | ".previous\n" | 506 | ".previous\n" |
506 | ".section .altinstructions,\"a\"\n" | 507 | ".section .altinstructions,\"a\"\n" |
507 | " .long 1b - .\n" /* src offset */ | 508 | " .long 1b - .\n" /* src offset */ |
508 | " .long 0\n" /* no replacement */ | 509 | " .long 0\n" /* no replacement */ |
509 | " .word %P0\n" /* feature bit */ | 510 | " .word %P0\n" /* feature bit */ |
510 | " .byte 2b - 1b\n" /* src len */ | 511 | " .byte 3b - 1b\n" /* src len */ |
511 | " .byte 0\n" /* repl len */ | 512 | " .byte 0\n" /* repl len */ |
513 | " .byte 0\n" /* pad len */ | ||
512 | ".previous\n" | 514 | ".previous\n" |
513 | : : "i" (bit), "i" (X86_FEATURE_ALWAYS) | 515 | : : "i" (bit), "i" (X86_FEATURE_ALWAYS) |
514 | : : t_dynamic, t_no); | 516 | : : t_dynamic, t_no); |
@@ -528,6 +530,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) | |||
528 | " .word %P2\n" /* always replace */ | 530 | " .word %P2\n" /* always replace */ |
529 | " .byte 2b - 1b\n" /* source len */ | 531 | " .byte 2b - 1b\n" /* source len */ |
530 | " .byte 4f - 3f\n" /* replacement len */ | 532 | " .byte 4f - 3f\n" /* replacement len */ |
533 | " .byte 0\n" /* pad len */ | ||
531 | ".previous\n" | 534 | ".previous\n" |
532 | ".section .discard,\"aw\",@progbits\n" | 535 | ".section .discard,\"aw\",@progbits\n" |
533 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ | 536 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ |
@@ -542,6 +545,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) | |||
542 | " .word %P1\n" /* feature bit */ | 545 | " .word %P1\n" /* feature bit */ |
543 | " .byte 4b - 3b\n" /* src len */ | 546 | " .byte 4b - 3b\n" /* src len */ |
544 | " .byte 6f - 5f\n" /* repl len */ | 547 | " .byte 6f - 5f\n" /* repl len */ |
548 | " .byte 0\n" /* pad len */ | ||
545 | ".previous\n" | 549 | ".previous\n" |
546 | ".section .discard,\"aw\",@progbits\n" | 550 | ".section .discard,\"aw\",@progbits\n" |
547 | " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ | 551 | " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ec1c93588cef..7be2c9a6caba 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -761,10 +761,10 @@ extern char ignore_fpu_irq; | |||
761 | #define ARCH_HAS_SPINLOCK_PREFETCH | 761 | #define ARCH_HAS_SPINLOCK_PREFETCH |
762 | 762 | ||
763 | #ifdef CONFIG_X86_32 | 763 | #ifdef CONFIG_X86_32 |
764 | # define BASE_PREFETCH ASM_NOP4 | 764 | # define BASE_PREFETCH "" |
765 | # define ARCH_HAS_PREFETCH | 765 | # define ARCH_HAS_PREFETCH |
766 | #else | 766 | #else |
767 | # define BASE_PREFETCH "prefetcht0 (%1)" | 767 | # define BASE_PREFETCH "prefetcht0 %P1" |
768 | #endif | 768 | #endif |
769 | 769 | ||
770 | /* | 770 | /* |
@@ -775,10 +775,9 @@ extern char ignore_fpu_irq; | |||
775 | */ | 775 | */ |
776 | static inline void prefetch(const void *x) | 776 | static inline void prefetch(const void *x) |
777 | { | 777 | { |
778 | alternative_input(BASE_PREFETCH, | 778 | alternative_input(BASE_PREFETCH, "prefetchnta %P1", |
779 | "prefetchnta (%1)", | ||
780 | X86_FEATURE_XMM, | 779 | X86_FEATURE_XMM, |
781 | "r" (x)); | 780 | "m" (*(const char *)x)); |
782 | } | 781 | } |
783 | 782 | ||
784 | /* | 783 | /* |
@@ -788,10 +787,9 @@ static inline void prefetch(const void *x) | |||
788 | */ | 787 | */ |
789 | static inline void prefetchw(const void *x) | 788 | static inline void prefetchw(const void *x) |
790 | { | 789 | { |
791 | alternative_input(BASE_PREFETCH, | 790 | alternative_input(BASE_PREFETCH, "prefetchw %P1", |
792 | "prefetchw (%1)", | 791 | X86_FEATURE_3DNOWPREFETCH, |
793 | X86_FEATURE_3DNOW, | 792 | "m" (*(const char *)x)); |
794 | "r" (x)); | ||
795 | } | 793 | } |
796 | 794 | ||
797 | static inline void spin_lock_prefetch(const void *x) | 795 | static inline void spin_lock_prefetch(const void *x) |
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8d3120f4e270..ba665ebd17bb 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h | |||
@@ -27,23 +27,11 @@ | |||
27 | 27 | ||
28 | #ifdef CONFIG_X86_SMAP | 28 | #ifdef CONFIG_X86_SMAP |
29 | 29 | ||
30 | #define ASM_CLAC \ | 30 | #define ASM_CLAC \ |
31 | 661: ASM_NOP3 ; \ | 31 | ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP |
32 | .pushsection .altinstr_replacement, "ax" ; \ | 32 | |
33 | 662: __ASM_CLAC ; \ | 33 | #define ASM_STAC \ |
34 | .popsection ; \ | 34 | ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP |
35 | .pushsection .altinstructions, "a" ; \ | ||
36 | altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ | ||
37 | .popsection | ||
38 | |||
39 | #define ASM_STAC \ | ||
40 | 661: ASM_NOP3 ; \ | ||
41 | .pushsection .altinstr_replacement, "ax" ; \ | ||
42 | 662: __ASM_STAC ; \ | ||
43 | .popsection ; \ | ||
44 | .pushsection .altinstructions, "a" ; \ | ||
45 | altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ | ||
46 | .popsection | ||
47 | 35 | ||
48 | #else /* CONFIG_X86_SMAP */ | 36 | #else /* CONFIG_X86_SMAP */ |
49 | 37 | ||
@@ -61,20 +49,20 @@ | |||
61 | static __always_inline void clac(void) | 49 | static __always_inline void clac(void) |
62 | { | 50 | { |
63 | /* Note: a barrier is implicit in alternative() */ | 51 | /* Note: a barrier is implicit in alternative() */ |
64 | alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); | 52 | alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); |
65 | } | 53 | } |
66 | 54 | ||
67 | static __always_inline void stac(void) | 55 | static __always_inline void stac(void) |
68 | { | 56 | { |
69 | /* Note: a barrier is implicit in alternative() */ | 57 | /* Note: a barrier is implicit in alternative() */ |
70 | alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); | 58 | alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); |
71 | } | 59 | } |
72 | 60 | ||
73 | /* These macros can be used in asm() statements */ | 61 | /* These macros can be used in asm() statements */ |
74 | #define ASM_CLAC \ | 62 | #define ASM_CLAC \ |
75 | ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) | 63 | ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) |
76 | #define ASM_STAC \ | 64 | #define ASM_STAC \ |
77 | ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) | 65 | ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) |
78 | 66 | ||
79 | #else /* CONFIG_X86_SMAP */ | 67 | #else /* CONFIG_X86_SMAP */ |
80 | 68 | ||
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 703130f469ec..af397cc98d05 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str) | |||
52 | __setup("noreplace-paravirt", setup_noreplace_paravirt); | 52 | __setup("noreplace-paravirt", setup_noreplace_paravirt); |
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | #define DPRINTK(fmt, ...) \ | 55 | #define DPRINTK(fmt, args...) \ |
56 | do { \ | 56 | do { \ |
57 | if (debug_alternative) \ | 57 | if (debug_alternative) \ |
58 | printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ | 58 | printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ |
59 | } while (0) | ||
60 | |||
61 | #define DUMP_BYTES(buf, len, fmt, args...) \ | ||
62 | do { \ | ||
63 | if (unlikely(debug_alternative)) { \ | ||
64 | int j; \ | ||
65 | \ | ||
66 | if (!(len)) \ | ||
67 | break; \ | ||
68 | \ | ||
69 | printk(KERN_DEBUG fmt, ##args); \ | ||
70 | for (j = 0; j < (len) - 1; j++) \ | ||
71 | printk(KERN_CONT "%02hhx ", buf[j]); \ | ||
72 | printk(KERN_CONT "%02hhx\n", buf[j]); \ | ||
73 | } \ | ||
59 | } while (0) | 74 | } while (0) |
60 | 75 | ||
61 | /* | 76 | /* |
@@ -243,12 +258,86 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | |||
243 | extern s32 __smp_locks[], __smp_locks_end[]; | 258 | extern s32 __smp_locks[], __smp_locks_end[]; |
244 | void *text_poke_early(void *addr, const void *opcode, size_t len); | 259 | void *text_poke_early(void *addr, const void *opcode, size_t len); |
245 | 260 | ||
246 | /* Replace instructions with better alternatives for this CPU type. | 261 | /* |
247 | This runs before SMP is initialized to avoid SMP problems with | 262 | * Are we looking at a near JMP with a 1 or 4-byte displacement. |
248 | self modifying code. This implies that asymmetric systems where | 263 | */ |
249 | APs have less capabilities than the boot processor are not handled. | 264 | static inline bool is_jmp(const u8 opcode) |
250 | Tough. Make sure you disable such features by hand. */ | 265 | { |
266 | return opcode == 0xeb || opcode == 0xe9; | ||
267 | } | ||
268 | |||
269 | static void __init_or_module | ||
270 | recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) | ||
271 | { | ||
272 | u8 *next_rip, *tgt_rip; | ||
273 | s32 n_dspl, o_dspl; | ||
274 | int repl_len; | ||
275 | |||
276 | if (a->replacementlen != 5) | ||
277 | return; | ||
278 | |||
279 | o_dspl = *(s32 *)(insnbuf + 1); | ||
280 | |||
281 | /* next_rip of the replacement JMP */ | ||
282 | next_rip = repl_insn + a->replacementlen; | ||
283 | /* target rip of the replacement JMP */ | ||
284 | tgt_rip = next_rip + o_dspl; | ||
285 | n_dspl = tgt_rip - orig_insn; | ||
286 | |||
287 | DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); | ||
288 | |||
289 | if (tgt_rip - orig_insn >= 0) { | ||
290 | if (n_dspl - 2 <= 127) | ||
291 | goto two_byte_jmp; | ||
292 | else | ||
293 | goto five_byte_jmp; | ||
294 | /* negative offset */ | ||
295 | } else { | ||
296 | if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) | ||
297 | goto two_byte_jmp; | ||
298 | else | ||
299 | goto five_byte_jmp; | ||
300 | } | ||
301 | |||
302 | two_byte_jmp: | ||
303 | n_dspl -= 2; | ||
304 | |||
305 | insnbuf[0] = 0xeb; | ||
306 | insnbuf[1] = (s8)n_dspl; | ||
307 | add_nops(insnbuf + 2, 3); | ||
308 | |||
309 | repl_len = 2; | ||
310 | goto done; | ||
311 | |||
312 | five_byte_jmp: | ||
313 | n_dspl -= 5; | ||
251 | 314 | ||
315 | insnbuf[0] = 0xe9; | ||
316 | *(s32 *)&insnbuf[1] = n_dspl; | ||
317 | |||
318 | repl_len = 5; | ||
319 | |||
320 | done: | ||
321 | |||
322 | DPRINTK("final displ: 0x%08x, JMP 0x%lx", | ||
323 | n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); | ||
324 | } | ||
325 | |||
326 | static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) | ||
327 | { | ||
328 | add_nops(instr + (a->instrlen - a->padlen), a->padlen); | ||
329 | |||
330 | DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", | ||
331 | instr, a->instrlen - a->padlen, a->padlen); | ||
332 | } | ||
333 | |||
334 | /* | ||
335 | * Replace instructions with better alternatives for this CPU type. This runs | ||
336 | * before SMP is initialized to avoid SMP problems with self modifying code. | ||
337 | * This implies that asymmetric systems where APs have less capabilities than | ||
338 | * the boot processor are not handled. Tough. Make sure you disable such | ||
339 | * features by hand. | ||
340 | */ | ||
252 | void __init_or_module apply_alternatives(struct alt_instr *start, | 341 | void __init_or_module apply_alternatives(struct alt_instr *start, |
253 | struct alt_instr *end) | 342 | struct alt_instr *end) |
254 | { | 343 | { |
@@ -256,10 +345,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
256 | u8 *instr, *replacement; | 345 | u8 *instr, *replacement; |
257 | u8 insnbuf[MAX_PATCH_LEN]; | 346 | u8 insnbuf[MAX_PATCH_LEN]; |
258 | 347 | ||
259 | DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); | 348 | DPRINTK("alt table %p -> %p", start, end); |
260 | /* | 349 | /* |
261 | * The scan order should be from start to end. A later scanned | 350 | * The scan order should be from start to end. A later scanned |
262 | * alternative code can overwrite a previous scanned alternative code. | 351 | * alternative code can overwrite previously scanned alternative code. |
263 | * Some kernel functions (e.g. memcpy, memset, etc) use this order to | 352 | * Some kernel functions (e.g. memcpy, memset, etc) use this order to |
264 | * patch code. | 353 | * patch code. |
265 | * | 354 | * |
@@ -267,29 +356,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
267 | * order. | 356 | * order. |
268 | */ | 357 | */ |
269 | for (a = start; a < end; a++) { | 358 | for (a = start; a < end; a++) { |
359 | int insnbuf_sz = 0; | ||
360 | |||
270 | instr = (u8 *)&a->instr_offset + a->instr_offset; | 361 | instr = (u8 *)&a->instr_offset + a->instr_offset; |
271 | replacement = (u8 *)&a->repl_offset + a->repl_offset; | 362 | replacement = (u8 *)&a->repl_offset + a->repl_offset; |
272 | BUG_ON(a->replacementlen > a->instrlen); | ||
273 | BUG_ON(a->instrlen > sizeof(insnbuf)); | 363 | BUG_ON(a->instrlen > sizeof(insnbuf)); |
274 | BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); | 364 | BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); |
275 | if (!boot_cpu_has(a->cpuid)) | 365 | if (!boot_cpu_has(a->cpuid)) { |
366 | if (a->padlen > 1) | ||
367 | optimize_nops(a, instr); | ||
368 | |||
276 | continue; | 369 | continue; |
370 | } | ||
371 | |||
372 | DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)", | ||
373 | a->cpuid >> 5, | ||
374 | a->cpuid & 0x1f, | ||
375 | instr, a->instrlen, | ||
376 | replacement, a->replacementlen); | ||
377 | |||
378 | DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); | ||
379 | DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); | ||
277 | 380 | ||
278 | memcpy(insnbuf, replacement, a->replacementlen); | 381 | memcpy(insnbuf, replacement, a->replacementlen); |
382 | insnbuf_sz = a->replacementlen; | ||
279 | 383 | ||
280 | /* 0xe8 is a relative jump; fix the offset. */ | 384 | /* 0xe8 is a relative jump; fix the offset. */ |
281 | if (*insnbuf == 0xe8 && a->replacementlen == 5) | 385 | if (*insnbuf == 0xe8 && a->replacementlen == 5) { |
282 | *(s32 *)(insnbuf + 1) += replacement - instr; | 386 | *(s32 *)(insnbuf + 1) += replacement - instr; |
387 | DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", | ||
388 | *(s32 *)(insnbuf + 1), | ||
389 | (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); | ||
390 | } | ||
391 | |||
392 | if (a->replacementlen && is_jmp(replacement[0])) | ||
393 | recompute_jump(a, instr, replacement, insnbuf); | ||
283 | 394 | ||
284 | add_nops(insnbuf + a->replacementlen, | 395 | if (a->instrlen > a->replacementlen) { |
285 | a->instrlen - a->replacementlen); | 396 | add_nops(insnbuf + a->replacementlen, |
397 | a->instrlen - a->replacementlen); | ||
398 | insnbuf_sz += a->instrlen - a->replacementlen; | ||
399 | } | ||
400 | DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); | ||
286 | 401 | ||
287 | text_poke_early(instr, insnbuf, a->instrlen); | 402 | text_poke_early(instr, insnbuf, insnbuf_sz); |
288 | } | 403 | } |
289 | } | 404 | } |
290 | 405 | ||
291 | #ifdef CONFIG_SMP | 406 | #ifdef CONFIG_SMP |
292 | |||
293 | static void alternatives_smp_lock(const s32 *start, const s32 *end, | 407 | static void alternatives_smp_lock(const s32 *start, const s32 *end, |
294 | u8 *text, u8 *text_end) | 408 | u8 *text, u8 *text_end) |
295 | { | 409 | { |
@@ -371,8 +485,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, | |||
371 | smp->locks_end = locks_end; | 485 | smp->locks_end = locks_end; |
372 | smp->text = text; | 486 | smp->text = text; |
373 | smp->text_end = text_end; | 487 | smp->text_end = text_end; |
374 | DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", | 488 | DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", |
375 | __func__, smp->locks, smp->locks_end, | 489 | smp->locks, smp->locks_end, |
376 | smp->text, smp->text_end, smp->name); | 490 | smp->text, smp->text_end, smp->name); |
377 | 491 | ||
378 | list_add_tail(&smp->next, &smp_alt_modules); | 492 | list_add_tail(&smp->next, &smp_alt_modules); |
@@ -440,7 +554,7 @@ int alternatives_text_reserved(void *start, void *end) | |||
440 | 554 | ||
441 | return 0; | 555 | return 0; |
442 | } | 556 | } |
443 | #endif | 557 | #endif /* CONFIG_SMP */ |
444 | 558 | ||
445 | #ifdef CONFIG_PARAVIRT | 559 | #ifdef CONFIG_PARAVIRT |
446 | void __init_or_module apply_paravirt(struct paravirt_patch_site *start, | 560 | void __init_or_module apply_paravirt(struct paravirt_patch_site *start, |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a220239cea65..dd9e50500297 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c) | |||
711 | set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); | 711 | set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); |
712 | 712 | ||
713 | rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); | 713 | rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); |
714 | |||
715 | /* 3DNow or LM implies PREFETCHW */ | ||
716 | if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) | ||
717 | if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) | ||
718 | set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); | ||
714 | } | 719 | } |
715 | 720 | ||
716 | #ifdef CONFIG_X86_32 | 721 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 31e2d5bf3e38..7e0323cc0b7d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error) | |||
816 | pushl_cfi $0 | 816 | pushl_cfi $0 |
817 | #ifdef CONFIG_X86_INVD_BUG | 817 | #ifdef CONFIG_X86_INVD_BUG |
818 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ | 818 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ |
819 | 661: pushl_cfi $do_general_protection | 819 | ALTERNATIVE "pushl_cfi $do_general_protection", \ |
820 | 662: | 820 | "pushl $do_simd_coprocessor_error", \ |
821 | .section .altinstructions,"a" | 821 | X86_FEATURE_XMM |
822 | altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f | ||
823 | .previous | ||
824 | .section .altinstr_replacement,"ax" | ||
825 | 663: pushl $do_simd_coprocessor_error | ||
826 | 664: | ||
827 | .previous | ||
828 | #else | 822 | #else |
829 | pushl_cfi $do_simd_coprocessor_error | 823 | pushl_cfi $do_simd_coprocessor_error |
830 | #endif | 824 | #endif |
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index f2145cfa12a6..e67e579c93bd 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S | |||
@@ -1,31 +1,35 @@ | |||
1 | #include <linux/linkage.h> | 1 | #include <linux/linkage.h> |
2 | #include <asm/dwarf2.h> | 2 | #include <asm/dwarf2.h> |
3 | #include <asm/cpufeature.h> | ||
3 | #include <asm/alternative-asm.h> | 4 | #include <asm/alternative-asm.h> |
4 | 5 | ||
5 | /* | 6 | /* |
6 | * Zero a page. | 7 | * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is |
7 | * rdi page | 8 | * recommended to use this when possible and we do use them by default. |
8 | */ | 9 | * If enhanced REP MOVSB/STOSB is not available, try to use fast string. |
9 | ENTRY(clear_page_c) | 10 | * Otherwise, use original. |
11 | */ | ||
12 | |||
13 | /* | ||
14 | * Zero a page. | ||
15 | * %rdi - page | ||
16 | */ | ||
17 | ENTRY(clear_page) | ||
10 | CFI_STARTPROC | 18 | CFI_STARTPROC |
19 | |||
20 | ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ | ||
21 | "jmp clear_page_c_e", X86_FEATURE_ERMS | ||
22 | |||
11 | movl $4096/8,%ecx | 23 | movl $4096/8,%ecx |
12 | xorl %eax,%eax | 24 | xorl %eax,%eax |
13 | rep stosq | 25 | rep stosq |
14 | ret | 26 | ret |
15 | CFI_ENDPROC | 27 | CFI_ENDPROC |
16 | ENDPROC(clear_page_c) | 28 | ENDPROC(clear_page) |
17 | 29 | ||
18 | ENTRY(clear_page_c_e) | 30 | ENTRY(clear_page_orig) |
19 | CFI_STARTPROC | 31 | CFI_STARTPROC |
20 | movl $4096,%ecx | ||
21 | xorl %eax,%eax | ||
22 | rep stosb | ||
23 | ret | ||
24 | CFI_ENDPROC | ||
25 | ENDPROC(clear_page_c_e) | ||
26 | 32 | ||
27 | ENTRY(clear_page) | ||
28 | CFI_STARTPROC | ||
29 | xorl %eax,%eax | 33 | xorl %eax,%eax |
30 | movl $4096/64,%ecx | 34 | movl $4096/64,%ecx |
31 | .p2align 4 | 35 | .p2align 4 |
@@ -45,29 +49,13 @@ ENTRY(clear_page) | |||
45 | nop | 49 | nop |
46 | ret | 50 | ret |
47 | CFI_ENDPROC | 51 | CFI_ENDPROC |
48 | .Lclear_page_end: | 52 | ENDPROC(clear_page_orig) |
49 | ENDPROC(clear_page) | ||
50 | |||
51 | /* | ||
52 | * Some CPUs support enhanced REP MOVSB/STOSB instructions. | ||
53 | * It is recommended to use this when possible. | ||
54 | * If enhanced REP MOVSB/STOSB is not available, try to use fast string. | ||
55 | * Otherwise, use original function. | ||
56 | * | ||
57 | */ | ||
58 | 53 | ||
59 | #include <asm/cpufeature.h> | 54 | ENTRY(clear_page_c_e) |
60 | 55 | CFI_STARTPROC | |
61 | .section .altinstr_replacement,"ax" | 56 | movl $4096,%ecx |
62 | 1: .byte 0xeb /* jmp <disp8> */ | 57 | xorl %eax,%eax |
63 | .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ | 58 | rep stosb |
64 | 2: .byte 0xeb /* jmp <disp8> */ | 59 | ret |
65 | .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ | 60 | CFI_ENDPROC |
66 | 3: | 61 | ENDPROC(clear_page_c_e) |
67 | .previous | ||
68 | .section .altinstructions,"a" | ||
69 | altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ | ||
70 | .Lclear_page_end-clear_page, 2b-1b | ||
71 | altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ | ||
72 | .Lclear_page_end-clear_page,3b-2b | ||
73 | .previous | ||
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 176cca67212b..8239dbcbf984 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S | |||
@@ -2,23 +2,26 @@ | |||
2 | 2 | ||
3 | #include <linux/linkage.h> | 3 | #include <linux/linkage.h> |
4 | #include <asm/dwarf2.h> | 4 | #include <asm/dwarf2.h> |
5 | #include <asm/cpufeature.h> | ||
5 | #include <asm/alternative-asm.h> | 6 | #include <asm/alternative-asm.h> |
6 | 7 | ||
8 | /* | ||
9 | * Some CPUs run faster using the string copy instructions (sane microcode). | ||
10 | * It is also a lot simpler. Use this when possible. But, don't use streaming | ||
11 | * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the | ||
12 | * prefetch distance based on SMP/UP. | ||
13 | */ | ||
7 | ALIGN | 14 | ALIGN |
8 | copy_page_rep: | 15 | ENTRY(copy_page) |
9 | CFI_STARTPROC | 16 | CFI_STARTPROC |
17 | ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD | ||
10 | movl $4096/8, %ecx | 18 | movl $4096/8, %ecx |
11 | rep movsq | 19 | rep movsq |
12 | ret | 20 | ret |
13 | CFI_ENDPROC | 21 | CFI_ENDPROC |
14 | ENDPROC(copy_page_rep) | 22 | ENDPROC(copy_page) |
15 | |||
16 | /* | ||
17 | * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. | ||
18 | * Could vary the prefetch distance based on SMP/UP. | ||
19 | */ | ||
20 | 23 | ||
21 | ENTRY(copy_page) | 24 | ENTRY(copy_page_regs) |
22 | CFI_STARTPROC | 25 | CFI_STARTPROC |
23 | subq $2*8, %rsp | 26 | subq $2*8, %rsp |
24 | CFI_ADJUST_CFA_OFFSET 2*8 | 27 | CFI_ADJUST_CFA_OFFSET 2*8 |
@@ -90,21 +93,5 @@ ENTRY(copy_page) | |||
90 | addq $2*8, %rsp | 93 | addq $2*8, %rsp |
91 | CFI_ADJUST_CFA_OFFSET -2*8 | 94 | CFI_ADJUST_CFA_OFFSET -2*8 |
92 | ret | 95 | ret |
93 | .Lcopy_page_end: | ||
94 | CFI_ENDPROC | 96 | CFI_ENDPROC |
95 | ENDPROC(copy_page) | 97 | ENDPROC(copy_page_regs) |
96 | |||
97 | /* Some CPUs run faster using the string copy instructions. | ||
98 | It is also a lot simpler. Use this when possible */ | ||
99 | |||
100 | #include <asm/cpufeature.h> | ||
101 | |||
102 | .section .altinstr_replacement,"ax" | ||
103 | 1: .byte 0xeb /* jmp <disp8> */ | ||
104 | .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ | ||
105 | 2: | ||
106 | .previous | ||
107 | .section .altinstructions,"a" | ||
108 | altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ | ||
109 | .Lcopy_page_end-copy_page, 2b-1b | ||
110 | .previous | ||
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index dee945d55594..fa997dfaef24 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S | |||
@@ -8,9 +8,6 @@ | |||
8 | 8 | ||
9 | #include <linux/linkage.h> | 9 | #include <linux/linkage.h> |
10 | #include <asm/dwarf2.h> | 10 | #include <asm/dwarf2.h> |
11 | |||
12 | #define FIX_ALIGNMENT 1 | ||
13 | |||
14 | #include <asm/current.h> | 11 | #include <asm/current.h> |
15 | #include <asm/asm-offsets.h> | 12 | #include <asm/asm-offsets.h> |
16 | #include <asm/thread_info.h> | 13 | #include <asm/thread_info.h> |
@@ -19,33 +16,7 @@ | |||
19 | #include <asm/asm.h> | 16 | #include <asm/asm.h> |
20 | #include <asm/smap.h> | 17 | #include <asm/smap.h> |
21 | 18 | ||
22 | /* | ||
23 | * By placing feature2 after feature1 in altinstructions section, we logically | ||
24 | * implement: | ||
25 | * If CPU has feature2, jmp to alt2 is used | ||
26 | * else if CPU has feature1, jmp to alt1 is used | ||
27 | * else jmp to orig is used. | ||
28 | */ | ||
29 | .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 | ||
30 | 0: | ||
31 | .byte 0xe9 /* 32bit jump */ | ||
32 | .long \orig-1f /* by default jump to orig */ | ||
33 | 1: | ||
34 | .section .altinstr_replacement,"ax" | ||
35 | 2: .byte 0xe9 /* near jump with 32bit immediate */ | ||
36 | .long \alt1-1b /* offset */ /* or alternatively to alt1 */ | ||
37 | 3: .byte 0xe9 /* near jump with 32bit immediate */ | ||
38 | .long \alt2-1b /* offset */ /* or alternatively to alt2 */ | ||
39 | .previous | ||
40 | |||
41 | .section .altinstructions,"a" | ||
42 | altinstruction_entry 0b,2b,\feature1,5,5 | ||
43 | altinstruction_entry 0b,3b,\feature2,5,5 | ||
44 | .previous | ||
45 | .endm | ||
46 | |||
47 | .macro ALIGN_DESTINATION | 19 | .macro ALIGN_DESTINATION |
48 | #ifdef FIX_ALIGNMENT | ||
49 | /* check for bad alignment of destination */ | 20 | /* check for bad alignment of destination */ |
50 | movl %edi,%ecx | 21 | movl %edi,%ecx |
51 | andl $7,%ecx | 22 | andl $7,%ecx |
@@ -67,7 +38,6 @@ | |||
67 | 38 | ||
68 | _ASM_EXTABLE(100b,103b) | 39 | _ASM_EXTABLE(100b,103b) |
69 | _ASM_EXTABLE(101b,103b) | 40 | _ASM_EXTABLE(101b,103b) |
70 | #endif | ||
71 | .endm | 41 | .endm |
72 | 42 | ||
73 | /* Standard copy_to_user with segment limit checking */ | 43 | /* Standard copy_to_user with segment limit checking */ |
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user) | |||
79 | jc bad_to_user | 49 | jc bad_to_user |
80 | cmpq TI_addr_limit(%rax),%rcx | 50 | cmpq TI_addr_limit(%rax),%rcx |
81 | ja bad_to_user | 51 | ja bad_to_user |
82 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ | 52 | ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ |
83 | copy_user_generic_unrolled,copy_user_generic_string, \ | 53 | "jmp copy_user_generic_string", \ |
84 | copy_user_enhanced_fast_string | 54 | X86_FEATURE_REP_GOOD, \ |
55 | "jmp copy_user_enhanced_fast_string", \ | ||
56 | X86_FEATURE_ERMS | ||
85 | CFI_ENDPROC | 57 | CFI_ENDPROC |
86 | ENDPROC(_copy_to_user) | 58 | ENDPROC(_copy_to_user) |
87 | 59 | ||
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user) | |||
94 | jc bad_from_user | 66 | jc bad_from_user |
95 | cmpq TI_addr_limit(%rax),%rcx | 67 | cmpq TI_addr_limit(%rax),%rcx |
96 | ja bad_from_user | 68 | ja bad_from_user |
97 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ | 69 | ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ |
98 | copy_user_generic_unrolled,copy_user_generic_string, \ | 70 | "jmp copy_user_generic_string", \ |
99 | copy_user_enhanced_fast_string | 71 | X86_FEATURE_REP_GOOD, \ |
72 | "jmp copy_user_enhanced_fast_string", \ | ||
73 | X86_FEATURE_ERMS | ||
100 | CFI_ENDPROC | 74 | CFI_ENDPROC |
101 | ENDPROC(_copy_from_user) | 75 | ENDPROC(_copy_from_user) |
102 | 76 | ||
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 89b53c9968e7..b046664f5a1c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S | |||
@@ -1,12 +1,20 @@ | |||
1 | /* Copyright 2002 Andi Kleen */ | 1 | /* Copyright 2002 Andi Kleen */ |
2 | 2 | ||
3 | #include <linux/linkage.h> | 3 | #include <linux/linkage.h> |
4 | |||
5 | #include <asm/cpufeature.h> | 4 | #include <asm/cpufeature.h> |
6 | #include <asm/dwarf2.h> | 5 | #include <asm/dwarf2.h> |
7 | #include <asm/alternative-asm.h> | 6 | #include <asm/alternative-asm.h> |
8 | 7 | ||
9 | /* | 8 | /* |
9 | * We build a jump to memcpy_orig by default which gets NOPped out on | ||
10 | * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which | ||
11 | * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs | ||
12 | * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. | ||
13 | */ | ||
14 | |||
15 | .weak memcpy | ||
16 | |||
17 | /* | ||
10 | * memcpy - Copy a memory block. | 18 | * memcpy - Copy a memory block. |
11 | * | 19 | * |
12 | * Input: | 20 | * Input: |
@@ -17,15 +25,11 @@ | |||
17 | * Output: | 25 | * Output: |
18 | * rax original destination | 26 | * rax original destination |
19 | */ | 27 | */ |
28 | ENTRY(__memcpy) | ||
29 | ENTRY(memcpy) | ||
30 | ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ | ||
31 | "jmp memcpy_erms", X86_FEATURE_ERMS | ||
20 | 32 | ||
21 | /* | ||
22 | * memcpy_c() - fast string ops (REP MOVSQ) based variant. | ||
23 | * | ||
24 | * This gets patched over the unrolled variant (below) via the | ||
25 | * alternative instructions framework: | ||
26 | */ | ||
27 | .section .altinstr_replacement, "ax", @progbits | ||
28 | .Lmemcpy_c: | ||
29 | movq %rdi, %rax | 33 | movq %rdi, %rax |
30 | movq %rdx, %rcx | 34 | movq %rdx, %rcx |
31 | shrq $3, %rcx | 35 | shrq $3, %rcx |
@@ -34,29 +38,21 @@ | |||
34 | movl %edx, %ecx | 38 | movl %edx, %ecx |
35 | rep movsb | 39 | rep movsb |
36 | ret | 40 | ret |
37 | .Lmemcpy_e: | 41 | ENDPROC(memcpy) |
38 | .previous | 42 | ENDPROC(__memcpy) |
39 | 43 | ||
40 | /* | 44 | /* |
41 | * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than | 45 | * memcpy_erms() - enhanced fast string memcpy. This is faster and |
42 | * memcpy_c. Use memcpy_c_e when possible. | 46 | * simpler than memcpy. Use memcpy_erms when possible. |
43 | * | ||
44 | * This gets patched over the unrolled variant (below) via the | ||
45 | * alternative instructions framework: | ||
46 | */ | 47 | */ |
47 | .section .altinstr_replacement, "ax", @progbits | 48 | ENTRY(memcpy_erms) |
48 | .Lmemcpy_c_e: | ||
49 | movq %rdi, %rax | 49 | movq %rdi, %rax |
50 | movq %rdx, %rcx | 50 | movq %rdx, %rcx |
51 | rep movsb | 51 | rep movsb |
52 | ret | 52 | ret |
53 | .Lmemcpy_e_e: | 53 | ENDPROC(memcpy_erms) |
54 | .previous | ||
55 | |||
56 | .weak memcpy | ||
57 | 54 | ||
58 | ENTRY(__memcpy) | 55 | ENTRY(memcpy_orig) |
59 | ENTRY(memcpy) | ||
60 | CFI_STARTPROC | 56 | CFI_STARTPROC |
61 | movq %rdi, %rax | 57 | movq %rdi, %rax |
62 | 58 | ||
@@ -183,26 +179,4 @@ ENTRY(memcpy) | |||
183 | .Lend: | 179 | .Lend: |
184 | retq | 180 | retq |
185 | CFI_ENDPROC | 181 | CFI_ENDPROC |
186 | ENDPROC(memcpy) | 182 | ENDPROC(memcpy_orig) |
187 | ENDPROC(__memcpy) | ||
188 | |||
189 | /* | ||
190 | * Some CPUs are adding enhanced REP MOVSB/STOSB feature | ||
191 | * If the feature is supported, memcpy_c_e() is the first choice. | ||
192 | * If enhanced rep movsb copy is not available, use fast string copy | ||
193 | * memcpy_c() when possible. This is faster and code is simpler than | ||
194 | * original memcpy(). | ||
195 | * Otherwise, original memcpy() is used. | ||
196 | * In .altinstructions section, ERMS feature is placed after REG_GOOD | ||
197 | * feature to implement the right patch order. | ||
198 | * | ||
199 | * Replace only beginning, memcpy is used to apply alternatives, | ||
200 | * so it is silly to overwrite itself with nops - reboot is the | ||
201 | * only outcome... | ||
202 | */ | ||
203 | .section .altinstructions, "a" | ||
204 | altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ | ||
205 | .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c | ||
206 | altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ | ||
207 | .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e | ||
208 | .previous | ||
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 9c4b530575da..0f8a0d0331b9 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S | |||
@@ -5,7 +5,6 @@ | |||
5 | * This assembly file is re-written from memmove_64.c file. | 5 | * This assembly file is re-written from memmove_64.c file. |
6 | * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> | 6 | * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> |
7 | */ | 7 | */ |
8 | #define _STRING_C | ||
9 | #include <linux/linkage.h> | 8 | #include <linux/linkage.h> |
10 | #include <asm/dwarf2.h> | 9 | #include <asm/dwarf2.h> |
11 | #include <asm/cpufeature.h> | 10 | #include <asm/cpufeature.h> |
@@ -44,6 +43,8 @@ ENTRY(__memmove) | |||
44 | jg 2f | 43 | jg 2f |
45 | 44 | ||
46 | .Lmemmove_begin_forward: | 45 | .Lmemmove_begin_forward: |
46 | ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS | ||
47 | |||
47 | /* | 48 | /* |
48 | * movsq instruction have many startup latency | 49 | * movsq instruction have many startup latency |
49 | * so we handle small size by general register. | 50 | * so we handle small size by general register. |
@@ -207,21 +208,5 @@ ENTRY(__memmove) | |||
207 | 13: | 208 | 13: |
208 | retq | 209 | retq |
209 | CFI_ENDPROC | 210 | CFI_ENDPROC |
210 | |||
211 | .section .altinstr_replacement,"ax" | ||
212 | .Lmemmove_begin_forward_efs: | ||
213 | /* Forward moving data. */ | ||
214 | movq %rdx, %rcx | ||
215 | rep movsb | ||
216 | retq | ||
217 | .Lmemmove_end_forward_efs: | ||
218 | .previous | ||
219 | |||
220 | .section .altinstructions,"a" | ||
221 | altinstruction_entry .Lmemmove_begin_forward, \ | ||
222 | .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ | ||
223 | .Lmemmove_end_forward-.Lmemmove_begin_forward, \ | ||
224 | .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs | ||
225 | .previous | ||
226 | ENDPROC(__memmove) | 211 | ENDPROC(__memmove) |
227 | ENDPROC(memmove) | 212 | ENDPROC(memmove) |
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 6f44935c6a60..93118fb23976 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S | |||
@@ -5,19 +5,30 @@ | |||
5 | #include <asm/cpufeature.h> | 5 | #include <asm/cpufeature.h> |
6 | #include <asm/alternative-asm.h> | 6 | #include <asm/alternative-asm.h> |
7 | 7 | ||
8 | .weak memset | ||
9 | |||
8 | /* | 10 | /* |
9 | * ISO C memset - set a memory block to a byte value. This function uses fast | 11 | * ISO C memset - set a memory block to a byte value. This function uses fast |
10 | * string to get better performance than the original function. The code is | 12 | * string to get better performance than the original function. The code is |
11 | * simpler and shorter than the orignal function as well. | 13 | * simpler and shorter than the orignal function as well. |
12 | * | 14 | * |
13 | * rdi destination | 15 | * rdi destination |
14 | * rsi value (char) | 16 | * rsi value (char) |
15 | * rdx count (bytes) | 17 | * rdx count (bytes) |
16 | * | 18 | * |
17 | * rax original destination | 19 | * rax original destination |
18 | */ | 20 | */ |
19 | .section .altinstr_replacement, "ax", @progbits | 21 | ENTRY(memset) |
20 | .Lmemset_c: | 22 | ENTRY(__memset) |
23 | /* | ||
24 | * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended | ||
25 | * to use it when possible. If not available, use fast string instructions. | ||
26 | * | ||
27 | * Otherwise, use original memset function. | ||
28 | */ | ||
29 | ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ | ||
30 | "jmp memset_erms", X86_FEATURE_ERMS | ||
31 | |||
21 | movq %rdi,%r9 | 32 | movq %rdi,%r9 |
22 | movq %rdx,%rcx | 33 | movq %rdx,%rcx |
23 | andl $7,%edx | 34 | andl $7,%edx |
@@ -31,8 +42,8 @@ | |||
31 | rep stosb | 42 | rep stosb |
32 | movq %r9,%rax | 43 | movq %r9,%rax |
33 | ret | 44 | ret |
34 | .Lmemset_e: | 45 | ENDPROC(memset) |
35 | .previous | 46 | ENDPROC(__memset) |
36 | 47 | ||
37 | /* | 48 | /* |
38 | * ISO C memset - set a memory block to a byte value. This function uses | 49 | * ISO C memset - set a memory block to a byte value. This function uses |
@@ -45,21 +56,16 @@ | |||
45 | * | 56 | * |
46 | * rax original destination | 57 | * rax original destination |
47 | */ | 58 | */ |
48 | .section .altinstr_replacement, "ax", @progbits | 59 | ENTRY(memset_erms) |
49 | .Lmemset_c_e: | ||
50 | movq %rdi,%r9 | 60 | movq %rdi,%r9 |
51 | movb %sil,%al | 61 | movb %sil,%al |
52 | movq %rdx,%rcx | 62 | movq %rdx,%rcx |
53 | rep stosb | 63 | rep stosb |
54 | movq %r9,%rax | 64 | movq %r9,%rax |
55 | ret | 65 | ret |
56 | .Lmemset_e_e: | 66 | ENDPROC(memset_erms) |
57 | .previous | ||
58 | |||
59 | .weak memset | ||
60 | 67 | ||
61 | ENTRY(memset) | 68 | ENTRY(memset_orig) |
62 | ENTRY(__memset) | ||
63 | CFI_STARTPROC | 69 | CFI_STARTPROC |
64 | movq %rdi,%r10 | 70 | movq %rdi,%r10 |
65 | 71 | ||
@@ -134,23 +140,4 @@ ENTRY(__memset) | |||
134 | jmp .Lafter_bad_alignment | 140 | jmp .Lafter_bad_alignment |
135 | .Lfinal: | 141 | .Lfinal: |
136 | CFI_ENDPROC | 142 | CFI_ENDPROC |
137 | ENDPROC(memset) | 143 | ENDPROC(memset_orig) |
138 | ENDPROC(__memset) | ||
139 | |||
140 | /* Some CPUs support enhanced REP MOVSB/STOSB feature. | ||
141 | * It is recommended to use this when possible. | ||
142 | * | ||
143 | * If enhanced REP MOVSB/STOSB feature is not available, use fast string | ||
144 | * instructions. | ||
145 | * | ||
146 | * Otherwise, use original memset function. | ||
147 | * | ||
148 | * In .altinstructions section, ERMS feature is placed after REG_GOOD | ||
149 | * feature to implement the right patch order. | ||
150 | */ | ||
151 | .section .altinstructions,"a" | ||
152 | altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ | ||
153 | .Lfinal-__memset,.Lmemset_e-.Lmemset_c | ||
154 | altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ | ||
155 | .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e | ||
156 | .previous | ||
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 2d7d9a1f5b53..8ffd2146fa6a 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h | |||
@@ -64,8 +64,8 @@ | |||
64 | */ | 64 | */ |
65 | static inline void rdtsc_barrier(void) | 65 | static inline void rdtsc_barrier(void) |
66 | { | 66 | { |
67 | alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); | 67 | alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, |
68 | alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); | 68 | "lfence", X86_FEATURE_LFENCE_RDTSC); |
69 | } | 69 | } |
70 | 70 | ||
71 | #endif | 71 | #endif |
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h index d66ab799b35f..8c0c1a2770c8 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h | |||
@@ -1,12 +1,12 @@ | |||
1 | 1 | ||
2 | MEMCPY_FN(__memcpy, | 2 | MEMCPY_FN(memcpy_orig, |
3 | "x86-64-unrolled", | 3 | "x86-64-unrolled", |
4 | "unrolled memcpy() in arch/x86/lib/memcpy_64.S") | 4 | "unrolled memcpy() in arch/x86/lib/memcpy_64.S") |
5 | 5 | ||
6 | MEMCPY_FN(memcpy_c, | 6 | MEMCPY_FN(__memcpy, |
7 | "x86-64-movsq", | 7 | "x86-64-movsq", |
8 | "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") | 8 | "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") |
9 | 9 | ||
10 | MEMCPY_FN(memcpy_c_e, | 10 | MEMCPY_FN(memcpy_erms, |
11 | "x86-64-movsb", | 11 | "x86-64-movsb", |
12 | "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") | 12 | "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") |
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S index fcd9cf00600a..e4c2c30143b9 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm.S +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S | |||
@@ -1,8 +1,6 @@ | |||
1 | #define memcpy MEMCPY /* don't hide glibc's memcpy() */ | 1 | #define memcpy MEMCPY /* don't hide glibc's memcpy() */ |
2 | #define altinstr_replacement text | 2 | #define altinstr_replacement text |
3 | #define globl p2align 4; .globl | 3 | #define globl p2align 4; .globl |
4 | #define Lmemcpy_c globl memcpy_c; memcpy_c | ||
5 | #define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e | ||
6 | #include "../../../arch/x86/lib/memcpy_64.S" | 4 | #include "../../../arch/x86/lib/memcpy_64.S" |
7 | /* | 5 | /* |
8 | * We need to provide note.GNU-stack section, saying that we want | 6 | * We need to provide note.GNU-stack section, saying that we want |
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c index db1d3a29d97f..d3dfb7936dcd 100644 --- a/tools/perf/bench/mem-memcpy.c +++ b/tools/perf/bench/mem-memcpy.c | |||
@@ -36,7 +36,7 @@ static const struct option options[] = { | |||
36 | "Specify length of memory to copy. " | 36 | "Specify length of memory to copy. " |
37 | "Available units: B, KB, MB, GB and TB (upper and lower)"), | 37 | "Available units: B, KB, MB, GB and TB (upper and lower)"), |
38 | OPT_STRING('r', "routine", &routine, "default", | 38 | OPT_STRING('r', "routine", &routine, "default", |
39 | "Specify routine to copy"), | 39 | "Specify routine to copy, \"all\" runs all available routines"), |
40 | OPT_INTEGER('i', "iterations", &iterations, | 40 | OPT_INTEGER('i', "iterations", &iterations, |
41 | "repeat memcpy() invocation this number of times"), | 41 | "repeat memcpy() invocation this number of times"), |
42 | OPT_BOOLEAN('c', "cycle", &use_cycle, | 42 | OPT_BOOLEAN('c', "cycle", &use_cycle, |
@@ -135,55 +135,16 @@ struct bench_mem_info { | |||
135 | const char *const *usage; | 135 | const char *const *usage; |
136 | }; | 136 | }; |
137 | 137 | ||
138 | static int bench_mem_common(int argc, const char **argv, | 138 | static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen) |
139 | const char *prefix __maybe_unused, | ||
140 | struct bench_mem_info *info) | ||
141 | { | 139 | { |
142 | int i; | 140 | const struct routine *r = &info->routines[r_idx]; |
143 | size_t len; | ||
144 | double totallen; | ||
145 | double result_bps[2]; | 141 | double result_bps[2]; |
146 | u64 result_cycle[2]; | 142 | u64 result_cycle[2]; |
147 | 143 | ||
148 | argc = parse_options(argc, argv, options, | ||
149 | info->usage, 0); | ||
150 | |||
151 | if (no_prefault && only_prefault) { | ||
152 | fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n"); | ||
153 | return 1; | ||
154 | } | ||
155 | |||
156 | if (use_cycle) | ||
157 | init_cycle(); | ||
158 | |||
159 | len = (size_t)perf_atoll((char *)length_str); | ||
160 | totallen = (double)len * iterations; | ||
161 | |||
162 | result_cycle[0] = result_cycle[1] = 0ULL; | 144 | result_cycle[0] = result_cycle[1] = 0ULL; |
163 | result_bps[0] = result_bps[1] = 0.0; | 145 | result_bps[0] = result_bps[1] = 0.0; |
164 | 146 | ||
165 | if ((s64)len <= 0) { | 147 | printf("Routine %s (%s)\n", r->name, r->desc); |
166 | fprintf(stderr, "Invalid length:%s\n", length_str); | ||
167 | return 1; | ||
168 | } | ||
169 | |||
170 | /* same to without specifying either of prefault and no-prefault */ | ||
171 | if (only_prefault && no_prefault) | ||
172 | only_prefault = no_prefault = false; | ||
173 | |||
174 | for (i = 0; info->routines[i].name; i++) { | ||
175 | if (!strcmp(info->routines[i].name, routine)) | ||
176 | break; | ||
177 | } | ||
178 | if (!info->routines[i].name) { | ||
179 | printf("Unknown routine:%s\n", routine); | ||
180 | printf("Available routines...\n"); | ||
181 | for (i = 0; info->routines[i].name; i++) { | ||
182 | printf("\t%s ... %s\n", | ||
183 | info->routines[i].name, info->routines[i].desc); | ||
184 | } | ||
185 | return 1; | ||
186 | } | ||
187 | 148 | ||
188 | if (bench_format == BENCH_FORMAT_DEFAULT) | 149 | if (bench_format == BENCH_FORMAT_DEFAULT) |
189 | printf("# Copying %s Bytes ...\n\n", length_str); | 150 | printf("# Copying %s Bytes ...\n\n", length_str); |
@@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv, | |||
191 | if (!only_prefault && !no_prefault) { | 152 | if (!only_prefault && !no_prefault) { |
192 | /* show both of results */ | 153 | /* show both of results */ |
193 | if (use_cycle) { | 154 | if (use_cycle) { |
194 | result_cycle[0] = | 155 | result_cycle[0] = info->do_cycle(r, len, false); |
195 | info->do_cycle(&info->routines[i], len, false); | 156 | result_cycle[1] = info->do_cycle(r, len, true); |
196 | result_cycle[1] = | ||
197 | info->do_cycle(&info->routines[i], len, true); | ||
198 | } else { | 157 | } else { |
199 | result_bps[0] = | 158 | result_bps[0] = info->do_gettimeofday(r, len, false); |
200 | info->do_gettimeofday(&info->routines[i], | 159 | result_bps[1] = info->do_gettimeofday(r, len, true); |
201 | len, false); | ||
202 | result_bps[1] = | ||
203 | info->do_gettimeofday(&info->routines[i], | ||
204 | len, true); | ||
205 | } | 160 | } |
206 | } else { | 161 | } else { |
207 | if (use_cycle) { | 162 | if (use_cycle) |
208 | result_cycle[pf] = | 163 | result_cycle[pf] = info->do_cycle(r, len, only_prefault); |
209 | info->do_cycle(&info->routines[i], | 164 | else |
210 | len, only_prefault); | 165 | result_bps[pf] = info->do_gettimeofday(r, len, only_prefault); |
211 | } else { | ||
212 | result_bps[pf] = | ||
213 | info->do_gettimeofday(&info->routines[i], | ||
214 | len, only_prefault); | ||
215 | } | ||
216 | } | 166 | } |
217 | 167 | ||
218 | switch (bench_format) { | 168 | switch (bench_format) { |
@@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv, | |||
265 | die("unknown format: %d\n", bench_format); | 215 | die("unknown format: %d\n", bench_format); |
266 | break; | 216 | break; |
267 | } | 217 | } |
218 | } | ||
219 | |||
220 | static int bench_mem_common(int argc, const char **argv, | ||
221 | const char *prefix __maybe_unused, | ||
222 | struct bench_mem_info *info) | ||
223 | { | ||
224 | int i; | ||
225 | size_t len; | ||
226 | double totallen; | ||
227 | |||
228 | argc = parse_options(argc, argv, options, | ||
229 | info->usage, 0); | ||
230 | |||
231 | if (no_prefault && only_prefault) { | ||
232 | fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n"); | ||
233 | return 1; | ||
234 | } | ||
235 | |||
236 | if (use_cycle) | ||
237 | init_cycle(); | ||
238 | |||
239 | len = (size_t)perf_atoll((char *)length_str); | ||
240 | totallen = (double)len * iterations; | ||
241 | |||
242 | if ((s64)len <= 0) { | ||
243 | fprintf(stderr, "Invalid length:%s\n", length_str); | ||
244 | return 1; | ||
245 | } | ||
246 | |||
247 | /* same to without specifying either of prefault and no-prefault */ | ||
248 | if (only_prefault && no_prefault) | ||
249 | only_prefault = no_prefault = false; | ||
250 | |||
251 | if (!strncmp(routine, "all", 3)) { | ||
252 | for (i = 0; info->routines[i].name; i++) | ||
253 | __bench_mem_routine(info, i, len, totallen); | ||
254 | return 0; | ||
255 | } | ||
256 | |||
257 | for (i = 0; info->routines[i].name; i++) { | ||
258 | if (!strcmp(info->routines[i].name, routine)) | ||
259 | break; | ||
260 | } | ||
261 | if (!info->routines[i].name) { | ||
262 | printf("Unknown routine:%s\n", routine); | ||
263 | printf("Available routines...\n"); | ||
264 | for (i = 0; info->routines[i].name; i++) { | ||
265 | printf("\t%s ... %s\n", | ||
266 | info->routines[i].name, info->routines[i].desc); | ||
267 | } | ||
268 | return 1; | ||
269 | } | ||
270 | |||
271 | __bench_mem_routine(info, i, len, totallen); | ||
268 | 272 | ||
269 | return 0; | 273 | return 0; |
270 | } | 274 | } |
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h index a71dff97c1f5..f02d028771d9 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h | |||
@@ -1,12 +1,12 @@ | |||
1 | 1 | ||
2 | MEMSET_FN(__memset, | 2 | MEMSET_FN(memset_orig, |
3 | "x86-64-unrolled", | 3 | "x86-64-unrolled", |
4 | "unrolled memset() in arch/x86/lib/memset_64.S") | 4 | "unrolled memset() in arch/x86/lib/memset_64.S") |
5 | 5 | ||
6 | MEMSET_FN(memset_c, | 6 | MEMSET_FN(__memset, |
7 | "x86-64-stosq", | 7 | "x86-64-stosq", |
8 | "movsq-based memset() in arch/x86/lib/memset_64.S") | 8 | "movsq-based memset() in arch/x86/lib/memset_64.S") |
9 | 9 | ||
10 | MEMSET_FN(memset_c_e, | 10 | MEMSET_FN(memset_erms, |
11 | "x86-64-stosb", | 11 | "x86-64-stosb", |
12 | "movsb-based memset() in arch/x86/lib/memset_64.S") | 12 | "movsb-based memset() in arch/x86/lib/memset_64.S") |
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S index 9e5af89ed13a..de278784c866 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm.S +++ b/tools/perf/bench/mem-memset-x86-64-asm.S | |||
@@ -1,8 +1,6 @@ | |||
1 | #define memset MEMSET /* don't hide glibc's memset() */ | 1 | #define memset MEMSET /* don't hide glibc's memset() */ |
2 | #define altinstr_replacement text | 2 | #define altinstr_replacement text |
3 | #define globl p2align 4; .globl | 3 | #define globl p2align 4; .globl |
4 | #define Lmemset_c globl memset_c; memset_c | ||
5 | #define Lmemset_c_e globl memset_c_e; memset_c_e | ||
6 | #include "../../../arch/x86/lib/memset_64.S" | 4 | #include "../../../arch/x86/lib/memset_64.S" |
7 | 5 | ||
8 | /* | 6 | /* |
diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h index 6789d788d494..3a3a0f16456a 100644 --- a/tools/perf/util/include/asm/alternative-asm.h +++ b/tools/perf/util/include/asm/alternative-asm.h | |||
@@ -4,5 +4,6 @@ | |||
4 | /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ | 4 | /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ |
5 | 5 | ||
6 | #define altinstruction_entry # | 6 | #define altinstruction_entry # |
7 | #define ALTERNATIVE_2 # | ||
7 | 8 | ||
8 | #endif | 9 | #endif |