aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2015-03-04 00:33:49 -0500
committerIngo Molnar <mingo@kernel.org>2015-03-04 00:36:15 -0500
commitf8e92fb4b0ffc4d62279ab39f34e798e37e90b0b (patch)
tree9caa8df664792e64ddcb4ea03fd418a8a529c82e
parentd2c032e3dc58137a7261a7824d3acce435db1d66 (diff)
parentdfecb95cdfeaf7872d83a96bec3a606e9cd95c8d (diff)
Merge tag 'alternatives_padding' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp into x86/asm
Pull alternative instructions framework improvements from Borislav Petkov: "A more involved rework of the alternatives framework to be able to pad instructions and thus make using the alternatives macros more straightforward and without having to figure out old and new instruction sizes but have the toolchain figure that out for us. Furthermore, it optimizes JMPs used so that fetch and decode can be relieved with smaller versions of the JMPs, where possible. Some stats: x86_64 defconfig: Alternatives sites total: 2478 Total padding added (in Bytes): 6051 The padding is currently done for: X86_FEATURE_ALWAYS X86_FEATURE_ERMS X86_FEATURE_LFENCE_RDTSC X86_FEATURE_MFENCE_RDTSC X86_FEATURE_SMAP This is with the latest version of the patchset. Of course, on each machine the alternatives sites actually being patched are a proper subset of the total number." Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/include/asm/alternative-asm.h43
-rw-r--r--arch/x86/include/asm/alternative.h65
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/barrier.h6
-rw-r--r--arch/x86/include/asm/cpufeature.h30
-rw-r--r--arch/x86/include/asm/processor.h16
-rw-r--r--arch/x86/include/asm/smap.h30
-rw-r--r--arch/x86/kernel/alternative.c158
-rw-r--r--arch/x86/kernel/cpu/amd.c5
-rw-r--r--arch/x86/kernel/entry_32.S12
-rw-r--r--arch/x86/lib/clear_page_64.S66
-rw-r--r--arch/x86/lib/copy_page_64.S37
-rw-r--r--arch/x86/lib/copy_user_64.S46
-rw-r--r--arch/x86/lib/memcpy_64.S68
-rw-r--r--arch/x86/lib/memmove_64.S19
-rw-r--r--arch/x86/lib/memset_64.S61
-rw-r--r--arch/x86/um/asm/barrier.h4
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm-def.h6
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm.S2
-rw-r--r--tools/perf/bench/mem-memcpy.c128
-rw-r--r--tools/perf/bench/mem-memset-x86-64-asm-def.h6
-rw-r--r--tools/perf/bench/mem-memset-x86-64-asm.S2
-rw-r--r--tools/perf/util/include/asm/alternative-asm.h1
23 files changed, 433 insertions, 380 deletions
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 372231c22a47..524bddce0b76 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,12 +18,53 @@
18 .endm 18 .endm
19#endif 19#endif
20 20
21.macro altinstruction_entry orig alt feature orig_len alt_len 21.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
22 .long \orig - . 22 .long \orig - .
23 .long \alt - . 23 .long \alt - .
24 .word \feature 24 .word \feature
25 .byte \orig_len 25 .byte \orig_len
26 .byte \alt_len 26 .byte \alt_len
27 .byte \pad_len
28.endm
29
30.macro ALTERNATIVE oldinstr, newinstr, feature
31140:
32 \oldinstr
33141:
34 .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
35142:
36
37 .pushsection .altinstructions,"a"
38 altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
39 .popsection
40
41 .pushsection .altinstr_replacement,"ax"
42143:
43 \newinstr
44144:
45 .popsection
46.endm
47
48.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
49140:
50 \oldinstr
51141:
52 .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
53 .skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90
54142:
55
56 .pushsection .altinstructions,"a"
57 altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
58 altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
59 .popsection
60
61 .pushsection .altinstr_replacement,"ax"
62143:
63 \newinstr1
64144:
65 \newinstr2
66145:
67 .popsection
27.endm 68.endm
28 69
29#endif /* __ASSEMBLY__ */ 70#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 473bdbee378a..5aef6a97d80e 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -48,8 +48,9 @@ struct alt_instr {
48 s32 repl_offset; /* offset to replacement instruction */ 48 s32 repl_offset; /* offset to replacement instruction */
49 u16 cpuid; /* cpuid bit set for replacement */ 49 u16 cpuid; /* cpuid bit set for replacement */
50 u8 instrlen; /* length of original instruction */ 50 u8 instrlen; /* length of original instruction */
51 u8 replacementlen; /* length of new instruction, <= instrlen */ 51 u8 replacementlen; /* length of new instruction */
52}; 52 u8 padlen; /* length of build-time padding */
53} __packed;
53 54
54extern void alternative_instructions(void); 55extern void alternative_instructions(void);
55extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); 56extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end)
76} 77}
77#endif /* CONFIG_SMP */ 78#endif /* CONFIG_SMP */
78 79
79#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" 80#define b_replacement(num) "664"#num
81#define e_replacement(num) "665"#num
80 82
81#define b_replacement(number) "663"#number 83#define alt_end_marker "663"
82#define e_replacement(number) "664"#number 84#define alt_slen "662b-661b"
85#define alt_pad_len alt_end_marker"b-662b"
86#define alt_total_slen alt_end_marker"b-661b"
87#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
83 88
84#define alt_slen "662b-661b" 89#define __OLDINSTR(oldinstr, num) \
85#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" 90 "661:\n\t" oldinstr "\n662:\n" \
91 ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
92 "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
86 93
87#define ALTINSTR_ENTRY(feature, number) \ 94#define OLDINSTR(oldinstr, num) \
95 __OLDINSTR(oldinstr, num) \
96 alt_end_marker ":\n"
97
98/*
99 * Pad the second replacement alternative with additional NOPs if it is
100 * additionally longer than the first replacement alternative.
101 */
102#define OLDINSTR_2(oldinstr, num1, num2) \
103 __OLDINSTR(oldinstr, num1) \
104 ".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \
105 "((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n" \
106 alt_end_marker ":\n"
107
108#define ALTINSTR_ENTRY(feature, num) \
88 " .long 661b - .\n" /* label */ \ 109 " .long 661b - .\n" /* label */ \
89 " .long " b_replacement(number)"f - .\n" /* new instruction */ \ 110 " .long " b_replacement(num)"f - .\n" /* new instruction */ \
90 " .word " __stringify(feature) "\n" /* feature bit */ \ 111 " .word " __stringify(feature) "\n" /* feature bit */ \
91 " .byte " alt_slen "\n" /* source len */ \ 112 " .byte " alt_total_slen "\n" /* source len */ \
92 " .byte " alt_rlen(number) "\n" /* replacement len */ 113 " .byte " alt_rlen(num) "\n" /* replacement len */ \
114 " .byte " alt_pad_len "\n" /* pad len */
93 115
94#define DISCARD_ENTRY(number) /* rlen <= slen */ \ 116#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
95 " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" 117 b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
96
97#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \
98 b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
99 118
100/* alternative assembly primitive: */ 119/* alternative assembly primitive: */
101#define ALTERNATIVE(oldinstr, newinstr, feature) \ 120#define ALTERNATIVE(oldinstr, newinstr, feature) \
102 OLDINSTR(oldinstr) \ 121 OLDINSTR(oldinstr, 1) \
103 ".pushsection .altinstructions,\"a\"\n" \ 122 ".pushsection .altinstructions,\"a\"\n" \
104 ALTINSTR_ENTRY(feature, 1) \ 123 ALTINSTR_ENTRY(feature, 1) \
105 ".popsection\n" \ 124 ".popsection\n" \
106 ".pushsection .discard,\"aw\",@progbits\n" \
107 DISCARD_ENTRY(1) \
108 ".popsection\n" \
109 ".pushsection .altinstr_replacement, \"ax\"\n" \ 125 ".pushsection .altinstr_replacement, \"ax\"\n" \
110 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ 126 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
111 ".popsection" 127 ".popsection"
112 128
113#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ 129#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
114 OLDINSTR(oldinstr) \ 130 OLDINSTR_2(oldinstr, 1, 2) \
115 ".pushsection .altinstructions,\"a\"\n" \ 131 ".pushsection .altinstructions,\"a\"\n" \
116 ALTINSTR_ENTRY(feature1, 1) \ 132 ALTINSTR_ENTRY(feature1, 1) \
117 ALTINSTR_ENTRY(feature2, 2) \ 133 ALTINSTR_ENTRY(feature2, 2) \
118 ".popsection\n" \ 134 ".popsection\n" \
119 ".pushsection .discard,\"aw\",@progbits\n" \
120 DISCARD_ENTRY(1) \
121 DISCARD_ENTRY(2) \
122 ".popsection\n" \
123 ".pushsection .altinstr_replacement, \"ax\"\n" \ 135 ".pushsection .altinstr_replacement, \"ax\"\n" \
124 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ 136 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
125 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ 137 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
@@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
146#define alternative(oldinstr, newinstr, feature) \ 158#define alternative(oldinstr, newinstr, feature) \
147 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") 159 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
148 160
161#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
162 asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
163
149/* 164/*
150 * Alternative inline assembly with input. 165 * Alternative inline assembly with input.
151 * 166 *
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efc3b22d896e..8118e94d50ab 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
91{ 91{
92 volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); 92 volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
93 93
94 alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, 94 alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
95 ASM_OUTPUT2("=r" (v), "=m" (*addr)), 95 ASM_OUTPUT2("=r" (v), "=m" (*addr)),
96 ASM_OUTPUT2("0" (v), "m" (*addr))); 96 ASM_OUTPUT2("0" (v), "m" (*addr)));
97} 97}
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 2ab1eb33106e..959e45b81fe2 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -95,13 +95,11 @@ do { \
95 * Stop RDTSC speculation. This is needed when you need to use RDTSC 95 * Stop RDTSC speculation. This is needed when you need to use RDTSC
96 * (or get_cycles or vread that possibly accesses the TSC) in a defined 96 * (or get_cycles or vread that possibly accesses the TSC) in a defined
97 * code region. 97 * code region.
98 *
99 * (Could use an alternative three way for this if there was one.)
100 */ 98 */
101static __always_inline void rdtsc_barrier(void) 99static __always_inline void rdtsc_barrier(void)
102{ 100{
103 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); 101 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
104 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 102 "lfence", X86_FEATURE_LFENCE_RDTSC);
105} 103}
106 104
107#endif /* _ASM_X86_BARRIER_H */ 105#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index d6428ea5d316..0f7a5a1a8db2 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -419,6 +419,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
419 " .word %P0\n" /* 1: do replace */ 419 " .word %P0\n" /* 1: do replace */
420 " .byte 2b - 1b\n" /* source len */ 420 " .byte 2b - 1b\n" /* source len */
421 " .byte 0\n" /* replacement len */ 421 " .byte 0\n" /* replacement len */
422 " .byte 0\n" /* pad len */
422 ".previous\n" 423 ".previous\n"
423 /* skipping size check since replacement size = 0 */ 424 /* skipping size check since replacement size = 0 */
424 : : "i" (X86_FEATURE_ALWAYS) : : t_warn); 425 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -433,6 +434,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
433 " .word %P0\n" /* feature bit */ 434 " .word %P0\n" /* feature bit */
434 " .byte 2b - 1b\n" /* source len */ 435 " .byte 2b - 1b\n" /* source len */
435 " .byte 0\n" /* replacement len */ 436 " .byte 0\n" /* replacement len */
437 " .byte 0\n" /* pad len */
436 ".previous\n" 438 ".previous\n"
437 /* skipping size check since replacement size = 0 */ 439 /* skipping size check since replacement size = 0 */
438 : : "i" (bit) : : t_no); 440 : : "i" (bit) : : t_no);
@@ -458,6 +460,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
458 " .word %P1\n" /* feature bit */ 460 " .word %P1\n" /* feature bit */
459 " .byte 2b - 1b\n" /* source len */ 461 " .byte 2b - 1b\n" /* source len */
460 " .byte 4f - 3f\n" /* replacement len */ 462 " .byte 4f - 3f\n" /* replacement len */
463 " .byte 0\n" /* pad len */
461 ".previous\n" 464 ".previous\n"
462 ".section .discard,\"aw\",@progbits\n" 465 ".section .discard,\"aw\",@progbits\n"
463 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ 466 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -484,31 +487,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
484static __always_inline __pure bool _static_cpu_has_safe(u16 bit) 487static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
485{ 488{
486#ifdef CC_HAVE_ASM_GOTO 489#ifdef CC_HAVE_ASM_GOTO
487/* 490 asm_volatile_goto("1: jmp %l[t_dynamic]\n"
488 * We need to spell the jumps to the compiler because, depending on the offset,
489 * the replacement jump can be bigger than the original jump, and this we cannot
490 * have. Thus, we force the jump to the widest, 4-byte, signed relative
491 * offset even though the last would often fit in less bytes.
492 */
493 asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
494 "2:\n" 491 "2:\n"
492 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
493 "((5f-4f) - (2b-1b)),0x90\n"
494 "3:\n"
495 ".section .altinstructions,\"a\"\n" 495 ".section .altinstructions,\"a\"\n"
496 " .long 1b - .\n" /* src offset */ 496 " .long 1b - .\n" /* src offset */
497 " .long 3f - .\n" /* repl offset */ 497 " .long 4f - .\n" /* repl offset */
498 " .word %P1\n" /* always replace */ 498 " .word %P1\n" /* always replace */
499 " .byte 2b - 1b\n" /* src len */ 499 " .byte 3b - 1b\n" /* src len */
500 " .byte 4f - 3f\n" /* repl len */ 500 " .byte 5f - 4f\n" /* repl len */
501 " .byte 3b - 2b\n" /* pad len */
501 ".previous\n" 502 ".previous\n"
502 ".section .altinstr_replacement,\"ax\"\n" 503 ".section .altinstr_replacement,\"ax\"\n"
503 "3: .byte 0xe9\n .long %l[t_no] - 2b\n" 504 "4: jmp %l[t_no]\n"
504 "4:\n" 505 "5:\n"
505 ".previous\n" 506 ".previous\n"
506 ".section .altinstructions,\"a\"\n" 507 ".section .altinstructions,\"a\"\n"
507 " .long 1b - .\n" /* src offset */ 508 " .long 1b - .\n" /* src offset */
508 " .long 0\n" /* no replacement */ 509 " .long 0\n" /* no replacement */
509 " .word %P0\n" /* feature bit */ 510 " .word %P0\n" /* feature bit */
510 " .byte 2b - 1b\n" /* src len */ 511 " .byte 3b - 1b\n" /* src len */
511 " .byte 0\n" /* repl len */ 512 " .byte 0\n" /* repl len */
513 " .byte 0\n" /* pad len */
512 ".previous\n" 514 ".previous\n"
513 : : "i" (bit), "i" (X86_FEATURE_ALWAYS) 515 : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
514 : : t_dynamic, t_no); 516 : : t_dynamic, t_no);
@@ -528,6 +530,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
528 " .word %P2\n" /* always replace */ 530 " .word %P2\n" /* always replace */
529 " .byte 2b - 1b\n" /* source len */ 531 " .byte 2b - 1b\n" /* source len */
530 " .byte 4f - 3f\n" /* replacement len */ 532 " .byte 4f - 3f\n" /* replacement len */
533 " .byte 0\n" /* pad len */
531 ".previous\n" 534 ".previous\n"
532 ".section .discard,\"aw\",@progbits\n" 535 ".section .discard,\"aw\",@progbits\n"
533 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ 536 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -542,6 +545,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
542 " .word %P1\n" /* feature bit */ 545 " .word %P1\n" /* feature bit */
543 " .byte 4b - 3b\n" /* src len */ 546 " .byte 4b - 3b\n" /* src len */
544 " .byte 6f - 5f\n" /* repl len */ 547 " .byte 6f - 5f\n" /* repl len */
548 " .byte 0\n" /* pad len */
545 ".previous\n" 549 ".previous\n"
546 ".section .discard,\"aw\",@progbits\n" 550 ".section .discard,\"aw\",@progbits\n"
547 " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ 551 " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c93588cef..7be2c9a6caba 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -761,10 +761,10 @@ extern char ignore_fpu_irq;
761#define ARCH_HAS_SPINLOCK_PREFETCH 761#define ARCH_HAS_SPINLOCK_PREFETCH
762 762
763#ifdef CONFIG_X86_32 763#ifdef CONFIG_X86_32
764# define BASE_PREFETCH ASM_NOP4 764# define BASE_PREFETCH ""
765# define ARCH_HAS_PREFETCH 765# define ARCH_HAS_PREFETCH
766#else 766#else
767# define BASE_PREFETCH "prefetcht0 (%1)" 767# define BASE_PREFETCH "prefetcht0 %P1"
768#endif 768#endif
769 769
770/* 770/*
@@ -775,10 +775,9 @@ extern char ignore_fpu_irq;
775 */ 775 */
776static inline void prefetch(const void *x) 776static inline void prefetch(const void *x)
777{ 777{
778 alternative_input(BASE_PREFETCH, 778 alternative_input(BASE_PREFETCH, "prefetchnta %P1",
779 "prefetchnta (%1)",
780 X86_FEATURE_XMM, 779 X86_FEATURE_XMM,
781 "r" (x)); 780 "m" (*(const char *)x));
782} 781}
783 782
784/* 783/*
@@ -788,10 +787,9 @@ static inline void prefetch(const void *x)
788 */ 787 */
789static inline void prefetchw(const void *x) 788static inline void prefetchw(const void *x)
790{ 789{
791 alternative_input(BASE_PREFETCH, 790 alternative_input(BASE_PREFETCH, "prefetchw %P1",
792 "prefetchw (%1)", 791 X86_FEATURE_3DNOWPREFETCH,
793 X86_FEATURE_3DNOW, 792 "m" (*(const char *)x));
794 "r" (x));
795} 793}
796 794
797static inline void spin_lock_prefetch(const void *x) 795static inline void spin_lock_prefetch(const void *x)
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8d3120f4e270..ba665ebd17bb 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -27,23 +27,11 @@
27 27
28#ifdef CONFIG_X86_SMAP 28#ifdef CONFIG_X86_SMAP
29 29
30#define ASM_CLAC \ 30#define ASM_CLAC \
31 661: ASM_NOP3 ; \ 31 ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
32 .pushsection .altinstr_replacement, "ax" ; \ 32
33 662: __ASM_CLAC ; \ 33#define ASM_STAC \
34 .popsection ; \ 34 ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
35 .pushsection .altinstructions, "a" ; \
36 altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
37 .popsection
38
39#define ASM_STAC \
40 661: ASM_NOP3 ; \
41 .pushsection .altinstr_replacement, "ax" ; \
42 662: __ASM_STAC ; \
43 .popsection ; \
44 .pushsection .altinstructions, "a" ; \
45 altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
46 .popsection
47 35
48#else /* CONFIG_X86_SMAP */ 36#else /* CONFIG_X86_SMAP */
49 37
@@ -61,20 +49,20 @@
61static __always_inline void clac(void) 49static __always_inline void clac(void)
62{ 50{
63 /* Note: a barrier is implicit in alternative() */ 51 /* Note: a barrier is implicit in alternative() */
64 alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); 52 alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
65} 53}
66 54
67static __always_inline void stac(void) 55static __always_inline void stac(void)
68{ 56{
69 /* Note: a barrier is implicit in alternative() */ 57 /* Note: a barrier is implicit in alternative() */
70 alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); 58 alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
71} 59}
72 60
73/* These macros can be used in asm() statements */ 61/* These macros can be used in asm() statements */
74#define ASM_CLAC \ 62#define ASM_CLAC \
75 ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) 63 ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
76#define ASM_STAC \ 64#define ASM_STAC \
77 ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) 65 ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
78 66
79#else /* CONFIG_X86_SMAP */ 67#else /* CONFIG_X86_SMAP */
80 68
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f469ec..af397cc98d05 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
52__setup("noreplace-paravirt", setup_noreplace_paravirt); 52__setup("noreplace-paravirt", setup_noreplace_paravirt);
53#endif 53#endif
54 54
55#define DPRINTK(fmt, ...) \ 55#define DPRINTK(fmt, args...) \
56do { \ 56do { \
57 if (debug_alternative) \ 57 if (debug_alternative) \
58 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ 58 printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
59} while (0)
60
61#define DUMP_BYTES(buf, len, fmt, args...) \
62do { \
63 if (unlikely(debug_alternative)) { \
64 int j; \
65 \
66 if (!(len)) \
67 break; \
68 \
69 printk(KERN_DEBUG fmt, ##args); \
70 for (j = 0; j < (len) - 1; j++) \
71 printk(KERN_CONT "%02hhx ", buf[j]); \
72 printk(KERN_CONT "%02hhx\n", buf[j]); \
73 } \
59} while (0) 74} while (0)
60 75
61/* 76/*
@@ -243,12 +258,86 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
243extern s32 __smp_locks[], __smp_locks_end[]; 258extern s32 __smp_locks[], __smp_locks_end[];
244void *text_poke_early(void *addr, const void *opcode, size_t len); 259void *text_poke_early(void *addr, const void *opcode, size_t len);
245 260
246/* Replace instructions with better alternatives for this CPU type. 261/*
247 This runs before SMP is initialized to avoid SMP problems with 262 * Are we looking at a near JMP with a 1 or 4-byte displacement.
248 self modifying code. This implies that asymmetric systems where 263 */
249 APs have less capabilities than the boot processor are not handled. 264static inline bool is_jmp(const u8 opcode)
250 Tough. Make sure you disable such features by hand. */ 265{
266 return opcode == 0xeb || opcode == 0xe9;
267}
268
269static void __init_or_module
270recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
271{
272 u8 *next_rip, *tgt_rip;
273 s32 n_dspl, o_dspl;
274 int repl_len;
275
276 if (a->replacementlen != 5)
277 return;
278
279 o_dspl = *(s32 *)(insnbuf + 1);
280
281 /* next_rip of the replacement JMP */
282 next_rip = repl_insn + a->replacementlen;
283 /* target rip of the replacement JMP */
284 tgt_rip = next_rip + o_dspl;
285 n_dspl = tgt_rip - orig_insn;
286
287 DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
288
289 if (tgt_rip - orig_insn >= 0) {
290 if (n_dspl - 2 <= 127)
291 goto two_byte_jmp;
292 else
293 goto five_byte_jmp;
294 /* negative offset */
295 } else {
296 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
297 goto two_byte_jmp;
298 else
299 goto five_byte_jmp;
300 }
301
302two_byte_jmp:
303 n_dspl -= 2;
304
305 insnbuf[0] = 0xeb;
306 insnbuf[1] = (s8)n_dspl;
307 add_nops(insnbuf + 2, 3);
308
309 repl_len = 2;
310 goto done;
311
312five_byte_jmp:
313 n_dspl -= 5;
251 314
315 insnbuf[0] = 0xe9;
316 *(s32 *)&insnbuf[1] = n_dspl;
317
318 repl_len = 5;
319
320done:
321
322 DPRINTK("final displ: 0x%08x, JMP 0x%lx",
323 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
324}
325
326static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
327{
328 add_nops(instr + (a->instrlen - a->padlen), a->padlen);
329
330 DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
331 instr, a->instrlen - a->padlen, a->padlen);
332}
333
334/*
335 * Replace instructions with better alternatives for this CPU type. This runs
336 * before SMP is initialized to avoid SMP problems with self modifying code.
337 * This implies that asymmetric systems where APs have less capabilities than
338 * the boot processor are not handled. Tough. Make sure you disable such
339 * features by hand.
340 */
252void __init_or_module apply_alternatives(struct alt_instr *start, 341void __init_or_module apply_alternatives(struct alt_instr *start,
253 struct alt_instr *end) 342 struct alt_instr *end)
254{ 343{
@@ -256,10 +345,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
256 u8 *instr, *replacement; 345 u8 *instr, *replacement;
257 u8 insnbuf[MAX_PATCH_LEN]; 346 u8 insnbuf[MAX_PATCH_LEN];
258 347
259 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 348 DPRINTK("alt table %p -> %p", start, end);
260 /* 349 /*
261 * The scan order should be from start to end. A later scanned 350 * The scan order should be from start to end. A later scanned
262 * alternative code can overwrite a previous scanned alternative code. 351 * alternative code can overwrite previously scanned alternative code.
263 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 352 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
264 * patch code. 353 * patch code.
265 * 354 *
@@ -267,29 +356,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
267 * order. 356 * order.
268 */ 357 */
269 for (a = start; a < end; a++) { 358 for (a = start; a < end; a++) {
359 int insnbuf_sz = 0;
360
270 instr = (u8 *)&a->instr_offset + a->instr_offset; 361 instr = (u8 *)&a->instr_offset + a->instr_offset;
271 replacement = (u8 *)&a->repl_offset + a->repl_offset; 362 replacement = (u8 *)&a->repl_offset + a->repl_offset;
272 BUG_ON(a->replacementlen > a->instrlen);
273 BUG_ON(a->instrlen > sizeof(insnbuf)); 363 BUG_ON(a->instrlen > sizeof(insnbuf));
274 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); 364 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
275 if (!boot_cpu_has(a->cpuid)) 365 if (!boot_cpu_has(a->cpuid)) {
366 if (a->padlen > 1)
367 optimize_nops(a, instr);
368
276 continue; 369 continue;
370 }
371
372 DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
373 a->cpuid >> 5,
374 a->cpuid & 0x1f,
375 instr, a->instrlen,
376 replacement, a->replacementlen);
377
378 DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
379 DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
277 380
278 memcpy(insnbuf, replacement, a->replacementlen); 381 memcpy(insnbuf, replacement, a->replacementlen);
382 insnbuf_sz = a->replacementlen;
279 383
280 /* 0xe8 is a relative jump; fix the offset. */ 384 /* 0xe8 is a relative jump; fix the offset. */
281 if (*insnbuf == 0xe8 && a->replacementlen == 5) 385 if (*insnbuf == 0xe8 && a->replacementlen == 5) {
282 *(s32 *)(insnbuf + 1) += replacement - instr; 386 *(s32 *)(insnbuf + 1) += replacement - instr;
387 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
388 *(s32 *)(insnbuf + 1),
389 (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
390 }
391
392 if (a->replacementlen && is_jmp(replacement[0]))
393 recompute_jump(a, instr, replacement, insnbuf);
283 394
284 add_nops(insnbuf + a->replacementlen, 395 if (a->instrlen > a->replacementlen) {
285 a->instrlen - a->replacementlen); 396 add_nops(insnbuf + a->replacementlen,
397 a->instrlen - a->replacementlen);
398 insnbuf_sz += a->instrlen - a->replacementlen;
399 }
400 DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
286 401
287 text_poke_early(instr, insnbuf, a->instrlen); 402 text_poke_early(instr, insnbuf, insnbuf_sz);
288 } 403 }
289} 404}
290 405
291#ifdef CONFIG_SMP 406#ifdef CONFIG_SMP
292
293static void alternatives_smp_lock(const s32 *start, const s32 *end, 407static void alternatives_smp_lock(const s32 *start, const s32 *end,
294 u8 *text, u8 *text_end) 408 u8 *text, u8 *text_end)
295{ 409{
@@ -371,8 +485,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
371 smp->locks_end = locks_end; 485 smp->locks_end = locks_end;
372 smp->text = text; 486 smp->text = text;
373 smp->text_end = text_end; 487 smp->text_end = text_end;
374 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", 488 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
375 __func__, smp->locks, smp->locks_end, 489 smp->locks, smp->locks_end,
376 smp->text, smp->text_end, smp->name); 490 smp->text, smp->text_end, smp->name);
377 491
378 list_add_tail(&smp->next, &smp_alt_modules); 492 list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +554,7 @@ int alternatives_text_reserved(void *start, void *end)
440 554
441 return 0; 555 return 0;
442} 556}
443#endif 557#endif /* CONFIG_SMP */
444 558
445#ifdef CONFIG_PARAVIRT 559#ifdef CONFIG_PARAVIRT
446void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 560void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..dd9e50500297 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c)
711 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); 711 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
712 712
713 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 713 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
714
715 /* 3DNow or LM implies PREFETCHW */
716 if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
717 if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
718 set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
714} 719}
715 720
716#ifdef CONFIG_X86_32 721#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 31e2d5bf3e38..7e0323cc0b7d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error)
816 pushl_cfi $0 816 pushl_cfi $0
817#ifdef CONFIG_X86_INVD_BUG 817#ifdef CONFIG_X86_INVD_BUG
818 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 818 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
819661: pushl_cfi $do_general_protection 819 ALTERNATIVE "pushl_cfi $do_general_protection", \
820662: 820 "pushl $do_simd_coprocessor_error", \
821.section .altinstructions,"a" 821 X86_FEATURE_XMM
822 altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
823.previous
824.section .altinstr_replacement,"ax"
825663: pushl $do_simd_coprocessor_error
826664:
827.previous
828#else 822#else
829 pushl_cfi $do_simd_coprocessor_error 823 pushl_cfi $do_simd_coprocessor_error
830#endif 824#endif
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f2145cfa12a6..e67e579c93bd 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,31 +1,35 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <asm/dwarf2.h> 2#include <asm/dwarf2.h>
3#include <asm/cpufeature.h>
3#include <asm/alternative-asm.h> 4#include <asm/alternative-asm.h>
4 5
5/* 6/*
6 * Zero a page. 7 * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
7 * rdi page 8 * recommended to use this when possible and we do use them by default.
8 */ 9 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
9ENTRY(clear_page_c) 10 * Otherwise, use original.
11 */
12
13/*
14 * Zero a page.
15 * %rdi - page
16 */
17ENTRY(clear_page)
10 CFI_STARTPROC 18 CFI_STARTPROC
19
20 ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
21 "jmp clear_page_c_e", X86_FEATURE_ERMS
22
11 movl $4096/8,%ecx 23 movl $4096/8,%ecx
12 xorl %eax,%eax 24 xorl %eax,%eax
13 rep stosq 25 rep stosq
14 ret 26 ret
15 CFI_ENDPROC 27 CFI_ENDPROC
16ENDPROC(clear_page_c) 28ENDPROC(clear_page)
17 29
18ENTRY(clear_page_c_e) 30ENTRY(clear_page_orig)
19 CFI_STARTPROC 31 CFI_STARTPROC
20 movl $4096,%ecx
21 xorl %eax,%eax
22 rep stosb
23 ret
24 CFI_ENDPROC
25ENDPROC(clear_page_c_e)
26 32
27ENTRY(clear_page)
28 CFI_STARTPROC
29 xorl %eax,%eax 33 xorl %eax,%eax
30 movl $4096/64,%ecx 34 movl $4096/64,%ecx
31 .p2align 4 35 .p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
45 nop 49 nop
46 ret 50 ret
47 CFI_ENDPROC 51 CFI_ENDPROC
48.Lclear_page_end: 52ENDPROC(clear_page_orig)
49ENDPROC(clear_page)
50
51 /*
52 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
53 * It is recommended to use this when possible.
54 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
55 * Otherwise, use original function.
56 *
57 */
58 53
59#include <asm/cpufeature.h> 54ENTRY(clear_page_c_e)
60 55 CFI_STARTPROC
61 .section .altinstr_replacement,"ax" 56 movl $4096,%ecx
621: .byte 0xeb /* jmp <disp8> */ 57 xorl %eax,%eax
63 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ 58 rep stosb
642: .byte 0xeb /* jmp <disp8> */ 59 ret
65 .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ 60 CFI_ENDPROC
663: 61ENDPROC(clear_page_c_e)
67 .previous
68 .section .altinstructions,"a"
69 altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
70 .Lclear_page_end-clear_page, 2b-1b
71 altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
72 .Lclear_page_end-clear_page,3b-2b
73 .previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 176cca67212b..8239dbcbf984 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
5#include <asm/cpufeature.h>
5#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
6 7
8/*
9 * Some CPUs run faster using the string copy instructions (sane microcode).
10 * It is also a lot simpler. Use this when possible. But, don't use streaming
11 * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
12 * prefetch distance based on SMP/UP.
13 */
7 ALIGN 14 ALIGN
8copy_page_rep: 15ENTRY(copy_page)
9 CFI_STARTPROC 16 CFI_STARTPROC
17 ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
10 movl $4096/8, %ecx 18 movl $4096/8, %ecx
11 rep movsq 19 rep movsq
12 ret 20 ret
13 CFI_ENDPROC 21 CFI_ENDPROC
14ENDPROC(copy_page_rep) 22ENDPROC(copy_page)
15
16/*
17 * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
18 * Could vary the prefetch distance based on SMP/UP.
19*/
20 23
21ENTRY(copy_page) 24ENTRY(copy_page_regs)
22 CFI_STARTPROC 25 CFI_STARTPROC
23 subq $2*8, %rsp 26 subq $2*8, %rsp
24 CFI_ADJUST_CFA_OFFSET 2*8 27 CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
90 addq $2*8, %rsp 93 addq $2*8, %rsp
91 CFI_ADJUST_CFA_OFFSET -2*8 94 CFI_ADJUST_CFA_OFFSET -2*8
92 ret 95 ret
93.Lcopy_page_end:
94 CFI_ENDPROC 96 CFI_ENDPROC
95ENDPROC(copy_page) 97ENDPROC(copy_page_regs)
96
97 /* Some CPUs run faster using the string copy instructions.
98 It is also a lot simpler. Use this when possible */
99
100#include <asm/cpufeature.h>
101
102 .section .altinstr_replacement,"ax"
1031: .byte 0xeb /* jmp <disp8> */
104 .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
1052:
106 .previous
107 .section .altinstructions,"a"
108 altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
109 .Lcopy_page_end-copy_page, 2b-1b
110 .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dee945d55594..fa997dfaef24 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,9 +8,6 @@
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
11
12#define FIX_ALIGNMENT 1
13
14#include <asm/current.h> 11#include <asm/current.h>
15#include <asm/asm-offsets.h> 12#include <asm/asm-offsets.h>
16#include <asm/thread_info.h> 13#include <asm/thread_info.h>
@@ -19,33 +16,7 @@
19#include <asm/asm.h> 16#include <asm/asm.h>
20#include <asm/smap.h> 17#include <asm/smap.h>
21 18
22/*
23 * By placing feature2 after feature1 in altinstructions section, we logically
24 * implement:
25 * If CPU has feature2, jmp to alt2 is used
26 * else if CPU has feature1, jmp to alt1 is used
27 * else jmp to orig is used.
28 */
29 .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
300:
31 .byte 0xe9 /* 32bit jump */
32 .long \orig-1f /* by default jump to orig */
331:
34 .section .altinstr_replacement,"ax"
352: .byte 0xe9 /* near jump with 32bit immediate */
36 .long \alt1-1b /* offset */ /* or alternatively to alt1 */
373: .byte 0xe9 /* near jump with 32bit immediate */
38 .long \alt2-1b /* offset */ /* or alternatively to alt2 */
39 .previous
40
41 .section .altinstructions,"a"
42 altinstruction_entry 0b,2b,\feature1,5,5
43 altinstruction_entry 0b,3b,\feature2,5,5
44 .previous
45 .endm
46
47 .macro ALIGN_DESTINATION 19 .macro ALIGN_DESTINATION
48#ifdef FIX_ALIGNMENT
49 /* check for bad alignment of destination */ 20 /* check for bad alignment of destination */
50 movl %edi,%ecx 21 movl %edi,%ecx
51 andl $7,%ecx 22 andl $7,%ecx
@@ -67,7 +38,6 @@
67 38
68 _ASM_EXTABLE(100b,103b) 39 _ASM_EXTABLE(100b,103b)
69 _ASM_EXTABLE(101b,103b) 40 _ASM_EXTABLE(101b,103b)
70#endif
71 .endm 41 .endm
72 42
73/* Standard copy_to_user with segment limit checking */ 43/* Standard copy_to_user with segment limit checking */
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user)
79 jc bad_to_user 49 jc bad_to_user
80 cmpq TI_addr_limit(%rax),%rcx 50 cmpq TI_addr_limit(%rax),%rcx
81 ja bad_to_user 51 ja bad_to_user
82 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ 52 ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
83 copy_user_generic_unrolled,copy_user_generic_string, \ 53 "jmp copy_user_generic_string", \
84 copy_user_enhanced_fast_string 54 X86_FEATURE_REP_GOOD, \
55 "jmp copy_user_enhanced_fast_string", \
56 X86_FEATURE_ERMS
85 CFI_ENDPROC 57 CFI_ENDPROC
86ENDPROC(_copy_to_user) 58ENDPROC(_copy_to_user)
87 59
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user)
94 jc bad_from_user 66 jc bad_from_user
95 cmpq TI_addr_limit(%rax),%rcx 67 cmpq TI_addr_limit(%rax),%rcx
96 ja bad_from_user 68 ja bad_from_user
97 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ 69 ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
98 copy_user_generic_unrolled,copy_user_generic_string, \ 70 "jmp copy_user_generic_string", \
99 copy_user_enhanced_fast_string 71 X86_FEATURE_REP_GOOD, \
72 "jmp copy_user_enhanced_fast_string", \
73 X86_FEATURE_ERMS
100 CFI_ENDPROC 74 CFI_ENDPROC
101ENDPROC(_copy_from_user) 75ENDPROC(_copy_from_user)
102 76
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 89b53c9968e7..b046664f5a1c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,20 @@
1/* Copyright 2002 Andi Kleen */ 1/* Copyright 2002 Andi Kleen */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4
5#include <asm/cpufeature.h> 4#include <asm/cpufeature.h>
6#include <asm/dwarf2.h> 5#include <asm/dwarf2.h>
7#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
8 7
9/* 8/*
9 * We build a jump to memcpy_orig by default which gets NOPped out on
10 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
11 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
12 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
13 */
14
15.weak memcpy
16
17/*
10 * memcpy - Copy a memory block. 18 * memcpy - Copy a memory block.
11 * 19 *
12 * Input: 20 * Input:
@@ -17,15 +25,11 @@
17 * Output: 25 * Output:
18 * rax original destination 26 * rax original destination
19 */ 27 */
28ENTRY(__memcpy)
29ENTRY(memcpy)
30 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
31 "jmp memcpy_erms", X86_FEATURE_ERMS
20 32
21/*
22 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
23 *
24 * This gets patched over the unrolled variant (below) via the
25 * alternative instructions framework:
26 */
27 .section .altinstr_replacement, "ax", @progbits
28.Lmemcpy_c:
29 movq %rdi, %rax 33 movq %rdi, %rax
30 movq %rdx, %rcx 34 movq %rdx, %rcx
31 shrq $3, %rcx 35 shrq $3, %rcx
@@ -34,29 +38,21 @@
34 movl %edx, %ecx 38 movl %edx, %ecx
35 rep movsb 39 rep movsb
36 ret 40 ret
37.Lmemcpy_e: 41ENDPROC(memcpy)
38 .previous 42ENDPROC(__memcpy)
39 43
40/* 44/*
41 * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than 45 * memcpy_erms() - enhanced fast string memcpy. This is faster and
42 * memcpy_c. Use memcpy_c_e when possible. 46 * simpler than memcpy. Use memcpy_erms when possible.
43 *
44 * This gets patched over the unrolled variant (below) via the
45 * alternative instructions framework:
46 */ 47 */
47 .section .altinstr_replacement, "ax", @progbits 48ENTRY(memcpy_erms)
48.Lmemcpy_c_e:
49 movq %rdi, %rax 49 movq %rdi, %rax
50 movq %rdx, %rcx 50 movq %rdx, %rcx
51 rep movsb 51 rep movsb
52 ret 52 ret
53.Lmemcpy_e_e: 53ENDPROC(memcpy_erms)
54 .previous
55
56.weak memcpy
57 54
58ENTRY(__memcpy) 55ENTRY(memcpy_orig)
59ENTRY(memcpy)
60 CFI_STARTPROC 56 CFI_STARTPROC
61 movq %rdi, %rax 57 movq %rdi, %rax
62 58
@@ -183,26 +179,4 @@ ENTRY(memcpy)
183.Lend: 179.Lend:
184 retq 180 retq
185 CFI_ENDPROC 181 CFI_ENDPROC
186ENDPROC(memcpy) 182ENDPROC(memcpy_orig)
187ENDPROC(__memcpy)
188
189 /*
190 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
191 * If the feature is supported, memcpy_c_e() is the first choice.
192 * If enhanced rep movsb copy is not available, use fast string copy
193 * memcpy_c() when possible. This is faster and code is simpler than
194 * original memcpy().
195 * Otherwise, original memcpy() is used.
196 * In .altinstructions section, ERMS feature is placed after REG_GOOD
197 * feature to implement the right patch order.
198 *
199 * Replace only beginning, memcpy is used to apply alternatives,
200 * so it is silly to overwrite itself with nops - reboot is the
201 * only outcome...
202 */
203 .section .altinstructions, "a"
204 altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
205 .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
206 altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
207 .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
208 .previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 9c4b530575da..0f8a0d0331b9 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -5,7 +5,6 @@
5 * This assembly file is re-written from memmove_64.c file. 5 * This assembly file is re-written from memmove_64.c file.
6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> 6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */ 7 */
8#define _STRING_C
9#include <linux/linkage.h> 8#include <linux/linkage.h>
10#include <asm/dwarf2.h> 9#include <asm/dwarf2.h>
11#include <asm/cpufeature.h> 10#include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
44 jg 2f 43 jg 2f
45 44
46.Lmemmove_begin_forward: 45.Lmemmove_begin_forward:
46 ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
47
47 /* 48 /*
48 * movsq instruction have many startup latency 49 * movsq instruction have many startup latency
49 * so we handle small size by general register. 50 * so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
20713: 20813:
208 retq 209 retq
209 CFI_ENDPROC 210 CFI_ENDPROC
210
211 .section .altinstr_replacement,"ax"
212.Lmemmove_begin_forward_efs:
213 /* Forward moving data. */
214 movq %rdx, %rcx
215 rep movsb
216 retq
217.Lmemmove_end_forward_efs:
218 .previous
219
220 .section .altinstructions,"a"
221 altinstruction_entry .Lmemmove_begin_forward, \
222 .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
223 .Lmemmove_end_forward-.Lmemmove_begin_forward, \
224 .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
225 .previous
226ENDPROC(__memmove) 211ENDPROC(__memmove)
227ENDPROC(memmove) 212ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6f44935c6a60..93118fb23976 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -5,19 +5,30 @@
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
7 7
8.weak memset
9
8/* 10/*
9 * ISO C memset - set a memory block to a byte value. This function uses fast 11 * ISO C memset - set a memory block to a byte value. This function uses fast
10 * string to get better performance than the original function. The code is 12 * string to get better performance than the original function. The code is
11 * simpler and shorter than the orignal function as well. 13 * simpler and shorter than the orignal function as well.
12 * 14 *
13 * rdi destination 15 * rdi destination
14 * rsi value (char) 16 * rsi value (char)
15 * rdx count (bytes) 17 * rdx count (bytes)
16 * 18 *
17 * rax original destination 19 * rax original destination
18 */ 20 */
19 .section .altinstr_replacement, "ax", @progbits 21ENTRY(memset)
20.Lmemset_c: 22ENTRY(__memset)
23 /*
24 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
25 * to use it when possible. If not available, use fast string instructions.
26 *
27 * Otherwise, use original memset function.
28 */
29 ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
30 "jmp memset_erms", X86_FEATURE_ERMS
31
21 movq %rdi,%r9 32 movq %rdi,%r9
22 movq %rdx,%rcx 33 movq %rdx,%rcx
23 andl $7,%edx 34 andl $7,%edx
@@ -31,8 +42,8 @@
31 rep stosb 42 rep stosb
32 movq %r9,%rax 43 movq %r9,%rax
33 ret 44 ret
34.Lmemset_e: 45ENDPROC(memset)
35 .previous 46ENDPROC(__memset)
36 47
37/* 48/*
38 * ISO C memset - set a memory block to a byte value. This function uses 49 * ISO C memset - set a memory block to a byte value. This function uses
@@ -45,21 +56,16 @@
45 * 56 *
46 * rax original destination 57 * rax original destination
47 */ 58 */
48 .section .altinstr_replacement, "ax", @progbits 59ENTRY(memset_erms)
49.Lmemset_c_e:
50 movq %rdi,%r9 60 movq %rdi,%r9
51 movb %sil,%al 61 movb %sil,%al
52 movq %rdx,%rcx 62 movq %rdx,%rcx
53 rep stosb 63 rep stosb
54 movq %r9,%rax 64 movq %r9,%rax
55 ret 65 ret
56.Lmemset_e_e: 66ENDPROC(memset_erms)
57 .previous
58
59.weak memset
60 67
61ENTRY(memset) 68ENTRY(memset_orig)
62ENTRY(__memset)
63 CFI_STARTPROC 69 CFI_STARTPROC
64 movq %rdi,%r10 70 movq %rdi,%r10
65 71
@@ -134,23 +140,4 @@ ENTRY(__memset)
134 jmp .Lafter_bad_alignment 140 jmp .Lafter_bad_alignment
135.Lfinal: 141.Lfinal:
136 CFI_ENDPROC 142 CFI_ENDPROC
137ENDPROC(memset) 143ENDPROC(memset_orig)
138ENDPROC(__memset)
139
140 /* Some CPUs support enhanced REP MOVSB/STOSB feature.
141 * It is recommended to use this when possible.
142 *
143 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
144 * instructions.
145 *
146 * Otherwise, use original memset function.
147 *
148 * In .altinstructions section, ERMS feature is placed after REG_GOOD
149 * feature to implement the right patch order.
150 */
151 .section .altinstructions,"a"
152 altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
153 .Lfinal-__memset,.Lmemset_e-.Lmemset_c
154 altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
155 .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
156 .previous
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1f5b53..8ffd2146fa6a 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -64,8 +64,8 @@
64 */ 64 */
65static inline void rdtsc_barrier(void) 65static inline void rdtsc_barrier(void)
66{ 66{
67 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); 67 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
68 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 68 "lfence", X86_FEATURE_LFENCE_RDTSC);
69} 69}
70 70
71#endif 71#endif
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
index d66ab799b35f..8c0c1a2770c8 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -1,12 +1,12 @@
1 1
2MEMCPY_FN(__memcpy, 2MEMCPY_FN(memcpy_orig,
3 "x86-64-unrolled", 3 "x86-64-unrolled",
4 "unrolled memcpy() in arch/x86/lib/memcpy_64.S") 4 "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
5 5
6MEMCPY_FN(memcpy_c, 6MEMCPY_FN(__memcpy,
7 "x86-64-movsq", 7 "x86-64-movsq",
8 "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") 8 "movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
9 9
10MEMCPY_FN(memcpy_c_e, 10MEMCPY_FN(memcpy_erms,
11 "x86-64-movsb", 11 "x86-64-movsb",
12 "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") 12 "movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
index fcd9cf00600a..e4c2c30143b9 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,8 +1,6 @@
1#define memcpy MEMCPY /* don't hide glibc's memcpy() */ 1#define memcpy MEMCPY /* don't hide glibc's memcpy() */
2#define altinstr_replacement text 2#define altinstr_replacement text
3#define globl p2align 4; .globl 3#define globl p2align 4; .globl
4#define Lmemcpy_c globl memcpy_c; memcpy_c
5#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
6#include "../../../arch/x86/lib/memcpy_64.S" 4#include "../../../arch/x86/lib/memcpy_64.S"
7/* 5/*
8 * We need to provide note.GNU-stack section, saying that we want 6 * We need to provide note.GNU-stack section, saying that we want
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index db1d3a29d97f..d3dfb7936dcd 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -36,7 +36,7 @@ static const struct option options[] = {
36 "Specify length of memory to copy. " 36 "Specify length of memory to copy. "
37 "Available units: B, KB, MB, GB and TB (upper and lower)"), 37 "Available units: B, KB, MB, GB and TB (upper and lower)"),
38 OPT_STRING('r', "routine", &routine, "default", 38 OPT_STRING('r', "routine", &routine, "default",
39 "Specify routine to copy"), 39 "Specify routine to copy, \"all\" runs all available routines"),
40 OPT_INTEGER('i', "iterations", &iterations, 40 OPT_INTEGER('i', "iterations", &iterations,
41 "repeat memcpy() invocation this number of times"), 41 "repeat memcpy() invocation this number of times"),
42 OPT_BOOLEAN('c', "cycle", &use_cycle, 42 OPT_BOOLEAN('c', "cycle", &use_cycle,
@@ -135,55 +135,16 @@ struct bench_mem_info {
135 const char *const *usage; 135 const char *const *usage;
136}; 136};
137 137
138static int bench_mem_common(int argc, const char **argv, 138static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen)
139 const char *prefix __maybe_unused,
140 struct bench_mem_info *info)
141{ 139{
142 int i; 140 const struct routine *r = &info->routines[r_idx];
143 size_t len;
144 double totallen;
145 double result_bps[2]; 141 double result_bps[2];
146 u64 result_cycle[2]; 142 u64 result_cycle[2];
147 143
148 argc = parse_options(argc, argv, options,
149 info->usage, 0);
150
151 if (no_prefault && only_prefault) {
152 fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
153 return 1;
154 }
155
156 if (use_cycle)
157 init_cycle();
158
159 len = (size_t)perf_atoll((char *)length_str);
160 totallen = (double)len * iterations;
161
162 result_cycle[0] = result_cycle[1] = 0ULL; 144 result_cycle[0] = result_cycle[1] = 0ULL;
163 result_bps[0] = result_bps[1] = 0.0; 145 result_bps[0] = result_bps[1] = 0.0;
164 146
165 if ((s64)len <= 0) { 147 printf("Routine %s (%s)\n", r->name, r->desc);
166 fprintf(stderr, "Invalid length:%s\n", length_str);
167 return 1;
168 }
169
170 /* same to without specifying either of prefault and no-prefault */
171 if (only_prefault && no_prefault)
172 only_prefault = no_prefault = false;
173
174 for (i = 0; info->routines[i].name; i++) {
175 if (!strcmp(info->routines[i].name, routine))
176 break;
177 }
178 if (!info->routines[i].name) {
179 printf("Unknown routine:%s\n", routine);
180 printf("Available routines...\n");
181 for (i = 0; info->routines[i].name; i++) {
182 printf("\t%s ... %s\n",
183 info->routines[i].name, info->routines[i].desc);
184 }
185 return 1;
186 }
187 148
188 if (bench_format == BENCH_FORMAT_DEFAULT) 149 if (bench_format == BENCH_FORMAT_DEFAULT)
189 printf("# Copying %s Bytes ...\n\n", length_str); 150 printf("# Copying %s Bytes ...\n\n", length_str);
@@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv,
191 if (!only_prefault && !no_prefault) { 152 if (!only_prefault && !no_prefault) {
192 /* show both of results */ 153 /* show both of results */
193 if (use_cycle) { 154 if (use_cycle) {
194 result_cycle[0] = 155 result_cycle[0] = info->do_cycle(r, len, false);
195 info->do_cycle(&info->routines[i], len, false); 156 result_cycle[1] = info->do_cycle(r, len, true);
196 result_cycle[1] =
197 info->do_cycle(&info->routines[i], len, true);
198 } else { 157 } else {
199 result_bps[0] = 158 result_bps[0] = info->do_gettimeofday(r, len, false);
200 info->do_gettimeofday(&info->routines[i], 159 result_bps[1] = info->do_gettimeofday(r, len, true);
201 len, false);
202 result_bps[1] =
203 info->do_gettimeofday(&info->routines[i],
204 len, true);
205 } 160 }
206 } else { 161 } else {
207 if (use_cycle) { 162 if (use_cycle)
208 result_cycle[pf] = 163 result_cycle[pf] = info->do_cycle(r, len, only_prefault);
209 info->do_cycle(&info->routines[i], 164 else
210 len, only_prefault); 165 result_bps[pf] = info->do_gettimeofday(r, len, only_prefault);
211 } else {
212 result_bps[pf] =
213 info->do_gettimeofday(&info->routines[i],
214 len, only_prefault);
215 }
216 } 166 }
217 167
218 switch (bench_format) { 168 switch (bench_format) {
@@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv,
265 die("unknown format: %d\n", bench_format); 215 die("unknown format: %d\n", bench_format);
266 break; 216 break;
267 } 217 }
218}
219
220static int bench_mem_common(int argc, const char **argv,
221 const char *prefix __maybe_unused,
222 struct bench_mem_info *info)
223{
224 int i;
225 size_t len;
226 double totallen;
227
228 argc = parse_options(argc, argv, options,
229 info->usage, 0);
230
231 if (no_prefault && only_prefault) {
232 fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
233 return 1;
234 }
235
236 if (use_cycle)
237 init_cycle();
238
239 len = (size_t)perf_atoll((char *)length_str);
240 totallen = (double)len * iterations;
241
242 if ((s64)len <= 0) {
243 fprintf(stderr, "Invalid length:%s\n", length_str);
244 return 1;
245 }
246
247 /* same to without specifying either of prefault and no-prefault */
248 if (only_prefault && no_prefault)
249 only_prefault = no_prefault = false;
250
251 if (!strncmp(routine, "all", 3)) {
252 for (i = 0; info->routines[i].name; i++)
253 __bench_mem_routine(info, i, len, totallen);
254 return 0;
255 }
256
257 for (i = 0; info->routines[i].name; i++) {
258 if (!strcmp(info->routines[i].name, routine))
259 break;
260 }
261 if (!info->routines[i].name) {
262 printf("Unknown routine:%s\n", routine);
263 printf("Available routines...\n");
264 for (i = 0; info->routines[i].name; i++) {
265 printf("\t%s ... %s\n",
266 info->routines[i].name, info->routines[i].desc);
267 }
268 return 1;
269 }
270
271 __bench_mem_routine(info, i, len, totallen);
268 272
269 return 0; 273 return 0;
270} 274}
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
index a71dff97c1f5..f02d028771d9 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -1,12 +1,12 @@
1 1
2MEMSET_FN(__memset, 2MEMSET_FN(memset_orig,
3 "x86-64-unrolled", 3 "x86-64-unrolled",
4 "unrolled memset() in arch/x86/lib/memset_64.S") 4 "unrolled memset() in arch/x86/lib/memset_64.S")
5 5
6MEMSET_FN(memset_c, 6MEMSET_FN(__memset,
7 "x86-64-stosq", 7 "x86-64-stosq",
8 "movsq-based memset() in arch/x86/lib/memset_64.S") 8 "movsq-based memset() in arch/x86/lib/memset_64.S")
9 9
10MEMSET_FN(memset_c_e, 10MEMSET_FN(memset_erms,
11 "x86-64-stosb", 11 "x86-64-stosb",
12 "movsb-based memset() in arch/x86/lib/memset_64.S") 12 "movsb-based memset() in arch/x86/lib/memset_64.S")
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S
index 9e5af89ed13a..de278784c866 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm.S
+++ b/tools/perf/bench/mem-memset-x86-64-asm.S
@@ -1,8 +1,6 @@
1#define memset MEMSET /* don't hide glibc's memset() */ 1#define memset MEMSET /* don't hide glibc's memset() */
2#define altinstr_replacement text 2#define altinstr_replacement text
3#define globl p2align 4; .globl 3#define globl p2align 4; .globl
4#define Lmemset_c globl memset_c; memset_c
5#define Lmemset_c_e globl memset_c_e; memset_c_e
6#include "../../../arch/x86/lib/memset_64.S" 4#include "../../../arch/x86/lib/memset_64.S"
7 5
8/* 6/*
diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h
index 6789d788d494..3a3a0f16456a 100644
--- a/tools/perf/util/include/asm/alternative-asm.h
+++ b/tools/perf/util/include/asm/alternative-asm.h
@@ -4,5 +4,6 @@
4/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ 4/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
5 5
6#define altinstruction_entry # 6#define altinstruction_entry #
7#define ALTERNATIVE_2 #
7 8
8#endif 9#endif