diff options
author | Arnaldo Carvalho de Melo <acme@redhat.com> | 2016-07-11 11:36:41 -0400 |
---|---|---|
committer | Arnaldo Carvalho de Melo <acme@redhat.com> | 2016-07-12 14:20:32 -0400 |
commit | 7d7d1bf1d1dabe435ef50efb051724b8664749cb (patch) | |
tree | 25b24227fb2d78e03262785ba893c1fb21306d1c | |
parent | c4b6014e8bb0c8d47fe5c71ebc604f31091e5d3f (diff) |
perf bench: Copy kernel files needed to build mem{cpy,set} x86_64 benchmarks
We can't access kernel files directly from tools/, so copy the required
bits, and make sure that we detect when the original files, in the
kernel, gets modified.
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-z7e76274ch5j4nugv048qacb@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
-rw-r--r-- | tools/arch/x86/include/asm/cpufeatures.h | 316 | ||||
-rw-r--r-- | tools/arch/x86/include/asm/disabled-features.h | 60 | ||||
-rw-r--r-- | tools/arch/x86/include/asm/required-features.h | 103 | ||||
-rw-r--r-- | tools/arch/x86/lib/memcpy_64.S | 297 | ||||
-rw-r--r-- | tools/arch/x86/lib/memset_64.S | 138 | ||||
-rw-r--r-- | tools/include/asm/alternative-asm.h (renamed from tools/perf/util/include/asm/alternative-asm.h) | 4 | ||||
-rw-r--r-- | tools/perf/MANIFEST | 9 | ||||
-rw-r--r-- | tools/perf/Makefile.perf | 15 | ||||
-rw-r--r-- | tools/perf/bench/mem-memcpy-x86-64-asm.S | 2 | ||||
-rw-r--r-- | tools/perf/bench/mem-memset-x86-64-asm.S | 2 |
10 files changed, 939 insertions, 7 deletions
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h new file mode 100644 index 000000000000..4a413485f9eb --- /dev/null +++ b/tools/arch/x86/include/asm/cpufeatures.h | |||
@@ -0,0 +1,316 @@ | |||
1 | #ifndef _ASM_X86_CPUFEATURES_H | ||
2 | #define _ASM_X86_CPUFEATURES_H | ||
3 | |||
4 | #ifndef _ASM_X86_REQUIRED_FEATURES_H | ||
5 | #include <asm/required-features.h> | ||
6 | #endif | ||
7 | |||
8 | #ifndef _ASM_X86_DISABLED_FEATURES_H | ||
9 | #include <asm/disabled-features.h> | ||
10 | #endif | ||
11 | |||
12 | /* | ||
13 | * Defines x86 CPU feature bits | ||
14 | */ | ||
15 | #define NCAPINTS 18 /* N 32-bit words worth of info */ | ||
16 | #define NBUGINTS 1 /* N 32-bit bug flags */ | ||
17 | |||
18 | /* | ||
19 | * Note: If the comment begins with a quoted string, that string is used | ||
20 | * in /proc/cpuinfo instead of the macro name. If the string is "", | ||
21 | * this feature bit is not displayed in /proc/cpuinfo at all. | ||
22 | */ | ||
23 | |||
24 | /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ | ||
25 | #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ | ||
26 | #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ | ||
27 | #define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ | ||
28 | #define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ | ||
29 | #define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ | ||
30 | #define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ | ||
31 | #define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ | ||
32 | #define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ | ||
33 | #define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ | ||
34 | #define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ | ||
35 | #define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ | ||
36 | #define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ | ||
37 | #define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ | ||
38 | #define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ | ||
39 | #define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ | ||
40 | /* (plus FCMOVcc, FCOMI with FPU) */ | ||
41 | #define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ | ||
42 | #define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ | ||
43 | #define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ | ||
44 | #define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ | ||
45 | #define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ | ||
46 | #define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ | ||
47 | #define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ | ||
48 | #define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ | ||
49 | #define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ | ||
50 | #define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ | ||
51 | #define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ | ||
52 | #define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ | ||
53 | #define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ | ||
54 | #define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ | ||
55 | #define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ | ||
56 | |||
57 | /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ | ||
58 | /* Don't duplicate feature flags which are redundant with Intel! */ | ||
59 | #define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ | ||
60 | #define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ | ||
61 | #define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ | ||
62 | #define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ | ||
63 | #define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ | ||
64 | #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ | ||
65 | #define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ | ||
66 | #define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ | ||
67 | #define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ | ||
68 | #define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ | ||
69 | |||
70 | /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ | ||
71 | #define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ | ||
72 | #define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ | ||
73 | #define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ | ||
74 | |||
75 | /* Other features, Linux-defined mapping, word 3 */ | ||
76 | /* This range is used for feature bits which conflict or are synthesized */ | ||
77 | #define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ | ||
78 | #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ | ||
79 | #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ | ||
80 | #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ | ||
81 | /* cpu types for specific tunings: */ | ||
82 | #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ | ||
83 | #define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ | ||
84 | #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ | ||
85 | #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ | ||
86 | #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ | ||
87 | #define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ | ||
88 | #define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ | ||
89 | #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ | ||
90 | #define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ | ||
91 | #define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ | ||
92 | #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ | ||
93 | #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ | ||
94 | #define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ | ||
95 | #define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ | ||
96 | #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ | ||
97 | #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ | ||
98 | #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ | ||
99 | #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ | ||
100 | #define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ | ||
101 | #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ | ||
102 | #define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ | ||
103 | /* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ | ||
104 | #define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ | ||
105 | #define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ | ||
106 | #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ | ||
107 | #define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ | ||
108 | #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ | ||
109 | #define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */ | ||
110 | |||
111 | /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ | ||
112 | #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ | ||
113 | #define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ | ||
114 | #define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ | ||
115 | #define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ | ||
116 | #define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ | ||
117 | #define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ | ||
118 | #define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ | ||
119 | #define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ | ||
120 | #define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ | ||
121 | #define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ | ||
122 | #define X86_FEATURE_CID ( 4*32+10) /* Context ID */ | ||
123 | #define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ | ||
124 | #define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ | ||
125 | #define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ | ||
126 | #define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ | ||
127 | #define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ | ||
128 | #define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ | ||
129 | #define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ | ||
130 | #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ | ||
131 | #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ | ||
132 | #define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ | ||
133 | #define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ | ||
134 | #define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ | ||
135 | #define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ | ||
136 | #define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ | ||
137 | #define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ | ||
138 | #define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ | ||
139 | #define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ | ||
140 | #define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ | ||
141 | #define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ | ||
142 | #define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ | ||
143 | |||
144 | /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ | ||
145 | #define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ | ||
146 | #define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ | ||
147 | #define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ | ||
148 | #define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ | ||
149 | #define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ | ||
150 | #define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ | ||
151 | #define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ | ||
152 | #define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ | ||
153 | #define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ | ||
154 | #define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ | ||
155 | |||
156 | /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ | ||
157 | #define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ | ||
158 | #define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ | ||
159 | #define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ | ||
160 | #define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ | ||
161 | #define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ | ||
162 | #define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ | ||
163 | #define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ | ||
164 | #define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ | ||
165 | #define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ | ||
166 | #define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ | ||
167 | #define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ | ||
168 | #define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ | ||
169 | #define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ | ||
170 | #define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ | ||
171 | #define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ | ||
172 | #define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ | ||
173 | #define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ | ||
174 | #define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ | ||
175 | #define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ | ||
176 | #define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ | ||
177 | #define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ | ||
178 | #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ | ||
179 | #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ | ||
180 | #define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ | ||
181 | #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ | ||
182 | #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ | ||
183 | |||
184 | /* | ||
185 | * Auxiliary flags: Linux defined - For features scattered in various | ||
186 | * CPUID levels like 0x6, 0xA etc, word 7. | ||
187 | * | ||
188 | * Reuse free bits when adding new feature flags! | ||
189 | */ | ||
190 | |||
191 | #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ | ||
192 | #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ | ||
193 | |||
194 | #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ | ||
195 | #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ | ||
196 | |||
197 | #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ | ||
198 | |||
199 | /* Virtualization flags: Linux defined, word 8 */ | ||
200 | #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ | ||
201 | #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ | ||
202 | #define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ | ||
203 | #define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ | ||
204 | #define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ | ||
205 | |||
206 | #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ | ||
207 | #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ | ||
208 | |||
209 | |||
210 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ | ||
211 | #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ | ||
212 | #define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ | ||
213 | #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ | ||
214 | #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ | ||
215 | #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ | ||
216 | #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ | ||
217 | #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ | ||
218 | #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ | ||
219 | #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ | ||
220 | #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ | ||
221 | #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ | ||
222 | #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ | ||
223 | #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ | ||
224 | #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ | ||
225 | #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ | ||
226 | #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ | ||
227 | #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ | ||
228 | #define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ | ||
229 | #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ | ||
230 | #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ | ||
231 | #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ | ||
232 | #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ | ||
233 | #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ | ||
234 | #define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ | ||
235 | #define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ | ||
236 | #define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ | ||
237 | |||
238 | /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ | ||
239 | #define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ | ||
240 | #define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ | ||
241 | #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ | ||
242 | #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ | ||
243 | |||
244 | /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ | ||
245 | #define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ | ||
246 | |||
247 | /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ | ||
248 | #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ | ||
249 | #define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ | ||
250 | #define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ | ||
251 | |||
252 | /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ | ||
253 | #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ | ||
254 | #define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ | ||
255 | |||
256 | /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ | ||
257 | #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ | ||
258 | #define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ | ||
259 | #define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ | ||
260 | #define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ | ||
261 | #define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ | ||
262 | #define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ | ||
263 | #define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ | ||
264 | #define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ | ||
265 | #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ | ||
266 | #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ | ||
267 | |||
268 | /* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ | ||
269 | #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ | ||
270 | #define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ | ||
271 | #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ | ||
272 | #define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ | ||
273 | #define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ | ||
274 | #define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ | ||
275 | #define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ | ||
276 | #define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ | ||
277 | #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ | ||
278 | #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ | ||
279 | #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ | ||
280 | |||
281 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ | ||
282 | #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ | ||
283 | #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ | ||
284 | |||
285 | /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ | ||
286 | #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ | ||
287 | #define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ | ||
288 | #define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ | ||
289 | |||
290 | /* | ||
291 | * BUG word(s) | ||
292 | */ | ||
293 | #define X86_BUG(x) (NCAPINTS*32 + (x)) | ||
294 | |||
295 | #define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ | ||
296 | #define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ | ||
297 | #define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ | ||
298 | #define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ | ||
299 | #define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ | ||
300 | #define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ | ||
301 | #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ | ||
302 | #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ | ||
303 | #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ | ||
304 | #define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */ | ||
305 | #define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */ | ||
306 | |||
307 | |||
308 | #ifdef CONFIG_X86_32 | ||
309 | /* | ||
310 | * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional | ||
311 | * to avoid confusion. | ||
312 | */ | ||
313 | #define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ | ||
314 | #endif | ||
315 | |||
316 | #endif /* _ASM_X86_CPUFEATURES_H */ | ||
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h new file mode 100644 index 000000000000..911e9358ceb1 --- /dev/null +++ b/tools/arch/x86/include/asm/disabled-features.h | |||
@@ -0,0 +1,60 @@ | |||
1 | #ifndef _ASM_X86_DISABLED_FEATURES_H | ||
2 | #define _ASM_X86_DISABLED_FEATURES_H | ||
3 | |||
4 | /* These features, although they might be available in a CPU | ||
5 | * will not be used because the compile options to support | ||
6 | * them are not present. | ||
7 | * | ||
8 | * This code allows them to be checked and disabled at | ||
9 | * compile time without an explicit #ifdef. Use | ||
10 | * cpu_feature_enabled(). | ||
11 | */ | ||
12 | |||
13 | #ifdef CONFIG_X86_INTEL_MPX | ||
14 | # define DISABLE_MPX 0 | ||
15 | #else | ||
16 | # define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) | ||
17 | #endif | ||
18 | |||
19 | #ifdef CONFIG_X86_64 | ||
20 | # define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) | ||
21 | # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) | ||
22 | # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) | ||
23 | # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) | ||
24 | #else | ||
25 | # define DISABLE_VME 0 | ||
26 | # define DISABLE_K6_MTRR 0 | ||
27 | # define DISABLE_CYRIX_ARR 0 | ||
28 | # define DISABLE_CENTAUR_MCR 0 | ||
29 | #endif /* CONFIG_X86_64 */ | ||
30 | |||
31 | #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS | ||
32 | # define DISABLE_PKU 0 | ||
33 | # define DISABLE_OSPKE 0 | ||
34 | #else | ||
35 | # define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31)) | ||
36 | # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) | ||
37 | #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ | ||
38 | |||
39 | /* | ||
40 | * Make sure to add features to the correct mask | ||
41 | */ | ||
42 | #define DISABLED_MASK0 (DISABLE_VME) | ||
43 | #define DISABLED_MASK1 0 | ||
44 | #define DISABLED_MASK2 0 | ||
45 | #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) | ||
46 | #define DISABLED_MASK4 0 | ||
47 | #define DISABLED_MASK5 0 | ||
48 | #define DISABLED_MASK6 0 | ||
49 | #define DISABLED_MASK7 0 | ||
50 | #define DISABLED_MASK8 0 | ||
51 | #define DISABLED_MASK9 (DISABLE_MPX) | ||
52 | #define DISABLED_MASK10 0 | ||
53 | #define DISABLED_MASK11 0 | ||
54 | #define DISABLED_MASK12 0 | ||
55 | #define DISABLED_MASK13 0 | ||
56 | #define DISABLED_MASK14 0 | ||
57 | #define DISABLED_MASK15 0 | ||
58 | #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) | ||
59 | |||
60 | #endif /* _ASM_X86_DISABLED_FEATURES_H */ | ||
diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h new file mode 100644 index 000000000000..4916144e3c42 --- /dev/null +++ b/tools/arch/x86/include/asm/required-features.h | |||
@@ -0,0 +1,103 @@ | |||
1 | #ifndef _ASM_X86_REQUIRED_FEATURES_H | ||
2 | #define _ASM_X86_REQUIRED_FEATURES_H | ||
3 | |||
4 | /* Define minimum CPUID feature set for kernel These bits are checked | ||
5 | really early to actually display a visible error message before the | ||
6 | kernel dies. Make sure to assign features to the proper mask! | ||
7 | |||
8 | Some requirements that are not in CPUID yet are also in the | ||
9 | CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too. | ||
10 | |||
11 | The real information is in arch/x86/Kconfig.cpu, this just converts | ||
12 | the CONFIGs into a bitmask */ | ||
13 | |||
14 | #ifndef CONFIG_MATH_EMULATION | ||
15 | # define NEED_FPU (1<<(X86_FEATURE_FPU & 31)) | ||
16 | #else | ||
17 | # define NEED_FPU 0 | ||
18 | #endif | ||
19 | |||
20 | #if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) | ||
21 | # define NEED_PAE (1<<(X86_FEATURE_PAE & 31)) | ||
22 | #else | ||
23 | # define NEED_PAE 0 | ||
24 | #endif | ||
25 | |||
26 | #ifdef CONFIG_X86_CMPXCHG64 | ||
27 | # define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) | ||
28 | #else | ||
29 | # define NEED_CX8 0 | ||
30 | #endif | ||
31 | |||
32 | #if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64) | ||
33 | # define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31)) | ||
34 | #else | ||
35 | # define NEED_CMOV 0 | ||
36 | #endif | ||
37 | |||
38 | #ifdef CONFIG_X86_USE_3DNOW | ||
39 | # define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31)) | ||
40 | #else | ||
41 | # define NEED_3DNOW 0 | ||
42 | #endif | ||
43 | |||
44 | #if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) | ||
45 | # define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) | ||
46 | #else | ||
47 | # define NEED_NOPL 0 | ||
48 | #endif | ||
49 | |||
50 | #ifdef CONFIG_MATOM | ||
51 | # define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31)) | ||
52 | #else | ||
53 | # define NEED_MOVBE 0 | ||
54 | #endif | ||
55 | |||
56 | #ifdef CONFIG_X86_64 | ||
57 | #ifdef CONFIG_PARAVIRT | ||
58 | /* Paravirtualized systems may not have PSE or PGE available */ | ||
59 | #define NEED_PSE 0 | ||
60 | #define NEED_PGE 0 | ||
61 | #else | ||
62 | #define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) | ||
63 | #define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) | ||
64 | #endif | ||
65 | #define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) | ||
66 | #define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) | ||
67 | #define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) | ||
68 | #define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) | ||
69 | #define NEED_LM (1<<(X86_FEATURE_LM & 31)) | ||
70 | #else | ||
71 | #define NEED_PSE 0 | ||
72 | #define NEED_MSR 0 | ||
73 | #define NEED_PGE 0 | ||
74 | #define NEED_FXSR 0 | ||
75 | #define NEED_XMM 0 | ||
76 | #define NEED_XMM2 0 | ||
77 | #define NEED_LM 0 | ||
78 | #endif | ||
79 | |||
80 | #define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\ | ||
81 | NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\ | ||
82 | NEED_XMM|NEED_XMM2) | ||
83 | #define SSE_MASK (NEED_XMM|NEED_XMM2) | ||
84 | |||
85 | #define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW) | ||
86 | |||
87 | #define REQUIRED_MASK2 0 | ||
88 | #define REQUIRED_MASK3 (NEED_NOPL) | ||
89 | #define REQUIRED_MASK4 (NEED_MOVBE) | ||
90 | #define REQUIRED_MASK5 0 | ||
91 | #define REQUIRED_MASK6 0 | ||
92 | #define REQUIRED_MASK7 0 | ||
93 | #define REQUIRED_MASK8 0 | ||
94 | #define REQUIRED_MASK9 0 | ||
95 | #define REQUIRED_MASK10 0 | ||
96 | #define REQUIRED_MASK11 0 | ||
97 | #define REQUIRED_MASK12 0 | ||
98 | #define REQUIRED_MASK13 0 | ||
99 | #define REQUIRED_MASK14 0 | ||
100 | #define REQUIRED_MASK15 0 | ||
101 | #define REQUIRED_MASK16 0 | ||
102 | |||
103 | #endif /* _ASM_X86_REQUIRED_FEATURES_H */ | ||
diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S new file mode 100644 index 000000000000..2ec0b0abbfaa --- /dev/null +++ b/tools/arch/x86/lib/memcpy_64.S | |||
@@ -0,0 +1,297 @@ | |||
1 | /* Copyright 2002 Andi Kleen */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <asm/errno.h> | ||
5 | #include <asm/cpufeatures.h> | ||
6 | #include <asm/alternative-asm.h> | ||
7 | |||
8 | /* | ||
9 | * We build a jump to memcpy_orig by default which gets NOPped out on | ||
10 | * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which | ||
11 | * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs | ||
12 | * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. | ||
13 | */ | ||
14 | |||
15 | .weak memcpy | ||
16 | |||
17 | /* | ||
18 | * memcpy - Copy a memory block. | ||
19 | * | ||
20 | * Input: | ||
21 | * rdi destination | ||
22 | * rsi source | ||
23 | * rdx count | ||
24 | * | ||
25 | * Output: | ||
26 | * rax original destination | ||
27 | */ | ||
28 | ENTRY(__memcpy) | ||
29 | ENTRY(memcpy) | ||
30 | ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ | ||
31 | "jmp memcpy_erms", X86_FEATURE_ERMS | ||
32 | |||
33 | movq %rdi, %rax | ||
34 | movq %rdx, %rcx | ||
35 | shrq $3, %rcx | ||
36 | andl $7, %edx | ||
37 | rep movsq | ||
38 | movl %edx, %ecx | ||
39 | rep movsb | ||
40 | ret | ||
41 | ENDPROC(memcpy) | ||
42 | ENDPROC(__memcpy) | ||
43 | |||
44 | /* | ||
45 | * memcpy_erms() - enhanced fast string memcpy. This is faster and | ||
46 | * simpler than memcpy. Use memcpy_erms when possible. | ||
47 | */ | ||
48 | ENTRY(memcpy_erms) | ||
49 | movq %rdi, %rax | ||
50 | movq %rdx, %rcx | ||
51 | rep movsb | ||
52 | ret | ||
53 | ENDPROC(memcpy_erms) | ||
54 | |||
55 | ENTRY(memcpy_orig) | ||
56 | movq %rdi, %rax | ||
57 | |||
58 | cmpq $0x20, %rdx | ||
59 | jb .Lhandle_tail | ||
60 | |||
61 | /* | ||
62 | * We check whether memory false dependence could occur, | ||
63 | * then jump to corresponding copy mode. | ||
64 | */ | ||
65 | cmp %dil, %sil | ||
66 | jl .Lcopy_backward | ||
67 | subq $0x20, %rdx | ||
68 | .Lcopy_forward_loop: | ||
69 | subq $0x20, %rdx | ||
70 | |||
71 | /* | ||
72 | * Move in blocks of 4x8 bytes: | ||
73 | */ | ||
74 | movq 0*8(%rsi), %r8 | ||
75 | movq 1*8(%rsi), %r9 | ||
76 | movq 2*8(%rsi), %r10 | ||
77 | movq 3*8(%rsi), %r11 | ||
78 | leaq 4*8(%rsi), %rsi | ||
79 | |||
80 | movq %r8, 0*8(%rdi) | ||
81 | movq %r9, 1*8(%rdi) | ||
82 | movq %r10, 2*8(%rdi) | ||
83 | movq %r11, 3*8(%rdi) | ||
84 | leaq 4*8(%rdi), %rdi | ||
85 | jae .Lcopy_forward_loop | ||
86 | addl $0x20, %edx | ||
87 | jmp .Lhandle_tail | ||
88 | |||
89 | .Lcopy_backward: | ||
90 | /* | ||
91 | * Calculate copy position to tail. | ||
92 | */ | ||
93 | addq %rdx, %rsi | ||
94 | addq %rdx, %rdi | ||
95 | subq $0x20, %rdx | ||
96 | /* | ||
97 | * At most 3 ALU operations in one cycle, | ||
98 | * so append NOPS in the same 16 bytes trunk. | ||
99 | */ | ||
100 | .p2align 4 | ||
101 | .Lcopy_backward_loop: | ||
102 | subq $0x20, %rdx | ||
103 | movq -1*8(%rsi), %r8 | ||
104 | movq -2*8(%rsi), %r9 | ||
105 | movq -3*8(%rsi), %r10 | ||
106 | movq -4*8(%rsi), %r11 | ||
107 | leaq -4*8(%rsi), %rsi | ||
108 | movq %r8, -1*8(%rdi) | ||
109 | movq %r9, -2*8(%rdi) | ||
110 | movq %r10, -3*8(%rdi) | ||
111 | movq %r11, -4*8(%rdi) | ||
112 | leaq -4*8(%rdi), %rdi | ||
113 | jae .Lcopy_backward_loop | ||
114 | |||
115 | /* | ||
116 | * Calculate copy position to head. | ||
117 | */ | ||
118 | addl $0x20, %edx | ||
119 | subq %rdx, %rsi | ||
120 | subq %rdx, %rdi | ||
121 | .Lhandle_tail: | ||
122 | cmpl $16, %edx | ||
123 | jb .Lless_16bytes | ||
124 | |||
125 | /* | ||
126 | * Move data from 16 bytes to 31 bytes. | ||
127 | */ | ||
128 | movq 0*8(%rsi), %r8 | ||
129 | movq 1*8(%rsi), %r9 | ||
130 | movq -2*8(%rsi, %rdx), %r10 | ||
131 | movq -1*8(%rsi, %rdx), %r11 | ||
132 | movq %r8, 0*8(%rdi) | ||
133 | movq %r9, 1*8(%rdi) | ||
134 | movq %r10, -2*8(%rdi, %rdx) | ||
135 | movq %r11, -1*8(%rdi, %rdx) | ||
136 | retq | ||
137 | .p2align 4 | ||
138 | .Lless_16bytes: | ||
139 | cmpl $8, %edx | ||
140 | jb .Lless_8bytes | ||
141 | /* | ||
142 | * Move data from 8 bytes to 15 bytes. | ||
143 | */ | ||
144 | movq 0*8(%rsi), %r8 | ||
145 | movq -1*8(%rsi, %rdx), %r9 | ||
146 | movq %r8, 0*8(%rdi) | ||
147 | movq %r9, -1*8(%rdi, %rdx) | ||
148 | retq | ||
149 | .p2align 4 | ||
150 | .Lless_8bytes: | ||
151 | cmpl $4, %edx | ||
152 | jb .Lless_3bytes | ||
153 | |||
154 | /* | ||
155 | * Move data from 4 bytes to 7 bytes. | ||
156 | */ | ||
157 | movl (%rsi), %ecx | ||
158 | movl -4(%rsi, %rdx), %r8d | ||
159 | movl %ecx, (%rdi) | ||
160 | movl %r8d, -4(%rdi, %rdx) | ||
161 | retq | ||
162 | .p2align 4 | ||
163 | .Lless_3bytes: | ||
164 | subl $1, %edx | ||
165 | jb .Lend | ||
166 | /* | ||
167 | * Move data from 1 bytes to 3 bytes. | ||
168 | */ | ||
169 | movzbl (%rsi), %ecx | ||
170 | jz .Lstore_1byte | ||
171 | movzbq 1(%rsi), %r8 | ||
172 | movzbq (%rsi, %rdx), %r9 | ||
173 | movb %r8b, 1(%rdi) | ||
174 | movb %r9b, (%rdi, %rdx) | ||
175 | .Lstore_1byte: | ||
176 | movb %cl, (%rdi) | ||
177 | |||
178 | .Lend: | ||
179 | retq | ||
180 | ENDPROC(memcpy_orig) | ||
181 | |||
182 | #ifndef CONFIG_UML | ||
183 | /* | ||
184 | * memcpy_mcsafe - memory copy with machine check exception handling | ||
185 | * Note that we only catch machine checks when reading the source addresses. | ||
186 | * Writes to target are posted and don't generate machine checks. | ||
187 | */ | ||
188 | ENTRY(memcpy_mcsafe) | ||
189 | cmpl $8, %edx | ||
190 | /* Less than 8 bytes? Go to byte copy loop */ | ||
191 | jb .L_no_whole_words | ||
192 | |||
193 | /* Check for bad alignment of source */ | ||
194 | testl $7, %esi | ||
195 | /* Already aligned */ | ||
196 | jz .L_8byte_aligned | ||
197 | |||
198 | /* Copy one byte at a time until source is 8-byte aligned */ | ||
199 | movl %esi, %ecx | ||
200 | andl $7, %ecx | ||
201 | subl $8, %ecx | ||
202 | negl %ecx | ||
203 | subl %ecx, %edx | ||
204 | .L_copy_leading_bytes: | ||
205 | movb (%rsi), %al | ||
206 | movb %al, (%rdi) | ||
207 | incq %rsi | ||
208 | incq %rdi | ||
209 | decl %ecx | ||
210 | jnz .L_copy_leading_bytes | ||
211 | |||
212 | .L_8byte_aligned: | ||
213 | /* Figure out how many whole cache lines (64-bytes) to copy */ | ||
214 | movl %edx, %ecx | ||
215 | andl $63, %edx | ||
216 | shrl $6, %ecx | ||
217 | jz .L_no_whole_cache_lines | ||
218 | |||
219 | /* Loop copying whole cache lines */ | ||
220 | .L_cache_w0: movq (%rsi), %r8 | ||
221 | .L_cache_w1: movq 1*8(%rsi), %r9 | ||
222 | .L_cache_w2: movq 2*8(%rsi), %r10 | ||
223 | .L_cache_w3: movq 3*8(%rsi), %r11 | ||
224 | movq %r8, (%rdi) | ||
225 | movq %r9, 1*8(%rdi) | ||
226 | movq %r10, 2*8(%rdi) | ||
227 | movq %r11, 3*8(%rdi) | ||
228 | .L_cache_w4: movq 4*8(%rsi), %r8 | ||
229 | .L_cache_w5: movq 5*8(%rsi), %r9 | ||
230 | .L_cache_w6: movq 6*8(%rsi), %r10 | ||
231 | .L_cache_w7: movq 7*8(%rsi), %r11 | ||
232 | movq %r8, 4*8(%rdi) | ||
233 | movq %r9, 5*8(%rdi) | ||
234 | movq %r10, 6*8(%rdi) | ||
235 | movq %r11, 7*8(%rdi) | ||
236 | leaq 64(%rsi), %rsi | ||
237 | leaq 64(%rdi), %rdi | ||
238 | decl %ecx | ||
239 | jnz .L_cache_w0 | ||
240 | |||
241 | /* Are there any trailing 8-byte words? */ | ||
242 | .L_no_whole_cache_lines: | ||
243 | movl %edx, %ecx | ||
244 | andl $7, %edx | ||
245 | shrl $3, %ecx | ||
246 | jz .L_no_whole_words | ||
247 | |||
248 | /* Copy trailing words */ | ||
249 | .L_copy_trailing_words: | ||
250 | movq (%rsi), %r8 | ||
251 | mov %r8, (%rdi) | ||
252 | leaq 8(%rsi), %rsi | ||
253 | leaq 8(%rdi), %rdi | ||
254 | decl %ecx | ||
255 | jnz .L_copy_trailing_words | ||
256 | |||
257 | /* Any trailing bytes? */ | ||
258 | .L_no_whole_words: | ||
259 | andl %edx, %edx | ||
260 | jz .L_done_memcpy_trap | ||
261 | |||
262 | /* Copy trailing bytes */ | ||
263 | movl %edx, %ecx | ||
264 | .L_copy_trailing_bytes: | ||
265 | movb (%rsi), %al | ||
266 | movb %al, (%rdi) | ||
267 | incq %rsi | ||
268 | incq %rdi | ||
269 | decl %ecx | ||
270 | jnz .L_copy_trailing_bytes | ||
271 | |||
272 | /* Copy successful. Return zero */ | ||
273 | .L_done_memcpy_trap: | ||
274 | xorq %rax, %rax | ||
275 | ret | ||
276 | ENDPROC(memcpy_mcsafe) | ||
277 | |||
278 | .section .fixup, "ax" | ||
279 | /* Return -EFAULT for any failure */ | ||
280 | .L_memcpy_mcsafe_fail: | ||
281 | mov $-EFAULT, %rax | ||
282 | ret | ||
283 | |||
284 | .previous | ||
285 | |||
286 | _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) | ||
287 | _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) | ||
288 | _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) | ||
289 | _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) | ||
290 | _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) | ||
291 | _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) | ||
292 | _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) | ||
293 | _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail) | ||
294 | _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail) | ||
295 | _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail) | ||
296 | _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail) | ||
297 | #endif | ||
diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S new file mode 100644 index 000000000000..e1229ecd2a82 --- /dev/null +++ b/tools/arch/x86/lib/memset_64.S | |||
@@ -0,0 +1,138 @@ | |||
1 | /* Copyright 2002 Andi Kleen, SuSE Labs */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <asm/cpufeatures.h> | ||
5 | #include <asm/alternative-asm.h> | ||
6 | |||
7 | .weak memset | ||
8 | |||
9 | /* | ||
10 | * ISO C memset - set a memory block to a byte value. This function uses fast | ||
11 | * string to get better performance than the original function. The code is | ||
12 | * simpler and shorter than the original function as well. | ||
13 | * | ||
14 | * rdi destination | ||
15 | * rsi value (char) | ||
16 | * rdx count (bytes) | ||
17 | * | ||
18 | * rax original destination | ||
19 | */ | ||
20 | ENTRY(memset) | ||
21 | ENTRY(__memset) | ||
22 | /* | ||
23 | * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended | ||
24 | * to use it when possible. If not available, use fast string instructions. | ||
25 | * | ||
26 | * Otherwise, use original memset function. | ||
27 | */ | ||
28 | ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ | ||
29 | "jmp memset_erms", X86_FEATURE_ERMS | ||
30 | |||
31 | movq %rdi,%r9 | ||
32 | movq %rdx,%rcx | ||
33 | andl $7,%edx | ||
34 | shrq $3,%rcx | ||
35 | /* expand byte value */ | ||
36 | movzbl %sil,%esi | ||
37 | movabs $0x0101010101010101,%rax | ||
38 | imulq %rsi,%rax | ||
39 | rep stosq | ||
40 | movl %edx,%ecx | ||
41 | rep stosb | ||
42 | movq %r9,%rax | ||
43 | ret | ||
44 | ENDPROC(memset) | ||
45 | ENDPROC(__memset) | ||
46 | |||
47 | /* | ||
48 | * ISO C memset - set a memory block to a byte value. This function uses | ||
49 | * enhanced rep stosb to override the fast string function. | ||
50 | * The code is simpler and shorter than the fast string function as well. | ||
51 | * | ||
52 | * rdi destination | ||
53 | * rsi value (char) | ||
54 | * rdx count (bytes) | ||
55 | * | ||
56 | * rax original destination | ||
57 | */ | ||
58 | ENTRY(memset_erms) | ||
59 | movq %rdi,%r9 | ||
60 | movb %sil,%al | ||
61 | movq %rdx,%rcx | ||
62 | rep stosb | ||
63 | movq %r9,%rax | ||
64 | ret | ||
65 | ENDPROC(memset_erms) | ||
66 | |||
67 | ENTRY(memset_orig) | ||
68 | movq %rdi,%r10 | ||
69 | |||
70 | /* expand byte value */ | ||
71 | movzbl %sil,%ecx | ||
72 | movabs $0x0101010101010101,%rax | ||
73 | imulq %rcx,%rax | ||
74 | |||
75 | /* align dst */ | ||
76 | movl %edi,%r9d | ||
77 | andl $7,%r9d | ||
78 | jnz .Lbad_alignment | ||
79 | .Lafter_bad_alignment: | ||
80 | |||
81 | movq %rdx,%rcx | ||
82 | shrq $6,%rcx | ||
83 | jz .Lhandle_tail | ||
84 | |||
85 | .p2align 4 | ||
86 | .Lloop_64: | ||
87 | decq %rcx | ||
88 | movq %rax,(%rdi) | ||
89 | movq %rax,8(%rdi) | ||
90 | movq %rax,16(%rdi) | ||
91 | movq %rax,24(%rdi) | ||
92 | movq %rax,32(%rdi) | ||
93 | movq %rax,40(%rdi) | ||
94 | movq %rax,48(%rdi) | ||
95 | movq %rax,56(%rdi) | ||
96 | leaq 64(%rdi),%rdi | ||
97 | jnz .Lloop_64 | ||
98 | |||
99 | /* Handle tail in loops. The loops should be faster than hard | ||
100 | to predict jump tables. */ | ||
101 | .p2align 4 | ||
102 | .Lhandle_tail: | ||
103 | movl %edx,%ecx | ||
104 | andl $63&(~7),%ecx | ||
105 | jz .Lhandle_7 | ||
106 | shrl $3,%ecx | ||
107 | .p2align 4 | ||
108 | .Lloop_8: | ||
109 | decl %ecx | ||
110 | movq %rax,(%rdi) | ||
111 | leaq 8(%rdi),%rdi | ||
112 | jnz .Lloop_8 | ||
113 | |||
114 | .Lhandle_7: | ||
115 | andl $7,%edx | ||
116 | jz .Lende | ||
117 | .p2align 4 | ||
118 | .Lloop_1: | ||
119 | decl %edx | ||
120 | movb %al,(%rdi) | ||
121 | leaq 1(%rdi),%rdi | ||
122 | jnz .Lloop_1 | ||
123 | |||
124 | .Lende: | ||
125 | movq %r10,%rax | ||
126 | ret | ||
127 | |||
128 | .Lbad_alignment: | ||
129 | cmpq $7,%rdx | ||
130 | jbe .Lhandle_7 | ||
131 | movq %rax,(%rdi) /* unaligned store */ | ||
132 | movq $8,%r8 | ||
133 | subq %r9,%r8 | ||
134 | addq %r8,%rdi | ||
135 | subq %r8,%rdx | ||
136 | jmp .Lafter_bad_alignment | ||
137 | .Lfinal: | ||
138 | ENDPROC(memset_orig) | ||
diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/include/asm/alternative-asm.h index 3a3a0f16456a..2a4d1bfa2988 100644 --- a/tools/perf/util/include/asm/alternative-asm.h +++ b/tools/include/asm/alternative-asm.h | |||
@@ -1,5 +1,5 @@ | |||
1 | #ifndef _PERF_ASM_ALTERNATIVE_ASM_H | 1 | #ifndef _TOOLS_ASM_ALTERNATIVE_ASM_H |
2 | #define _PERF_ASM_ALTERNATIVE_ASM_H | 2 | #define _TOOLS_ASM_ALTERNATIVE_ASM_H |
3 | 3 | ||
4 | /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ | 4 | /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ |
5 | 5 | ||
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 0b1ebf3c08f6..cf85d1cd1c91 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST | |||
@@ -12,6 +12,11 @@ tools/arch/sparc/include/asm/barrier_32.h | |||
12 | tools/arch/sparc/include/asm/barrier_64.h | 12 | tools/arch/sparc/include/asm/barrier_64.h |
13 | tools/arch/tile/include/asm/barrier.h | 13 | tools/arch/tile/include/asm/barrier.h |
14 | tools/arch/x86/include/asm/barrier.h | 14 | tools/arch/x86/include/asm/barrier.h |
15 | tools/arch/x86/include/asm/cpufeatures.h | ||
16 | tools/arch/x86/include/asm/disabled-features.h | ||
17 | tools/arch/x86/include/asm/required-features.h | ||
18 | tools/arch/x86/lib/memcpy_64.S | ||
19 | tools/arch/x86/lib/memset_64.S | ||
15 | tools/arch/xtensa/include/asm/barrier.h | 20 | tools/arch/xtensa/include/asm/barrier.h |
16 | tools/scripts | 21 | tools/scripts |
17 | tools/build | 22 | tools/build |
@@ -31,6 +36,7 @@ tools/lib/find_bit.c | |||
31 | tools/lib/bitmap.c | 36 | tools/lib/bitmap.c |
32 | tools/lib/str_error_r.c | 37 | tools/lib/str_error_r.c |
33 | tools/lib/vsprintf.c | 38 | tools/lib/vsprintf.c |
39 | tools/include/asm/alternative-asm.h | ||
34 | tools/include/asm/atomic.h | 40 | tools/include/asm/atomic.h |
35 | tools/include/asm/barrier.h | 41 | tools/include/asm/barrier.h |
36 | tools/include/asm/bug.h | 42 | tools/include/asm/bug.h |
@@ -74,9 +80,6 @@ include/linux/swab.h | |||
74 | arch/*/include/asm/unistd*.h | 80 | arch/*/include/asm/unistd*.h |
75 | arch/*/include/uapi/asm/unistd*.h | 81 | arch/*/include/uapi/asm/unistd*.h |
76 | arch/*/include/uapi/asm/perf_regs.h | 82 | arch/*/include/uapi/asm/perf_regs.h |
77 | arch/*/lib/memcpy*.S | ||
78 | arch/*/lib/memset*.S | ||
79 | arch/*/include/asm/*features.h | ||
80 | include/linux/poison.h | 83 | include/linux/poison.h |
81 | include/linux/hw_breakpoint.h | 84 | include/linux/hw_breakpoint.h |
82 | include/uapi/linux/bpf.h | 85 | include/uapi/linux/bpf.h |
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 5e5f8cb1dd83..809735c6cb26 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf | |||
@@ -348,6 +348,21 @@ $(PERF_IN): prepare FORCE | |||
348 | @(test -f ../../include/uapi/linux/perf_event.h && ( \ | 348 | @(test -f ../../include/uapi/linux/perf_event.h && ( \ |
349 | (diff -B ../include/uapi/linux/perf_event.h ../../include/uapi/linux/perf_event.h >/dev/null) \ | 349 | (diff -B ../include/uapi/linux/perf_event.h ../../include/uapi/linux/perf_event.h >/dev/null) \ |
350 | || echo "Warning: tools/include/uapi/linux/perf_event.h differs from kernel" >&2 )) || true | 350 | || echo "Warning: tools/include/uapi/linux/perf_event.h differs from kernel" >&2 )) || true |
351 | @(test -f ../../arch/x86/include/asm/disabled-features.h && ( \ | ||
352 | (diff -B ../arch/x86/include/asm/disabled-features.h ../../arch/x86/include/asm/disabled-features.h >/dev/null) \ | ||
353 | || echo "Warning: tools/arch/x86/include/asm/disabled-features.h differs from kernel" >&2 )) || true | ||
354 | @(test -f ../../arch/x86/include/asm/required-features.h && ( \ | ||
355 | (diff -B ../arch/x86/include/asm/required-features.h ../../arch/x86/include/asm/required-features.h >/dev/null) \ | ||
356 | || echo "Warning: tools/arch/x86/include/asm/required-features.h differs from kernel" >&2 )) || true | ||
357 | @(test -f ../../arch/x86/include/asm/cpufeatures.h && ( \ | ||
358 | (diff -B ../arch/x86/include/asm/cpufeatures.h ../../arch/x86/include/asm/cpufeatures.h >/dev/null) \ | ||
359 | || echo "Warning: tools/arch/x86/include/asm/cpufeatures.h differs from kernel" >&2 )) || true | ||
360 | @(test -f ../../arch/x86/lib/memcpy_64.S && ( \ | ||
361 | (diff -B ../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memcpy_64.S >/dev/null) \ | ||
362 | || echo "Warning: tools/arch/x86/lib/memcpy_64.S differs from kernel" >&2 )) || true | ||
363 | @(test -f ../../arch/x86/lib/memset_64.S && ( \ | ||
364 | (diff -B ../arch/x86/lib/memset_64.S ../../arch/x86/lib/memset_64.S >/dev/null) \ | ||
365 | || echo "Warning: tools/arch/x86/lib/memset_64.S differs from kernel" >&2 )) || true | ||
351 | $(Q)$(MAKE) $(build)=perf | 366 | $(Q)$(MAKE) $(build)=perf |
352 | 367 | ||
353 | $(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(LIBTRACEEVENT_DYNAMIC_LIST) | 368 | $(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(LIBTRACEEVENT_DYNAMIC_LIST) |
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S index 5c3cce082cb8..f700369bb0f6 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm.S +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S | |||
@@ -6,7 +6,7 @@ | |||
6 | #define globl p2align 4; .globl | 6 | #define globl p2align 4; .globl |
7 | #define _ASM_EXTABLE_FAULT(x, y) | 7 | #define _ASM_EXTABLE_FAULT(x, y) |
8 | 8 | ||
9 | #include "../../../arch/x86/lib/memcpy_64.S" | 9 | #include "../../arch/x86/lib/memcpy_64.S" |
10 | /* | 10 | /* |
11 | * We need to provide note.GNU-stack section, saying that we want | 11 | * We need to provide note.GNU-stack section, saying that we want |
12 | * NOT executable stack. Otherwise the final linking will assume that | 12 | * NOT executable stack. Otherwise the final linking will assume that |
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S index de278784c866..58407aa24c1b 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm.S +++ b/tools/perf/bench/mem-memset-x86-64-asm.S | |||
@@ -1,7 +1,7 @@ | |||
1 | #define memset MEMSET /* don't hide glibc's memset() */ | 1 | #define memset MEMSET /* don't hide glibc's memset() */ |
2 | #define altinstr_replacement text | 2 | #define altinstr_replacement text |
3 | #define globl p2align 4; .globl | 3 | #define globl p2align 4; .globl |
4 | #include "../../../arch/x86/lib/memset_64.S" | 4 | #include "../../arch/x86/lib/memset_64.S" |
5 | 5 | ||
6 | /* | 6 | /* |
7 | * We need to provide note.GNU-stack section, saying that we want | 7 | * We need to provide note.GNU-stack section, saying that we want |