aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2019-07-04 04:36:20 -0400
committerIngo Molnar <mingo@kernel.org>2019-07-04 04:36:20 -0400
commitf584dd32edc5d4400d7ceb92111a89f0c1f6651f (patch)
treee31ee9615fc9f07e8791fca0e77cd35f2dd1041a
parenta328a259ced0c0fa5aabcd29238779a536335884 (diff)
parent049331f277fef1c3f2527c2c9afa1d285e9a1247 (diff)
Merge branch 'x86/cpu' into perf/core, to pick up revert
perf/core has an earlier version of the x86/cpu tree merged, to avoid conflicts, and due to this we want to pick up this ABI impacting revert as well: 049331f277fe: ("x86/fsgsbase: Revert FSGSBASE support") Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/x86/entry_64.rst9
-rw-r--r--Documentation/x86/x86_64/fsgs.rst199
-rw-r--r--Documentation/x86/x86_64/index.rst1
-rw-r--r--arch/x86/entry/calling.h40
-rw-r--r--arch/x86/entry/entry_64.S115
-rw-r--r--arch/x86/include/asm/fsgsbase.h45
-rw-r--r--arch/x86/include/asm/inst.h15
-rw-r--r--arch/x86/include/uapi/asm/hwcap2.h3
-rw-r--r--arch/x86/kernel/cpu/common.c22
-rw-r--r--arch/x86/kernel/cpu/intel.c27
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c15
-rw-r--r--arch/x86/kernel/process_64.c119
-rw-r--r--tools/testing/selftests/x86/Makefile5
-rw-r--r--tools/testing/selftests/x86/fsgsbase.c74
-rw-r--r--tools/testing/selftests/x86/syscall_arg_fault.c112
16 files changed, 235 insertions, 568 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 35bc3c3574c6..138f6664b2e2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2857,8 +2857,6 @@
2857 no5lvl [X86-64] Disable 5-level paging mode. Forces 2857 no5lvl [X86-64] Disable 5-level paging mode. Forces
2858 kernel to use 4-level paging instead. 2858 kernel to use 4-level paging instead.
2859 2859
2860 nofsgsbase [X86] Disables FSGSBASE instructions.
2861
2862 no_console_suspend 2860 no_console_suspend
2863 [HW] Never suspend the console 2861 [HW] Never suspend the console
2864 Disable suspending of consoles during suspend and 2862 Disable suspending of consoles during suspend and
diff --git a/Documentation/x86/entry_64.rst b/Documentation/x86/entry_64.rst
index b87c1d816aea..a48b3f6ebbe8 100644
--- a/Documentation/x86/entry_64.rst
+++ b/Documentation/x86/entry_64.rst
@@ -108,12 +108,3 @@ We try to only use IST entries and the paranoid entry code for vectors
108that absolutely need the more expensive check for the GS base - and we 108that absolutely need the more expensive check for the GS base - and we
109generate all 'normal' entry points with the regular (faster) paranoid=0 109generate all 'normal' entry points with the regular (faster) paranoid=0
110variant. 110variant.
111
112On a FSGSBASE system, however, user space can set GS without kernel
113interaction. It means the value of GS base itself does not imply anything,
114whether a kernel value or a user space value. So, there is no longer a safe
115way to check whether the exception is entering from user mode or kernel
116mode in the paranoid entry code path. So the GSBASE value needs to be read
117out, saved and the kernel GSBASE value written. On exit the saved GSBASE
118value needs to be restored unconditionally. The non paranoid entry/exit
119code still uses SWAPGS unconditionally as the state is known.
diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst
deleted file mode 100644
index 380c0b5ccca2..000000000000
--- a/Documentation/x86/x86_64/fsgs.rst
+++ /dev/null
@@ -1,199 +0,0 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Using FS and GS segments in user space applications
4===================================================
5
6The x86 architecture supports segmentation. Instructions which access
7memory can use segment register based addressing mode. The following
8notation is used to address a byte within a segment:
9
10 Segment-register:Byte-address
11
12The segment base address is added to the Byte-address to compute the
13resulting virtual address which is accessed. This allows to access multiple
14instances of data with the identical Byte-address, i.e. the same code. The
15selection of a particular instance is purely based on the base-address in
16the segment register.
17
18In 32-bit mode the CPU provides 6 segments, which also support segment
19limits. The limits can be used to enforce address space protections.
20
21In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is
22always 0 to provide a full 64bit address space. The FS and GS segments are
23still functional in 64-bit mode.
24
25Common FS and GS usage
26------------------------------
27
28The FS segment is commonly used to address Thread Local Storage (TLS). FS
29is usually managed by runtime code or a threading library. Variables
30declared with the '__thread' storage class specifier are instantiated per
31thread and the compiler emits the FS: address prefix for accesses to these
32variables. Each thread has its own FS base address so common code can be
33used without complex address offset calculations to access the per thread
34instances. Applications should not use FS for other purposes when they use
35runtimes or threading libraries which manage the per thread FS.
36
37The GS segment has no common use and can be used freely by
38applications. GCC and Clang support GS based addressing via address space
39identifiers.
40
41Reading and writing the FS/GS base address
42------------------------------------------
43
44There exist two mechanisms to read and write the FS/FS base address:
45
46 - the arch_prctl() system call
47
48 - the FSGSBASE instruction family
49
50Accessing FS/GS base with arch_prctl()
51--------------------------------------
52
53 The arch_prctl(2) based mechanism is available on all 64bit CPUs and all
54 kernel versions.
55
56 Reading the base:
57
58 arch_prctl(ARCH_GET_FS, &fsbase);
59 arch_prctl(ARCH_GET_GS, &gsbase);
60
61 Writing the base:
62
63 arch_prctl(ARCH_SET_FS, fsbase);
64 arch_prctl(ARCH_SET_GS, gsbase);
65
66 The ARCH_SET_GS prctl may be disabled depending on kernel configuration
67 and security settings.
68
69Accessing FS/GS base with the FSGSBASE instructions
70---------------------------------------------------
71
72 With the Ivy Bridge CPU generation Intel introduced a new set of
73 instructions to access the FS and GS base registers directly from user
74 space. These instructions are also supported on AMD Family 17H CPUs. The
75 following instructions are available:
76
77 =============== ===========================
78 RDFSBASE %reg Read the FS base register
79 RDGSBASE %reg Read the GS base register
80 WRFSBASE %reg Write the FS base register
81 WRGSBASE %reg Write the GS base register
82 =============== ===========================
83
84 The instructions avoid the overhead of the arch_prctl() syscall and allow
85 more flexible usage of the FS/GS addressing modes in user space
86 applications. This does not prevent conflicts between threading libraries
87 and runtimes which utilize FS and applications which want to use it for
88 their own purpose.
89
90FSGSBASE instructions enablement
91^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
92 The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If
93 available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs.
94
95 The availability of the instructions does not enable them
96 automatically. The kernel has to enable them explicitly in CR4. The
97 reason for this is that older kernels make assumptions about the values in
98 the GS register and enforce them when GS base is set via
99 arch_prctl(). Allowing user space to write arbitrary values to GS base
100 would violate these assumptions and cause malfunction.
101
102 On kernels which do not enable FSGSBASE the execution of the FSGSBASE
103 instructions will fault with a #UD exception.
104
105 The kernel provides reliable information about the enabled state in the
106 ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the
107 kernel has FSGSBASE instructions enabled and applications can use them.
108 The following code example shows how this detection works::
109
110 #include <sys/auxv.h>
111 #include <elf.h>
112
113 /* Will be eventually in asm/hwcap.h */
114 #ifndef HWCAP2_FSGSBASE
115 #define HWCAP2_FSGSBASE (1 << 1)
116 #endif
117
118 ....
119
120 unsigned val = getauxval(AT_HWCAP2);
121
122 if (val & HWCAP2_FSGSBASE)
123 printf("FSGSBASE enabled\n");
124
125FSGSBASE instructions compiler support
126^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
127
128GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE
129instructions. Clang supports them as well.
130
131 =================== ===========================
132 _readfsbase_u64() Read the FS base register
133 _readfsbase_u64() Read the GS base register
134 _writefsbase_u64() Write the FS base register
135 _writegsbase_u64() Write the GS base register
136 =================== ===========================
137
138To utilize these instrinsics <immintrin.h> must be included in the source
139code and the compiler option -mfsgsbase has to be added.
140
141Compiler support for FS/GS based addressing
142-------------------------------------------
143
144GCC version 6 and newer provide support for FS/GS based addressing via
145Named Address Spaces. GCC implements the following address space
146identifiers for x86:
147
148 ========= ====================================
149 __seg_fs Variable is addressed relative to FS
150 __seg_gs Variable is addressed relative to GS
151 ========= ====================================
152
153The preprocessor symbols __SEG_FS and __SEG_GS are defined when these
154address spaces are supported. Code which implements fallback modes should
155check whether these symbols are defined. Usage example::
156
157 #ifdef __SEG_GS
158
159 long data0 = 0;
160 long data1 = 1;
161
162 long __seg_gs *ptr;
163
164 /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */
165 ....
166
167 /* Set GS to point to data0 */
168 _writegsbase_u64(&data0);
169
170 /* Access offset 0 of GS */
171 ptr = 0;
172 printf("data0 = %ld\n", *ptr);
173
174 /* Set GS to point to data1 */
175 _writegsbase_u64(&data1);
176 /* ptr still addresses offset 0! */
177 printf("data1 = %ld\n", *ptr);
178
179
180Clang does not provide the GCC address space identifiers, but it provides
181address spaces via an attribute based mechanism in Clang 5 and newer
182versions:
183
184 ==================================== =====================================
185 __attribute__((address_space(256)) Variable is addressed relative to GS
186 __attribute__((address_space(257)) Variable is addressed relative to FS
187 ==================================== =====================================
188
189FS/GS based addressing with inline assembly
190-------------------------------------------
191
192In case the compiler does not support address spaces, inline assembly can
193be used for FS/GS based addressing mode::
194
195 mov %fs:offset, %reg
196 mov %gs:offset, %reg
197
198 mov %reg, %fs:offset
199 mov %reg, %gs:offset
diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst
index a56070fc8e77..d6eaaa5a35fc 100644
--- a/Documentation/x86/x86_64/index.rst
+++ b/Documentation/x86/x86_64/index.rst
@@ -14,4 +14,3 @@ x86_64 Support
14 fake-numa-for-cpusets 14 fake-numa-for-cpusets
15 cpu-hotplug-spec 15 cpu-hotplug-spec
16 machinecheck 16 machinecheck
17 fsgs
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d3fbe2dc03ea..efb0d1b1f15f 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -6,7 +6,6 @@
6#include <asm/percpu.h> 6#include <asm/percpu.h>
7#include <asm/asm-offsets.h> 7#include <asm/asm-offsets.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9#include <asm/inst.h>
10 9
11/* 10/*
12 11
@@ -338,12 +337,6 @@ For 32-bit we have the following conventions - kernel is built with
338#endif 337#endif
339.endm 338.endm
340 339
341.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
342 rdgsbase \save_reg
343 GET_PERCPU_BASE \scratch_reg
344 wrgsbase \scratch_reg
345.endm
346
347#endif /* CONFIG_X86_64 */ 340#endif /* CONFIG_X86_64 */
348 341
349.macro STACKLEAK_ERASE 342.macro STACKLEAK_ERASE
@@ -352,39 +345,6 @@ For 32-bit we have the following conventions - kernel is built with
352#endif 345#endif
353.endm 346.endm
354 347
355#ifdef CONFIG_SMP
356
357/*
358 * CPU/node NR is loaded from the limit (size) field of a special segment
359 * descriptor entry in GDT.
360 */
361.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
362 movq $__CPUNODE_SEG, \reg
363 lsl \reg, \reg
364.endm
365
366/*
367 * Fetch the per-CPU GSBASE value for this processor and put it in @reg.
368 * We normally use %gs for accessing per-CPU data, but we are setting up
369 * %gs here and obviously can not use %gs itself to access per-CPU data.
370 */
371.macro GET_PERCPU_BASE reg:req
372 ALTERNATIVE \
373 "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \
374 "RDPID \reg", \
375 X86_FEATURE_RDPID
376 andq $VDSO_CPUNODE_MASK, \reg
377 movq __per_cpu_offset(, \reg, 8), \reg
378.endm
379
380#else
381
382.macro GET_PERCPU_BASE reg:req
383 movq pcpu_unit_offsets(%rip), \reg
384.endm
385
386#endif /* CONFIG_SMP */
387
388/* 348/*
389 * This does 'call enter_from_user_mode' unless we can avoid it based on 349 * This does 'call enter_from_user_mode' unless we can avoid it based on
390 * kernel config or using the static jump infrastructure. 350 * kernel config or using the static jump infrastructure.
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 7f9f5119d6b1..3b7a0e8d3bc0 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -38,7 +38,6 @@
38#include <asm/export.h> 38#include <asm/export.h>
39#include <asm/frame.h> 39#include <asm/frame.h>
40#include <asm/nospec-branch.h> 40#include <asm/nospec-branch.h>
41#include <asm/fsgsbase.h>
42#include <linux/err.h> 41#include <linux/err.h>
43 42
44#include "calling.h" 43#include "calling.h"
@@ -948,6 +947,7 @@ ENTRY(\sym)
948 addq $\ist_offset, CPU_TSS_IST(\shift_ist) 947 addq $\ist_offset, CPU_TSS_IST(\shift_ist)
949 .endif 948 .endif
950 949
950 /* these procedures expect "no swapgs" flag in ebx */
951 .if \paranoid 951 .if \paranoid
952 jmp paranoid_exit 952 jmp paranoid_exit
953 .else 953 .else
@@ -1164,21 +1164,24 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
1164#endif 1164#endif
1165 1165
1166/* 1166/*
1167 * Save all registers in pt_regs. Return GSBASE related information 1167 * Save all registers in pt_regs, and switch gs if needed.
1168 * in EBX depending on the availability of the FSGSBASE instructions: 1168 * Use slow, but surefire "are we in kernel?" check.
1169 * 1169 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1170 * FSGSBASE R/EBX
1171 * N 0 -> SWAPGS on exit
1172 * 1 -> no SWAPGS on exit
1173 *
1174 * Y GSBASE value at entry, must be restored in paranoid_exit
1175 */ 1170 */
1176ENTRY(paranoid_entry) 1171ENTRY(paranoid_entry)
1177 UNWIND_HINT_FUNC 1172 UNWIND_HINT_FUNC
1178 cld 1173 cld
1179 PUSH_AND_CLEAR_REGS save_ret=1 1174 PUSH_AND_CLEAR_REGS save_ret=1
1180 ENCODE_FRAME_POINTER 8 1175 ENCODE_FRAME_POINTER 8
1176 movl $1, %ebx
1177 movl $MSR_GS_BASE, %ecx
1178 rdmsr
1179 testl %edx, %edx
1180 js 1f /* negative -> in kernel */
1181 SWAPGS
1182 xorl %ebx, %ebx
1181 1183
11841:
1182 /* 1185 /*
1183 * Always stash CR3 in %r14. This value will be restored, 1186 * Always stash CR3 in %r14. This value will be restored,
1184 * verbatim, at exit. Needed if paranoid_entry interrupted 1187 * verbatim, at exit. Needed if paranoid_entry interrupted
@@ -1188,49 +1191,9 @@ ENTRY(paranoid_entry)
1188 * This is also why CS (stashed in the "iret frame" by the 1191 * This is also why CS (stashed in the "iret frame" by the
1189 * hardware at entry) can not be used: this may be a return 1192 * hardware at entry) can not be used: this may be a return
1190 * to kernel code, but with a user CR3 value. 1193 * to kernel code, but with a user CR3 value.
1191 *
1192 * Switching CR3 does not depend on kernel GSBASE so it can
1193 * be done before switching to the kernel GSBASE. This is
1194 * required for FSGSBASE because the kernel GSBASE has to
1195 * be retrieved from a kernel internal table.
1196 */ 1194 */
1197 SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 1195 SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
1198 1196
1199 /*
1200 * Handling GSBASE depends on the availability of FSGSBASE.
1201 *
1202 * Without FSGSBASE the kernel enforces that negative GSBASE
1203 * values indicate kernel GSBASE. With FSGSBASE no assumptions
1204 * can be made about the GSBASE value when entering from user
1205 * space.
1206 */
1207 ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
1208
1209 /*
1210 * Read the current GSBASE and store it in in %rbx unconditionally,
1211 * retrieve and set the current CPUs kernel GSBASE. The stored value
1212 * has to be restored in paranoid_exit unconditionally.
1213 */
1214 SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
1215 ret
1216
1217.Lparanoid_entry_checkgs:
1218 /* EBX = 1 -> kernel GSBASE active, no restore required */
1219 movl $1, %ebx
1220 /*
1221 * The kernel-enforced convention is a negative GSBASE indicates
1222 * a kernel value. No SWAPGS needed on entry and exit.
1223 */
1224 movl $MSR_GS_BASE, %ecx
1225 rdmsr
1226 testl %edx, %edx
1227 jns .Lparanoid_entry_swapgs
1228 ret
1229
1230.Lparanoid_entry_swapgs:
1231 SWAPGS
1232 /* EBX = 0 -> SWAPGS required on exit */
1233 xorl %ebx, %ebx
1234 ret 1197 ret
1235END(paranoid_entry) 1198END(paranoid_entry)
1236 1199
@@ -1241,47 +1204,28 @@ END(paranoid_entry)
1241 * 1204 *
1242 * We may be returning to very strange contexts (e.g. very early 1205 * We may be returning to very strange contexts (e.g. very early
1243 * in syscall entry), so checking for preemption here would 1206 * in syscall entry), so checking for preemption here would
1244 * be complicated. Fortunately, there's no good reason to try 1207 * be complicated. Fortunately, we there's no good reason
1245 * to handle preemption here. 1208 * to try to handle preemption here.
1246 *
1247 * R/EBX contains the GSBASE related information depending on the
1248 * availability of the FSGSBASE instructions:
1249 *
1250 * FSGSBASE R/EBX
1251 * N 0 -> SWAPGS on exit
1252 * 1 -> no SWAPGS on exit
1253 * 1209 *
1254 * Y User space GSBASE, must be restored unconditionally 1210 * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
1255 */ 1211 */
1256ENTRY(paranoid_exit) 1212ENTRY(paranoid_exit)
1257 UNWIND_HINT_REGS 1213 UNWIND_HINT_REGS
1258 DISABLE_INTERRUPTS(CLBR_ANY) 1214 DISABLE_INTERRUPTS(CLBR_ANY)
1259 TRACE_IRQS_OFF_DEBUG 1215 TRACE_IRQS_OFF_DEBUG
1260 1216 testl %ebx, %ebx /* swapgs needed? */
1261 /* Handle GS depending on FSGSBASE availability */
1262 ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "nop",X86_FEATURE_FSGSBASE
1263
1264 /* With FSGSBASE enabled, unconditionally restore GSBASE */
1265 wrgsbase %rbx
1266 jmp .Lparanoid_exit_no_swapgs;
1267
1268.Lparanoid_exit_checkgs:
1269 /* On non-FSGSBASE systems, conditionally do SWAPGS */
1270 testl %ebx, %ebx
1271 jnz .Lparanoid_exit_no_swapgs 1217 jnz .Lparanoid_exit_no_swapgs
1272 TRACE_IRQS_IRETQ 1218 TRACE_IRQS_IRETQ
1273 /* Always restore stashed CR3 value (see paranoid_entry) */ 1219 /* Always restore stashed CR3 value (see paranoid_entry) */
1274 RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 1220 RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
1275 SWAPGS_UNSAFE_STACK 1221 SWAPGS_UNSAFE_STACK
1276 jmp .Lparanoid_exit_restore 1222 jmp .Lparanoid_exit_restore
1277
1278.Lparanoid_exit_no_swapgs: 1223.Lparanoid_exit_no_swapgs:
1279 TRACE_IRQS_IRETQ_DEBUG 1224 TRACE_IRQS_IRETQ_DEBUG
1280 /* Always restore stashed CR3 value (see paranoid_entry) */ 1225 /* Always restore stashed CR3 value (see paranoid_entry) */
1281 RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 1226 RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
1282
1283.Lparanoid_exit_restore: 1227.Lparanoid_exit_restore:
1284 jmp restore_regs_and_return_to_kernel 1228 jmp restore_regs_and_return_to_kernel
1285END(paranoid_exit) 1229END(paranoid_exit)
1286 1230
1287/* 1231/*
@@ -1692,27 +1636,10 @@ end_repeat_nmi:
1692 /* Always restore stashed CR3 value (see paranoid_entry) */ 1636 /* Always restore stashed CR3 value (see paranoid_entry) */
1693 RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 1637 RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
1694 1638
1695 /* 1639 testl %ebx, %ebx /* swapgs needed? */
1696 * The above invocation of paranoid_entry stored the GSBASE
1697 * related information in R/EBX depending on the availability
1698 * of FSGSBASE.
1699 *
1700 * If FSGSBASE is enabled, restore the saved GSBASE value
1701 * unconditionally, otherwise take the conditional SWAPGS path.
1702 */
1703 ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
1704
1705 wrgsbase %rbx
1706 jmp nmi_restore
1707
1708nmi_no_fsgsbase:
1709 /* EBX == 0 -> invoke SWAPGS */
1710 testl %ebx, %ebx
1711 jnz nmi_restore 1640 jnz nmi_restore
1712
1713nmi_swapgs: 1641nmi_swapgs:
1714 SWAPGS_UNSAFE_STACK 1642 SWAPGS_UNSAFE_STACK
1715
1716nmi_restore: 1643nmi_restore:
1717 POP_REGS 1644 POP_REGS
1718 1645
@@ -1743,11 +1670,17 @@ nmi_restore:
1743 iretq 1670 iretq
1744END(nmi) 1671END(nmi)
1745 1672
1673#ifndef CONFIG_IA32_EMULATION
1674/*
1675 * This handles SYSCALL from 32-bit code. There is no way to program
1676 * MSRs to fully disable 32-bit SYSCALL.
1677 */
1746ENTRY(ignore_sysret) 1678ENTRY(ignore_sysret)
1747 UNWIND_HINT_EMPTY 1679 UNWIND_HINT_EMPTY
1748 mov $-ENOSYS, %eax 1680 mov $-ENOSYS, %eax
1749 sysret 1681 sysret
1750END(ignore_sysret) 1682END(ignore_sysret)
1683#endif
1751 1684
1752ENTRY(rewind_stack_do_exit) 1685ENTRY(rewind_stack_do_exit)
1753 UNWIND_HINT_FUNC 1686 UNWIND_HINT_FUNC
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index aefd53767a5d..bca4c743de77 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -19,63 +19,36 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task);
19extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); 19extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
20extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); 20extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
21 21
22/* Must be protected by X86_FEATURE_FSGSBASE check. */ 22/* Helper functions for reading/writing FS/GS base */
23 23
24static __always_inline unsigned long rdfsbase(void) 24static inline unsigned long x86_fsbase_read_cpu(void)
25{ 25{
26 unsigned long fsbase; 26 unsigned long fsbase;
27 27
28 asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); 28 rdmsrl(MSR_FS_BASE, fsbase);
29 29
30 return fsbase; 30 return fsbase;
31} 31}
32 32
33static __always_inline unsigned long rdgsbase(void) 33static inline unsigned long x86_gsbase_read_cpu_inactive(void)
34{ 34{
35 unsigned long gsbase; 35 unsigned long gsbase;
36 36
37 asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); 37 rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
38 38
39 return gsbase; 39 return gsbase;
40} 40}
41 41
42static __always_inline void wrfsbase(unsigned long fsbase) 42static inline void x86_fsbase_write_cpu(unsigned long fsbase)
43{
44 asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory");
45}
46
47static __always_inline void wrgsbase(unsigned long gsbase)
48{
49 asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
50}
51
52#include <asm/cpufeature.h>
53
54/* Helper functions for reading/writing FS/GS base */
55
56static inline unsigned long x86_fsbase_read_cpu(void)
57{ 43{
58 unsigned long fsbase; 44 wrmsrl(MSR_FS_BASE, fsbase);
59
60 if (static_cpu_has(X86_FEATURE_FSGSBASE))
61 fsbase = rdfsbase();
62 else
63 rdmsrl(MSR_FS_BASE, fsbase);
64
65 return fsbase;
66} 45}
67 46
68static inline void x86_fsbase_write_cpu(unsigned long fsbase) 47static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
69{ 48{
70 if (static_cpu_has(X86_FEATURE_FSGSBASE)) 49 wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
71 wrfsbase(fsbase);
72 else
73 wrmsrl(MSR_FS_BASE, fsbase);
74} 50}
75 51
76extern unsigned long x86_gsbase_read_cpu_inactive(void);
77extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
78
79#endif /* CONFIG_X86_64 */ 52#endif /* CONFIG_X86_64 */
80 53
81#endif /* __ASSEMBLY__ */ 54#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index d063841a17e3..f5a796da07f8 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -306,21 +306,6 @@
306 .endif 306 .endif
307 MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 307 MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
308 .endm 308 .endm
309
310.macro RDPID opd
311 REG_TYPE rdpid_opd_type \opd
312 .if rdpid_opd_type == REG_TYPE_R64
313 R64_NUM rdpid_opd \opd
314 .else
315 R32_NUM rdpid_opd \opd
316 .endif
317 .byte 0xf3
318 .if rdpid_opd > 7
319 PFX_REX rdpid_opd 0
320 .endif
321 .byte 0x0f, 0xc7
322 MODRM 0xc0 rdpid_opd 0x7
323.endm
324#endif 309#endif
325 310
326#endif 311#endif
diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h
index c5ce54e749f6..6ebaae90e207 100644
--- a/arch/x86/include/uapi/asm/hwcap2.h
+++ b/arch/x86/include/uapi/asm/hwcap2.h
@@ -5,7 +5,4 @@
5/* MONITOR/MWAIT enabled in Ring 3 */ 5/* MONITOR/MWAIT enabled in Ring 3 */
6#define HWCAP2_RING3MWAIT (1 << 0) 6#define HWCAP2_RING3MWAIT (1 << 0)
7 7
8/* Kernel allows FSGSBASE instructions available in Ring 3 */
9#define HWCAP2_FSGSBASE BIT(1)
10
11#endif 8#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0948fc25446a..482f74859fb7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -366,22 +366,6 @@ out:
366 cr4_clear_bits(X86_CR4_UMIP); 366 cr4_clear_bits(X86_CR4_UMIP);
367} 367}
368 368
369static __init int x86_nofsgsbase_setup(char *arg)
370{
371 /* Require an exact match without trailing characters. */
372 if (strlen(arg))
373 return 0;
374
375 /* Do not emit a message if the feature is not present. */
376 if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
377 return 1;
378
379 setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
380 pr_info("FSGSBASE disabled via kernel command line\n");
381 return 1;
382}
383__setup("nofsgsbase", x86_nofsgsbase_setup);
384
385/* 369/*
386 * Protection Keys are not available in 32-bit mode. 370 * Protection Keys are not available in 32-bit mode.
387 */ 371 */
@@ -1387,12 +1371,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
1387 setup_smap(c); 1371 setup_smap(c);
1388 setup_umip(c); 1372 setup_umip(c);
1389 1373
1390 /* Enable FSGSBASE instructions if available. */
1391 if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
1392 cr4_set_bits(X86_CR4_FSGSBASE);
1393 elf_hwcap2 |= HWCAP2_FSGSBASE;
1394 }
1395
1396 /* 1374 /*
1397 * The vendor-specific functions might have changed features. 1375 * The vendor-specific functions might have changed features.
1398 * Now we do "generic changes." 1376 * Now we do "generic changes."
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f17c1a714779..8d6d92ebeb54 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c)
66 } 66 }
67} 67}
68 68
69/*
70 * Processors which have self-snooping capability can handle conflicting
71 * memory type across CPUs by snooping its own cache. However, there exists
72 * CPU models in which having conflicting memory types still leads to
73 * unpredictable behavior, machine check errors, or hangs. Clear this
74 * feature to prevent its use on machines with known erratas.
75 */
76static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
77{
78 switch (c->x86_model) {
79 case INTEL_FAM6_CORE_YONAH:
80 case INTEL_FAM6_CORE2_MEROM:
81 case INTEL_FAM6_CORE2_MEROM_L:
82 case INTEL_FAM6_CORE2_PENRYN:
83 case INTEL_FAM6_CORE2_DUNNINGTON:
84 case INTEL_FAM6_NEHALEM:
85 case INTEL_FAM6_NEHALEM_G:
86 case INTEL_FAM6_NEHALEM_EP:
87 case INTEL_FAM6_NEHALEM_EX:
88 case INTEL_FAM6_WESTMERE:
89 case INTEL_FAM6_WESTMERE_EP:
90 case INTEL_FAM6_SANDYBRIDGE:
91 setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
92 }
93}
94
69static bool ring3mwait_disabled __read_mostly; 95static bool ring3mwait_disabled __read_mostly;
70 96
71static int __init ring3mwait_disable(char *__unused) 97static int __init ring3mwait_disable(char *__unused)
@@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
304 } 330 }
305 331
306 check_mpx_erratum(c); 332 check_mpx_erratum(c);
333 check_memory_type_self_snoop_errata(c);
307 334
308 /* 335 /*
309 * Get the number of SMT siblings early from the extended topology 336 * Get the number of SMT siblings early from the extended topology
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 9356c1c9024d..aa5c064a6a22 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -743,7 +743,15 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
743 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 743 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
744 cr0 = read_cr0() | X86_CR0_CD; 744 cr0 = read_cr0() | X86_CR0_CD;
745 write_cr0(cr0); 745 write_cr0(cr0);
746 wbinvd(); 746
747 /*
748 * Cache flushing is the most time-consuming step when programming
749 * the MTRRs. Fortunately, as per the Intel Software Development
750 * Manual, we can skip it if the processor supports cache self-
751 * snooping.
752 */
753 if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
754 wbinvd();
747 755
748 /* Save value of CR4 and clear Page Global Enable (bit 7) */ 756 /* Save value of CR4 and clear Page Global Enable (bit 7) */
749 if (boot_cpu_has(X86_FEATURE_PGE)) { 757 if (boot_cpu_has(X86_FEATURE_PGE)) {
@@ -760,7 +768,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
760 768
761 /* Disable MTRRs, and set the default type to uncached */ 769 /* Disable MTRRs, and set the default type to uncached */
762 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); 770 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
763 wbinvd(); 771
772 /* Again, only flush caches if we have to. */
773 if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
774 wbinvd();
764} 775}
765 776
766static void post_set(void) __releases(set_atomicity_lock) 777static void post_set(void) __releases(set_atomicity_lock)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 8f239091c15d..250e4c4ac6d9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -162,40 +162,6 @@ enum which_selector {
162}; 162};
163 163
164/* 164/*
165 * Out of line to be protected from kprobes. It is not used on Xen
166 * paravirt. When paravirt support is needed, it needs to be renamed
167 * with native_ prefix.
168 */
169static noinline unsigned long __rdgsbase_inactive(void)
170{
171 unsigned long gsbase;
172
173 lockdep_assert_irqs_disabled();
174
175 native_swapgs();
176 gsbase = rdgsbase();
177 native_swapgs();
178
179 return gsbase;
180}
181NOKPROBE_SYMBOL(__rdgsbase_inactive);
182
183/*
184 * Out of line to be protected from kprobes. It is not used on Xen
185 * paravirt. When paravirt support is needed, it needs to be renamed
186 * with native_ prefix.
187 */
188static noinline void __wrgsbase_inactive(unsigned long gsbase)
189{
190 lockdep_assert_irqs_disabled();
191
192 native_swapgs();
193 wrgsbase(gsbase);
194 native_swapgs();
195}
196NOKPROBE_SYMBOL(__wrgsbase_inactive);
197
198/*
199 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are 165 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
200 * not available. The goal is to be reasonably fast on non-FSGSBASE systems. 166 * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
201 * It's forcibly inlined because it'll generate better code and this function 167 * It's forcibly inlined because it'll generate better code and this function
@@ -244,22 +210,8 @@ static __always_inline void save_fsgs(struct task_struct *task)
244{ 210{
245 savesegment(fs, task->thread.fsindex); 211 savesegment(fs, task->thread.fsindex);
246 savesegment(gs, task->thread.gsindex); 212 savesegment(gs, task->thread.gsindex);
247 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 213 save_base_legacy(task, task->thread.fsindex, FS);
248 unsigned long flags; 214 save_base_legacy(task, task->thread.gsindex, GS);
249
250 /*
251 * If FSGSBASE is enabled, we can't make any useful guesses
252 * about the base, and user code expects us to save the current
253 * value. Fortunately, reading the base directly is efficient.
254 */
255 task->thread.fsbase = rdfsbase();
256 local_irq_save(flags);
257 task->thread.gsbase = __rdgsbase_inactive();
258 local_irq_restore(flags);
259 } else {
260 save_base_legacy(task, task->thread.fsindex, FS);
261 save_base_legacy(task, task->thread.gsindex, GS);
262 }
263} 215}
264 216
265#if IS_ENABLED(CONFIG_KVM) 217#if IS_ENABLED(CONFIG_KVM)
@@ -338,22 +290,10 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
338static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, 290static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
339 struct thread_struct *next) 291 struct thread_struct *next)
340{ 292{
341 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 293 load_seg_legacy(prev->fsindex, prev->fsbase,
342 /* Update the FS and GS selectors if they could have changed. */ 294 next->fsindex, next->fsbase, FS);
343 if (unlikely(prev->fsindex || next->fsindex)) 295 load_seg_legacy(prev->gsindex, prev->gsbase,
344 loadseg(FS, next->fsindex); 296 next->gsindex, next->gsbase, GS);
345 if (unlikely(prev->gsindex || next->gsindex))
346 loadseg(GS, next->gsindex);
347
348 /* Update the bases. */
349 wrfsbase(next->fsbase);
350 __wrgsbase_inactive(next->gsbase);
351 } else {
352 load_seg_legacy(prev->fsindex, prev->fsbase,
353 next->fsindex, next->fsbase, FS);
354 load_seg_legacy(prev->gsindex, prev->gsbase,
355 next->gsindex, next->gsbase, GS);
356 }
357} 297}
358 298
359static unsigned long x86_fsgsbase_read_task(struct task_struct *task, 299static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
@@ -399,46 +339,13 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
399 return base; 339 return base;
400} 340}
401 341
402unsigned long x86_gsbase_read_cpu_inactive(void)
403{
404 unsigned long gsbase;
405
406 if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
407 unsigned long flags;
408
409 /* Interrupts are disabled here. */
410 local_irq_save(flags);
411 gsbase = __rdgsbase_inactive();
412 local_irq_restore(flags);
413 } else {
414 rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
415 }
416
417 return gsbase;
418}
419
420void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
421{
422 if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
423 unsigned long flags;
424
425 /* Interrupts are disabled here. */
426 local_irq_save(flags);
427 __wrgsbase_inactive(gsbase);
428 local_irq_restore(flags);
429 } else {
430 wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
431 }
432}
433
434unsigned long x86_fsbase_read_task(struct task_struct *task) 342unsigned long x86_fsbase_read_task(struct task_struct *task)
435{ 343{
436 unsigned long fsbase; 344 unsigned long fsbase;
437 345
438 if (task == current) 346 if (task == current)
439 fsbase = x86_fsbase_read_cpu(); 347 fsbase = x86_fsbase_read_cpu();
440 else if (static_cpu_has(X86_FEATURE_FSGSBASE) || 348 else if (task->thread.fsindex == 0)
441 (task->thread.fsindex == 0))
442 fsbase = task->thread.fsbase; 349 fsbase = task->thread.fsbase;
443 else 350 else
444 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); 351 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
@@ -452,8 +359,7 @@ unsigned long x86_gsbase_read_task(struct task_struct *task)
452 359
453 if (task == current) 360 if (task == current)
454 gsbase = x86_gsbase_read_cpu_inactive(); 361 gsbase = x86_gsbase_read_cpu_inactive();
455 else if (static_cpu_has(X86_FEATURE_FSGSBASE) || 362 else if (task->thread.gsindex == 0)
456 (task->thread.gsindex == 0))
457 gsbase = task->thread.gsbase; 363 gsbase = task->thread.gsbase;
458 else 364 else
459 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); 365 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
@@ -493,11 +399,10 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
493 p->thread.sp = (unsigned long) fork_frame; 399 p->thread.sp = (unsigned long) fork_frame;
494 p->thread.io_bitmap_ptr = NULL; 400 p->thread.io_bitmap_ptr = NULL;
495 401
496 save_fsgs(me); 402 savesegment(gs, p->thread.gsindex);
497 p->thread.fsindex = me->thread.fsindex; 403 p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
498 p->thread.fsbase = me->thread.fsbase; 404 savesegment(fs, p->thread.fsindex);
499 p->thread.gsindex = me->thread.gsindex; 405 p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
500 p->thread.gsbase = me->thread.gsbase;
501 savesegment(es, p->thread.es); 406 savesegment(es, p->thread.es);
502 savesegment(ds, p->thread.ds); 407 savesegment(ds, p->thread.ds);
503 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 408 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 186520198de7..fa07d526fe39 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -12,8 +12,9 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie)
12 12
13TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ 13TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
14 check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \ 14 check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
15 protection_keys test_vdso test_vsyscall mov_ss_trap 15 protection_keys test_vdso test_vsyscall mov_ss_trap \
16TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ 16 syscall_arg_fault
17TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
17 test_FCMOV test_FCOMI test_FISTTP \ 18 test_FCMOV test_FCOMI test_FISTTP \
18 vdso_restorer 19 vdso_restorer
19TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip 20TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
index 21fd4f94b5b0..5ab4c60c100e 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -35,6 +35,8 @@
35static volatile sig_atomic_t want_segv; 35static volatile sig_atomic_t want_segv;
36static volatile unsigned long segv_addr; 36static volatile unsigned long segv_addr;
37 37
38static unsigned short *shared_scratch;
39
38static int nerrs; 40static int nerrs;
39 41
40static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 42static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
@@ -242,16 +244,11 @@ static void do_remote_base()
242 244
243static __thread int set_thread_area_entry_number = -1; 245static __thread int set_thread_area_entry_number = -1;
244 246
245static void do_unexpected_base(void) 247static unsigned short load_gs(void)
246{ 248{
247 /* 249 /*
248 * The goal here is to try to arrange for GS == 0, GSBASE != 250 * Sets GS != 0 and GSBASE != 0 but arranges for the kernel to think
249 * 0, and for the the kernel the think that GSBASE == 0. 251 * that GSBASE == 0 (i.e. thread.gsbase == 0).
250 *
251 * To make the test as reliable as possible, this uses
252 * explicit descriptors. (This is not the only way. This
253 * could use ARCH_SET_GS with a low, nonzero base, but the
254 * relevant side effect of ARCH_SET_GS could change.)
255 */ 252 */
256 253
257 /* Step 1: tell the kernel that we have GSBASE == 0. */ 254 /* Step 1: tell the kernel that we have GSBASE == 0. */
@@ -271,8 +268,9 @@ static void do_unexpected_base(void)
271 .useable = 0 268 .useable = 0
272 }; 269 };
273 if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) { 270 if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
274 printf("\tother thread: using LDT slot 0\n"); 271 printf("\tusing LDT slot 0\n");
275 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); 272 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
273 return 0x7;
276 } else { 274 } else {
277 /* No modify_ldt for us (configured out, perhaps) */ 275 /* No modify_ldt for us (configured out, perhaps) */
278 276
@@ -294,20 +292,15 @@ static void do_unexpected_base(void)
294 292
295 if (ret != 0) { 293 if (ret != 0) {
296 printf("[NOTE]\tcould not create a segment -- test won't do anything\n"); 294 printf("[NOTE]\tcould not create a segment -- test won't do anything\n");
297 return; 295 return 0;
298 } 296 }
299 printf("\tother thread: using GDT slot %d\n", desc.entry_number); 297 printf("\tusing GDT slot %d\n", desc.entry_number);
300 set_thread_area_entry_number = desc.entry_number; 298 set_thread_area_entry_number = desc.entry_number;
301 299
302 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3))); 300 unsigned short gs = (unsigned short)((desc.entry_number << 3) | 0x3);
301 asm volatile ("mov %0, %%gs" : : "rm" (gs));
302 return gs;
303 } 303 }
304
305 /*
306 * Step 3: set the selector back to zero. On AMD chips, this will
307 * preserve GSBASE.
308 */
309
310 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
311} 304}
312 305
313void test_wrbase(unsigned short index, unsigned long base) 306void test_wrbase(unsigned short index, unsigned long base)
@@ -346,12 +339,19 @@ static void *threadproc(void *ctx)
346 if (ftx == 3) 339 if (ftx == 3)
347 return NULL; 340 return NULL;
348 341
349 if (ftx == 1) 342 if (ftx == 1) {
350 do_remote_base(); 343 do_remote_base();
351 else if (ftx == 2) 344 } else if (ftx == 2) {
352 do_unexpected_base(); 345 /*
353 else 346 * On AMD chips, this causes GSBASE != 0, GS == 0, and
347 * thread.gsbase == 0.
348 */
349
350 load_gs();
351 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
352 } else {
354 errx(1, "helper thread got bad command"); 353 errx(1, "helper thread got bad command");
354 }
355 355
356 ftx = 0; 356 ftx = 0;
357 syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); 357 syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
@@ -453,12 +453,7 @@ static void test_ptrace_write_gsbase(void)
453 if (child == 0) { 453 if (child == 0) {
454 printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n"); 454 printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n");
455 455
456 /* 456 *shared_scratch = load_gs();
457 * Use the LDT setup and fetch the GSBASE from the LDT
458 * by switching to the (nonzero) selector (again)
459 */
460 do_unexpected_base();
461 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
462 457
463 if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0) 458 if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0)
464 err(1, "PTRACE_TRACEME"); 459 err(1, "PTRACE_TRACEME");
@@ -476,7 +471,7 @@ static void test_ptrace_write_gsbase(void)
476 471
477 gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL); 472 gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
478 473
479 if (gs != 0x7) { 474 if (gs != *shared_scratch) {
480 nerrs++; 475 nerrs++;
481 printf("[FAIL]\tGS is not prepared with nonzero\n"); 476 printf("[FAIL]\tGS is not prepared with nonzero\n");
482 goto END; 477 goto END;
@@ -494,16 +489,24 @@ static void test_ptrace_write_gsbase(void)
494 * selector value is changed or not by the GSBASE write in 489 * selector value is changed or not by the GSBASE write in
495 * a ptracer. 490 * a ptracer.
496 */ 491 */
497 if (gs != 0x7) { 492 if (gs != *shared_scratch) {
498 nerrs++; 493 nerrs++;
499 printf("[FAIL]\tGS changed to %lx\n", gs); 494 printf("[FAIL]\tGS changed to %lx\n", gs);
495
496 /*
497 * On older kernels, poking a nonzero value into the
498 * base would zero the selector. On newer kernels,
499 * this behavior has changed -- poking the base
500 * changes only the base and, if FSGSBASE is not
501 * available, this may have no effect.
502 */
503 if (gs == 0)
504 printf("\tNote: this is expected behavior on older kernels.\n");
500 } else if (have_fsgsbase && (base != 0xFF)) { 505 } else if (have_fsgsbase && (base != 0xFF)) {
501 nerrs++; 506 nerrs++;
502 printf("[FAIL]\tGSBASE changed to %lx\n", base); 507 printf("[FAIL]\tGSBASE changed to %lx\n", base);
503 } else { 508 } else {
504 printf("[OK]\tGS remained 0x7 %s"); 509 printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : "");
505 if (have_fsgsbase)
506 printf("and GSBASE changed to 0xFF");
507 printf("\n"); 510 printf("\n");
508 } 511 }
509 } 512 }
@@ -516,6 +519,9 @@ int main()
516{ 519{
517 pthread_t thread; 520 pthread_t thread;
518 521
522 shared_scratch = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
523 MAP_ANONYMOUS | MAP_SHARED, -1, 0);
524
519 /* Probe FSGSBASE */ 525 /* Probe FSGSBASE */
520 sethandler(SIGILL, sigill, 0); 526 sethandler(SIGILL, sigill, 0);
521 if (sigsetjmp(jmpbuf, 1) == 0) { 527 if (sigsetjmp(jmpbuf, 1) == 0) {
diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c
index 4e25d38c8bbd..bc0ecc2e862e 100644
--- a/tools/testing/selftests/x86/syscall_arg_fault.c
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
@@ -15,9 +15,30 @@
15#include <setjmp.h> 15#include <setjmp.h>
16#include <errno.h> 16#include <errno.h>
17 17
18#ifdef __x86_64__
19# define WIDTH "q"
20#else
21# define WIDTH "l"
22#endif
23
18/* Our sigaltstack scratch space. */ 24/* Our sigaltstack scratch space. */
19static unsigned char altstack_data[SIGSTKSZ]; 25static unsigned char altstack_data[SIGSTKSZ];
20 26
27static unsigned long get_eflags(void)
28{
29 unsigned long eflags;
30 asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
31 return eflags;
32}
33
34static void set_eflags(unsigned long eflags)
35{
36 asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
37 : : "rm" (eflags) : "flags");
38}
39
40#define X86_EFLAGS_TF (1UL << 8)
41
21static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 42static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
22 int flags) 43 int flags)
23{ 44{
@@ -35,13 +56,22 @@ static sigjmp_buf jmpbuf;
35 56
36static volatile sig_atomic_t n_errs; 57static volatile sig_atomic_t n_errs;
37 58
59#ifdef __x86_64__
60#define REG_AX REG_RAX
61#define REG_IP REG_RIP
62#else
63#define REG_AX REG_EAX
64#define REG_IP REG_EIP
65#endif
66
38static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) 67static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
39{ 68{
40 ucontext_t *ctx = (ucontext_t*)ctx_void; 69 ucontext_t *ctx = (ucontext_t*)ctx_void;
70 long ax = (long)ctx->uc_mcontext.gregs[REG_AX];
41 71
42 if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) { 72 if (ax != -EFAULT && ax != -ENOSYS) {
43 printf("[FAIL]\tAX had the wrong value: 0x%x\n", 73 printf("[FAIL]\tAX had the wrong value: 0x%lx\n",
44 ctx->uc_mcontext.gregs[REG_EAX]); 74 (unsigned long)ax);
45 n_errs++; 75 n_errs++;
46 } else { 76 } else {
47 printf("[OK]\tSeems okay\n"); 77 printf("[OK]\tSeems okay\n");
@@ -50,9 +80,42 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
50 siglongjmp(jmpbuf, 1); 80 siglongjmp(jmpbuf, 1);
51} 81}
52 82
83static volatile sig_atomic_t sigtrap_consecutive_syscalls;
84
85static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
86{
87 /*
88 * KVM has some bugs that can cause us to stop making progress.
89 * detect them and complain, but don't infinite loop or fail the
90 * test.
91 */
92
93 ucontext_t *ctx = (ucontext_t*)ctx_void;
94 unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
95
96 if (*ip == 0x340f || *ip == 0x050f) {
97 /* The trap was on SYSCALL or SYSENTER */
98 sigtrap_consecutive_syscalls++;
99 if (sigtrap_consecutive_syscalls > 3) {
100 printf("[WARN]\tGot stuck single-stepping -- you probably have a KVM bug\n");
101 siglongjmp(jmpbuf, 1);
102 }
103 } else {
104 sigtrap_consecutive_syscalls = 0;
105 }
106}
107
53static void sigill(int sig, siginfo_t *info, void *ctx_void) 108static void sigill(int sig, siginfo_t *info, void *ctx_void)
54{ 109{
55 printf("[SKIP]\tIllegal instruction\n"); 110 ucontext_t *ctx = (ucontext_t*)ctx_void;
111 unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
112
113 if (*ip == 0x0b0f) {
114 /* one of the ud2 instructions faulted */
115 printf("[OK]\tSYSCALL returned normally\n");
116 } else {
117 printf("[SKIP]\tIllegal instruction\n");
118 }
56 siglongjmp(jmpbuf, 1); 119 siglongjmp(jmpbuf, 1);
57} 120}
58 121
@@ -120,9 +183,48 @@ int main()
120 "movl $-1, %%ebp\n\t" 183 "movl $-1, %%ebp\n\t"
121 "movl $-1, %%esp\n\t" 184 "movl $-1, %%esp\n\t"
122 "syscall\n\t" 185 "syscall\n\t"
123 "pushl $0" /* make sure we segfault cleanly */ 186 "ud2" /* make sure we recover cleanly */
187 : : : "memory", "flags");
188 }
189
190 printf("[RUN]\tSYSENTER with TF and invalid state\n");
191 sethandler(SIGTRAP, sigtrap, SA_ONSTACK);
192
193 if (sigsetjmp(jmpbuf, 1) == 0) {
194 sigtrap_consecutive_syscalls = 0;
195 set_eflags(get_eflags() | X86_EFLAGS_TF);
196 asm volatile (
197 "movl $-1, %%eax\n\t"
198 "movl $-1, %%ebx\n\t"
199 "movl $-1, %%ecx\n\t"
200 "movl $-1, %%edx\n\t"
201 "movl $-1, %%esi\n\t"
202 "movl $-1, %%edi\n\t"
203 "movl $-1, %%ebp\n\t"
204 "movl $-1, %%esp\n\t"
205 "sysenter"
206 : : : "memory", "flags");
207 }
208 set_eflags(get_eflags() & ~X86_EFLAGS_TF);
209
210 printf("[RUN]\tSYSCALL with TF and invalid state\n");
211 if (sigsetjmp(jmpbuf, 1) == 0) {
212 sigtrap_consecutive_syscalls = 0;
213 set_eflags(get_eflags() | X86_EFLAGS_TF);
214 asm volatile (
215 "movl $-1, %%eax\n\t"
216 "movl $-1, %%ebx\n\t"
217 "movl $-1, %%ecx\n\t"
218 "movl $-1, %%edx\n\t"
219 "movl $-1, %%esi\n\t"
220 "movl $-1, %%edi\n\t"
221 "movl $-1, %%ebp\n\t"
222 "movl $-1, %%esp\n\t"
223 "syscall\n\t"
224 "ud2" /* make sure we recover cleanly */
124 : : : "memory", "flags"); 225 : : : "memory", "flags");
125 } 226 }
227 set_eflags(get_eflags() & ~X86_EFLAGS_TF);
126 228
127 return 0; 229 return 0;
128} 230}