aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-01-14 12:51:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-01-14 12:51:25 -0500
commit40548c6b6c134275c750eb372dc2cf8ee1bbc3d4 (patch)
tree3bfc6943c3b43f1e345ddb7c88996e7b2f121fcd
parent2c1cfa49901839136e578ca516a7e230182da024 (diff)
parent99a9dc98ba52267ce5e062b52de88ea1f1b2a7d8 (diff)
Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 pti updates from Thomas Gleixner: "This contains: - a PTI bugfix to avoid setting reserved CR3 bits when PCID is disabled. This seems to cause issues on a virtual machine at least and is incorrect according to the AMD manual. - a PTI bugfix which disables the perf BTS facility if PTI is enabled. The BTS AUX buffer is not globally visible and causes the CPU to fault when the mapping disappears on switching CR3 to user space. A full fix which restores BTS on PTI is non trivial and will be worked on. - PTI bugfixes for EFI and trusted boot which make sure that the user space visible page table entries have the NX bit cleared - removal of dead code in the PTI pagetable setup functions - add PTI documentation - add a selftest for vsyscall to verify that the kernel actually implements what it advertises. - a sysfs interface to expose vulnerability and mitigation information so there is a coherent way for users to retrieve the status. - the initial spectre_v2 mitigations, aka retpoline: + The necessary ASM thunk and compiler support + The ASM variants of retpoline and the conversion of affected ASM code + Make LFENCE serializing on AMD so it can be used as speculation trap + The RSB fill after vmexit - initial objtool support for retpoline As I said in the status mail this is the most of the set of patches which should go into 4.15 except two straight forward patches still on hold: - the retpoline add on of LFENCE which waits for ACKs - the RSB fill after context switch Both should be ready to go early next week and with that we'll have covered the major holes of spectre_v2 and go back to normality" * 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (28 commits) x86,perf: Disable intel_bts when PTI security/Kconfig: Correct the Documentation reference for PTI x86/pti: Fix !PCID and sanitize defines selftests/x86: Add test_vsyscall x86/retpoline: Fill return stack buffer on vmexit x86/retpoline/irq32: Convert assembler indirect jumps x86/retpoline/checksum32: Convert assembler indirect jumps x86/retpoline/xen: Convert Xen hypercall indirect jumps x86/retpoline/hyperv: Convert assembler indirect jumps x86/retpoline/ftrace: Convert ftrace assembler indirect jumps x86/retpoline/entry: Convert entry assembler indirect jumps x86/retpoline/crypto: Convert crypto assembler indirect jumps x86/spectre: Add boot time option to select Spectre v2 mitigation x86/retpoline: Add initial retpoline support objtool: Allow alternatives to be ignored objtool: Detect jumps to retpoline thunks x86/pti: Make unpoison of pgd for trusted boot work for real x86/alternatives: Fix optimize_nops() checking sysfs/cpu: Fix typos in vulnerability documentation x86/cpu/AMD: Use LFENCE_RDTSC in preference to MFENCE_RDTSC ...
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu16
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt49
-rw-r--r--Documentation/x86/pti.txt186
-rw-r--r--arch/x86/Kconfig14
-rw-r--r--arch/x86/Makefile10
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S5
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S3
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S3
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S3
-rw-r--r--arch/x86/entry/calling.h36
-rw-r--r--arch/x86/entry/entry_32.S5
-rw-r--r--arch/x86/entry/entry_64.S12
-rw-r--r--arch/x86/events/intel/bts.c18
-rw-r--r--arch/x86/include/asm/asm-prototypes.h25
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/mshyperv.h18
-rw-r--r--arch/x86/include/asm/msr-index.h3
-rw-r--r--arch/x86/include/asm/nospec-branch.h214
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/include/asm/xen/hypercall.h5
-rw-r--r--arch/x86/kernel/alternative.c7
-rw-r--r--arch/x86/kernel/cpu/amd.c28
-rw-r--r--arch/x86/kernel/cpu/bugs.c185
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/ftrace_32.S6
-rw-r--r--arch/x86/kernel/ftrace_64.S8
-rw-r--r--arch/x86/kernel/irq_32.c9
-rw-r--r--arch/x86/kernel/tboot.c11
-rw-r--r--arch/x86/kvm/svm.c4
-rw-r--r--arch/x86/kvm/vmx.c4
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/checksum_32.S7
-rw-r--r--arch/x86/lib/retpoline.S48
-rw-r--r--arch/x86/mm/pti.c32
-rw-r--r--arch/x86/platform/efi/efi_64.c2
-rw-r--r--drivers/base/Kconfig3
-rw-r--r--drivers/base/cpu.c48
-rw-r--r--include/linux/cpu.h7
-rw-r--r--security/Kconfig2
-rw-r--r--tools/objtool/check.c69
-rw-r--r--tools/objtool/check.h2
-rw-r--r--tools/testing/selftests/x86/Makefile2
-rw-r--r--tools/testing/selftests/x86/test_vsyscall.c500
44 files changed, 1525 insertions, 100 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index d6d862db3b5d..bfd29bc8d37a 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -375,3 +375,19 @@ Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
375Description: information about CPUs heterogeneity. 375Description: information about CPUs heterogeneity.
376 376
377 cpu_capacity: capacity of cpu#. 377 cpu_capacity: capacity of cpu#.
378
379What: /sys/devices/system/cpu/vulnerabilities
380 /sys/devices/system/cpu/vulnerabilities/meltdown
381 /sys/devices/system/cpu/vulnerabilities/spectre_v1
382 /sys/devices/system/cpu/vulnerabilities/spectre_v2
383Date: January 2018
384Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
385Description: Information about CPU vulnerabilities
386
387 The files are named after the code names of CPU
388 vulnerabilities. The output of those files reflects the
389 state of the CPUs in the system. Possible output values:
390
391 "Not affected" CPU is not affected by the vulnerability
392 "Vulnerable" CPU is affected and no mitigation in effect
393 "Mitigation: $M" CPU is affected and mitigation $M is in effect
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index faccc582c0f9..46b26bfee27b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2623,6 +2623,11 @@
2623 nosmt [KNL,S390] Disable symmetric multithreading (SMT). 2623 nosmt [KNL,S390] Disable symmetric multithreading (SMT).
2624 Equivalent to smt=1. 2624 Equivalent to smt=1.
2625 2625
2626 nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2
2627 (indirect branch prediction) vulnerability. System may
2628 allow data leaks with this option, which is equivalent
2629 to spectre_v2=off.
2630
2626 noxsave [BUGS=X86] Disables x86 extended register state save 2631 noxsave [BUGS=X86] Disables x86 extended register state save
2627 and restore using xsave. The kernel will fallback to 2632 and restore using xsave. The kernel will fallback to
2628 enabling legacy floating-point and sse state. 2633 enabling legacy floating-point and sse state.
@@ -2709,8 +2714,6 @@
2709 steal time is computed, but won't influence scheduler 2714 steal time is computed, but won't influence scheduler
2710 behaviour 2715 behaviour
2711 2716
2712 nopti [X86-64] Disable kernel page table isolation
2713
2714 nolapic [X86-32,APIC] Do not enable or use the local APIC. 2717 nolapic [X86-32,APIC] Do not enable or use the local APIC.
2715 2718
2716 nolapic_timer [X86-32,APIC] Do not use the local APIC timer. 2719 nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
@@ -3291,11 +3294,20 @@
3291 pt. [PARIDE] 3294 pt. [PARIDE]
3292 See Documentation/blockdev/paride.txt. 3295 See Documentation/blockdev/paride.txt.
3293 3296
3294 pti= [X86_64] 3297 pti= [X86_64] Control Page Table Isolation of user and
3295 Control user/kernel address space isolation: 3298 kernel address spaces. Disabling this feature
3296 on - enable 3299 removes hardening, but improves performance of
3297 off - disable 3300 system calls and interrupts.
3298 auto - default setting 3301
3302 on - unconditionally enable
3303 off - unconditionally disable
3304 auto - kernel detects whether your CPU model is
3305 vulnerable to issues that PTI mitigates
3306
3307 Not specifying this option is equivalent to pti=auto.
3308
3309 nopti [X86_64]
3310 Equivalent to pti=off
3299 3311
3300 pty.legacy_count= 3312 pty.legacy_count=
3301 [KNL] Number of legacy pty's. Overwrites compiled-in 3313 [KNL] Number of legacy pty's. Overwrites compiled-in
@@ -3946,6 +3958,29 @@
3946 sonypi.*= [HW] Sony Programmable I/O Control Device driver 3958 sonypi.*= [HW] Sony Programmable I/O Control Device driver
3947 See Documentation/laptops/sonypi.txt 3959 See Documentation/laptops/sonypi.txt
3948 3960
3961 spectre_v2= [X86] Control mitigation of Spectre variant 2
3962 (indirect branch speculation) vulnerability.
3963
3964 on - unconditionally enable
3965 off - unconditionally disable
3966 auto - kernel detects whether your CPU model is
3967 vulnerable
3968
3969 Selecting 'on' will, and 'auto' may, choose a
3970 mitigation method at run time according to the
3971 CPU, the available microcode, the setting of the
3972 CONFIG_RETPOLINE configuration option, and the
3973 compiler with which the kernel was built.
3974
3975 Specific mitigations can also be selected manually:
3976
3977 retpoline - replace indirect branches
3978 retpoline,generic - google's original retpoline
3979 retpoline,amd - AMD-specific minimal thunk
3980
3981 Not specifying this option is equivalent to
3982 spectre_v2=auto.
3983
3949 spia_io_base= [HW,MTD] 3984 spia_io_base= [HW,MTD]
3950 spia_fio_base= 3985 spia_fio_base=
3951 spia_pedr= 3986 spia_pedr=
diff --git a/Documentation/x86/pti.txt b/Documentation/x86/pti.txt
new file mode 100644
index 000000000000..d11eff61fc9a
--- /dev/null
+++ b/Documentation/x86/pti.txt
@@ -0,0 +1,186 @@
1Overview
2========
3
4Page Table Isolation (pti, previously known as KAISER[1]) is a
5countermeasure against attacks on the shared user/kernel address
6space such as the "Meltdown" approach[2].
7
8To mitigate this class of attacks, we create an independent set of
9page tables for use only when running userspace applications. When
10the kernel is entered via syscalls, interrupts or exceptions, the
11page tables are switched to the full "kernel" copy. When the system
12switches back to user mode, the user copy is used again.
13
14The userspace page tables contain only a minimal amount of kernel
15data: only what is needed to enter/exit the kernel such as the
16entry/exit functions themselves and the interrupt descriptor table
17(IDT). There are a few strictly unnecessary things that get mapped
18such as the first C function when entering an interrupt (see
19comments in pti.c).
20
21This approach helps to ensure that side-channel attacks leveraging
22the paging structures do not function when PTI is enabled. It can be
23enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time.
24Once enabled at compile-time, it can be disabled at boot with the
25'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt).
26
27Page Table Management
28=====================
29
30When PTI is enabled, the kernel manages two sets of page tables.
31The first set is very similar to the single set which is present in
32kernels without PTI. This includes a complete mapping of userspace
33that the kernel can use for things like copy_to_user().
34
35Although _complete_, the user portion of the kernel page tables is
36crippled by setting the NX bit in the top level. This ensures
37that any missed kernel->user CR3 switch will immediately crash
38userspace upon executing its first instruction.
39
40The userspace page tables map only the kernel data needed to enter
41and exit the kernel. This data is entirely contained in the 'struct
42cpu_entry_area' structure which is placed in the fixmap which gives
43each CPU's copy of the area a compile-time-fixed virtual address.
44
45For new userspace mappings, the kernel makes the entries in its
46page tables like normal. The only difference is when the kernel
47makes entries in the top (PGD) level. In addition to setting the
48entry in the main kernel PGD, a copy of the entry is made in the
49userspace page tables' PGD.
50
51This sharing at the PGD level also inherently shares all the lower
52layers of the page tables. This leaves a single, shared set of
53userspace page tables to manage. One PTE to lock, one set of
54accessed bits, dirty bits, etc...
55
56Overhead
57========
58
59Protection against side-channel attacks is important. But,
60this protection comes at a cost:
61
621. Increased Memory Use
63 a. Each process now needs an order-1 PGD instead of order-0.
64 (Consumes an additional 4k per process).
65 b. The 'cpu_entry_area' structure must be 2MB in size and 2MB
66 aligned so that it can be mapped by setting a single PMD
67 entry. This consumes nearly 2MB of RAM once the kernel
68 is decompressed, but no space in the kernel image itself.
69
702. Runtime Cost
71 a. CR3 manipulation to switch between the page table copies
72 must be done at interrupt, syscall, and exception entry
73 and exit (it can be skipped when the kernel is interrupted,
74 though.) Moves to CR3 are on the order of a hundred
75 cycles, and are required at every entry and exit.
76 b. A "trampoline" must be used for SYSCALL entry. This
77 trampoline depends on a smaller set of resources than the
78 non-PTI SYSCALL entry code, so requires mapping fewer
79 things into the userspace page tables. The downside is
80 that stacks must be switched at entry time.
81 d. Global pages are disabled for all kernel structures not
82 mapped into both kernel and userspace page tables. This
83 feature of the MMU allows different processes to share TLB
84 entries mapping the kernel. Losing the feature means more
85 TLB misses after a context switch. The actual loss of
86 performance is very small, however, never exceeding 1%.
87 d. Process Context IDentifiers (PCID) is a CPU feature that
88 allows us to skip flushing the entire TLB when switching page
89 tables by setting a special bit in CR3 when the page tables
90 are changed. This makes switching the page tables (at context
91 switch, or kernel entry/exit) cheaper. But, on systems with
92 PCID support, the context switch code must flush both the user
93 and kernel entries out of the TLB. The user PCID TLB flush is
94 deferred until the exit to userspace, minimizing the cost.
95 See intel.com/sdm for the gory PCID/INVPCID details.
96 e. The userspace page tables must be populated for each new
97 process. Even without PTI, the shared kernel mappings
98 are created by copying top-level (PGD) entries into each
99 new process. But, with PTI, there are now *two* kernel
100 mappings: one in the kernel page tables that maps everything
101 and one for the entry/exit structures. At fork(), we need to
102 copy both.
103 f. In addition to the fork()-time copying, there must also
104 be an update to the userspace PGD any time a set_pgd() is done
105 on a PGD used to map userspace. This ensures that the kernel
106 and userspace copies always map the same userspace
107 memory.
108 g. On systems without PCID support, each CR3 write flushes
109 the entire TLB. That means that each syscall, interrupt
110 or exception flushes the TLB.
111 h. INVPCID is a TLB-flushing instruction which allows flushing
112 of TLB entries for non-current PCIDs. Some systems support
113 PCIDs, but do not support INVPCID. On these systems, addresses
114 can only be flushed from the TLB for the current PCID. When
115 flushing a kernel address, we need to flush all PCIDs, so a
116 single kernel address flush will require a TLB-flushing CR3
117 write upon the next use of every PCID.
118
119Possible Future Work
120====================
1211. We can be more careful about not actually writing to CR3
122 unless its value is actually changed.
1232. Allow PTI to be enabled/disabled at runtime in addition to the
124 boot-time switching.
125
126Testing
127========
128
129To test stability of PTI, the following test procedure is recommended,
130ideally doing all of these in parallel:
131
1321. Set CONFIG_DEBUG_ENTRY=y
1332. Run several copies of all of the tools/testing/selftests/x86/ tests
134 (excluding MPX and protection_keys) in a loop on multiple CPUs for
135 several minutes. These tests frequently uncover corner cases in the
136 kernel entry code. In general, old kernels might cause these tests
137 themselves to crash, but they should never crash the kernel.
1383. Run the 'perf' tool in a mode (top or record) that generates many
139 frequent performance monitoring non-maskable interrupts (see "NMI"
140 in /proc/interrupts). This exercises the NMI entry/exit code which
141 is known to trigger bugs in code paths that did not expect to be
142 interrupted, including nested NMIs. Using "-c" boosts the rate of
143 NMIs, and using two -c with separate counters encourages nested NMIs
144 and less deterministic behavior.
145
146 while true; do perf record -c 10000 -e instructions,cycles -a sleep 10; done
147
1484. Launch a KVM virtual machine.
1495. Run 32-bit binaries on systems supporting the SYSCALL instruction.
150 This has been a lightly-tested code path and needs extra scrutiny.
151
152Debugging
153=========
154
155Bugs in PTI cause a few different signatures of crashes
156that are worth noting here.
157
158 * Failures of the selftests/x86 code. Usually a bug in one of the
159 more obscure corners of entry_64.S
160 * Crashes in early boot, especially around CPU bringup. Bugs
161 in the trampoline code or mappings cause these.
162 * Crashes at the first interrupt. Caused by bugs in entry_64.S,
163 like screwing up a page table switch. Also caused by
164 incorrectly mapping the IRQ handler entry code.
165 * Crashes at the first NMI. The NMI code is separate from main
166 interrupt handlers and can have bugs that do not affect
167 normal interrupts. Also caused by incorrectly mapping NMI
168 code. NMIs that interrupt the entry code must be very
169 careful and can be the cause of crashes that show up when
170 running perf.
171 * Kernel crashes at the first exit to userspace. entry_64.S
172 bugs, or failing to map some of the exit code.
173 * Crashes at first interrupt that interrupts userspace. The paths
174 in entry_64.S that return to userspace are sometimes separate
175 from the ones that return to the kernel.
176 * Double faults: overflowing the kernel stack because of page
177 faults upon page faults. Caused by touching non-pti-mapped
178 data in the entry code, or forgetting to switch to kernel
179 CR3 before calling into C functions which are not pti-mapped.
180 * Userspace segfaults early in boot, sometimes manifesting
181 as mount(8) failing to mount the rootfs. These have
182 tended to be TLB invalidation issues. Usually invalidating
183 the wrong PCID, or otherwise missing an invalidation.
184
1851. https://gruss.cc/files/kaiser.pdf
1862. https://meltdownattack.com/meltdown.pdf
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ff4e9cd99854..20da391b5f32 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,6 +88,7 @@ config X86
88 select GENERIC_CLOCKEVENTS_MIN_ADJUST 88 select GENERIC_CLOCKEVENTS_MIN_ADJUST
89 select GENERIC_CMOS_UPDATE 89 select GENERIC_CMOS_UPDATE
90 select GENERIC_CPU_AUTOPROBE 90 select GENERIC_CPU_AUTOPROBE
91 select GENERIC_CPU_VULNERABILITIES
91 select GENERIC_EARLY_IOREMAP 92 select GENERIC_EARLY_IOREMAP
92 select GENERIC_FIND_FIRST_BIT 93 select GENERIC_FIND_FIRST_BIT
93 select GENERIC_IOMAP 94 select GENERIC_IOMAP
@@ -428,6 +429,19 @@ config GOLDFISH
428 def_bool y 429 def_bool y
429 depends on X86_GOLDFISH 430 depends on X86_GOLDFISH
430 431
432config RETPOLINE
433 bool "Avoid speculative indirect branches in kernel"
434 default y
435 help
436 Compile kernel with the retpoline compiler options to guard against
437 kernel-to-user data leaks by avoiding speculative indirect
438 branches. Requires a compiler with -mindirect-branch=thunk-extern
439 support for full protection. The kernel may run slower.
440
441 Without compiler support, at least indirect branches in assembler
442 code are eliminated. Since this includes the syscall entry path,
443 it is not entirely pointless.
444
431config INTEL_RDT 445config INTEL_RDT
432 bool "Intel Resource Director Technology support" 446 bool "Intel Resource Director Technology support"
433 default n 447 default n
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 3e73bc255e4e..e98f8e2e3708 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -230,6 +230,16 @@ KBUILD_CFLAGS += -Wno-sign-compare
230# 230#
231KBUILD_CFLAGS += -fno-asynchronous-unwind-tables 231KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
232 232
233# Avoid indirect branches in kernel to deal with Spectre
234ifdef CONFIG_RETPOLINE
235 RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
236 ifneq ($(RETPOLINE_CFLAGS),)
237 KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
238 else
239 $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.)
240 endif
241endif
242
233archscripts: scripts_basic 243archscripts: scripts_basic
234 $(Q)$(MAKE) $(build)=arch/x86/tools relocs 244 $(Q)$(MAKE) $(build)=arch/x86/tools relocs
235 245
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 16627fec80b2..3d09e3aca18d 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,6 +32,7 @@
32#include <linux/linkage.h> 32#include <linux/linkage.h>
33#include <asm/inst.h> 33#include <asm/inst.h>
34#include <asm/frame.h> 34#include <asm/frame.h>
35#include <asm/nospec-branch.h>
35 36
36/* 37/*
37 * The following macros are used to move an (un)aligned 16 byte value to/from 38 * The following macros are used to move an (un)aligned 16 byte value to/from
@@ -2884,7 +2885,7 @@ ENTRY(aesni_xts_crypt8)
2884 pxor INC, STATE4 2885 pxor INC, STATE4
2885 movdqu IV, 0x30(OUTP) 2886 movdqu IV, 0x30(OUTP)
2886 2887
2887 call *%r11 2888 CALL_NOSPEC %r11
2888 2889
2889 movdqu 0x00(OUTP), INC 2890 movdqu 0x00(OUTP), INC
2890 pxor INC, STATE1 2891 pxor INC, STATE1
@@ -2929,7 +2930,7 @@ ENTRY(aesni_xts_crypt8)
2929 _aesni_gf128mul_x_ble() 2930 _aesni_gf128mul_x_ble()
2930 movups IV, (IVP) 2931 movups IV, (IVP)
2931 2932
2932 call *%r11 2933 CALL_NOSPEC %r11
2933 2934
2934 movdqu 0x40(OUTP), INC 2935 movdqu 0x40(OUTP), INC
2935 pxor INC, STATE1 2936 pxor INC, STATE1
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index f7c495e2863c..a14af6eb09cb 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -17,6 +17,7 @@
17 17
18#include <linux/linkage.h> 18#include <linux/linkage.h>
19#include <asm/frame.h> 19#include <asm/frame.h>
20#include <asm/nospec-branch.h>
20 21
21#define CAMELLIA_TABLE_BYTE_LEN 272 22#define CAMELLIA_TABLE_BYTE_LEN 272
22 23
@@ -1227,7 +1228,7 @@ camellia_xts_crypt_16way:
1227 vpxor 14 * 16(%rax), %xmm15, %xmm14; 1228 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1228 vpxor 15 * 16(%rax), %xmm15, %xmm15; 1229 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1229 1230
1230 call *%r9; 1231 CALL_NOSPEC %r9;
1231 1232
1232 addq $(16 * 16), %rsp; 1233 addq $(16 * 16), %rsp;
1233 1234
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index eee5b3982cfd..b66bbfa62f50 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -12,6 +12,7 @@
12 12
13#include <linux/linkage.h> 13#include <linux/linkage.h>
14#include <asm/frame.h> 14#include <asm/frame.h>
15#include <asm/nospec-branch.h>
15 16
16#define CAMELLIA_TABLE_BYTE_LEN 272 17#define CAMELLIA_TABLE_BYTE_LEN 272
17 18
@@ -1343,7 +1344,7 @@ camellia_xts_crypt_32way:
1343 vpxor 14 * 32(%rax), %ymm15, %ymm14; 1344 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1344 vpxor 15 * 32(%rax), %ymm15, %ymm15; 1345 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1345 1346
1346 call *%r9; 1347 CALL_NOSPEC %r9;
1347 1348
1348 addq $(16 * 32), %rsp; 1349 addq $(16 * 32), %rsp;
1349 1350
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 7a7de27c6f41..d9b734d0c8cc 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -45,6 +45,7 @@
45 45
46#include <asm/inst.h> 46#include <asm/inst.h>
47#include <linux/linkage.h> 47#include <linux/linkage.h>
48#include <asm/nospec-branch.h>
48 49
49## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction 50## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
50 51
@@ -172,7 +173,7 @@ continue_block:
172 movzxw (bufp, %rax, 2), len 173 movzxw (bufp, %rax, 2), len
173 lea crc_array(%rip), bufp 174 lea crc_array(%rip), bufp
174 lea (bufp, len, 1), bufp 175 lea (bufp, len, 1), bufp
175 jmp *bufp 176 JMP_NOSPEC bufp
176 177
177 ################################################################ 178 ################################################################
178 ## 2a) PROCESS FULL BLOCKS: 179 ## 2a) PROCESS FULL BLOCKS:
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 45a63e00a6af..3f48f695d5e6 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -198,8 +198,11 @@ For 32-bit we have the following conventions - kernel is built with
198 * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two 198 * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
199 * halves: 199 * halves:
200 */ 200 */
201#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT) 201#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
202#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT)) 202#define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT)
203#define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT
204#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT)
205#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
203 206
204.macro SET_NOFLUSH_BIT reg:req 207.macro SET_NOFLUSH_BIT reg:req
205 bts $X86_CR3_PCID_NOFLUSH_BIT, \reg 208 bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
@@ -208,7 +211,7 @@ For 32-bit we have the following conventions - kernel is built with
208.macro ADJUST_KERNEL_CR3 reg:req 211.macro ADJUST_KERNEL_CR3 reg:req
209 ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID 212 ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
210 /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ 213 /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
211 andq $(~PTI_SWITCH_MASK), \reg 214 andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
212.endm 215.endm
213 216
214.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req 217.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
@@ -239,15 +242,19 @@ For 32-bit we have the following conventions - kernel is built with
239 /* Flush needed, clear the bit */ 242 /* Flush needed, clear the bit */
240 btr \scratch_reg, THIS_CPU_user_pcid_flush_mask 243 btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
241 movq \scratch_reg2, \scratch_reg 244 movq \scratch_reg2, \scratch_reg
242 jmp .Lwrcr3_\@ 245 jmp .Lwrcr3_pcid_\@
243 246
244.Lnoflush_\@: 247.Lnoflush_\@:
245 movq \scratch_reg2, \scratch_reg 248 movq \scratch_reg2, \scratch_reg
246 SET_NOFLUSH_BIT \scratch_reg 249 SET_NOFLUSH_BIT \scratch_reg
247 250
251.Lwrcr3_pcid_\@:
252 /* Flip the ASID to the user version */
253 orq $(PTI_USER_PCID_MASK), \scratch_reg
254
248.Lwrcr3_\@: 255.Lwrcr3_\@:
249 /* Flip the PGD and ASID to the user version */ 256 /* Flip the PGD to the user version */
250 orq $(PTI_SWITCH_MASK), \scratch_reg 257 orq $(PTI_USER_PGTABLE_MASK), \scratch_reg
251 mov \scratch_reg, %cr3 258 mov \scratch_reg, %cr3
252.Lend_\@: 259.Lend_\@:
253.endm 260.endm
@@ -263,17 +270,12 @@ For 32-bit we have the following conventions - kernel is built with
263 movq %cr3, \scratch_reg 270 movq %cr3, \scratch_reg
264 movq \scratch_reg, \save_reg 271 movq \scratch_reg, \save_reg
265 /* 272 /*
266 * Is the "switch mask" all zero? That means that both of 273 * Test the user pagetable bit. If set, then the user page tables
267 * these are zero: 274 * are active. If clear CR3 already has the kernel page table
268 * 275 * active.
269 * 1. The user/kernel PCID bit, and
270 * 2. The user/kernel "bit" that points CR3 to the
271 * bottom half of the 8k PGD
272 *
273 * That indicates a kernel CR3 value, not a user CR3.
274 */ 276 */
275 testq $(PTI_SWITCH_MASK), \scratch_reg 277 bt $PTI_USER_PGTABLE_BIT, \scratch_reg
276 jz .Ldone_\@ 278 jnc .Ldone_\@
277 279
278 ADJUST_KERNEL_CR3 \scratch_reg 280 ADJUST_KERNEL_CR3 \scratch_reg
279 movq \scratch_reg, %cr3 281 movq \scratch_reg, %cr3
@@ -290,7 +292,7 @@ For 32-bit we have the following conventions - kernel is built with
290 * KERNEL pages can always resume with NOFLUSH as we do 292 * KERNEL pages can always resume with NOFLUSH as we do
291 * explicit flushes. 293 * explicit flushes.
292 */ 294 */
293 bt $X86_CR3_PTI_SWITCH_BIT, \save_reg 295 bt $PTI_USER_PGTABLE_BIT, \save_reg
294 jnc .Lnoflush_\@ 296 jnc .Lnoflush_\@
295 297
296 /* 298 /*
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index ace8f321a5a1..a1f28a54f23a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -44,6 +44,7 @@
44#include <asm/asm.h> 44#include <asm/asm.h>
45#include <asm/smap.h> 45#include <asm/smap.h>
46#include <asm/frame.h> 46#include <asm/frame.h>
47#include <asm/nospec-branch.h>
47 48
48 .section .entry.text, "ax" 49 .section .entry.text, "ax"
49 50
@@ -290,7 +291,7 @@ ENTRY(ret_from_fork)
290 291
291 /* kernel thread */ 292 /* kernel thread */
2921: movl %edi, %eax 2931: movl %edi, %eax
293 call *%ebx 294 CALL_NOSPEC %ebx
294 /* 295 /*
295 * A kernel thread is allowed to return here after successfully 296 * A kernel thread is allowed to return here after successfully
296 * calling do_execve(). Exit to userspace to complete the execve() 297 * calling do_execve(). Exit to userspace to complete the execve()
@@ -919,7 +920,7 @@ common_exception:
919 movl %ecx, %es 920 movl %ecx, %es
920 TRACE_IRQS_OFF 921 TRACE_IRQS_OFF
921 movl %esp, %eax # pt_regs pointer 922 movl %esp, %eax # pt_regs pointer
922 call *%edi 923 CALL_NOSPEC %edi
923 jmp ret_from_exception 924 jmp ret_from_exception
924END(common_exception) 925END(common_exception)
925 926
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f048e384ff54..4f8e1d35a97c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -37,6 +37,7 @@
37#include <asm/pgtable_types.h> 37#include <asm/pgtable_types.h>
38#include <asm/export.h> 38#include <asm/export.h>
39#include <asm/frame.h> 39#include <asm/frame.h>
40#include <asm/nospec-branch.h>
40#include <linux/err.h> 41#include <linux/err.h>
41 42
42#include "calling.h" 43#include "calling.h"
@@ -191,7 +192,7 @@ ENTRY(entry_SYSCALL_64_trampoline)
191 */ 192 */
192 pushq %rdi 193 pushq %rdi
193 movq $entry_SYSCALL_64_stage2, %rdi 194 movq $entry_SYSCALL_64_stage2, %rdi
194 jmp *%rdi 195 JMP_NOSPEC %rdi
195END(entry_SYSCALL_64_trampoline) 196END(entry_SYSCALL_64_trampoline)
196 197
197 .popsection 198 .popsection
@@ -270,7 +271,12 @@ entry_SYSCALL_64_fastpath:
270 * It might end up jumping to the slow path. If it jumps, RAX 271 * It might end up jumping to the slow path. If it jumps, RAX
271 * and all argument registers are clobbered. 272 * and all argument registers are clobbered.
272 */ 273 */
274#ifdef CONFIG_RETPOLINE
275 movq sys_call_table(, %rax, 8), %rax
276 call __x86_indirect_thunk_rax
277#else
273 call *sys_call_table(, %rax, 8) 278 call *sys_call_table(, %rax, 8)
279#endif
274.Lentry_SYSCALL_64_after_fastpath_call: 280.Lentry_SYSCALL_64_after_fastpath_call:
275 281
276 movq %rax, RAX(%rsp) 282 movq %rax, RAX(%rsp)
@@ -442,7 +448,7 @@ ENTRY(stub_ptregs_64)
442 jmp entry_SYSCALL64_slow_path 448 jmp entry_SYSCALL64_slow_path
443 449
4441: 4501:
445 jmp *%rax /* Called from C */ 451 JMP_NOSPEC %rax /* Called from C */
446END(stub_ptregs_64) 452END(stub_ptregs_64)
447 453
448.macro ptregs_stub func 454.macro ptregs_stub func
@@ -521,7 +527,7 @@ ENTRY(ret_from_fork)
5211: 5271:
522 /* kernel thread */ 528 /* kernel thread */
523 movq %r12, %rdi 529 movq %r12, %rdi
524 call *%rbx 530 CALL_NOSPEC %rbx
525 /* 531 /*
526 * A kernel thread is allowed to return here after successfully 532 * A kernel thread is allowed to return here after successfully
527 * calling do_execve(). Exit to userspace to complete the execve() 533 * calling do_execve(). Exit to userspace to complete the execve()
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 141e07b06216..24ffa1e88cf9 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -582,6 +582,24 @@ static __init int bts_init(void)
582 if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) 582 if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
583 return -ENODEV; 583 return -ENODEV;
584 584
585 if (boot_cpu_has(X86_FEATURE_PTI)) {
586 /*
587 * BTS hardware writes through a virtual memory map we must
588 * either use the kernel physical map, or the user mapping of
589 * the AUX buffer.
590 *
591 * However, since this driver supports per-CPU and per-task inherit
592 * we cannot use the user mapping since it will not be availble
593 * if we're not running the owning process.
594 *
595 * With PTI we can't use the kernal map either, because its not
596 * there when we run userspace.
597 *
598 * For now, disable this driver when using PTI.
599 */
600 return -ENODEV;
601 }
602
585 bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | 603 bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
586 PERF_PMU_CAP_EXCLUSIVE; 604 PERF_PMU_CAP_EXCLUSIVE;
587 bts_pmu.task_ctx_nr = perf_sw_context; 605 bts_pmu.task_ctx_nr = perf_sw_context;
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index ff700d81e91e..0927cdc4f946 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -11,7 +11,32 @@
11#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/special_insns.h> 12#include <asm/special_insns.h>
13#include <asm/preempt.h> 13#include <asm/preempt.h>
14#include <asm/asm.h>
14 15
15#ifndef CONFIG_X86_CMPXCHG64 16#ifndef CONFIG_X86_CMPXCHG64
16extern void cmpxchg8b_emu(void); 17extern void cmpxchg8b_emu(void);
17#endif 18#endif
19
20#ifdef CONFIG_RETPOLINE
21#ifdef CONFIG_X86_32
22#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void);
23#else
24#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void);
25INDIRECT_THUNK(8)
26INDIRECT_THUNK(9)
27INDIRECT_THUNK(10)
28INDIRECT_THUNK(11)
29INDIRECT_THUNK(12)
30INDIRECT_THUNK(13)
31INDIRECT_THUNK(14)
32INDIRECT_THUNK(15)
33#endif
34INDIRECT_THUNK(ax)
35INDIRECT_THUNK(bx)
36INDIRECT_THUNK(cx)
37INDIRECT_THUNK(dx)
38INDIRECT_THUNK(si)
39INDIRECT_THUNK(di)
40INDIRECT_THUNK(bp)
41INDIRECT_THUNK(sp)
42#endif /* CONFIG_RETPOLINE */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 21ac898df2d8..f275447862f4 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -203,6 +203,8 @@
203#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ 203#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
204#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ 204#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
205#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ 205#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
206#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
207#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */
206#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ 208#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
207#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ 209#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
208#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ 210#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
@@ -342,5 +344,7 @@
342#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ 344#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
343#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ 345#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
344#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ 346#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
347#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
348#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
345 349
346#endif /* _ASM_X86_CPUFEATURES_H */ 350#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 5400add2885b..8bf450b13d9f 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -7,6 +7,7 @@
7#include <linux/nmi.h> 7#include <linux/nmi.h>
8#include <asm/io.h> 8#include <asm/io.h>
9#include <asm/hyperv.h> 9#include <asm/hyperv.h>
10#include <asm/nospec-branch.h>
10 11
11/* 12/*
12 * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent 13 * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
@@ -186,10 +187,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
186 return U64_MAX; 187 return U64_MAX;
187 188
188 __asm__ __volatile__("mov %4, %%r8\n" 189 __asm__ __volatile__("mov %4, %%r8\n"
189 "call *%5" 190 CALL_NOSPEC
190 : "=a" (hv_status), ASM_CALL_CONSTRAINT, 191 : "=a" (hv_status), ASM_CALL_CONSTRAINT,
191 "+c" (control), "+d" (input_address) 192 "+c" (control), "+d" (input_address)
192 : "r" (output_address), "m" (hv_hypercall_pg) 193 : "r" (output_address),
194 THUNK_TARGET(hv_hypercall_pg)
193 : "cc", "memory", "r8", "r9", "r10", "r11"); 195 : "cc", "memory", "r8", "r9", "r10", "r11");
194#else 196#else
195 u32 input_address_hi = upper_32_bits(input_address); 197 u32 input_address_hi = upper_32_bits(input_address);
@@ -200,13 +202,13 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
200 if (!hv_hypercall_pg) 202 if (!hv_hypercall_pg)
201 return U64_MAX; 203 return U64_MAX;
202 204
203 __asm__ __volatile__("call *%7" 205 __asm__ __volatile__(CALL_NOSPEC
204 : "=A" (hv_status), 206 : "=A" (hv_status),
205 "+c" (input_address_lo), ASM_CALL_CONSTRAINT 207 "+c" (input_address_lo), ASM_CALL_CONSTRAINT
206 : "A" (control), 208 : "A" (control),
207 "b" (input_address_hi), 209 "b" (input_address_hi),
208 "D"(output_address_hi), "S"(output_address_lo), 210 "D"(output_address_hi), "S"(output_address_lo),
209 "m" (hv_hypercall_pg) 211 THUNK_TARGET(hv_hypercall_pg)
210 : "cc", "memory"); 212 : "cc", "memory");
211#endif /* !x86_64 */ 213#endif /* !x86_64 */
212 return hv_status; 214 return hv_status;
@@ -227,10 +229,10 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
227 229
228#ifdef CONFIG_X86_64 230#ifdef CONFIG_X86_64
229 { 231 {
230 __asm__ __volatile__("call *%4" 232 __asm__ __volatile__(CALL_NOSPEC
231 : "=a" (hv_status), ASM_CALL_CONSTRAINT, 233 : "=a" (hv_status), ASM_CALL_CONSTRAINT,
232 "+c" (control), "+d" (input1) 234 "+c" (control), "+d" (input1)
233 : "m" (hv_hypercall_pg) 235 : THUNK_TARGET(hv_hypercall_pg)
234 : "cc", "r8", "r9", "r10", "r11"); 236 : "cc", "r8", "r9", "r10", "r11");
235 } 237 }
236#else 238#else
@@ -238,13 +240,13 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
238 u32 input1_hi = upper_32_bits(input1); 240 u32 input1_hi = upper_32_bits(input1);
239 u32 input1_lo = lower_32_bits(input1); 241 u32 input1_lo = lower_32_bits(input1);
240 242
241 __asm__ __volatile__ ("call *%5" 243 __asm__ __volatile__ (CALL_NOSPEC
242 : "=A"(hv_status), 244 : "=A"(hv_status),
243 "+c"(input1_lo), 245 "+c"(input1_lo),
244 ASM_CALL_CONSTRAINT 246 ASM_CALL_CONSTRAINT
245 : "A" (control), 247 : "A" (control),
246 "b" (input1_hi), 248 "b" (input1_hi),
247 "m" (hv_hypercall_pg) 249 THUNK_TARGET(hv_hypercall_pg)
248 : "cc", "edi", "esi"); 250 : "cc", "edi", "esi");
249 } 251 }
250#endif 252#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 34c4922bbc3f..e7b983a35506 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -355,6 +355,9 @@
355#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL 355#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
356#define FAM10H_MMIO_CONF_BASE_SHIFT 20 356#define FAM10H_MMIO_CONF_BASE_SHIFT 20
357#define MSR_FAM10H_NODE_ID 0xc001100c 357#define MSR_FAM10H_NODE_ID 0xc001100c
358#define MSR_F10H_DECFG 0xc0011029
359#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1
360#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT)
358 361
359/* K8 MSRs */ 362/* K8 MSRs */
360#define MSR_K8_TOP_MEM1 0xc001001a 363#define MSR_K8_TOP_MEM1 0xc001001a
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
new file mode 100644
index 000000000000..402a11c803c3
--- /dev/null
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -0,0 +1,214 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2
3#ifndef __NOSPEC_BRANCH_H__
4#define __NOSPEC_BRANCH_H__
5
6#include <asm/alternative.h>
7#include <asm/alternative-asm.h>
8#include <asm/cpufeatures.h>
9
10/*
11 * Fill the CPU return stack buffer.
12 *
13 * Each entry in the RSB, if used for a speculative 'ret', contains an
14 * infinite 'pause; jmp' loop to capture speculative execution.
15 *
16 * This is required in various cases for retpoline and IBRS-based
17 * mitigations for the Spectre variant 2 vulnerability. Sometimes to
18 * eliminate potentially bogus entries from the RSB, and sometimes
19 * purely to ensure that it doesn't get empty, which on some CPUs would
20 * allow predictions from other (unwanted!) sources to be used.
21 *
22 * We define a CPP macro such that it can be used from both .S files and
23 * inline assembly. It's possible to do a .macro and then include that
24 * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
25 */
26
27#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
28#define RSB_FILL_LOOPS 16 /* To avoid underflow */
29
30/*
31 * Google experimented with loop-unrolling and this turned out to be
32 * the optimal version — two calls, each with their own speculation
33 * trap should their return address end up getting used, in a loop.
34 */
35#define __FILL_RETURN_BUFFER(reg, nr, sp) \
36 mov $(nr/2), reg; \
37771: \
38 call 772f; \
39773: /* speculation trap */ \
40 pause; \
41 jmp 773b; \
42772: \
43 call 774f; \
44775: /* speculation trap */ \
45 pause; \
46 jmp 775b; \
47774: \
48 dec reg; \
49 jnz 771b; \
50 add $(BITS_PER_LONG/8) * nr, sp;
51
52#ifdef __ASSEMBLY__
53
54/*
55 * This should be used immediately before a retpoline alternative. It tells
56 * objtool where the retpolines are so that it can make sense of the control
57 * flow by just reading the original instruction(s) and ignoring the
58 * alternatives.
59 */
60.macro ANNOTATE_NOSPEC_ALTERNATIVE
61 .Lannotate_\@:
62 .pushsection .discard.nospec
63 .long .Lannotate_\@ - .
64 .popsection
65.endm
66
67/*
68 * These are the bare retpoline primitives for indirect jmp and call.
69 * Do not use these directly; they only exist to make the ALTERNATIVE
70 * invocation below less ugly.
71 */
72.macro RETPOLINE_JMP reg:req
73 call .Ldo_rop_\@
74.Lspec_trap_\@:
75 pause
76 jmp .Lspec_trap_\@
77.Ldo_rop_\@:
78 mov \reg, (%_ASM_SP)
79 ret
80.endm
81
82/*
83 * This is a wrapper around RETPOLINE_JMP so the called function in reg
84 * returns to the instruction after the macro.
85 */
86.macro RETPOLINE_CALL reg:req
87 jmp .Ldo_call_\@
88.Ldo_retpoline_jmp_\@:
89 RETPOLINE_JMP \reg
90.Ldo_call_\@:
91 call .Ldo_retpoline_jmp_\@
92.endm
93
94/*
95 * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
96 * indirect jmp/call which may be susceptible to the Spectre variant 2
97 * attack.
98 */
99.macro JMP_NOSPEC reg:req
100#ifdef CONFIG_RETPOLINE
101 ANNOTATE_NOSPEC_ALTERNATIVE
102 ALTERNATIVE_2 __stringify(jmp *\reg), \
103 __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \
104 __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
105#else
106 jmp *\reg
107#endif
108.endm
109
110.macro CALL_NOSPEC reg:req
111#ifdef CONFIG_RETPOLINE
112 ANNOTATE_NOSPEC_ALTERNATIVE
113 ALTERNATIVE_2 __stringify(call *\reg), \
114 __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
115 __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD
116#else
117 call *\reg
118#endif
119.endm
120
121 /*
122 * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
123 * monstrosity above, manually.
124 */
125.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
126#ifdef CONFIG_RETPOLINE
127 ANNOTATE_NOSPEC_ALTERNATIVE
128 ALTERNATIVE "jmp .Lskip_rsb_\@", \
129 __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \
130 \ftr
131.Lskip_rsb_\@:
132#endif
133.endm
134
135#else /* __ASSEMBLY__ */
136
137#define ANNOTATE_NOSPEC_ALTERNATIVE \
138 "999:\n\t" \
139 ".pushsection .discard.nospec\n\t" \
140 ".long 999b - .\n\t" \
141 ".popsection\n\t"
142
143#if defined(CONFIG_X86_64) && defined(RETPOLINE)
144
145/*
146 * Since the inline asm uses the %V modifier which is only in newer GCC,
147 * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE.
148 */
149# define CALL_NOSPEC \
150 ANNOTATE_NOSPEC_ALTERNATIVE \
151 ALTERNATIVE( \
152 "call *%[thunk_target]\n", \
153 "call __x86_indirect_thunk_%V[thunk_target]\n", \
154 X86_FEATURE_RETPOLINE)
155# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
156
157#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
158/*
159 * For i386 we use the original ret-equivalent retpoline, because
160 * otherwise we'll run out of registers. We don't care about CET
161 * here, anyway.
162 */
163# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n", \
164 " jmp 904f;\n" \
165 " .align 16\n" \
166 "901: call 903f;\n" \
167 "902: pause;\n" \
168 " jmp 902b;\n" \
169 " .align 16\n" \
170 "903: addl $4, %%esp;\n" \
171 " pushl %[thunk_target];\n" \
172 " ret;\n" \
173 " .align 16\n" \
174 "904: call 901b;\n", \
175 X86_FEATURE_RETPOLINE)
176
177# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
178#else /* No retpoline for C / inline asm */
179# define CALL_NOSPEC "call *%[thunk_target]\n"
180# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
181#endif
182
183/* The Spectre V2 mitigation variants */
184enum spectre_v2_mitigation {
185 SPECTRE_V2_NONE,
186 SPECTRE_V2_RETPOLINE_MINIMAL,
187 SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
188 SPECTRE_V2_RETPOLINE_GENERIC,
189 SPECTRE_V2_RETPOLINE_AMD,
190 SPECTRE_V2_IBRS,
191};
192
193/*
194 * On VMEXIT we must ensure that no RSB predictions learned in the guest
195 * can be followed in the host, by overwriting the RSB completely. Both
196 * retpoline and IBRS mitigations for Spectre v2 need this; only on future
197 * CPUs with IBRS_ATT *might* it be avoided.
198 */
199static inline void vmexit_fill_RSB(void)
200{
201#ifdef CONFIG_RETPOLINE
202 unsigned long loops = RSB_CLEAR_LOOPS / 2;
203
204 asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
205 ALTERNATIVE("jmp 910f",
206 __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
207 X86_FEATURE_RETPOLINE)
208 "910:"
209 : "=&r" (loops), ASM_CALL_CONSTRAINT
210 : "r" (loops) : "memory" );
211#endif
212}
213#endif /* __ASSEMBLY__ */
214#endif /* __NOSPEC_BRANCH_H__ */
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 6a60fea90b9d..625a52a5594f 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -40,7 +40,7 @@
40#define CR3_NOFLUSH BIT_ULL(63) 40#define CR3_NOFLUSH BIT_ULL(63)
41 41
42#ifdef CONFIG_PAGE_TABLE_ISOLATION 42#ifdef CONFIG_PAGE_TABLE_ISOLATION
43# define X86_CR3_PTI_SWITCH_BIT 11 43# define X86_CR3_PTI_PCID_USER_BIT 11
44#endif 44#endif
45 45
46#else 46#else
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4a08dd2ab32a..d33e4a26dc7e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -81,13 +81,13 @@ static inline u16 kern_pcid(u16 asid)
81 * Make sure that the dynamic ASID space does not confict with the 81 * Make sure that the dynamic ASID space does not confict with the
82 * bit we are using to switch between user and kernel ASIDs. 82 * bit we are using to switch between user and kernel ASIDs.
83 */ 83 */
84 BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT)); 84 BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
85 85
86 /* 86 /*
87 * The ASID being passed in here should have respected the 87 * The ASID being passed in here should have respected the
88 * MAX_ASID_AVAILABLE and thus never have the switch bit set. 88 * MAX_ASID_AVAILABLE and thus never have the switch bit set.
89 */ 89 */
90 VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); 90 VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
91#endif 91#endif
92 /* 92 /*
93 * The dynamically-assigned ASIDs that get passed in are small 93 * The dynamically-assigned ASIDs that get passed in are small
@@ -112,7 +112,7 @@ static inline u16 user_pcid(u16 asid)
112{ 112{
113 u16 ret = kern_pcid(asid); 113 u16 ret = kern_pcid(asid);
114#ifdef CONFIG_PAGE_TABLE_ISOLATION 114#ifdef CONFIG_PAGE_TABLE_ISOLATION
115 ret |= 1 << X86_CR3_PTI_SWITCH_BIT; 115 ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
116#endif 116#endif
117 return ret; 117 return ret;
118} 118}
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 7cb282e9e587..bfd882617613 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -44,6 +44,7 @@
44#include <asm/page.h> 44#include <asm/page.h>
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/smap.h> 46#include <asm/smap.h>
47#include <asm/nospec-branch.h>
47 48
48#include <xen/interface/xen.h> 49#include <xen/interface/xen.h>
49#include <xen/interface/sched.h> 50#include <xen/interface/sched.h>
@@ -217,9 +218,9 @@ privcmd_call(unsigned call,
217 __HYPERCALL_5ARG(a1, a2, a3, a4, a5); 218 __HYPERCALL_5ARG(a1, a2, a3, a4, a5);
218 219
219 stac(); 220 stac();
220 asm volatile("call *%[call]" 221 asm volatile(CALL_NOSPEC
221 : __HYPERCALL_5PARAM 222 : __HYPERCALL_5PARAM
222 : [call] "a" (&hypercall_page[call]) 223 : [thunk_target] "a" (&hypercall_page[call])
223 : __HYPERCALL_CLOBBER5); 224 : __HYPERCALL_CLOBBER5);
224 clac(); 225 clac();
225 226
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index dbaf14d69ebd..4817d743c263 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -344,9 +344,12 @@ done:
344static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) 344static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
345{ 345{
346 unsigned long flags; 346 unsigned long flags;
347 int i;
347 348
348 if (instr[0] != 0x90) 349 for (i = 0; i < a->padlen; i++) {
349 return; 350 if (instr[i] != 0x90)
351 return;
352 }
350 353
351 local_irq_save(flags); 354 local_irq_save(flags);
352 add_nops(instr + (a->instrlen - a->padlen), a->padlen); 355 add_nops(instr + (a->instrlen - a->padlen), a->padlen);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bcb75dc97d44..ea831c858195 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -829,8 +829,32 @@ static void init_amd(struct cpuinfo_x86 *c)
829 set_cpu_cap(c, X86_FEATURE_K8); 829 set_cpu_cap(c, X86_FEATURE_K8);
830 830
831 if (cpu_has(c, X86_FEATURE_XMM2)) { 831 if (cpu_has(c, X86_FEATURE_XMM2)) {
832 /* MFENCE stops RDTSC speculation */ 832 unsigned long long val;
833 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); 833 int ret;
834
835 /*
836 * A serializing LFENCE has less overhead than MFENCE, so
837 * use it for execution serialization. On families which
838 * don't have that MSR, LFENCE is already serializing.
839 * msr_set_bit() uses the safe accessors, too, even if the MSR
840 * is not present.
841 */
842 msr_set_bit(MSR_F10H_DECFG,
843 MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
844
845 /*
846 * Verify that the MSR write was successful (could be running
847 * under a hypervisor) and only then assume that LFENCE is
848 * serializing.
849 */
850 ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
851 if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
852 /* A serializing LFENCE stops RDTSC speculation */
853 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
854 } else {
855 /* MFENCE stops RDTSC speculation */
856 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
857 }
834 } 858 }
835 859
836 /* 860 /*
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ba0b2424c9b0..e4dc26185aa7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -10,6 +10,10 @@
10 */ 10 */
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/utsname.h> 12#include <linux/utsname.h>
13#include <linux/cpu.h>
14
15#include <asm/nospec-branch.h>
16#include <asm/cmdline.h>
13#include <asm/bugs.h> 17#include <asm/bugs.h>
14#include <asm/processor.h> 18#include <asm/processor.h>
15#include <asm/processor-flags.h> 19#include <asm/processor-flags.h>
@@ -20,6 +24,8 @@
20#include <asm/pgtable.h> 24#include <asm/pgtable.h>
21#include <asm/set_memory.h> 25#include <asm/set_memory.h>
22 26
27static void __init spectre_v2_select_mitigation(void);
28
23void __init check_bugs(void) 29void __init check_bugs(void)
24{ 30{
25 identify_boot_cpu(); 31 identify_boot_cpu();
@@ -29,6 +35,9 @@ void __init check_bugs(void)
29 print_cpu_info(&boot_cpu_data); 35 print_cpu_info(&boot_cpu_data);
30 } 36 }
31 37
38 /* Select the proper spectre mitigation before patching alternatives */
39 spectre_v2_select_mitigation();
40
32#ifdef CONFIG_X86_32 41#ifdef CONFIG_X86_32
33 /* 42 /*
34 * Check whether we are able to run this kernel safely on SMP. 43 * Check whether we are able to run this kernel safely on SMP.
@@ -60,3 +69,179 @@ void __init check_bugs(void)
60 set_memory_4k((unsigned long)__va(0), 1); 69 set_memory_4k((unsigned long)__va(0), 1);
61#endif 70#endif
62} 71}
72
73/* The kernel command line selection */
74enum spectre_v2_mitigation_cmd {
75 SPECTRE_V2_CMD_NONE,
76 SPECTRE_V2_CMD_AUTO,
77 SPECTRE_V2_CMD_FORCE,
78 SPECTRE_V2_CMD_RETPOLINE,
79 SPECTRE_V2_CMD_RETPOLINE_GENERIC,
80 SPECTRE_V2_CMD_RETPOLINE_AMD,
81};
82
83static const char *spectre_v2_strings[] = {
84 [SPECTRE_V2_NONE] = "Vulnerable",
85 [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
86 [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
87 [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
88 [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
89};
90
91#undef pr_fmt
92#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt
93
94static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
95
96static void __init spec2_print_if_insecure(const char *reason)
97{
98 if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
99 pr_info("%s\n", reason);
100}
101
102static void __init spec2_print_if_secure(const char *reason)
103{
104 if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
105 pr_info("%s\n", reason);
106}
107
108static inline bool retp_compiler(void)
109{
110 return __is_defined(RETPOLINE);
111}
112
113static inline bool match_option(const char *arg, int arglen, const char *opt)
114{
115 int len = strlen(opt);
116
117 return len == arglen && !strncmp(arg, opt, len);
118}
119
120static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
121{
122 char arg[20];
123 int ret;
124
125 ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
126 sizeof(arg));
127 if (ret > 0) {
128 if (match_option(arg, ret, "off")) {
129 goto disable;
130 } else if (match_option(arg, ret, "on")) {
131 spec2_print_if_secure("force enabled on command line.");
132 return SPECTRE_V2_CMD_FORCE;
133 } else if (match_option(arg, ret, "retpoline")) {
134 spec2_print_if_insecure("retpoline selected on command line.");
135 return SPECTRE_V2_CMD_RETPOLINE;
136 } else if (match_option(arg, ret, "retpoline,amd")) {
137 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
138 pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
139 return SPECTRE_V2_CMD_AUTO;
140 }
141 spec2_print_if_insecure("AMD retpoline selected on command line.");
142 return SPECTRE_V2_CMD_RETPOLINE_AMD;
143 } else if (match_option(arg, ret, "retpoline,generic")) {
144 spec2_print_if_insecure("generic retpoline selected on command line.");
145 return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
146 } else if (match_option(arg, ret, "auto")) {
147 return SPECTRE_V2_CMD_AUTO;
148 }
149 }
150
151 if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
152 return SPECTRE_V2_CMD_AUTO;
153disable:
154 spec2_print_if_insecure("disabled on command line.");
155 return SPECTRE_V2_CMD_NONE;
156}
157
158static void __init spectre_v2_select_mitigation(void)
159{
160 enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
161 enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
162
163 /*
164 * If the CPU is not affected and the command line mode is NONE or AUTO
165 * then nothing to do.
166 */
167 if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
168 (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
169 return;
170
171 switch (cmd) {
172 case SPECTRE_V2_CMD_NONE:
173 return;
174
175 case SPECTRE_V2_CMD_FORCE:
176 /* FALLTRHU */
177 case SPECTRE_V2_CMD_AUTO:
178 goto retpoline_auto;
179
180 case SPECTRE_V2_CMD_RETPOLINE_AMD:
181 if (IS_ENABLED(CONFIG_RETPOLINE))
182 goto retpoline_amd;
183 break;
184 case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
185 if (IS_ENABLED(CONFIG_RETPOLINE))
186 goto retpoline_generic;
187 break;
188 case SPECTRE_V2_CMD_RETPOLINE:
189 if (IS_ENABLED(CONFIG_RETPOLINE))
190 goto retpoline_auto;
191 break;
192 }
193 pr_err("kernel not compiled with retpoline; no mitigation available!");
194 return;
195
196retpoline_auto:
197 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
198 retpoline_amd:
199 if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
200 pr_err("LFENCE not serializing. Switching to generic retpoline\n");
201 goto retpoline_generic;
202 }
203 mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD :
204 SPECTRE_V2_RETPOLINE_MINIMAL_AMD;
205 setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
206 setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
207 } else {
208 retpoline_generic:
209 mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC :
210 SPECTRE_V2_RETPOLINE_MINIMAL;
211 setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
212 }
213
214 spectre_v2_enabled = mode;
215 pr_info("%s\n", spectre_v2_strings[mode]);
216}
217
218#undef pr_fmt
219
220#ifdef CONFIG_SYSFS
221ssize_t cpu_show_meltdown(struct device *dev,
222 struct device_attribute *attr, char *buf)
223{
224 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
225 return sprintf(buf, "Not affected\n");
226 if (boot_cpu_has(X86_FEATURE_PTI))
227 return sprintf(buf, "Mitigation: PTI\n");
228 return sprintf(buf, "Vulnerable\n");
229}
230
231ssize_t cpu_show_spectre_v1(struct device *dev,
232 struct device_attribute *attr, char *buf)
233{
234 if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
235 return sprintf(buf, "Not affected\n");
236 return sprintf(buf, "Vulnerable\n");
237}
238
239ssize_t cpu_show_spectre_v2(struct device *dev,
240 struct device_attribute *attr, char *buf)
241{
242 if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
243 return sprintf(buf, "Not affected\n");
244
245 return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]);
246}
247#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 39d7ea865207..ef29ad001991 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -926,6 +926,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
926 if (c->x86_vendor != X86_VENDOR_AMD) 926 if (c->x86_vendor != X86_VENDOR_AMD)
927 setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); 927 setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
928 928
929 setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
930 setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
931
929 fpu__init_system(c); 932 fpu__init_system(c);
930 933
931#ifdef CONFIG_X86_32 934#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index b6c6468e10bc..4c8440de3355 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -8,6 +8,7 @@
8#include <asm/segment.h> 8#include <asm/segment.h>
9#include <asm/export.h> 9#include <asm/export.h>
10#include <asm/ftrace.h> 10#include <asm/ftrace.h>
11#include <asm/nospec-branch.h>
11 12
12#ifdef CC_USING_FENTRY 13#ifdef CC_USING_FENTRY
13# define function_hook __fentry__ 14# define function_hook __fentry__
@@ -197,7 +198,8 @@ ftrace_stub:
197 movl 0x4(%ebp), %edx 198 movl 0x4(%ebp), %edx
198 subl $MCOUNT_INSN_SIZE, %eax 199 subl $MCOUNT_INSN_SIZE, %eax
199 200
200 call *ftrace_trace_function 201 movl ftrace_trace_function, %ecx
202 CALL_NOSPEC %ecx
201 203
202 popl %edx 204 popl %edx
203 popl %ecx 205 popl %ecx
@@ -241,5 +243,5 @@ return_to_handler:
241 movl %eax, %ecx 243 movl %eax, %ecx
242 popl %edx 244 popl %edx
243 popl %eax 245 popl %eax
244 jmp *%ecx 246 JMP_NOSPEC %ecx
245#endif 247#endif
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index c832291d948a..7cb8ba08beb9 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -7,7 +7,7 @@
7#include <asm/ptrace.h> 7#include <asm/ptrace.h>
8#include <asm/ftrace.h> 8#include <asm/ftrace.h>
9#include <asm/export.h> 9#include <asm/export.h>
10 10#include <asm/nospec-branch.h>
11 11
12 .code64 12 .code64
13 .section .entry.text, "ax" 13 .section .entry.text, "ax"
@@ -286,8 +286,8 @@ trace:
286 * ip and parent ip are used and the list function is called when 286 * ip and parent ip are used and the list function is called when
287 * function tracing is enabled. 287 * function tracing is enabled.
288 */ 288 */
289 call *ftrace_trace_function 289 movq ftrace_trace_function, %r8
290 290 CALL_NOSPEC %r8
291 restore_mcount_regs 291 restore_mcount_regs
292 292
293 jmp fgraph_trace 293 jmp fgraph_trace
@@ -329,5 +329,5 @@ GLOBAL(return_to_handler)
329 movq 8(%rsp), %rdx 329 movq 8(%rsp), %rdx
330 movq (%rsp), %rax 330 movq (%rsp), %rax
331 addq $24, %rsp 331 addq $24, %rsp
332 jmp *%rdi 332 JMP_NOSPEC %rdi
333#endif 333#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a83b3346a0e1..c1bdbd3d3232 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -20,6 +20,7 @@
20#include <linux/mm.h> 20#include <linux/mm.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/nospec-branch.h>
23 24
24#ifdef CONFIG_DEBUG_STACKOVERFLOW 25#ifdef CONFIG_DEBUG_STACKOVERFLOW
25 26
@@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
55static void call_on_stack(void *func, void *stack) 56static void call_on_stack(void *func, void *stack)
56{ 57{
57 asm volatile("xchgl %%ebx,%%esp \n" 58 asm volatile("xchgl %%ebx,%%esp \n"
58 "call *%%edi \n" 59 CALL_NOSPEC
59 "movl %%ebx,%%esp \n" 60 "movl %%ebx,%%esp \n"
60 : "=b" (stack) 61 : "=b" (stack)
61 : "0" (stack), 62 : "0" (stack),
62 "D"(func) 63 [thunk_target] "D"(func)
63 : "memory", "cc", "edx", "ecx", "eax"); 64 : "memory", "cc", "edx", "ecx", "eax");
64} 65}
65 66
@@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
95 call_on_stack(print_stack_overflow, isp); 96 call_on_stack(print_stack_overflow, isp);
96 97
97 asm volatile("xchgl %%ebx,%%esp \n" 98 asm volatile("xchgl %%ebx,%%esp \n"
98 "call *%%edi \n" 99 CALL_NOSPEC
99 "movl %%ebx,%%esp \n" 100 "movl %%ebx,%%esp \n"
100 : "=a" (arg1), "=b" (isp) 101 : "=a" (arg1), "=b" (isp)
101 : "0" (desc), "1" (isp), 102 : "0" (desc), "1" (isp),
102 "D" (desc->handle_irq) 103 [thunk_target] "D" (desc->handle_irq)
103 : "memory", "cc", "ecx"); 104 : "memory", "cc", "ecx");
104 return 1; 105 return 1;
105} 106}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a4eb27918ceb..a2486f444073 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -138,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
138 return -1; 138 return -1;
139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); 139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
140 pte_unmap(pte); 140 pte_unmap(pte);
141
142 /*
143 * PTI poisons low addresses in the kernel page tables in the
144 * name of making them unusable for userspace. To execute
145 * code at such a low address, the poison must be cleared.
146 *
147 * Note: 'pgd' actually gets set in p4d_alloc() _or_
148 * pud_alloc() depending on 4/5-level paging.
149 */
150 pgd->pgd &= ~_PAGE_NX;
151
141 return 0; 152 return 0;
142} 153}
143 154
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3158dac87f82..f40d0da1f1d3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -45,6 +45,7 @@
45#include <asm/debugreg.h> 45#include <asm/debugreg.h>
46#include <asm/kvm_para.h> 46#include <asm/kvm_para.h>
47#include <asm/irq_remapping.h> 47#include <asm/irq_remapping.h>
48#include <asm/nospec-branch.h>
48 49
49#include <asm/virtext.h> 50#include <asm/virtext.h>
50#include "trace.h" 51#include "trace.h"
@@ -5027,6 +5028,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
5027#endif 5028#endif
5028 ); 5029 );
5029 5030
5031 /* Eliminate branch target predictions from guest mode */
5032 vmexit_fill_RSB();
5033
5030#ifdef CONFIG_X86_64 5034#ifdef CONFIG_X86_64
5031 wrmsrl(MSR_GS_BASE, svm->host.gs_base); 5035 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
5032#else 5036#else
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3f89f6783aa5..c829d89e2e63 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -50,6 +50,7 @@
50#include <asm/apic.h> 50#include <asm/apic.h>
51#include <asm/irq_remapping.h> 51#include <asm/irq_remapping.h>
52#include <asm/mmu_context.h> 52#include <asm/mmu_context.h>
53#include <asm/nospec-branch.h>
53 54
54#include "trace.h" 55#include "trace.h"
55#include "pmu.h" 56#include "pmu.h"
@@ -9490,6 +9491,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9490#endif 9491#endif
9491 ); 9492 );
9492 9493
9494 /* Eliminate branch target predictions from guest mode */
9495 vmexit_fill_RSB();
9496
9493 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 9497 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
9494 if (debugctlmsr) 9498 if (debugctlmsr)
9495 update_debugctlmsr(debugctlmsr); 9499 update_debugctlmsr(debugctlmsr);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 7b181b61170e..f23934bbaf4e 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o
26lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o 26lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
27lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o 27lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
28lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o 28lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
29lib-$(CONFIG_RETPOLINE) += retpoline.o
29 30
30obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o 31obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
31 32
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 4d34bb548b41..46e71a74e612 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -29,7 +29,8 @@
29#include <asm/errno.h> 29#include <asm/errno.h>
30#include <asm/asm.h> 30#include <asm/asm.h>
31#include <asm/export.h> 31#include <asm/export.h>
32 32#include <asm/nospec-branch.h>
33
33/* 34/*
34 * computes a partial checksum, e.g. for TCP/UDP fragments 35 * computes a partial checksum, e.g. for TCP/UDP fragments
35 */ 36 */
@@ -156,7 +157,7 @@ ENTRY(csum_partial)
156 negl %ebx 157 negl %ebx
157 lea 45f(%ebx,%ebx,2), %ebx 158 lea 45f(%ebx,%ebx,2), %ebx
158 testl %esi, %esi 159 testl %esi, %esi
159 jmp *%ebx 160 JMP_NOSPEC %ebx
160 161
161 # Handle 2-byte-aligned regions 162 # Handle 2-byte-aligned regions
16220: addw (%esi), %ax 16320: addw (%esi), %ax
@@ -439,7 +440,7 @@ ENTRY(csum_partial_copy_generic)
439 andl $-32,%edx 440 andl $-32,%edx
440 lea 3f(%ebx,%ebx), %ebx 441 lea 3f(%ebx,%ebx), %ebx
441 testl %esi, %esi 442 testl %esi, %esi
442 jmp *%ebx 443 JMP_NOSPEC %ebx
4431: addl $64,%esi 4441: addl $64,%esi
444 addl $64,%edi 445 addl $64,%edi
445 SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) 446 SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
new file mode 100644
index 000000000000..cb45c6cb465f
--- /dev/null
+++ b/arch/x86/lib/retpoline.S
@@ -0,0 +1,48 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2
3#include <linux/stringify.h>
4#include <linux/linkage.h>
5#include <asm/dwarf2.h>
6#include <asm/cpufeatures.h>
7#include <asm/alternative-asm.h>
8#include <asm/export.h>
9#include <asm/nospec-branch.h>
10
11.macro THUNK reg
12 .section .text.__x86.indirect_thunk.\reg
13
14ENTRY(__x86_indirect_thunk_\reg)
15 CFI_STARTPROC
16 JMP_NOSPEC %\reg
17 CFI_ENDPROC
18ENDPROC(__x86_indirect_thunk_\reg)
19.endm
20
21/*
22 * Despite being an assembler file we can't just use .irp here
23 * because __KSYM_DEPS__ only uses the C preprocessor and would
24 * only see one instance of "__x86_indirect_thunk_\reg" rather
25 * than one per register with the correct names. So we do it
26 * the simple and nasty way...
27 */
28#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg)
29#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg)
30
31GENERATE_THUNK(_ASM_AX)
32GENERATE_THUNK(_ASM_BX)
33GENERATE_THUNK(_ASM_CX)
34GENERATE_THUNK(_ASM_DX)
35GENERATE_THUNK(_ASM_SI)
36GENERATE_THUNK(_ASM_DI)
37GENERATE_THUNK(_ASM_BP)
38GENERATE_THUNK(_ASM_SP)
39#ifdef CONFIG_64BIT
40GENERATE_THUNK(r8)
41GENERATE_THUNK(r9)
42GENERATE_THUNK(r10)
43GENERATE_THUNK(r11)
44GENERATE_THUNK(r12)
45GENERATE_THUNK(r13)
46GENERATE_THUNK(r14)
47GENERATE_THUNK(r15)
48#endif
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 43d4a4a29037..ce38f165489b 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -149,7 +149,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
149 * 149 *
150 * Returns a pointer to a P4D on success, or NULL on failure. 150 * Returns a pointer to a P4D on success, or NULL on failure.
151 */ 151 */
152static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) 152static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
153{ 153{
154 pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); 154 pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
155 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 155 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
@@ -164,12 +164,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
164 if (!new_p4d_page) 164 if (!new_p4d_page)
165 return NULL; 165 return NULL;
166 166
167 if (pgd_none(*pgd)) { 167 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
168 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
169 new_p4d_page = 0;
170 }
171 if (new_p4d_page)
172 free_page(new_p4d_page);
173 } 168 }
174 BUILD_BUG_ON(pgd_large(*pgd) != 0); 169 BUILD_BUG_ON(pgd_large(*pgd) != 0);
175 170
@@ -182,7 +177,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
182 * 177 *
183 * Returns a pointer to a PMD on success, or NULL on failure. 178 * Returns a pointer to a PMD on success, or NULL on failure.
184 */ 179 */
185static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) 180static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
186{ 181{
187 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 182 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
188 p4d_t *p4d = pti_user_pagetable_walk_p4d(address); 183 p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
@@ -194,12 +189,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
194 if (!new_pud_page) 189 if (!new_pud_page)
195 return NULL; 190 return NULL;
196 191
197 if (p4d_none(*p4d)) { 192 set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
198 set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
199 new_pud_page = 0;
200 }
201 if (new_pud_page)
202 free_page(new_pud_page);
203 } 193 }
204 194
205 pud = pud_offset(p4d, address); 195 pud = pud_offset(p4d, address);
@@ -213,12 +203,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
213 if (!new_pmd_page) 203 if (!new_pmd_page)
214 return NULL; 204 return NULL;
215 205
216 if (pud_none(*pud)) { 206 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
217 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
218 new_pmd_page = 0;
219 }
220 if (new_pmd_page)
221 free_page(new_pmd_page);
222 } 207 }
223 208
224 return pmd_offset(pud, address); 209 return pmd_offset(pud, address);
@@ -251,12 +236,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
251 if (!new_pte_page) 236 if (!new_pte_page)
252 return NULL; 237 return NULL;
253 238
254 if (pmd_none(*pmd)) { 239 set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
255 set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
256 new_pte_page = 0;
257 }
258 if (new_pte_page)
259 free_page(new_pte_page);
260 } 240 }
261 241
262 pte = pte_offset_kernel(pmd, address); 242 pte = pte_offset_kernel(pmd, address);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index d87ac96e37ed..2dd15e967c3f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -135,7 +135,9 @@ pgd_t * __init efi_call_phys_prolog(void)
135 pud[j] = *pud_offset(p4d_k, vaddr); 135 pud[j] = *pud_offset(p4d_k, vaddr);
136 } 136 }
137 } 137 }
138 pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX;
138 } 139 }
140
139out: 141out:
140 __flush_tlb_all(); 142 __flush_tlb_all();
141 143
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index bdc87907d6a1..2415ad9f6dd4 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -236,6 +236,9 @@ config GENERIC_CPU_DEVICES
236config GENERIC_CPU_AUTOPROBE 236config GENERIC_CPU_AUTOPROBE
237 bool 237 bool
238 238
239config GENERIC_CPU_VULNERABILITIES
240 bool
241
239config SOC_BUS 242config SOC_BUS
240 bool 243 bool
241 select GLOB 244 select GLOB
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 58a9b608d821..d99038487a0d 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -511,10 +511,58 @@ static void __init cpu_dev_register_generic(void)
511#endif 511#endif
512} 512}
513 513
514#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES
515
516ssize_t __weak cpu_show_meltdown(struct device *dev,
517 struct device_attribute *attr, char *buf)
518{
519 return sprintf(buf, "Not affected\n");
520}
521
522ssize_t __weak cpu_show_spectre_v1(struct device *dev,
523 struct device_attribute *attr, char *buf)
524{
525 return sprintf(buf, "Not affected\n");
526}
527
528ssize_t __weak cpu_show_spectre_v2(struct device *dev,
529 struct device_attribute *attr, char *buf)
530{
531 return sprintf(buf, "Not affected\n");
532}
533
534static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
535static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
536static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
537
538static struct attribute *cpu_root_vulnerabilities_attrs[] = {
539 &dev_attr_meltdown.attr,
540 &dev_attr_spectre_v1.attr,
541 &dev_attr_spectre_v2.attr,
542 NULL
543};
544
545static const struct attribute_group cpu_root_vulnerabilities_group = {
546 .name = "vulnerabilities",
547 .attrs = cpu_root_vulnerabilities_attrs,
548};
549
550static void __init cpu_register_vulnerabilities(void)
551{
552 if (sysfs_create_group(&cpu_subsys.dev_root->kobj,
553 &cpu_root_vulnerabilities_group))
554 pr_err("Unable to register CPU vulnerabilities\n");
555}
556
557#else
558static inline void cpu_register_vulnerabilities(void) { }
559#endif
560
514void __init cpu_dev_init(void) 561void __init cpu_dev_init(void)
515{ 562{
516 if (subsys_system_register(&cpu_subsys, cpu_root_attr_groups)) 563 if (subsys_system_register(&cpu_subsys, cpu_root_attr_groups))
517 panic("Failed to register CPU subsystem"); 564 panic("Failed to register CPU subsystem");
518 565
519 cpu_dev_register_generic(); 566 cpu_dev_register_generic();
567 cpu_register_vulnerabilities();
520} 568}
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index a04ef7c15c6a..7b01bc11c692 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -47,6 +47,13 @@ extern void cpu_remove_dev_attr(struct device_attribute *attr);
47extern int cpu_add_dev_attr_group(struct attribute_group *attrs); 47extern int cpu_add_dev_attr_group(struct attribute_group *attrs);
48extern void cpu_remove_dev_attr_group(struct attribute_group *attrs); 48extern void cpu_remove_dev_attr_group(struct attribute_group *attrs);
49 49
50extern ssize_t cpu_show_meltdown(struct device *dev,
51 struct device_attribute *attr, char *buf);
52extern ssize_t cpu_show_spectre_v1(struct device *dev,
53 struct device_attribute *attr, char *buf);
54extern ssize_t cpu_show_spectre_v2(struct device *dev,
55 struct device_attribute *attr, char *buf);
56
50extern __printf(4, 5) 57extern __printf(4, 5)
51struct device *cpu_device_create(struct device *parent, void *drvdata, 58struct device *cpu_device_create(struct device *parent, void *drvdata,
52 const struct attribute_group **groups, 59 const struct attribute_group **groups,
diff --git a/security/Kconfig b/security/Kconfig
index 3d4debd0257e..b0cb9a5f9448 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -63,7 +63,7 @@ config PAGE_TABLE_ISOLATION
63 ensuring that the majority of kernel addresses are not mapped 63 ensuring that the majority of kernel addresses are not mapped
64 into userspace. 64 into userspace.
65 65
66 See Documentation/x86/pagetable-isolation.txt for more details. 66 See Documentation/x86/pti.txt for more details.
67 67
68config SECURITY_INFINIBAND 68config SECURITY_INFINIBAND
69 bool "Infiniband Security Hooks" 69 bool "Infiniband Security Hooks"
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 9b341584eb1b..f40d46e24bcc 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -428,6 +428,40 @@ static void add_ignores(struct objtool_file *file)
428} 428}
429 429
430/* 430/*
431 * FIXME: For now, just ignore any alternatives which add retpolines. This is
432 * a temporary hack, as it doesn't allow ORC to unwind from inside a retpoline.
433 * But it at least allows objtool to understand the control flow *around* the
434 * retpoline.
435 */
436static int add_nospec_ignores(struct objtool_file *file)
437{
438 struct section *sec;
439 struct rela *rela;
440 struct instruction *insn;
441
442 sec = find_section_by_name(file->elf, ".rela.discard.nospec");
443 if (!sec)
444 return 0;
445
446 list_for_each_entry(rela, &sec->rela_list, list) {
447 if (rela->sym->type != STT_SECTION) {
448 WARN("unexpected relocation symbol type in %s", sec->name);
449 return -1;
450 }
451
452 insn = find_insn(file, rela->sym->sec, rela->addend);
453 if (!insn) {
454 WARN("bad .discard.nospec entry");
455 return -1;
456 }
457
458 insn->ignore_alts = true;
459 }
460
461 return 0;
462}
463
464/*
431 * Find the destination instructions for all jumps. 465 * Find the destination instructions for all jumps.
432 */ 466 */
433static int add_jump_destinations(struct objtool_file *file) 467static int add_jump_destinations(struct objtool_file *file)
@@ -456,6 +490,13 @@ static int add_jump_destinations(struct objtool_file *file)
456 } else if (rela->sym->sec->idx) { 490 } else if (rela->sym->sec->idx) {
457 dest_sec = rela->sym->sec; 491 dest_sec = rela->sym->sec;
458 dest_off = rela->sym->sym.st_value + rela->addend + 4; 492 dest_off = rela->sym->sym.st_value + rela->addend + 4;
493 } else if (strstr(rela->sym->name, "_indirect_thunk_")) {
494 /*
495 * Retpoline jumps are really dynamic jumps in
496 * disguise, so convert them accordingly.
497 */
498 insn->type = INSN_JUMP_DYNAMIC;
499 continue;
459 } else { 500 } else {
460 /* sibling call */ 501 /* sibling call */
461 insn->jump_dest = 0; 502 insn->jump_dest = 0;
@@ -502,11 +543,18 @@ static int add_call_destinations(struct objtool_file *file)
502 dest_off = insn->offset + insn->len + insn->immediate; 543 dest_off = insn->offset + insn->len + insn->immediate;
503 insn->call_dest = find_symbol_by_offset(insn->sec, 544 insn->call_dest = find_symbol_by_offset(insn->sec,
504 dest_off); 545 dest_off);
546 /*
547 * FIXME: Thanks to retpolines, it's now considered
548 * normal for a function to call within itself. So
549 * disable this warning for now.
550 */
551#if 0
505 if (!insn->call_dest) { 552 if (!insn->call_dest) {
506 WARN_FUNC("can't find call dest symbol at offset 0x%lx", 553 WARN_FUNC("can't find call dest symbol at offset 0x%lx",
507 insn->sec, insn->offset, dest_off); 554 insn->sec, insn->offset, dest_off);
508 return -1; 555 return -1;
509 } 556 }
557#endif
510 } else if (rela->sym->type == STT_SECTION) { 558 } else if (rela->sym->type == STT_SECTION) {
511 insn->call_dest = find_symbol_by_offset(rela->sym->sec, 559 insn->call_dest = find_symbol_by_offset(rela->sym->sec,
512 rela->addend+4); 560 rela->addend+4);
@@ -671,12 +719,6 @@ static int add_special_section_alts(struct objtool_file *file)
671 return ret; 719 return ret;
672 720
673 list_for_each_entry_safe(special_alt, tmp, &special_alts, list) { 721 list_for_each_entry_safe(special_alt, tmp, &special_alts, list) {
674 alt = malloc(sizeof(*alt));
675 if (!alt) {
676 WARN("malloc failed");
677 ret = -1;
678 goto out;
679 }
680 722
681 orig_insn = find_insn(file, special_alt->orig_sec, 723 orig_insn = find_insn(file, special_alt->orig_sec,
682 special_alt->orig_off); 724 special_alt->orig_off);
@@ -687,6 +729,10 @@ static int add_special_section_alts(struct objtool_file *file)
687 goto out; 729 goto out;
688 } 730 }
689 731
732 /* Ignore retpoline alternatives. */
733 if (orig_insn->ignore_alts)
734 continue;
735
690 new_insn = NULL; 736 new_insn = NULL;
691 if (!special_alt->group || special_alt->new_len) { 737 if (!special_alt->group || special_alt->new_len) {
692 new_insn = find_insn(file, special_alt->new_sec, 738 new_insn = find_insn(file, special_alt->new_sec,
@@ -712,6 +758,13 @@ static int add_special_section_alts(struct objtool_file *file)
712 goto out; 758 goto out;
713 } 759 }
714 760
761 alt = malloc(sizeof(*alt));
762 if (!alt) {
763 WARN("malloc failed");
764 ret = -1;
765 goto out;
766 }
767
715 alt->insn = new_insn; 768 alt->insn = new_insn;
716 list_add_tail(&alt->list, &orig_insn->alts); 769 list_add_tail(&alt->list, &orig_insn->alts);
717 770
@@ -1028,6 +1081,10 @@ static int decode_sections(struct objtool_file *file)
1028 1081
1029 add_ignores(file); 1082 add_ignores(file);
1030 1083
1084 ret = add_nospec_ignores(file);
1085 if (ret)
1086 return ret;
1087
1031 ret = add_jump_destinations(file); 1088 ret = add_jump_destinations(file);
1032 if (ret) 1089 if (ret)
1033 return ret; 1090 return ret;
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index 47d9ea70a83d..dbadb304a410 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -44,7 +44,7 @@ struct instruction {
44 unsigned int len; 44 unsigned int len;
45 unsigned char type; 45 unsigned char type;
46 unsigned long immediate; 46 unsigned long immediate;
47 bool alt_group, visited, dead_end, ignore, hint, save, restore; 47 bool alt_group, visited, dead_end, ignore, hint, save, restore, ignore_alts;
48 struct symbol *call_dest; 48 struct symbol *call_dest;
49 struct instruction *jump_dest; 49 struct instruction *jump_dest;
50 struct list_head alts; 50 struct list_head alts;
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 939a337128db..5d4f10ac2af2 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -7,7 +7,7 @@ include ../lib.mk
7 7
8TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \ 8TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
9 check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ioperm \ 9 check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ioperm \
10 protection_keys test_vdso 10 protection_keys test_vdso test_vsyscall
11TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ 11TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
12 test_FCMOV test_FCOMI test_FISTTP \ 12 test_FCMOV test_FCOMI test_FISTTP \
13 vdso_restorer 13 vdso_restorer
diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
new file mode 100644
index 000000000000..7a744fa7b786
--- /dev/null
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -0,0 +1,500 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2
3#define _GNU_SOURCE
4
5#include <stdio.h>
6#include <sys/time.h>
7#include <time.h>
8#include <stdlib.h>
9#include <sys/syscall.h>
10#include <unistd.h>
11#include <dlfcn.h>
12#include <string.h>
13#include <inttypes.h>
14#include <signal.h>
15#include <sys/ucontext.h>
16#include <errno.h>
17#include <err.h>
18#include <sched.h>
19#include <stdbool.h>
20#include <setjmp.h>
21
22#ifdef __x86_64__
23# define VSYS(x) (x)
24#else
25# define VSYS(x) 0
26#endif
27
28#ifndef SYS_getcpu
29# ifdef __x86_64__
30# define SYS_getcpu 309
31# else
32# define SYS_getcpu 318
33# endif
34#endif
35
36static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
37 int flags)
38{
39 struct sigaction sa;
40 memset(&sa, 0, sizeof(sa));
41 sa.sa_sigaction = handler;
42 sa.sa_flags = SA_SIGINFO | flags;
43 sigemptyset(&sa.sa_mask);
44 if (sigaction(sig, &sa, 0))
45 err(1, "sigaction");
46}
47
48/* vsyscalls and vDSO */
49bool should_read_vsyscall = false;
50
51typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
52gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
53gtod_t vdso_gtod;
54
55typedef int (*vgettime_t)(clockid_t, struct timespec *);
56vgettime_t vdso_gettime;
57
58typedef long (*time_func_t)(time_t *t);
59time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
60time_func_t vdso_time;
61
62typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
63getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
64getcpu_t vdso_getcpu;
65
66static void init_vdso(void)
67{
68 void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
69 if (!vdso)
70 vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
71 if (!vdso) {
72 printf("[WARN]\tfailed to find vDSO\n");
73 return;
74 }
75
76 vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday");
77 if (!vdso_gtod)
78 printf("[WARN]\tfailed to find gettimeofday in vDSO\n");
79
80 vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime");
81 if (!vdso_gettime)
82 printf("[WARN]\tfailed to find clock_gettime in vDSO\n");
83
84 vdso_time = (time_func_t)dlsym(vdso, "__vdso_time");
85 if (!vdso_time)
86 printf("[WARN]\tfailed to find time in vDSO\n");
87
88 vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
89 if (!vdso_getcpu) {
90 /* getcpu() was never wired up in the 32-bit vDSO. */
91 printf("[%s]\tfailed to find getcpu in vDSO\n",
92 sizeof(long) == 8 ? "WARN" : "NOTE");
93 }
94}
95
96static int init_vsys(void)
97{
98#ifdef __x86_64__
99 int nerrs = 0;
100 FILE *maps;
101 char line[128];
102 bool found = false;
103
104 maps = fopen("/proc/self/maps", "r");
105 if (!maps) {
106 printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
107 should_read_vsyscall = true;
108 return 0;
109 }
110
111 while (fgets(line, sizeof(line), maps)) {
112 char r, x;
113 void *start, *end;
114 char name[128];
115 if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
116 &start, &end, &r, &x, name) != 5)
117 continue;
118
119 if (strcmp(name, "[vsyscall]"))
120 continue;
121
122 printf("\tvsyscall map: %s", line);
123
124 if (start != (void *)0xffffffffff600000 ||
125 end != (void *)0xffffffffff601000) {
126 printf("[FAIL]\taddress range is nonsense\n");
127 nerrs++;
128 }
129
130 printf("\tvsyscall permissions are %c-%c\n", r, x);
131 should_read_vsyscall = (r == 'r');
132 if (x != 'x') {
133 vgtod = NULL;
134 vtime = NULL;
135 vgetcpu = NULL;
136 }
137
138 found = true;
139 break;
140 }
141
142 fclose(maps);
143
144 if (!found) {
145 printf("\tno vsyscall map in /proc/self/maps\n");
146 should_read_vsyscall = false;
147 vgtod = NULL;
148 vtime = NULL;
149 vgetcpu = NULL;
150 }
151
152 return nerrs;
153#else
154 return 0;
155#endif
156}
157
158/* syscalls */
159static inline long sys_gtod(struct timeval *tv, struct timezone *tz)
160{
161 return syscall(SYS_gettimeofday, tv, tz);
162}
163
164static inline int sys_clock_gettime(clockid_t id, struct timespec *ts)
165{
166 return syscall(SYS_clock_gettime, id, ts);
167}
168
169static inline long sys_time(time_t *t)
170{
171 return syscall(SYS_time, t);
172}
173
174static inline long sys_getcpu(unsigned * cpu, unsigned * node,
175 void* cache)
176{
177 return syscall(SYS_getcpu, cpu, node, cache);
178}
179
180static jmp_buf jmpbuf;
181
182static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
183{
184 siglongjmp(jmpbuf, 1);
185}
186
187static double tv_diff(const struct timeval *a, const struct timeval *b)
188{
189 return (double)(a->tv_sec - b->tv_sec) +
190 (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6;
191}
192
193static int check_gtod(const struct timeval *tv_sys1,
194 const struct timeval *tv_sys2,
195 const struct timezone *tz_sys,
196 const char *which,
197 const struct timeval *tv_other,
198 const struct timezone *tz_other)
199{
200 int nerrs = 0;
201 double d1, d2;
202
203 if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) {
204 printf("[FAIL] %s tz mismatch\n", which);
205 nerrs++;
206 }
207
208 d1 = tv_diff(tv_other, tv_sys1);
209 d2 = tv_diff(tv_sys2, tv_other);
210 printf("\t%s time offsets: %lf %lf\n", which, d1, d2);
211
212 if (d1 < 0 || d2 < 0) {
213 printf("[FAIL]\t%s time was inconsistent with the syscall\n", which);
214 nerrs++;
215 } else {
216 printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which);
217 }
218
219 return nerrs;
220}
221
222static int test_gtod(void)
223{
224 struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys;
225 struct timezone tz_sys, tz_vdso, tz_vsys;
226 long ret_vdso = -1;
227 long ret_vsys = -1;
228 int nerrs = 0;
229
230 printf("[RUN]\ttest gettimeofday()\n");
231
232 if (sys_gtod(&tv_sys1, &tz_sys) != 0)
233 err(1, "syscall gettimeofday");
234 if (vdso_gtod)
235 ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
236 if (vgtod)
237 ret_vsys = vgtod(&tv_vsys, &tz_vsys);
238 if (sys_gtod(&tv_sys2, &tz_sys) != 0)
239 err(1, "syscall gettimeofday");
240
241 if (vdso_gtod) {
242 if (ret_vdso == 0) {
243 nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso);
244 } else {
245 printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso);
246 nerrs++;
247 }
248 }
249
250 if (vgtod) {
251 if (ret_vsys == 0) {
252 nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
253 } else {
254 printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys);
255 nerrs++;
256 }
257 }
258
259 return nerrs;
260}
261
262static int test_time(void) {
263 int nerrs = 0;
264
265 printf("[RUN]\ttest time()\n");
266 long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0;
267 long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1;
268 t_sys1 = sys_time(&t2_sys1);
269 if (vdso_time)
270 t_vdso = vdso_time(&t2_vdso);
271 if (vtime)
272 t_vsys = vtime(&t2_vsys);
273 t_sys2 = sys_time(&t2_sys2);
274 if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
275 printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2);
276 nerrs++;
277 return nerrs;
278 }
279
280 if (vdso_time) {
281 if (t_vdso < 0 || t_vdso != t2_vdso) {
282 printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso);
283 nerrs++;
284 } else if (t_vdso < t_sys1 || t_vdso > t_sys2) {
285 printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2);
286 nerrs++;
287 } else {
288 printf("[OK]\tvDSO time() is okay\n");
289 }
290 }
291
292 if (vtime) {
293 if (t_vsys < 0 || t_vsys != t2_vsys) {
294 printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
295 nerrs++;
296 } else if (t_vsys < t_sys1 || t_vsys > t_sys2) {
297 printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2);
298 nerrs++;
299 } else {
300 printf("[OK]\tvsyscall time() is okay\n");
301 }
302 }
303
304 return nerrs;
305}
306
307static int test_getcpu(int cpu)
308{
309 int nerrs = 0;
310 long ret_sys, ret_vdso = -1, ret_vsys = -1;
311
312 printf("[RUN]\tgetcpu() on CPU %d\n", cpu);
313
314 cpu_set_t cpuset;
315 CPU_ZERO(&cpuset);
316 CPU_SET(cpu, &cpuset);
317 if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
318 printf("[SKIP]\tfailed to force CPU %d\n", cpu);
319 return nerrs;
320 }
321
322 unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys;
323 unsigned node = 0;
324 bool have_node = false;
325 ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
326 if (vdso_getcpu)
327 ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
328 if (vgetcpu)
329 ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
330
331 if (ret_sys == 0) {
332 if (cpu_sys != cpu) {
333 printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu);
334 nerrs++;
335 }
336
337 have_node = true;
338 node = node_sys;
339 }
340
341 if (vdso_getcpu) {
342 if (ret_vdso) {
343 printf("[FAIL]\tvDSO getcpu() failed\n");
344 nerrs++;
345 } else {
346 if (!have_node) {
347 have_node = true;
348 node = node_vdso;
349 }
350
351 if (cpu_vdso != cpu) {
352 printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu);
353 nerrs++;
354 } else {
355 printf("[OK]\tvDSO reported correct CPU\n");
356 }
357
358 if (node_vdso != node) {
359 printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node);
360 nerrs++;
361 } else {
362 printf("[OK]\tvDSO reported correct node\n");
363 }
364 }
365 }
366
367 if (vgetcpu) {
368 if (ret_vsys) {
369 printf("[FAIL]\tvsyscall getcpu() failed\n");
370 nerrs++;
371 } else {
372 if (!have_node) {
373 have_node = true;
374 node = node_vsys;
375 }
376
377 if (cpu_vsys != cpu) {
378 printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu);
379 nerrs++;
380 } else {
381 printf("[OK]\tvsyscall reported correct CPU\n");
382 }
383
384 if (node_vsys != node) {
385 printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node);
386 nerrs++;
387 } else {
388 printf("[OK]\tvsyscall reported correct node\n");
389 }
390 }
391 }
392
393 return nerrs;
394}
395
396static int test_vsys_r(void)
397{
398#ifdef __x86_64__
399 printf("[RUN]\tChecking read access to the vsyscall page\n");
400 bool can_read;
401 if (sigsetjmp(jmpbuf, 1) == 0) {
402 *(volatile int *)0xffffffffff600000;
403 can_read = true;
404 } else {
405 can_read = false;
406 }
407
408 if (can_read && !should_read_vsyscall) {
409 printf("[FAIL]\tWe have read access, but we shouldn't\n");
410 return 1;
411 } else if (!can_read && should_read_vsyscall) {
412 printf("[FAIL]\tWe don't have read access, but we should\n");
413 return 1;
414 } else {
415 printf("[OK]\tgot expected result\n");
416 }
417#endif
418
419 return 0;
420}
421
422
423#ifdef __x86_64__
424#define X86_EFLAGS_TF (1UL << 8)
425static volatile sig_atomic_t num_vsyscall_traps;
426
427static unsigned long get_eflags(void)
428{
429 unsigned long eflags;
430 asm volatile ("pushfq\n\tpopq %0" : "=rm" (eflags));
431 return eflags;
432}
433
434static void set_eflags(unsigned long eflags)
435{
436 asm volatile ("pushq %0\n\tpopfq" : : "rm" (eflags) : "flags");
437}
438
439static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
440{
441 ucontext_t *ctx = (ucontext_t *)ctx_void;
442 unsigned long ip = ctx->uc_mcontext.gregs[REG_RIP];
443
444 if (((ip ^ 0xffffffffff600000UL) & ~0xfffUL) == 0)
445 num_vsyscall_traps++;
446}
447
448static int test_native_vsyscall(void)
449{
450 time_t tmp;
451 bool is_native;
452
453 if (!vtime)
454 return 0;
455
456 printf("[RUN]\tchecking for native vsyscall\n");
457 sethandler(SIGTRAP, sigtrap, 0);
458 set_eflags(get_eflags() | X86_EFLAGS_TF);
459 vtime(&tmp);
460 set_eflags(get_eflags() & ~X86_EFLAGS_TF);
461
462 /*
463 * If vsyscalls are emulated, we expect a single trap in the
464 * vsyscall page -- the call instruction will trap with RIP
465 * pointing to the entry point before emulation takes over.
466 * In native mode, we expect two traps, since whatever code
467 * the vsyscall page contains will be more than just a ret
468 * instruction.
469 */
470 is_native = (num_vsyscall_traps > 1);
471
472 printf("\tvsyscalls are %s (%d instructions in vsyscall page)\n",
473 (is_native ? "native" : "emulated"),
474 (int)num_vsyscall_traps);
475
476 return 0;
477}
478#endif
479
480int main(int argc, char **argv)
481{
482 int nerrs = 0;
483
484 init_vdso();
485 nerrs += init_vsys();
486
487 nerrs += test_gtod();
488 nerrs += test_time();
489 nerrs += test_getcpu(0);
490 nerrs += test_getcpu(1);
491
492 sethandler(SIGSEGV, sigsegv, 0);
493 nerrs += test_vsys_r();
494
495#ifdef __x86_64__
496 nerrs += test_native_vsyscall();
497#endif
498
499 return nerrs ? 1 : 0;
500}