summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu2
-rw-r--r--Documentation/admin-guide/hw-vuln/index.rst2
-rw-r--r--Documentation/admin-guide/hw-vuln/multihit.rst163
-rw-r--r--Documentation/admin-guide/hw-vuln/tsx_async_abort.rst276
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt92
-rw-r--r--Documentation/x86/index.rst1
-rw-r--r--Documentation/x86/tsx_async_abort.rst117
-rw-r--r--arch/x86/Kconfig45
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h6
-rw-r--r--arch/x86/include/asm/msr-index.h16
-rw-r--r--arch/x86/include/asm/nospec-branch.h4
-rw-r--r--arch/x86/include/asm/processor.h7
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/bugs.c159
-rw-r--r--arch/x86/kernel/cpu/common.c99
-rw-r--r--arch/x86/kernel/cpu/cpu.h18
-rw-r--r--arch/x86/kernel/cpu/intel.c5
-rw-r--r--arch/x86/kernel/cpu/tsx.c140
-rw-r--r--arch/x86/kvm/mmu.c270
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/paging_tmpl.h29
-rw-r--r--arch/x86/kvm/x86.c39
-rw-r--r--drivers/base/cpu.c17
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_context.c5
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_context_types.h7
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c111
-rw-r--r--drivers/gpu/drm/i915/gt/intel_engine_types.h13
-rw-r--r--drivers/gpu/drm/i915/gt/intel_gt_pm.c8
-rw-r--r--drivers/gpu/drm/i915/i915_cmd_parser.c435
-rw-r--r--drivers/gpu/drm/i915/i915_drv.c4
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h31
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c16
-rw-r--r--drivers/gpu/drm/i915/i915_getparam.c2
-rw-r--r--drivers/gpu/drm/i915/i915_reg.h10
-rw-r--r--drivers/gpu/drm/i915/intel_pm.c122
-rw-r--r--drivers/gpu/drm/i915/intel_pm.h3
-rw-r--r--drivers/scsi/qla2xxx/qla_mid.c8
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c8
-rw-r--r--drivers/scsi/scsi_lib.c3
-rw-r--r--drivers/scsi/sd_zbc.c29
-rw-r--r--include/linux/cpu.h30
-rw-r--r--include/linux/kvm_host.h6
-rw-r--r--kernel/cpu.c27
-rw-r--r--kernel/signal.c2
-rw-r--r--virt/kvm/kvm_main.c112
46 files changed, 2231 insertions, 276 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 06d0931119cc..fc20cde63d1e 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -486,6 +486,8 @@ What: /sys/devices/system/cpu/vulnerabilities
486 /sys/devices/system/cpu/vulnerabilities/spec_store_bypass 486 /sys/devices/system/cpu/vulnerabilities/spec_store_bypass
487 /sys/devices/system/cpu/vulnerabilities/l1tf 487 /sys/devices/system/cpu/vulnerabilities/l1tf
488 /sys/devices/system/cpu/vulnerabilities/mds 488 /sys/devices/system/cpu/vulnerabilities/mds
489 /sys/devices/system/cpu/vulnerabilities/tsx_async_abort
490 /sys/devices/system/cpu/vulnerabilities/itlb_multihit
489Date: January 2018 491Date: January 2018
490Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> 492Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
491Description: Information about CPU vulnerabilities 493Description: Information about CPU vulnerabilities
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index 49311f3da6f2..0795e3c2643f 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -12,3 +12,5 @@ are configurable at compile, boot or run time.
12 spectre 12 spectre
13 l1tf 13 l1tf
14 mds 14 mds
15 tsx_async_abort
16 multihit.rst
diff --git a/Documentation/admin-guide/hw-vuln/multihit.rst b/Documentation/admin-guide/hw-vuln/multihit.rst
new file mode 100644
index 000000000000..ba9988d8bce5
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/multihit.rst
@@ -0,0 +1,163 @@
1iTLB multihit
2=============
3
4iTLB multihit is an erratum where some processors may incur a machine check
5error, possibly resulting in an unrecoverable CPU lockup, when an
6instruction fetch hits multiple entries in the instruction TLB. This can
7occur when the page size is changed along with either the physical address
8or cache type. A malicious guest running on a virtualized system can
9exploit this erratum to perform a denial of service attack.
10
11
12Affected processors
13-------------------
14
15Variations of this erratum are present on most Intel Core and Xeon processor
16models. The erratum is not present on:
17
18 - non-Intel processors
19
20 - Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont)
21
22 - Intel processors that have the PSCHANGE_MC_NO bit set in the
23 IA32_ARCH_CAPABILITIES MSR.
24
25
26Related CVEs
27------------
28
29The following CVE entry is related to this issue:
30
31 ============== =================================================
32 CVE-2018-12207 Machine Check Error Avoidance on Page Size Change
33 ============== =================================================
34
35
36Problem
37-------
38
39Privileged software, including OS and virtual machine managers (VMM), are in
40charge of memory management. A key component in memory management is the control
41of the page tables. Modern processors use virtual memory, a technique that creates
42the illusion of a very large memory for processors. This virtual space is split
43into pages of a given size. Page tables translate virtual addresses to physical
44addresses.
45
46To reduce latency when performing a virtual to physical address translation,
47processors include a structure, called TLB, that caches recent translations.
48There are separate TLBs for instruction (iTLB) and data (dTLB).
49
50Under this errata, instructions are fetched from a linear address translated
51using a 4 KB translation cached in the iTLB. Privileged software modifies the
52paging structure so that the same linear address using large page size (2 MB, 4
53MB, 1 GB) with a different physical address or memory type. After the page
54structure modification but before the software invalidates any iTLB entries for
55the linear address, a code fetch that happens on the same linear address may
56cause a machine-check error which can result in a system hang or shutdown.
57
58
59Attack scenarios
60----------------
61
62Attacks against the iTLB multihit erratum can be mounted from malicious
63guests in a virtualized system.
64
65
66iTLB multihit system information
67--------------------------------
68
69The Linux kernel provides a sysfs interface to enumerate the current iTLB
70multihit status of the system:whether the system is vulnerable and which
71mitigations are active. The relevant sysfs file is:
72
73/sys/devices/system/cpu/vulnerabilities/itlb_multihit
74
75The possible values in this file are:
76
77.. list-table::
78
79 * - Not affected
80 - The processor is not vulnerable.
81 * - KVM: Mitigation: Split huge pages
82 - Software changes mitigate this issue.
83 * - KVM: Vulnerable
84 - The processor is vulnerable, but no mitigation enabled
85
86
87Enumeration of the erratum
88--------------------------------
89
90A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr
91and will be set on CPU's which are mitigated against this issue.
92
93 ======================================= =========== ===============================
94 IA32_ARCH_CAPABILITIES MSR Not present Possibly vulnerable,check model
95 IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '0' Likely vulnerable,check model
96 IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '1' Not vulnerable
97 ======================================= =========== ===============================
98
99
100Mitigation mechanism
101-------------------------
102
103This erratum can be mitigated by restricting the use of large page sizes to
104non-executable pages. This forces all iTLB entries to be 4K, and removes
105the possibility of multiple hits.
106
107In order to mitigate the vulnerability, KVM initially marks all huge pages
108as non-executable. If the guest attempts to execute in one of those pages,
109the page is broken down into 4K pages, which are then marked executable.
110
111If EPT is disabled or not available on the host, KVM is in control of TLB
112flushes and the problematic situation cannot happen. However, the shadow
113EPT paging mechanism used by nested virtualization is vulnerable, because
114the nested guest can trigger multiple iTLB hits by modifying its own
115(non-nested) page tables. For simplicity, KVM will make large pages
116non-executable in all shadow paging modes.
117
118Mitigation control on the kernel command line and KVM - module parameter
119------------------------------------------------------------------------
120
121The KVM hypervisor mitigation mechanism for marking huge pages as
122non-executable can be controlled with a module parameter "nx_huge_pages=".
123The kernel command line allows to control the iTLB multihit mitigations at
124boot time with the option "kvm.nx_huge_pages=".
125
126The valid arguments for these options are:
127
128 ========== ================================================================
129 force Mitigation is enabled. In this case, the mitigation implements
130 non-executable huge pages in Linux kernel KVM module. All huge
131 pages in the EPT are marked as non-executable.
132 If a guest attempts to execute in one of those pages, the page is
133 broken down into 4K pages, which are then marked executable.
134
135 off Mitigation is disabled.
136
137 auto Enable mitigation only if the platform is affected and the kernel
138 was not booted with the "mitigations=off" command line parameter.
139 This is the default option.
140 ========== ================================================================
141
142
143Mitigation selection guide
144--------------------------
145
1461. No virtualization in use
147^^^^^^^^^^^^^^^^^^^^^^^^^^^
148
149 The system is protected by the kernel unconditionally and no further
150 action is required.
151
1522. Virtualization with trusted guests
153^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
154
155 If the guest comes from a trusted source, you may assume that the guest will
156 not attempt to maliciously exploit these errata and no further action is
157 required.
158
1593. Virtualization with untrusted guests
160^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
161 If the guest comes from an untrusted source, the guest host kernel will need
162 to apply iTLB multihit mitigation via the kernel command line or kvm
163 module parameter.
diff --git a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
new file mode 100644
index 000000000000..fddbd7579c53
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
@@ -0,0 +1,276 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3TAA - TSX Asynchronous Abort
4======================================
5
6TAA is a hardware vulnerability that allows unprivileged speculative access to
7data which is available in various CPU internal buffers by using asynchronous
8aborts within an Intel TSX transactional region.
9
10Affected processors
11-------------------
12
13This vulnerability only affects Intel processors that support Intel
14Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8)
15is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit
16(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations
17also mitigate against TAA.
18
19Whether a processor is affected or not can be read out from the TAA
20vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`.
21
22Related CVEs
23------------
24
25The following CVE entry is related to this TAA issue:
26
27 ============== ===== ===================================================
28 CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some
29 microprocessors utilizing speculative execution may
30 allow an authenticated user to potentially enable
31 information disclosure via a side channel with
32 local access.
33 ============== ===== ===================================================
34
35Problem
36-------
37
38When performing store, load or L1 refill operations, processors write
39data into temporary microarchitectural structures (buffers). The data in
40those buffers can be forwarded to load operations as an optimization.
41
42Intel TSX is an extension to the x86 instruction set architecture that adds
43hardware transactional memory support to improve performance of multi-threaded
44software. TSX lets the processor expose and exploit concurrency hidden in an
45application due to dynamically avoiding unnecessary synchronization.
46
47TSX supports atomic memory transactions that are either committed (success) or
48aborted. During an abort, operations that happened within the transactional region
49are rolled back. An asynchronous abort takes place, among other options, when a
50different thread accesses a cache line that is also used within the transactional
51region when that access might lead to a data race.
52
53Immediately after an uncompleted asynchronous abort, certain speculatively
54executed loads may read data from those internal buffers and pass it to dependent
55operations. This can be then used to infer the value via a cache side channel
56attack.
57
58Because the buffers are potentially shared between Hyper-Threads cross
59Hyper-Thread attacks are possible.
60
61The victim of a malicious actor does not need to make use of TSX. Only the
62attacker needs to begin a TSX transaction and raise an asynchronous abort
63which in turn potenitally leaks data stored in the buffers.
64
65More detailed technical information is available in the TAA specific x86
66architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`.
67
68
69Attack scenarios
70----------------
71
72Attacks against the TAA vulnerability can be implemented from unprivileged
73applications running on hosts or guests.
74
75As for MDS, the attacker has no control over the memory addresses that can
76be leaked. Only the victim is responsible for bringing data to the CPU. As
77a result, the malicious actor has to sample as much data as possible and
78then postprocess it to try to infer any useful information from it.
79
80A potential attacker only has read access to the data. Also, there is no direct
81privilege escalation by using this technique.
82
83
84.. _tsx_async_abort_sys_info:
85
86TAA system information
87-----------------------
88
89The Linux kernel provides a sysfs interface to enumerate the current TAA status
90of mitigated systems. The relevant sysfs file is:
91
92/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
93
94The possible values in this file are:
95
96.. list-table::
97
98 * - 'Vulnerable'
99 - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied.
100 * - 'Vulnerable: Clear CPU buffers attempted, no microcode'
101 - The system tries to clear the buffers but the microcode might not support the operation.
102 * - 'Mitigation: Clear CPU buffers'
103 - The microcode has been updated to clear the buffers. TSX is still enabled.
104 * - 'Mitigation: TSX disabled'
105 - TSX is disabled.
106 * - 'Not affected'
107 - The CPU is not affected by this issue.
108
109.. _ucode_needed:
110
111Best effort mitigation mode
112^^^^^^^^^^^^^^^^^^^^^^^^^^^
113
114If the processor is vulnerable, but the availability of the microcode-based
115mitigation mechanism is not advertised via CPUID the kernel selects a best
116effort mitigation mode. This mode invokes the mitigation instructions
117without a guarantee that they clear the CPU buffers.
118
119This is done to address virtualization scenarios where the host has the
120microcode update applied, but the hypervisor is not yet updated to expose the
121CPUID to the guest. If the host has updated microcode the protection takes
122effect; otherwise a few CPU cycles are wasted pointlessly.
123
124The state in the tsx_async_abort sysfs file reflects this situation
125accordingly.
126
127
128Mitigation mechanism
129--------------------
130
131The kernel detects the affected CPUs and the presence of the microcode which is
132required. If a CPU is affected and the microcode is available, then the kernel
133enables the mitigation by default.
134
135
136The mitigation can be controlled at boot time via a kernel command line option.
137See :ref:`taa_mitigation_control_command_line`.
138
139.. _virt_mechanism:
140
141Virtualization mitigation
142^^^^^^^^^^^^^^^^^^^^^^^^^
143
144Affected systems where the host has TAA microcode and TAA is mitigated by
145having disabled TSX previously, are not vulnerable regardless of the status
146of the VMs.
147
148In all other cases, if the host either does not have the TAA microcode or
149the kernel is not mitigated, the system might be vulnerable.
150
151
152.. _taa_mitigation_control_command_line:
153
154Mitigation control on the kernel command line
155---------------------------------------------
156
157The kernel command line allows to control the TAA mitigations at boot time with
158the option "tsx_async_abort=". The valid arguments for this option are:
159
160 ============ =============================================================
161 off This option disables the TAA mitigation on affected platforms.
162 If the system has TSX enabled (see next parameter) and the CPU
163 is affected, the system is vulnerable.
164
165 full TAA mitigation is enabled. If TSX is enabled, on an affected
166 system it will clear CPU buffers on ring transitions. On
167 systems which are MDS-affected and deploy MDS mitigation,
168 TAA is also mitigated. Specifying this option on those
169 systems will have no effect.
170
171 full,nosmt The same as tsx_async_abort=full, with SMT disabled on
172 vulnerable CPUs that have TSX enabled. This is the complete
173 mitigation. When TSX is disabled, SMT is not disabled because
174 CPU is not vulnerable to cross-thread TAA attacks.
175 ============ =============================================================
176
177Not specifying this option is equivalent to "tsx_async_abort=full".
178
179The kernel command line also allows to control the TSX feature using the
180parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used
181to control the TSX feature and the enumeration of the TSX feature bits (RTM
182and HLE) in CPUID.
183
184The valid options are:
185
186 ============ =============================================================
187 off Disables TSX on the system.
188
189 Note that this option takes effect only on newer CPUs which are
190 not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1
191 and which get the new IA32_TSX_CTRL MSR through a microcode
192 update. This new MSR allows for the reliable deactivation of
193 the TSX functionality.
194
195 on Enables TSX.
196
197 Although there are mitigations for all known security
198 vulnerabilities, TSX has been known to be an accelerator for
199 several previous speculation-related CVEs, and so there may be
200 unknown security risks associated with leaving it enabled.
201
202 auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX
203 on the system.
204 ============ =============================================================
205
206Not specifying this option is equivalent to "tsx=off".
207
208The following combinations of the "tsx_async_abort" and "tsx" are possible. For
209affected platforms tsx=auto is equivalent to tsx=off and the result will be:
210
211 ========= ========================== =========================================
212 tsx=on tsx_async_abort=full The system will use VERW to clear CPU
213 buffers. Cross-thread attacks are still
214 possible on SMT machines.
215 tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT
216 mitigated.
217 tsx=on tsx_async_abort=off The system is vulnerable.
218 tsx=off tsx_async_abort=full TSX might be disabled if microcode
219 provides a TSX control MSR. If so,
220 system is not vulnerable.
221 tsx=off tsx_async_abort=full,nosmt Ditto
222 tsx=off tsx_async_abort=off ditto
223 ========= ========================== =========================================
224
225
226For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU
227buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0)
228"tsx" command line argument has no effect.
229
230For the affected platforms below table indicates the mitigation status for the
231combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO
232and TSX_CTRL_MSR.
233
234 ======= ========= ============= ========================================
235 MDS_NO MD_CLEAR TSX_CTRL_MSR Status
236 ======= ========= ============= ========================================
237 0 0 0 Vulnerable (needs microcode)
238 0 1 0 MDS and TAA mitigated via VERW
239 1 1 0 MDS fixed, TAA vulnerable if TSX enabled
240 because MD_CLEAR has no meaning and
241 VERW is not guaranteed to clear buffers
242 1 X 1 MDS fixed, TAA can be mitigated by
243 VERW or TSX_CTRL_MSR
244 ======= ========= ============= ========================================
245
246Mitigation selection guide
247--------------------------
248
2491. Trusted userspace and guests
250^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
251
252If all user space applications are from a trusted source and do not execute
253untrusted code which is supplied externally, then the mitigation can be
254disabled. The same applies to virtualized environments with trusted guests.
255
256
2572. Untrusted userspace and guests
258^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
259
260If there are untrusted applications or guests on the system, enabling TSX
261might allow a malicious actor to leak data from the host or from other
262processes running on the same physical core.
263
264If the microcode is available and the TSX is disabled on the host, attacks
265are prevented in a virtualized environment as well, even if the VMs do not
266explicitly enable the mitigation.
267
268
269.. _taa_default_mitigations:
270
271Default mitigations
272-------------------
273
274The kernel's default action for vulnerable processors is:
275
276 - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off).
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a84a83f8881e..8dee8f68fe15 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2055,6 +2055,25 @@
2055 KVM MMU at runtime. 2055 KVM MMU at runtime.
2056 Default is 0 (off) 2056 Default is 0 (off)
2057 2057
2058 kvm.nx_huge_pages=
2059 [KVM] Controls the software workaround for the
2060 X86_BUG_ITLB_MULTIHIT bug.
2061 force : Always deploy workaround.
2062 off : Never deploy workaround.
2063 auto : Deploy workaround based on the presence of
2064 X86_BUG_ITLB_MULTIHIT.
2065
2066 Default is 'auto'.
2067
2068 If the software workaround is enabled for the host,
2069 guests do need not to enable it for nested guests.
2070
2071 kvm.nx_huge_pages_recovery_ratio=
2072 [KVM] Controls how many 4KiB pages are periodically zapped
2073 back to huge pages. 0 disables the recovery, otherwise if
2074 the value is N KVM will zap 1/Nth of the 4KiB pages every
2075 minute. The default is 60.
2076
2058 kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. 2077 kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
2059 Default is 1 (enabled) 2078 Default is 1 (enabled)
2060 2079
@@ -2636,6 +2655,13 @@
2636 ssbd=force-off [ARM64] 2655 ssbd=force-off [ARM64]
2637 l1tf=off [X86] 2656 l1tf=off [X86]
2638 mds=off [X86] 2657 mds=off [X86]
2658 tsx_async_abort=off [X86]
2659 kvm.nx_huge_pages=off [X86]
2660
2661 Exceptions:
2662 This does not have any effect on
2663 kvm.nx_huge_pages when
2664 kvm.nx_huge_pages=force.
2639 2665
2640 auto (default) 2666 auto (default)
2641 Mitigate all CPU vulnerabilities, but leave SMT 2667 Mitigate all CPU vulnerabilities, but leave SMT
@@ -2651,6 +2677,7 @@
2651 be fully mitigated, even if it means losing SMT. 2677 be fully mitigated, even if it means losing SMT.
2652 Equivalent to: l1tf=flush,nosmt [X86] 2678 Equivalent to: l1tf=flush,nosmt [X86]
2653 mds=full,nosmt [X86] 2679 mds=full,nosmt [X86]
2680 tsx_async_abort=full,nosmt [X86]
2654 2681
2655 mminit_loglevel= 2682 mminit_loglevel=
2656 [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this 2683 [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
@@ -4848,6 +4875,71 @@
4848 interruptions from clocksource watchdog are not 4875 interruptions from clocksource watchdog are not
4849 acceptable). 4876 acceptable).
4850 4877
4878 tsx= [X86] Control Transactional Synchronization
4879 Extensions (TSX) feature in Intel processors that
4880 support TSX control.
4881
4882 This parameter controls the TSX feature. The options are:
4883
4884 on - Enable TSX on the system. Although there are
4885 mitigations for all known security vulnerabilities,
4886 TSX has been known to be an accelerator for
4887 several previous speculation-related CVEs, and
4888 so there may be unknown security risks associated
4889 with leaving it enabled.
4890
4891 off - Disable TSX on the system. (Note that this
4892 option takes effect only on newer CPUs which are
4893 not vulnerable to MDS, i.e., have
4894 MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get
4895 the new IA32_TSX_CTRL MSR through a microcode
4896 update. This new MSR allows for the reliable
4897 deactivation of the TSX functionality.)
4898
4899 auto - Disable TSX if X86_BUG_TAA is present,
4900 otherwise enable TSX on the system.
4901
4902 Not specifying this option is equivalent to tsx=off.
4903
4904 See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
4905 for more details.
4906
4907 tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async
4908 Abort (TAA) vulnerability.
4909
4910 Similar to Micro-architectural Data Sampling (MDS)
4911 certain CPUs that support Transactional
4912 Synchronization Extensions (TSX) are vulnerable to an
4913 exploit against CPU internal buffers which can forward
4914 information to a disclosure gadget under certain
4915 conditions.
4916
4917 In vulnerable processors, the speculatively forwarded
4918 data can be used in a cache side channel attack, to
4919 access data to which the attacker does not have direct
4920 access.
4921
4922 This parameter controls the TAA mitigation. The
4923 options are:
4924
4925 full - Enable TAA mitigation on vulnerable CPUs
4926 if TSX is enabled.
4927
4928 full,nosmt - Enable TAA mitigation and disable SMT on
4929 vulnerable CPUs. If TSX is disabled, SMT
4930 is not disabled because CPU is not
4931 vulnerable to cross-thread TAA attacks.
4932 off - Unconditionally disable TAA mitigation
4933
4934 Not specifying this option is equivalent to
4935 tsx_async_abort=full. On CPUs which are MDS affected
4936 and deploy MDS mitigation, TAA mitigation is not
4937 required and doesn't provide any additional
4938 mitigation.
4939
4940 For details see:
4941 Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
4942
4851 turbografx.map[2|3]= [HW,JOY] 4943 turbografx.map[2|3]= [HW,JOY]
4852 TurboGraFX parallel port interface 4944 TurboGraFX parallel port interface
4853 Format: 4945 Format:
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index af64c4bb4447..a8de2fbc1caa 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -27,6 +27,7 @@ x86-specific Documentation
27 mds 27 mds
28 microcode 28 microcode
29 resctrl_ui 29 resctrl_ui
30 tsx_async_abort
30 usb-legacy-support 31 usb-legacy-support
31 i386/index 32 i386/index
32 x86_64/index 33 x86_64/index
diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst
new file mode 100644
index 000000000000..583ddc185ba2
--- /dev/null
+++ b/Documentation/x86/tsx_async_abort.rst
@@ -0,0 +1,117 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3TSX Async Abort (TAA) mitigation
4================================
5
6.. _tsx_async_abort:
7
8Overview
9--------
10
11TSX Async Abort (TAA) is a side channel attack on internal buffers in some
12Intel processors similar to Microachitectural Data Sampling (MDS). In this
13case certain loads may speculatively pass invalid data to dependent operations
14when an asynchronous abort condition is pending in a Transactional
15Synchronization Extensions (TSX) transaction. This includes loads with no
16fault or assist condition. Such loads may speculatively expose stale data from
17the same uarch data structures as in MDS, with same scope of exposure i.e.
18same-thread and cross-thread. This issue affects all current processors that
19support TSX.
20
21Mitigation strategy
22-------------------
23
24a) TSX disable - one of the mitigations is to disable TSX. A new MSR
25IA32_TSX_CTRL will be available in future and current processors after
26microcode update which can be used to disable TSX. In addition, it
27controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID.
28
29b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this
30vulnerability. More details on this approach can be found in
31:ref:`Documentation/admin-guide/hw-vuln/mds.rst <mds>`.
32
33Kernel internal mitigation modes
34--------------------------------
35
36 ============= ============================================================
37 off Mitigation is disabled. Either the CPU is not affected or
38 tsx_async_abort=off is supplied on the kernel command line.
39
40 tsx disabled Mitigation is enabled. TSX feature is disabled by default at
41 bootup on processors that support TSX control.
42
43 verw Mitigation is enabled. CPU is affected and MD_CLEAR is
44 advertised in CPUID.
45
46 ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not
47 advertised in CPUID. That is mainly for virtualization
48 scenarios where the host has the updated microcode but the
49 hypervisor does not expose MD_CLEAR in CPUID. It's a best
50 effort approach without guarantee.
51 ============= ============================================================
52
53If the CPU is affected and the "tsx_async_abort" kernel command line parameter is
54not provided then the kernel selects an appropriate mitigation depending on the
55status of RTM and MD_CLEAR CPUID bits.
56
57Below tables indicate the impact of tsx=on|off|auto cmdline options on state of
58TAA mitigation, VERW behavior and TSX feature for various combinations of
59MSR_IA32_ARCH_CAPABILITIES bits.
60
611. "tsx=off"
62
63========= ========= ============ ============ ============== =================== ======================
64MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off
65---------------------------------- -------------------------------------------------------------------------
66TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
67 after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
68========= ========= ============ ============ ============== =================== ======================
69 0 0 0 HW default Yes Same as MDS Same as MDS
70 0 0 1 Invalid case Invalid case Invalid case Invalid case
71 0 1 0 HW default No Need ucode update Need ucode update
72 0 1 1 Disabled Yes TSX disabled TSX disabled
73 1 X 1 Disabled X None needed None needed
74========= ========= ============ ============ ============== =================== ======================
75
762. "tsx=on"
77
78========= ========= ============ ============ ============== =================== ======================
79MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on
80---------------------------------- -------------------------------------------------------------------------
81TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
82 after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
83========= ========= ============ ============ ============== =================== ======================
84 0 0 0 HW default Yes Same as MDS Same as MDS
85 0 0 1 Invalid case Invalid case Invalid case Invalid case
86 0 1 0 HW default No Need ucode update Need ucode update
87 0 1 1 Enabled Yes None Same as MDS
88 1 X 1 Enabled X None needed None needed
89========= ========= ============ ============ ============== =================== ======================
90
913. "tsx=auto"
92
93========= ========= ============ ============ ============== =================== ======================
94MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto
95---------------------------------- -------------------------------------------------------------------------
96TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
97 after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
98========= ========= ============ ============ ============== =================== ======================
99 0 0 0 HW default Yes Same as MDS Same as MDS
100 0 0 1 Invalid case Invalid case Invalid case Invalid case
101 0 1 0 HW default No Need ucode update Need ucode update
102 0 1 1 Disabled Yes TSX disabled TSX disabled
103 1 X 1 Enabled X None needed None needed
104========= ========= ============ ============ ============== =================== ======================
105
106In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that
107indicates whether MSR_IA32_TSX_CTRL is supported.
108
109There are two control bits in IA32_TSX_CTRL MSR:
110
111 Bit 0: When set it disables the Restricted Transactional Memory (RTM)
112 sub-feature of TSX (will force all transactions to abort on the
113 XBEGIN instruction).
114
115 Bit 1: When set it disables the enumeration of the RTM and HLE feature
116 (i.e. it will make CPUID(EAX=7).EBX{bit4} and
117 CPUID(EAX=7).EBX{bit11} read as 0).
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d6e1faa28c58..8ef85139553f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1940,6 +1940,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
1940 1940
1941 If unsure, say y. 1941 If unsure, say y.
1942 1942
1943choice
1944 prompt "TSX enable mode"
1945 depends on CPU_SUP_INTEL
1946 default X86_INTEL_TSX_MODE_OFF
1947 help
1948 Intel's TSX (Transactional Synchronization Extensions) feature
1949 allows to optimize locking protocols through lock elision which
1950 can lead to a noticeable performance boost.
1951
1952 On the other hand it has been shown that TSX can be exploited
1953 to form side channel attacks (e.g. TAA) and chances are there
1954 will be more of those attacks discovered in the future.
1955
1956 Therefore TSX is not enabled by default (aka tsx=off). An admin
1957 might override this decision by tsx=on the command line parameter.
1958 Even with TSX enabled, the kernel will attempt to enable the best
1959 possible TAA mitigation setting depending on the microcode available
1960 for the particular machine.
1961
1962 This option allows to set the default tsx mode between tsx=on, =off
1963 and =auto. See Documentation/admin-guide/kernel-parameters.txt for more
1964 details.
1965
1966 Say off if not sure, auto if TSX is in use but it should be used on safe
1967 platforms or on if TSX is in use and the security aspect of tsx is not
1968 relevant.
1969
1970config X86_INTEL_TSX_MODE_OFF
1971 bool "off"
1972 help
1973 TSX is disabled if possible - equals to tsx=off command line parameter.
1974
1975config X86_INTEL_TSX_MODE_ON
1976 bool "on"
1977 help
1978 TSX is always enabled on TSX capable HW - equals the tsx=on command
1979 line parameter.
1980
1981config X86_INTEL_TSX_MODE_AUTO
1982 bool "auto"
1983 help
1984 TSX is enabled on TSX capable HW that is believed to be safe against
1985 side channel attacks- equals the tsx=auto command line parameter.
1986endchoice
1987
1943config EFI 1988config EFI
1944 bool "EFI runtime service support" 1989 bool "EFI runtime service support"
1945 depends on ACPI 1990 depends on ACPI
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 0652d3eed9bd..c4fbe379cc0b 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -399,5 +399,7 @@
399#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ 399#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
400#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ 400#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
401#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ 401#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
402#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
403#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
402 404
403#endif /* _ASM_X86_CPUFEATURES_H */ 405#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 24d6598dea29..4fc61483919a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -312,9 +312,12 @@ struct kvm_rmap_head {
312struct kvm_mmu_page { 312struct kvm_mmu_page {
313 struct list_head link; 313 struct list_head link;
314 struct hlist_node hash_link; 314 struct hlist_node hash_link;
315 struct list_head lpage_disallowed_link;
316
315 bool unsync; 317 bool unsync;
316 u8 mmu_valid_gen; 318 u8 mmu_valid_gen;
317 bool mmio_cached; 319 bool mmio_cached;
320 bool lpage_disallowed; /* Can't be replaced by an equiv large page */
318 321
319 /* 322 /*
320 * The following two entries are used to key the shadow page in the 323 * The following two entries are used to key the shadow page in the
@@ -859,6 +862,7 @@ struct kvm_arch {
859 */ 862 */
860 struct list_head active_mmu_pages; 863 struct list_head active_mmu_pages;
861 struct list_head zapped_obsolete_pages; 864 struct list_head zapped_obsolete_pages;
865 struct list_head lpage_disallowed_mmu_pages;
862 struct kvm_page_track_notifier_node mmu_sp_tracker; 866 struct kvm_page_track_notifier_node mmu_sp_tracker;
863 struct kvm_page_track_notifier_head track_notifier_head; 867 struct kvm_page_track_notifier_head track_notifier_head;
864 868
@@ -933,6 +937,7 @@ struct kvm_arch {
933 bool exception_payload_enabled; 937 bool exception_payload_enabled;
934 938
935 struct kvm_pmu_event_filter *pmu_event_filter; 939 struct kvm_pmu_event_filter *pmu_event_filter;
940 struct task_struct *nx_lpage_recovery_thread;
936}; 941};
937 942
938struct kvm_vm_stat { 943struct kvm_vm_stat {
@@ -946,6 +951,7 @@ struct kvm_vm_stat {
946 ulong mmu_unsync; 951 ulong mmu_unsync;
947 ulong remote_tlb_flush; 952 ulong remote_tlb_flush;
948 ulong lpages; 953 ulong lpages;
954 ulong nx_lpage_splits;
949 ulong max_mmu_page_hash_collisions; 955 ulong max_mmu_page_hash_collisions;
950}; 956};
951 957
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 20ce682a2540..6a3124664289 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -93,6 +93,18 @@
93 * Microarchitectural Data 93 * Microarchitectural Data
94 * Sampling (MDS) vulnerabilities. 94 * Sampling (MDS) vulnerabilities.
95 */ 95 */
96#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /*
97 * The processor is not susceptible to a
98 * machine check error due to modifying the
99 * code page size along with either the
100 * physical address or cache type
101 * without TLB invalidation.
102 */
103#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */
104#define ARCH_CAP_TAA_NO BIT(8) /*
105 * Not susceptible to
106 * TSX Async Abort (TAA) vulnerabilities.
107 */
96 108
97#define MSR_IA32_FLUSH_CMD 0x0000010b 109#define MSR_IA32_FLUSH_CMD 0x0000010b
98#define L1D_FLUSH BIT(0) /* 110#define L1D_FLUSH BIT(0) /*
@@ -103,6 +115,10 @@
103#define MSR_IA32_BBL_CR_CTL 0x00000119 115#define MSR_IA32_BBL_CR_CTL 0x00000119
104#define MSR_IA32_BBL_CR_CTL3 0x0000011e 116#define MSR_IA32_BBL_CR_CTL3 0x0000011e
105 117
118#define MSR_IA32_TSX_CTRL 0x00000122
119#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
120#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */
121
106#define MSR_IA32_SYSENTER_CS 0x00000174 122#define MSR_IA32_SYSENTER_CS 0x00000174
107#define MSR_IA32_SYSENTER_ESP 0x00000175 123#define MSR_IA32_SYSENTER_ESP 0x00000175
108#define MSR_IA32_SYSENTER_EIP 0x00000176 124#define MSR_IA32_SYSENTER_EIP 0x00000176
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 80bc209c0708..5c24a7b35166 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -314,7 +314,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
314#include <asm/segment.h> 314#include <asm/segment.h>
315 315
316/** 316/**
317 * mds_clear_cpu_buffers - Mitigation for MDS vulnerability 317 * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
318 * 318 *
319 * This uses the otherwise unused and obsolete VERW instruction in 319 * This uses the otherwise unused and obsolete VERW instruction in
320 * combination with microcode which triggers a CPU buffer flush when the 320 * combination with microcode which triggers a CPU buffer flush when the
@@ -337,7 +337,7 @@ static inline void mds_clear_cpu_buffers(void)
337} 337}
338 338
339/** 339/**
340 * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability 340 * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
341 * 341 *
342 * Clear CPU buffers if the corresponding static key is enabled 342 * Clear CPU buffers if the corresponding static key is enabled
343 */ 343 */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6e0a3b43d027..54f5d54280f6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -988,4 +988,11 @@ enum mds_mitigations {
988 MDS_MITIGATION_VMWERV, 988 MDS_MITIGATION_VMWERV,
989}; 989};
990 990
991enum taa_mitigations {
992 TAA_MITIGATION_OFF,
993 TAA_MITIGATION_UCODE_NEEDED,
994 TAA_MITIGATION_VERW,
995 TAA_MITIGATION_TSX_DISABLED,
996};
997
991#endif /* _ASM_X86_PROCESSOR_H */ 998#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d7a1e5a9331c..890f60083eca 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
30obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o 30obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
31 31
32ifdef CONFIG_CPU_SUP_INTEL 32ifdef CONFIG_CPU_SUP_INTEL
33obj-y += intel.o intel_pconfig.o 33obj-y += intel.o intel_pconfig.o tsx.o
34obj-$(CONFIG_PM) += intel_epb.o 34obj-$(CONFIG_PM) += intel_epb.o
35endif 35endif
36obj-$(CONFIG_CPU_SUP_AMD) += amd.o 36obj-$(CONFIG_CPU_SUP_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 91c2561b905f..4c7b0fa15a19 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -39,6 +39,7 @@ static void __init spectre_v2_select_mitigation(void);
39static void __init ssb_select_mitigation(void); 39static void __init ssb_select_mitigation(void);
40static void __init l1tf_select_mitigation(void); 40static void __init l1tf_select_mitigation(void);
41static void __init mds_select_mitigation(void); 41static void __init mds_select_mitigation(void);
42static void __init taa_select_mitigation(void);
42 43
43/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ 44/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
44u64 x86_spec_ctrl_base; 45u64 x86_spec_ctrl_base;
@@ -105,6 +106,7 @@ void __init check_bugs(void)
105 ssb_select_mitigation(); 106 ssb_select_mitigation();
106 l1tf_select_mitigation(); 107 l1tf_select_mitigation();
107 mds_select_mitigation(); 108 mds_select_mitigation();
109 taa_select_mitigation();
108 110
109 arch_smt_update(); 111 arch_smt_update();
110 112
@@ -269,6 +271,100 @@ static int __init mds_cmdline(char *str)
269early_param("mds", mds_cmdline); 271early_param("mds", mds_cmdline);
270 272
271#undef pr_fmt 273#undef pr_fmt
274#define pr_fmt(fmt) "TAA: " fmt
275
276/* Default mitigation for TAA-affected CPUs */
277static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
278static bool taa_nosmt __ro_after_init;
279
280static const char * const taa_strings[] = {
281 [TAA_MITIGATION_OFF] = "Vulnerable",
282 [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
283 [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
284 [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
285};
286
287static void __init taa_select_mitigation(void)
288{
289 u64 ia32_cap;
290
291 if (!boot_cpu_has_bug(X86_BUG_TAA)) {
292 taa_mitigation = TAA_MITIGATION_OFF;
293 return;
294 }
295
296 /* TSX previously disabled by tsx=off */
297 if (!boot_cpu_has(X86_FEATURE_RTM)) {
298 taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
299 goto out;
300 }
301
302 if (cpu_mitigations_off()) {
303 taa_mitigation = TAA_MITIGATION_OFF;
304 return;
305 }
306
307 /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */
308 if (taa_mitigation == TAA_MITIGATION_OFF)
309 goto out;
310
311 if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
312 taa_mitigation = TAA_MITIGATION_VERW;
313 else
314 taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
315
316 /*
317 * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
318 * A microcode update fixes this behavior to clear CPU buffers. It also
319 * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
320 * ARCH_CAP_TSX_CTRL_MSR bit.
321 *
322 * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
323 * update is required.
324 */
325 ia32_cap = x86_read_arch_cap_msr();
326 if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
327 !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
328 taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
329
330 /*
331 * TSX is enabled, select alternate mitigation for TAA which is
332 * the same as MDS. Enable MDS static branch to clear CPU buffers.
333 *
334 * For guests that can't determine whether the correct microcode is
335 * present on host, enable the mitigation for UCODE_NEEDED as well.
336 */
337 static_branch_enable(&mds_user_clear);
338
339 if (taa_nosmt || cpu_mitigations_auto_nosmt())
340 cpu_smt_disable(false);
341
342out:
343 pr_info("%s\n", taa_strings[taa_mitigation]);
344}
345
346static int __init tsx_async_abort_parse_cmdline(char *str)
347{
348 if (!boot_cpu_has_bug(X86_BUG_TAA))
349 return 0;
350
351 if (!str)
352 return -EINVAL;
353
354 if (!strcmp(str, "off")) {
355 taa_mitigation = TAA_MITIGATION_OFF;
356 } else if (!strcmp(str, "full")) {
357 taa_mitigation = TAA_MITIGATION_VERW;
358 } else if (!strcmp(str, "full,nosmt")) {
359 taa_mitigation = TAA_MITIGATION_VERW;
360 taa_nosmt = true;
361 }
362
363 return 0;
364}
365early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
366
367#undef pr_fmt
272#define pr_fmt(fmt) "Spectre V1 : " fmt 368#define pr_fmt(fmt) "Spectre V1 : " fmt
273 369
274enum spectre_v1_mitigation { 370enum spectre_v1_mitigation {
@@ -786,13 +882,10 @@ static void update_mds_branch_idle(void)
786} 882}
787 883
788#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" 884#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
885#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
789 886
790void cpu_bugs_smt_update(void) 887void cpu_bugs_smt_update(void)
791{ 888{
792 /* Enhanced IBRS implies STIBP. No update required. */
793 if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
794 return;
795
796 mutex_lock(&spec_ctrl_mutex); 889 mutex_lock(&spec_ctrl_mutex);
797 890
798 switch (spectre_v2_user) { 891 switch (spectre_v2_user) {
@@ -819,6 +912,17 @@ void cpu_bugs_smt_update(void)
819 break; 912 break;
820 } 913 }
821 914
915 switch (taa_mitigation) {
916 case TAA_MITIGATION_VERW:
917 case TAA_MITIGATION_UCODE_NEEDED:
918 if (sched_smt_active())
919 pr_warn_once(TAA_MSG_SMT);
920 break;
921 case TAA_MITIGATION_TSX_DISABLED:
922 case TAA_MITIGATION_OFF:
923 break;
924 }
925
822 mutex_unlock(&spec_ctrl_mutex); 926 mutex_unlock(&spec_ctrl_mutex);
823} 927}
824 928
@@ -1149,6 +1253,9 @@ void x86_spec_ctrl_setup_ap(void)
1149 x86_amd_ssb_disable(); 1253 x86_amd_ssb_disable();
1150} 1254}
1151 1255
1256bool itlb_multihit_kvm_mitigation;
1257EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
1258
1152#undef pr_fmt 1259#undef pr_fmt
1153#define pr_fmt(fmt) "L1TF: " fmt 1260#define pr_fmt(fmt) "L1TF: " fmt
1154 1261
@@ -1304,11 +1411,24 @@ static ssize_t l1tf_show_state(char *buf)
1304 l1tf_vmx_states[l1tf_vmx_mitigation], 1411 l1tf_vmx_states[l1tf_vmx_mitigation],
1305 sched_smt_active() ? "vulnerable" : "disabled"); 1412 sched_smt_active() ? "vulnerable" : "disabled");
1306} 1413}
1414
1415static ssize_t itlb_multihit_show_state(char *buf)
1416{
1417 if (itlb_multihit_kvm_mitigation)
1418 return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
1419 else
1420 return sprintf(buf, "KVM: Vulnerable\n");
1421}
1307#else 1422#else
1308static ssize_t l1tf_show_state(char *buf) 1423static ssize_t l1tf_show_state(char *buf)
1309{ 1424{
1310 return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); 1425 return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
1311} 1426}
1427
1428static ssize_t itlb_multihit_show_state(char *buf)
1429{
1430 return sprintf(buf, "Processor vulnerable\n");
1431}
1312#endif 1432#endif
1313 1433
1314static ssize_t mds_show_state(char *buf) 1434static ssize_t mds_show_state(char *buf)
@@ -1328,6 +1448,21 @@ static ssize_t mds_show_state(char *buf)
1328 sched_smt_active() ? "vulnerable" : "disabled"); 1448 sched_smt_active() ? "vulnerable" : "disabled");
1329} 1449}
1330 1450
1451static ssize_t tsx_async_abort_show_state(char *buf)
1452{
1453 if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
1454 (taa_mitigation == TAA_MITIGATION_OFF))
1455 return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
1456
1457 if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
1458 return sprintf(buf, "%s; SMT Host state unknown\n",
1459 taa_strings[taa_mitigation]);
1460 }
1461
1462 return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
1463 sched_smt_active() ? "vulnerable" : "disabled");
1464}
1465
1331static char *stibp_state(void) 1466static char *stibp_state(void)
1332{ 1467{
1333 if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) 1468 if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
@@ -1398,6 +1533,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
1398 case X86_BUG_MDS: 1533 case X86_BUG_MDS:
1399 return mds_show_state(buf); 1534 return mds_show_state(buf);
1400 1535
1536 case X86_BUG_TAA:
1537 return tsx_async_abort_show_state(buf);
1538
1539 case X86_BUG_ITLB_MULTIHIT:
1540 return itlb_multihit_show_state(buf);
1541
1401 default: 1542 default:
1402 break; 1543 break;
1403 } 1544 }
@@ -1434,4 +1575,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu
1434{ 1575{
1435 return cpu_show_common(dev, attr, buf, X86_BUG_MDS); 1576 return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
1436} 1577}
1578
1579ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
1580{
1581 return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
1582}
1583
1584ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
1585{
1586 return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
1587}
1437#endif 1588#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9ae7d1bcd4f4..fffe21945374 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1016,13 +1016,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
1016#endif 1016#endif
1017} 1017}
1018 1018
1019#define NO_SPECULATION BIT(0) 1019#define NO_SPECULATION BIT(0)
1020#define NO_MELTDOWN BIT(1) 1020#define NO_MELTDOWN BIT(1)
1021#define NO_SSB BIT(2) 1021#define NO_SSB BIT(2)
1022#define NO_L1TF BIT(3) 1022#define NO_L1TF BIT(3)
1023#define NO_MDS BIT(4) 1023#define NO_MDS BIT(4)
1024#define MSBDS_ONLY BIT(5) 1024#define MSBDS_ONLY BIT(5)
1025#define NO_SWAPGS BIT(6) 1025#define NO_SWAPGS BIT(6)
1026#define NO_ITLB_MULTIHIT BIT(7)
1026 1027
1027#define VULNWL(_vendor, _family, _model, _whitelist) \ 1028#define VULNWL(_vendor, _family, _model, _whitelist) \
1028 { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } 1029 { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -1043,27 +1044,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
1043 VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), 1044 VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
1044 1045
1045 /* Intel Family 6 */ 1046 /* Intel Family 6 */
1046 VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), 1047 VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
1047 VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), 1048 VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
1048 VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), 1049 VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
1049 VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), 1050 VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
1050 VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), 1051 VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
1051 1052
1052 VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), 1053 VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
1053 VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), 1054 VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
1054 VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), 1055 VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
1055 VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), 1056 VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
1056 VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), 1057 VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
1057 VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), 1058 VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
1058 1059
1059 VULNWL_INTEL(CORE_YONAH, NO_SSB), 1060 VULNWL_INTEL(CORE_YONAH, NO_SSB),
1060 1061
1061 VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), 1062 VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
1062 VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS), 1063 VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
1063 1064
1064 VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), 1065 VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
1065 VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS), 1066 VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
1066 VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), 1067 VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
1067 1068
1068 /* 1069 /*
1069 * Technically, swapgs isn't serializing on AMD (despite it previously 1070 * Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1073,15 +1074,17 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
1073 * good enough for our purposes. 1074 * good enough for our purposes.
1074 */ 1075 */
1075 1076
1077 VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT),
1078
1076 /* AMD Family 0xf - 0x12 */ 1079 /* AMD Family 0xf - 0x12 */
1077 VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), 1080 VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
1078 VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), 1081 VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
1079 VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), 1082 VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
1080 VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), 1083 VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
1081 1084
1082 /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ 1085 /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
1083 VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), 1086 VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
1084 VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), 1087 VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
1085 {} 1088 {}
1086}; 1089};
1087 1090
@@ -1092,19 +1095,30 @@ static bool __init cpu_matches(unsigned long which)
1092 return m && !!(m->driver_data & which); 1095 return m && !!(m->driver_data & which);
1093} 1096}
1094 1097
1095static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) 1098u64 x86_read_arch_cap_msr(void)
1096{ 1099{
1097 u64 ia32_cap = 0; 1100 u64 ia32_cap = 0;
1098 1101
1102 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
1103 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
1104
1105 return ia32_cap;
1106}
1107
1108static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
1109{
1110 u64 ia32_cap = x86_read_arch_cap_msr();
1111
1112 /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
1113 if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
1114 setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
1115
1099 if (cpu_matches(NO_SPECULATION)) 1116 if (cpu_matches(NO_SPECULATION))
1100 return; 1117 return;
1101 1118
1102 setup_force_cpu_bug(X86_BUG_SPECTRE_V1); 1119 setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
1103 setup_force_cpu_bug(X86_BUG_SPECTRE_V2); 1120 setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
1104 1121
1105 if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
1106 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
1107
1108 if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && 1122 if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
1109 !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) 1123 !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
1110 setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); 1124 setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
@@ -1121,6 +1135,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
1121 if (!cpu_matches(NO_SWAPGS)) 1135 if (!cpu_matches(NO_SWAPGS))
1122 setup_force_cpu_bug(X86_BUG_SWAPGS); 1136 setup_force_cpu_bug(X86_BUG_SWAPGS);
1123 1137
1138 /*
1139 * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
1140 * - TSX is supported or
1141 * - TSX_CTRL is present
1142 *
1143 * TSX_CTRL check is needed for cases when TSX could be disabled before
1144 * the kernel boot e.g. kexec.
1145 * TSX_CTRL check alone is not sufficient for cases when the microcode
1146 * update is not present or running as guest that don't get TSX_CTRL.
1147 */
1148 if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
1149 (cpu_has(c, X86_FEATURE_RTM) ||
1150 (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
1151 setup_force_cpu_bug(X86_BUG_TAA);
1152
1124 if (cpu_matches(NO_MELTDOWN)) 1153 if (cpu_matches(NO_MELTDOWN))
1125 return; 1154 return;
1126 1155
@@ -1554,6 +1583,8 @@ void __init identify_boot_cpu(void)
1554#endif 1583#endif
1555 cpu_detect_tlb(&boot_cpu_data); 1584 cpu_detect_tlb(&boot_cpu_data);
1556 setup_cr_pinning(); 1585 setup_cr_pinning();
1586
1587 tsx_init();
1557} 1588}
1558 1589
1559void identify_secondary_cpu(struct cpuinfo_x86 *c) 1590void identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index c0e2407abdd6..38ab6e115eac 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -44,6 +44,22 @@ struct _tlb_table {
44extern const struct cpu_dev *const __x86_cpu_dev_start[], 44extern const struct cpu_dev *const __x86_cpu_dev_start[],
45 *const __x86_cpu_dev_end[]; 45 *const __x86_cpu_dev_end[];
46 46
47#ifdef CONFIG_CPU_SUP_INTEL
48enum tsx_ctrl_states {
49 TSX_CTRL_ENABLE,
50 TSX_CTRL_DISABLE,
51 TSX_CTRL_NOT_SUPPORTED,
52};
53
54extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
55
56extern void __init tsx_init(void);
57extern void tsx_enable(void);
58extern void tsx_disable(void);
59#else
60static inline void tsx_init(void) { }
61#endif /* CONFIG_CPU_SUP_INTEL */
62
47extern void get_cpu_cap(struct cpuinfo_x86 *c); 63extern void get_cpu_cap(struct cpuinfo_x86 *c);
48extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); 64extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
49extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 65extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
@@ -62,4 +78,6 @@ unsigned int aperfmperf_get_khz(int cpu);
62 78
63extern void x86_spec_ctrl_setup_ap(void); 79extern void x86_spec_ctrl_setup_ap(void);
64 80
81extern u64 x86_read_arch_cap_msr(void);
82
65#endif /* ARCH_X86_CPU_H */ 83#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index c2fdc00df163..11d5c5950e2d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -762,6 +762,11 @@ static void init_intel(struct cpuinfo_x86 *c)
762 detect_tme(c); 762 detect_tme(c);
763 763
764 init_intel_misc_features(c); 764 init_intel_misc_features(c);
765
766 if (tsx_ctrl_state == TSX_CTRL_ENABLE)
767 tsx_enable();
768 if (tsx_ctrl_state == TSX_CTRL_DISABLE)
769 tsx_disable();
765} 770}
766 771
767#ifdef CONFIG_X86_32 772#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000000..3e20d322bc98
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,140 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Intel Transactional Synchronization Extensions (TSX) control.
4 *
5 * Copyright (C) 2019 Intel Corporation
6 *
7 * Author:
8 * Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
9 */
10
11#include <linux/cpufeature.h>
12
13#include <asm/cmdline.h>
14
15#include "cpu.h"
16
17enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
18
19void tsx_disable(void)
20{
21 u64 tsx;
22
23 rdmsrl(MSR_IA32_TSX_CTRL, tsx);
24
25 /* Force all transactions to immediately abort */
26 tsx |= TSX_CTRL_RTM_DISABLE;
27
28 /*
29 * Ensure TSX support is not enumerated in CPUID.
30 * This is visible to userspace and will ensure they
31 * do not waste resources trying TSX transactions that
32 * will always abort.
33 */
34 tsx |= TSX_CTRL_CPUID_CLEAR;
35
36 wrmsrl(MSR_IA32_TSX_CTRL, tsx);
37}
38
39void tsx_enable(void)
40{
41 u64 tsx;
42
43 rdmsrl(MSR_IA32_TSX_CTRL, tsx);
44
45 /* Enable the RTM feature in the cpu */
46 tsx &= ~TSX_CTRL_RTM_DISABLE;
47
48 /*
49 * Ensure TSX support is enumerated in CPUID.
50 * This is visible to userspace and will ensure they
51 * can enumerate and use the TSX feature.
52 */
53 tsx &= ~TSX_CTRL_CPUID_CLEAR;
54
55 wrmsrl(MSR_IA32_TSX_CTRL, tsx);
56}
57
58static bool __init tsx_ctrl_is_supported(void)
59{
60 u64 ia32_cap = x86_read_arch_cap_msr();
61
62 /*
63 * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
64 * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
65 *
66 * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
67 * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
68 * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
69 * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
70 * tsx= cmdline requests will do nothing on CPUs without
71 * MSR_IA32_TSX_CTRL support.
72 */
73 return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
74}
75
76static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
77{
78 if (boot_cpu_has_bug(X86_BUG_TAA))
79 return TSX_CTRL_DISABLE;
80
81 return TSX_CTRL_ENABLE;
82}
83
84void __init tsx_init(void)
85{
86 char arg[5] = {};
87 int ret;
88
89 if (!tsx_ctrl_is_supported())
90 return;
91
92 ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
93 if (ret >= 0) {
94 if (!strcmp(arg, "on")) {
95 tsx_ctrl_state = TSX_CTRL_ENABLE;
96 } else if (!strcmp(arg, "off")) {
97 tsx_ctrl_state = TSX_CTRL_DISABLE;
98 } else if (!strcmp(arg, "auto")) {
99 tsx_ctrl_state = x86_get_tsx_auto_mode();
100 } else {
101 tsx_ctrl_state = TSX_CTRL_DISABLE;
102 pr_err("tsx: invalid option, defaulting to off\n");
103 }
104 } else {
105 /* tsx= not provided */
106 if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
107 tsx_ctrl_state = x86_get_tsx_auto_mode();
108 else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
109 tsx_ctrl_state = TSX_CTRL_DISABLE;
110 else
111 tsx_ctrl_state = TSX_CTRL_ENABLE;
112 }
113
114 if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
115 tsx_disable();
116
117 /*
118 * tsx_disable() will change the state of the
119 * RTM CPUID bit. Clear it here since it is now
120 * expected to be not set.
121 */
122 setup_clear_cpu_cap(X86_FEATURE_RTM);
123 } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
124
125 /*
126 * HW defaults TSX to be enabled at bootup.
127 * We may still need the TSX enable support
128 * during init for special cases like
129 * kexec after TSX is disabled.
130 */
131 tsx_enable();
132
133 /*
134 * tsx_enable() will change the state of the
135 * RTM CPUID bit. Force it here since it is now
136 * expected to be set.
137 */
138 setup_force_cpu_cap(X86_FEATURE_RTM);
139 }
140}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bf82b1f2e834..fd6012eef9c9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,7 @@
37#include <linux/uaccess.h> 37#include <linux/uaccess.h>
38#include <linux/hash.h> 38#include <linux/hash.h>
39#include <linux/kern_levels.h> 39#include <linux/kern_levels.h>
40#include <linux/kthread.h>
40 41
41#include <asm/page.h> 42#include <asm/page.h>
42#include <asm/pat.h> 43#include <asm/pat.h>
@@ -47,6 +48,30 @@
47#include <asm/kvm_page_track.h> 48#include <asm/kvm_page_track.h>
48#include "trace.h" 49#include "trace.h"
49 50
51extern bool itlb_multihit_kvm_mitigation;
52
53static int __read_mostly nx_huge_pages = -1;
54static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
55
56static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
57static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
58
59static struct kernel_param_ops nx_huge_pages_ops = {
60 .set = set_nx_huge_pages,
61 .get = param_get_bool,
62};
63
64static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
65 .set = set_nx_huge_pages_recovery_ratio,
66 .get = param_get_uint,
67};
68
69module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
70__MODULE_PARM_TYPE(nx_huge_pages, "bool");
71module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
72 &nx_huge_pages_recovery_ratio, 0644);
73__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
74
50/* 75/*
51 * When setting this variable to true it enables Two-Dimensional-Paging 76 * When setting this variable to true it enables Two-Dimensional-Paging
52 * where the hardware walks 2 page tables: 77 * where the hardware walks 2 page tables:
@@ -352,6 +377,11 @@ static inline bool spte_ad_need_write_protect(u64 spte)
352 return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; 377 return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
353} 378}
354 379
380static bool is_nx_huge_page_enabled(void)
381{
382 return READ_ONCE(nx_huge_pages);
383}
384
355static inline u64 spte_shadow_accessed_mask(u64 spte) 385static inline u64 spte_shadow_accessed_mask(u64 spte)
356{ 386{
357 MMU_WARN_ON(is_mmio_spte(spte)); 387 MMU_WARN_ON(is_mmio_spte(spte));
@@ -1190,6 +1220,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1190 kvm_mmu_gfn_disallow_lpage(slot, gfn); 1220 kvm_mmu_gfn_disallow_lpage(slot, gfn);
1191} 1221}
1192 1222
1223static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1224{
1225 if (sp->lpage_disallowed)
1226 return;
1227
1228 ++kvm->stat.nx_lpage_splits;
1229 list_add_tail(&sp->lpage_disallowed_link,
1230 &kvm->arch.lpage_disallowed_mmu_pages);
1231 sp->lpage_disallowed = true;
1232}
1233
1193static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 1234static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1194{ 1235{
1195 struct kvm_memslots *slots; 1236 struct kvm_memslots *slots;
@@ -1207,6 +1248,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1207 kvm_mmu_gfn_allow_lpage(slot, gfn); 1248 kvm_mmu_gfn_allow_lpage(slot, gfn);
1208} 1249}
1209 1250
1251static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1252{
1253 --kvm->stat.nx_lpage_splits;
1254 sp->lpage_disallowed = false;
1255 list_del(&sp->lpage_disallowed_link);
1256}
1257
1210static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, 1258static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1211 struct kvm_memory_slot *slot) 1259 struct kvm_memory_slot *slot)
1212{ 1260{
@@ -2792,6 +2840,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2792 kvm_reload_remote_mmus(kvm); 2840 kvm_reload_remote_mmus(kvm);
2793 } 2841 }
2794 2842
2843 if (sp->lpage_disallowed)
2844 unaccount_huge_nx_page(kvm, sp);
2845
2795 sp->role.invalid = 1; 2846 sp->role.invalid = 1;
2796 return list_unstable; 2847 return list_unstable;
2797} 2848}
@@ -3013,6 +3064,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3013 if (!speculative) 3064 if (!speculative)
3014 spte |= spte_shadow_accessed_mask(spte); 3065 spte |= spte_shadow_accessed_mask(spte);
3015 3066
3067 if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
3068 is_nx_huge_page_enabled()) {
3069 pte_access &= ~ACC_EXEC_MASK;
3070 }
3071
3016 if (pte_access & ACC_EXEC_MASK) 3072 if (pte_access & ACC_EXEC_MASK)
3017 spte |= shadow_x_mask; 3073 spte |= shadow_x_mask;
3018 else 3074 else
@@ -3233,9 +3289,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3233 __direct_pte_prefetch(vcpu, sp, sptep); 3289 __direct_pte_prefetch(vcpu, sp, sptep);
3234} 3290}
3235 3291
3292static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
3293 gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
3294{
3295 int level = *levelp;
3296 u64 spte = *it.sptep;
3297
3298 if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
3299 is_nx_huge_page_enabled() &&
3300 is_shadow_present_pte(spte) &&
3301 !is_large_pte(spte)) {
3302 /*
3303 * A small SPTE exists for this pfn, but FNAME(fetch)
3304 * and __direct_map would like to create a large PTE
3305 * instead: just force them to go down another level,
3306 * patching back for them into pfn the next 9 bits of
3307 * the address.
3308 */
3309 u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
3310 *pfnp |= gfn & page_mask;
3311 (*levelp)--;
3312 }
3313}
3314
3236static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, 3315static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3237 int map_writable, int level, kvm_pfn_t pfn, 3316 int map_writable, int level, kvm_pfn_t pfn,
3238 bool prefault) 3317 bool prefault, bool lpage_disallowed)
3239{ 3318{
3240 struct kvm_shadow_walk_iterator it; 3319 struct kvm_shadow_walk_iterator it;
3241 struct kvm_mmu_page *sp; 3320 struct kvm_mmu_page *sp;
@@ -3248,6 +3327,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3248 3327
3249 trace_kvm_mmu_spte_requested(gpa, level, pfn); 3328 trace_kvm_mmu_spte_requested(gpa, level, pfn);
3250 for_each_shadow_entry(vcpu, gpa, it) { 3329 for_each_shadow_entry(vcpu, gpa, it) {
3330 /*
3331 * We cannot overwrite existing page tables with an NX
3332 * large page, as the leaf could be executable.
3333 */
3334 disallowed_hugepage_adjust(it, gfn, &pfn, &level);
3335
3251 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 3336 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3252 if (it.level == level) 3337 if (it.level == level)
3253 break; 3338 break;
@@ -3258,6 +3343,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3258 it.level - 1, true, ACC_ALL); 3343 it.level - 1, true, ACC_ALL);
3259 3344
3260 link_shadow_page(vcpu, it.sptep, sp); 3345 link_shadow_page(vcpu, it.sptep, sp);
3346 if (lpage_disallowed)
3347 account_huge_nx_page(vcpu->kvm, sp);
3261 } 3348 }
3262 } 3349 }
3263 3350
@@ -3550,11 +3637,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3550{ 3637{
3551 int r; 3638 int r;
3552 int level; 3639 int level;
3553 bool force_pt_level = false; 3640 bool force_pt_level;
3554 kvm_pfn_t pfn; 3641 kvm_pfn_t pfn;
3555 unsigned long mmu_seq; 3642 unsigned long mmu_seq;
3556 bool map_writable, write = error_code & PFERR_WRITE_MASK; 3643 bool map_writable, write = error_code & PFERR_WRITE_MASK;
3644 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
3645 is_nx_huge_page_enabled();
3557 3646
3647 force_pt_level = lpage_disallowed;
3558 level = mapping_level(vcpu, gfn, &force_pt_level); 3648 level = mapping_level(vcpu, gfn, &force_pt_level);
3559 if (likely(!force_pt_level)) { 3649 if (likely(!force_pt_level)) {
3560 /* 3650 /*
@@ -3588,7 +3678,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3588 goto out_unlock; 3678 goto out_unlock;
3589 if (likely(!force_pt_level)) 3679 if (likely(!force_pt_level))
3590 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); 3680 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
3591 r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); 3681 r = __direct_map(vcpu, v, write, map_writable, level, pfn,
3682 prefault, false);
3592out_unlock: 3683out_unlock:
3593 spin_unlock(&vcpu->kvm->mmu_lock); 3684 spin_unlock(&vcpu->kvm->mmu_lock);
3594 kvm_release_pfn_clean(pfn); 3685 kvm_release_pfn_clean(pfn);
@@ -4174,6 +4265,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4174 unsigned long mmu_seq; 4265 unsigned long mmu_seq;
4175 int write = error_code & PFERR_WRITE_MASK; 4266 int write = error_code & PFERR_WRITE_MASK;
4176 bool map_writable; 4267 bool map_writable;
4268 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
4269 is_nx_huge_page_enabled();
4177 4270
4178 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); 4271 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4179 4272
@@ -4184,8 +4277,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4184 if (r) 4277 if (r)
4185 return r; 4278 return r;
4186 4279
4187 force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, 4280 force_pt_level =
4188 PT_DIRECTORY_LEVEL); 4281 lpage_disallowed ||
4282 !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
4189 level = mapping_level(vcpu, gfn, &force_pt_level); 4283 level = mapping_level(vcpu, gfn, &force_pt_level);
4190 if (likely(!force_pt_level)) { 4284 if (likely(!force_pt_level)) {
4191 if (level > PT_DIRECTORY_LEVEL && 4285 if (level > PT_DIRECTORY_LEVEL &&
@@ -4214,7 +4308,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4214 goto out_unlock; 4308 goto out_unlock;
4215 if (likely(!force_pt_level)) 4309 if (likely(!force_pt_level))
4216 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); 4310 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
4217 r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); 4311 r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
4312 prefault, lpage_disallowed);
4218out_unlock: 4313out_unlock:
4219 spin_unlock(&vcpu->kvm->mmu_lock); 4314 spin_unlock(&vcpu->kvm->mmu_lock);
4220 kvm_release_pfn_clean(pfn); 4315 kvm_release_pfn_clean(pfn);
@@ -6155,10 +6250,60 @@ static void kvm_set_mmio_spte_mask(void)
6155 kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); 6250 kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
6156} 6251}
6157 6252
6253static bool get_nx_auto_mode(void)
6254{
6255 /* Return true when CPU has the bug, and mitigations are ON */
6256 return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6257}
6258
6259static void __set_nx_huge_pages(bool val)
6260{
6261 nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6262}
6263
6264static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6265{
6266 bool old_val = nx_huge_pages;
6267 bool new_val;
6268
6269 /* In "auto" mode deploy workaround only if CPU has the bug. */
6270 if (sysfs_streq(val, "off"))
6271 new_val = 0;
6272 else if (sysfs_streq(val, "force"))
6273 new_val = 1;
6274 else if (sysfs_streq(val, "auto"))
6275 new_val = get_nx_auto_mode();
6276 else if (strtobool(val, &new_val) < 0)
6277 return -EINVAL;
6278
6279 __set_nx_huge_pages(new_val);
6280
6281 if (new_val != old_val) {
6282 struct kvm *kvm;
6283 int idx;
6284
6285 mutex_lock(&kvm_lock);
6286
6287 list_for_each_entry(kvm, &vm_list, vm_list) {
6288 idx = srcu_read_lock(&kvm->srcu);
6289 kvm_mmu_zap_all_fast(kvm);
6290 srcu_read_unlock(&kvm->srcu, idx);
6291
6292 wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6293 }
6294 mutex_unlock(&kvm_lock);
6295 }
6296
6297 return 0;
6298}
6299
6158int kvm_mmu_module_init(void) 6300int kvm_mmu_module_init(void)
6159{ 6301{
6160 int ret = -ENOMEM; 6302 int ret = -ENOMEM;
6161 6303
6304 if (nx_huge_pages == -1)
6305 __set_nx_huge_pages(get_nx_auto_mode());
6306
6162 /* 6307 /*
6163 * MMU roles use union aliasing which is, generally speaking, an 6308 * MMU roles use union aliasing which is, generally speaking, an
6164 * undefined behavior. However, we supposedly know how compilers behave 6309 * undefined behavior. However, we supposedly know how compilers behave
@@ -6238,3 +6383,116 @@ void kvm_mmu_module_exit(void)
6238 unregister_shrinker(&mmu_shrinker); 6383 unregister_shrinker(&mmu_shrinker);
6239 mmu_audit_disable(); 6384 mmu_audit_disable();
6240} 6385}
6386
6387static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
6388{
6389 unsigned int old_val;
6390 int err;
6391
6392 old_val = nx_huge_pages_recovery_ratio;
6393 err = param_set_uint(val, kp);
6394 if (err)
6395 return err;
6396
6397 if (READ_ONCE(nx_huge_pages) &&
6398 !old_val && nx_huge_pages_recovery_ratio) {
6399 struct kvm *kvm;
6400
6401 mutex_lock(&kvm_lock);
6402
6403 list_for_each_entry(kvm, &vm_list, vm_list)
6404 wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6405
6406 mutex_unlock(&kvm_lock);
6407 }
6408
6409 return err;
6410}
6411
6412static void kvm_recover_nx_lpages(struct kvm *kvm)
6413{
6414 int rcu_idx;
6415 struct kvm_mmu_page *sp;
6416 unsigned int ratio;
6417 LIST_HEAD(invalid_list);
6418 ulong to_zap;
6419
6420 rcu_idx = srcu_read_lock(&kvm->srcu);
6421 spin_lock(&kvm->mmu_lock);
6422
6423 ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6424 to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
6425 while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
6426 /*
6427 * We use a separate list instead of just using active_mmu_pages
6428 * because the number of lpage_disallowed pages is expected to
6429 * be relatively small compared to the total.
6430 */
6431 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6432 struct kvm_mmu_page,
6433 lpage_disallowed_link);
6434 WARN_ON_ONCE(!sp->lpage_disallowed);
6435 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6436 WARN_ON_ONCE(sp->lpage_disallowed);
6437
6438 if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
6439 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6440 if (to_zap)
6441 cond_resched_lock(&kvm->mmu_lock);
6442 }
6443 }
6444
6445 spin_unlock(&kvm->mmu_lock);
6446 srcu_read_unlock(&kvm->srcu, rcu_idx);
6447}
6448
6449static long get_nx_lpage_recovery_timeout(u64 start_time)
6450{
6451 return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
6452 ? start_time + 60 * HZ - get_jiffies_64()
6453 : MAX_SCHEDULE_TIMEOUT;
6454}
6455
6456static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6457{
6458 u64 start_time;
6459 long remaining_time;
6460
6461 while (true) {
6462 start_time = get_jiffies_64();
6463 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6464
6465 set_current_state(TASK_INTERRUPTIBLE);
6466 while (!kthread_should_stop() && remaining_time > 0) {
6467 schedule_timeout(remaining_time);
6468 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6469 set_current_state(TASK_INTERRUPTIBLE);
6470 }
6471
6472 set_current_state(TASK_RUNNING);
6473
6474 if (kthread_should_stop())
6475 return 0;
6476
6477 kvm_recover_nx_lpages(kvm);
6478 }
6479}
6480
6481int kvm_mmu_post_init_vm(struct kvm *kvm)
6482{
6483 int err;
6484
6485 err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6486 "kvm-nx-lpage-recovery",
6487 &kvm->arch.nx_lpage_recovery_thread);
6488 if (!err)
6489 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6490
6491 return err;
6492}
6493
6494void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6495{
6496 if (kvm->arch.nx_lpage_recovery_thread)
6497 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6498}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 11f8ec89433b..d55674f44a18 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -210,4 +210,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
210bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, 210bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
211 struct kvm_memory_slot *slot, u64 gfn); 211 struct kvm_memory_slot *slot, u64 gfn);
212int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); 212int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
213
214int kvm_mmu_post_init_vm(struct kvm *kvm);
215void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
216
213#endif 217#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7d5cdb3af594..97b21e7fd013 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -614,13 +614,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
614static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 614static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
615 struct guest_walker *gw, 615 struct guest_walker *gw,
616 int write_fault, int hlevel, 616 int write_fault, int hlevel,
617 kvm_pfn_t pfn, bool map_writable, bool prefault) 617 kvm_pfn_t pfn, bool map_writable, bool prefault,
618 bool lpage_disallowed)
618{ 619{
619 struct kvm_mmu_page *sp = NULL; 620 struct kvm_mmu_page *sp = NULL;
620 struct kvm_shadow_walk_iterator it; 621 struct kvm_shadow_walk_iterator it;
621 unsigned direct_access, access = gw->pt_access; 622 unsigned direct_access, access = gw->pt_access;
622 int top_level, ret; 623 int top_level, ret;
623 gfn_t base_gfn; 624 gfn_t gfn, base_gfn;
624 625
625 direct_access = gw->pte_access; 626 direct_access = gw->pte_access;
626 627
@@ -665,13 +666,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
665 link_shadow_page(vcpu, it.sptep, sp); 666 link_shadow_page(vcpu, it.sptep, sp);
666 } 667 }
667 668
668 base_gfn = gw->gfn; 669 /*
670 * FNAME(page_fault) might have clobbered the bottom bits of
671 * gw->gfn, restore them from the virtual address.
672 */
673 gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
674 base_gfn = gfn;
669 675
670 trace_kvm_mmu_spte_requested(addr, gw->level, pfn); 676 trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
671 677
672 for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { 678 for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
673 clear_sp_write_flooding_count(it.sptep); 679 clear_sp_write_flooding_count(it.sptep);
674 base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 680
681 /*
682 * We cannot overwrite existing page tables with an NX
683 * large page, as the leaf could be executable.
684 */
685 disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
686
687 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
675 if (it.level == hlevel) 688 if (it.level == hlevel)
676 break; 689 break;
677 690
@@ -683,6 +696,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
683 sp = kvm_mmu_get_page(vcpu, base_gfn, addr, 696 sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
684 it.level - 1, true, direct_access); 697 it.level - 1, true, direct_access);
685 link_shadow_page(vcpu, it.sptep, sp); 698 link_shadow_page(vcpu, it.sptep, sp);
699 if (lpage_disallowed)
700 account_huge_nx_page(vcpu->kvm, sp);
686 } 701 }
687 } 702 }
688 703
@@ -759,9 +774,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
759 int r; 774 int r;
760 kvm_pfn_t pfn; 775 kvm_pfn_t pfn;
761 int level = PT_PAGE_TABLE_LEVEL; 776 int level = PT_PAGE_TABLE_LEVEL;
762 bool force_pt_level = false;
763 unsigned long mmu_seq; 777 unsigned long mmu_seq;
764 bool map_writable, is_self_change_mapping; 778 bool map_writable, is_self_change_mapping;
779 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
780 is_nx_huge_page_enabled();
781 bool force_pt_level = lpage_disallowed;
765 782
766 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 783 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
767 784
@@ -851,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
851 if (!force_pt_level) 868 if (!force_pt_level)
852 transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); 869 transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
853 r = FNAME(fetch)(vcpu, addr, &walker, write_fault, 870 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
854 level, pfn, map_writable, prefault); 871 level, pfn, map_writable, prefault, lpage_disallowed);
855 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 872 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
856 873
857out_unlock: 874out_unlock:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8c8a5e20ea06..7db5c8ef35dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -213,6 +213,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
213 { "mmu_unsync", VM_STAT(mmu_unsync) }, 213 { "mmu_unsync", VM_STAT(mmu_unsync) },
214 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 214 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
215 { "largepages", VM_STAT(lpages, .mode = 0444) }, 215 { "largepages", VM_STAT(lpages, .mode = 0444) },
216 { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
216 { "max_mmu_page_hash_collisions", 217 { "max_mmu_page_hash_collisions",
217 VM_STAT(max_mmu_page_hash_collisions) }, 218 VM_STAT(max_mmu_page_hash_collisions) },
218 { NULL } 219 { NULL }
@@ -1285,6 +1286,14 @@ static u64 kvm_get_arch_capabilities(void)
1285 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); 1286 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1286 1287
1287 /* 1288 /*
1289 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1290 * the nested hypervisor runs with NX huge pages. If it is not,
1291 * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
1292 * L1 guests, so it need not worry about its own (L2) guests.
1293 */
1294 data |= ARCH_CAP_PSCHANGE_MC_NO;
1295
1296 /*
1288 * If we're doing cache flushes (either "always" or "cond") 1297 * If we're doing cache flushes (either "always" or "cond")
1289 * we will do one whenever the guest does a vmlaunch/vmresume. 1298 * we will do one whenever the guest does a vmlaunch/vmresume.
1290 * If an outer hypervisor is doing the cache flush for us 1299 * If an outer hypervisor is doing the cache flush for us
@@ -1303,6 +1312,25 @@ static u64 kvm_get_arch_capabilities(void)
1303 if (!boot_cpu_has_bug(X86_BUG_MDS)) 1312 if (!boot_cpu_has_bug(X86_BUG_MDS))
1304 data |= ARCH_CAP_MDS_NO; 1313 data |= ARCH_CAP_MDS_NO;
1305 1314
1315 /*
1316 * On TAA affected systems, export MDS_NO=0 when:
1317 * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
1318 * - Updated microcode is present. This is detected by
1319 * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
1320 * that VERW clears CPU buffers.
1321 *
1322 * When MDS_NO=0 is exported, guests deploy clear CPU buffer
1323 * mitigation and don't complain:
1324 *
1325 * "Vulnerable: Clear CPU buffers attempted, no microcode"
1326 *
1327 * If TSX is disabled on the system, guests are also mitigated against
1328 * TAA and clear CPU buffer mitigation is not required for guests.
1329 */
1330 if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) &&
1331 (data & ARCH_CAP_TSX_CTRL_MSR))
1332 data &= ~ARCH_CAP_MDS_NO;
1333
1306 return data; 1334 return data;
1307} 1335}
1308 1336
@@ -9424,6 +9452,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
9424 INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); 9452 INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
9425 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 9453 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
9426 INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); 9454 INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
9455 INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
9427 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 9456 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
9428 atomic_set(&kvm->arch.noncoherent_dma_count, 0); 9457 atomic_set(&kvm->arch.noncoherent_dma_count, 0);
9429 9458
@@ -9452,6 +9481,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
9452 return kvm_x86_ops->vm_init(kvm); 9481 return kvm_x86_ops->vm_init(kvm);
9453} 9482}
9454 9483
9484int kvm_arch_post_init_vm(struct kvm *kvm)
9485{
9486 return kvm_mmu_post_init_vm(kvm);
9487}
9488
9455static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 9489static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
9456{ 9490{
9457 vcpu_load(vcpu); 9491 vcpu_load(vcpu);
@@ -9553,6 +9587,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
9553} 9587}
9554EXPORT_SYMBOL_GPL(x86_set_memory_region); 9588EXPORT_SYMBOL_GPL(x86_set_memory_region);
9555 9589
9590void kvm_arch_pre_destroy_vm(struct kvm *kvm)
9591{
9592 kvm_mmu_pre_destroy_vm(kvm);
9593}
9594
9556void kvm_arch_destroy_vm(struct kvm *kvm) 9595void kvm_arch_destroy_vm(struct kvm *kvm)
9557{ 9596{
9558 if (current->mm == kvm->mm) { 9597 if (current->mm == kvm->mm) {
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index cc37511de866..6265871a4af2 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -554,12 +554,27 @@ ssize_t __weak cpu_show_mds(struct device *dev,
554 return sprintf(buf, "Not affected\n"); 554 return sprintf(buf, "Not affected\n");
555} 555}
556 556
557ssize_t __weak cpu_show_tsx_async_abort(struct device *dev,
558 struct device_attribute *attr,
559 char *buf)
560{
561 return sprintf(buf, "Not affected\n");
562}
563
564ssize_t __weak cpu_show_itlb_multihit(struct device *dev,
565 struct device_attribute *attr, char *buf)
566{
567 return sprintf(buf, "Not affected\n");
568}
569
557static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); 570static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
558static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); 571static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
559static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); 572static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
560static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); 573static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
561static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); 574static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
562static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); 575static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
576static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL);
577static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL);
563 578
564static struct attribute *cpu_root_vulnerabilities_attrs[] = { 579static struct attribute *cpu_root_vulnerabilities_attrs[] = {
565 &dev_attr_meltdown.attr, 580 &dev_attr_meltdown.attr,
@@ -568,6 +583,8 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
568 &dev_attr_spec_store_bypass.attr, 583 &dev_attr_spec_store_bypass.attr,
569 &dev_attr_l1tf.attr, 584 &dev_attr_l1tf.attr,
570 &dev_attr_mds.attr, 585 &dev_attr_mds.attr,
586 &dev_attr_tsx_async_abort.attr,
587 &dev_attr_itlb_multihit.attr,
571 NULL 588 NULL
572}; 589};
573 590
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 1cdfe05514c3..e41fd94ae5a9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -319,6 +319,8 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
319 free_engines(rcu_access_pointer(ctx->engines)); 319 free_engines(rcu_access_pointer(ctx->engines));
320 mutex_destroy(&ctx->engines_mutex); 320 mutex_destroy(&ctx->engines_mutex);
321 321
322 kfree(ctx->jump_whitelist);
323
322 if (ctx->timeline) 324 if (ctx->timeline)
323 intel_timeline_put(ctx->timeline); 325 intel_timeline_put(ctx->timeline);
324 326
@@ -441,6 +443,9 @@ __create_context(struct drm_i915_private *i915)
441 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++) 443 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
442 ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES; 444 ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
443 445
446 ctx->jump_whitelist = NULL;
447 ctx->jump_whitelist_cmds = 0;
448
444 return ctx; 449 return ctx;
445 450
446err_free: 451err_free:
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index 260d59cc3de8..00537b9d7006 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -192,6 +192,13 @@ struct i915_gem_context {
192 * per vm, which may be one per context or shared with the global GTT) 192 * per vm, which may be one per context or shared with the global GTT)
193 */ 193 */
194 struct radix_tree_root handles_vma; 194 struct radix_tree_root handles_vma;
195
196 /** jump_whitelist: Bit array for tracking cmds during cmdparsing
197 * Guarded by struct_mutex
198 */
199 unsigned long *jump_whitelist;
200 /** jump_whitelist_cmds: No of cmd slots available */
201 u32 jump_whitelist_cmds;
195}; 202};
196 203
197#endif /* __I915_GEM_CONTEXT_TYPES_H__ */ 204#endif /* __I915_GEM_CONTEXT_TYPES_H__ */
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index b5f6937369ea..e635e1e5f4d3 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -296,7 +296,9 @@ static inline u64 gen8_noncanonical_addr(u64 address)
296 296
297static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) 297static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
298{ 298{
299 return intel_engine_needs_cmd_parser(eb->engine) && eb->batch_len; 299 return intel_engine_requires_cmd_parser(eb->engine) ||
300 (intel_engine_using_cmd_parser(eb->engine) &&
301 eb->args->batch_len);
300} 302}
301 303
302static int eb_create(struct i915_execbuffer *eb) 304static int eb_create(struct i915_execbuffer *eb)
@@ -1955,40 +1957,94 @@ static int i915_reset_gen7_sol_offsets(struct i915_request *rq)
1955 return 0; 1957 return 0;
1956} 1958}
1957 1959
1958static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master) 1960static struct i915_vma *
1961shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj)
1962{
1963 struct drm_i915_private *dev_priv = eb->i915;
1964 struct i915_vma * const vma = *eb->vma;
1965 struct i915_address_space *vm;
1966 u64 flags;
1967
1968 /*
1969 * PPGTT backed shadow buffers must be mapped RO, to prevent
1970 * post-scan tampering
1971 */
1972 if (CMDPARSER_USES_GGTT(dev_priv)) {
1973 flags = PIN_GLOBAL;
1974 vm = &dev_priv->ggtt.vm;
1975 } else if (vma->vm->has_read_only) {
1976 flags = PIN_USER;
1977 vm = vma->vm;
1978 i915_gem_object_set_readonly(obj);
1979 } else {
1980 DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n");
1981 return ERR_PTR(-EINVAL);
1982 }
1983
1984 return i915_gem_object_pin(obj, vm, NULL, 0, 0, flags);
1985}
1986
1987static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
1959{ 1988{
1960 struct intel_engine_pool_node *pool; 1989 struct intel_engine_pool_node *pool;
1961 struct i915_vma *vma; 1990 struct i915_vma *vma;
1991 u64 batch_start;
1992 u64 shadow_batch_start;
1962 int err; 1993 int err;
1963 1994
1964 pool = intel_engine_pool_get(&eb->engine->pool, eb->batch_len); 1995 pool = intel_engine_pool_get(&eb->engine->pool, eb->batch_len);
1965 if (IS_ERR(pool)) 1996 if (IS_ERR(pool))
1966 return ERR_CAST(pool); 1997 return ERR_CAST(pool);
1967 1998
1968 err = intel_engine_cmd_parser(eb->engine, 1999 vma = shadow_batch_pin(eb, pool->obj);
2000 if (IS_ERR(vma))
2001 goto err;
2002
2003 batch_start = gen8_canonical_addr(eb->batch->node.start) +
2004 eb->batch_start_offset;
2005
2006 shadow_batch_start = gen8_canonical_addr(vma->node.start);
2007
2008 err = intel_engine_cmd_parser(eb->gem_context,
2009 eb->engine,
1969 eb->batch->obj, 2010 eb->batch->obj,
1970 pool->obj, 2011 batch_start,
1971 eb->batch_start_offset, 2012 eb->batch_start_offset,
1972 eb->batch_len, 2013 eb->batch_len,
1973 is_master); 2014 pool->obj,
2015 shadow_batch_start);
2016
1974 if (err) { 2017 if (err) {
1975 if (err == -EACCES) /* unhandled chained batch */ 2018 i915_vma_unpin(vma);
2019
2020 /*
2021 * Unsafe GGTT-backed buffers can still be submitted safely
2022 * as non-secure.
2023 * For PPGTT backing however, we have no choice but to forcibly
2024 * reject unsafe buffers
2025 */
2026 if (CMDPARSER_USES_GGTT(eb->i915) && (err == -EACCES))
2027 /* Execute original buffer non-secure */
1976 vma = NULL; 2028 vma = NULL;
1977 else 2029 else
1978 vma = ERR_PTR(err); 2030 vma = ERR_PTR(err);
1979 goto err; 2031 goto err;
1980 } 2032 }
1981 2033
1982 vma = i915_gem_object_ggtt_pin(pool->obj, NULL, 0, 0, 0);
1983 if (IS_ERR(vma))
1984 goto err;
1985
1986 eb->vma[eb->buffer_count] = i915_vma_get(vma); 2034 eb->vma[eb->buffer_count] = i915_vma_get(vma);
1987 eb->flags[eb->buffer_count] = 2035 eb->flags[eb->buffer_count] =
1988 __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF; 2036 __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF;
1989 vma->exec_flags = &eb->flags[eb->buffer_count]; 2037 vma->exec_flags = &eb->flags[eb->buffer_count];
1990 eb->buffer_count++; 2038 eb->buffer_count++;
1991 2039
2040 eb->batch_start_offset = 0;
2041 eb->batch = vma;
2042
2043 if (CMDPARSER_USES_GGTT(eb->i915))
2044 eb->batch_flags |= I915_DISPATCH_SECURE;
2045
2046 /* eb->batch_len unchanged */
2047
1992 vma->private = pool; 2048 vma->private = pool;
1993 return vma; 2049 return vma;
1994 2050
@@ -2421,6 +2477,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
2421 struct drm_i915_gem_exec_object2 *exec, 2477 struct drm_i915_gem_exec_object2 *exec,
2422 struct drm_syncobj **fences) 2478 struct drm_syncobj **fences)
2423{ 2479{
2480 struct drm_i915_private *i915 = to_i915(dev);
2424 struct i915_execbuffer eb; 2481 struct i915_execbuffer eb;
2425 struct dma_fence *in_fence = NULL; 2482 struct dma_fence *in_fence = NULL;
2426 struct dma_fence *exec_fence = NULL; 2483 struct dma_fence *exec_fence = NULL;
@@ -2432,7 +2489,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
2432 BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & 2489 BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS &
2433 ~__EXEC_OBJECT_UNKNOWN_FLAGS); 2490 ~__EXEC_OBJECT_UNKNOWN_FLAGS);
2434 2491
2435 eb.i915 = to_i915(dev); 2492 eb.i915 = i915;
2436 eb.file = file; 2493 eb.file = file;
2437 eb.args = args; 2494 eb.args = args;
2438 if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) 2495 if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
@@ -2452,8 +2509,15 @@ i915_gem_do_execbuffer(struct drm_device *dev,
2452 2509
2453 eb.batch_flags = 0; 2510 eb.batch_flags = 0;
2454 if (args->flags & I915_EXEC_SECURE) { 2511 if (args->flags & I915_EXEC_SECURE) {
2512 if (INTEL_GEN(i915) >= 11)
2513 return -ENODEV;
2514
2515 /* Return -EPERM to trigger fallback code on old binaries. */
2516 if (!HAS_SECURE_BATCHES(i915))
2517 return -EPERM;
2518
2455 if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN)) 2519 if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN))
2456 return -EPERM; 2520 return -EPERM;
2457 2521
2458 eb.batch_flags |= I915_DISPATCH_SECURE; 2522 eb.batch_flags |= I915_DISPATCH_SECURE;
2459 } 2523 }
@@ -2530,34 +2594,19 @@ i915_gem_do_execbuffer(struct drm_device *dev,
2530 goto err_vma; 2594 goto err_vma;
2531 } 2595 }
2532 2596
2597 if (eb.batch_len == 0)
2598 eb.batch_len = eb.batch->size - eb.batch_start_offset;
2599
2533 if (eb_use_cmdparser(&eb)) { 2600 if (eb_use_cmdparser(&eb)) {
2534 struct i915_vma *vma; 2601 struct i915_vma *vma;
2535 2602
2536 vma = eb_parse(&eb, drm_is_current_master(file)); 2603 vma = eb_parse(&eb);
2537 if (IS_ERR(vma)) { 2604 if (IS_ERR(vma)) {
2538 err = PTR_ERR(vma); 2605 err = PTR_ERR(vma);
2539 goto err_vma; 2606 goto err_vma;
2540 } 2607 }
2541
2542 if (vma) {
2543 /*
2544 * Batch parsed and accepted:
2545 *
2546 * Set the DISPATCH_SECURE bit to remove the NON_SECURE
2547 * bit from MI_BATCH_BUFFER_START commands issued in
2548 * the dispatch_execbuffer implementations. We
2549 * specifically don't want that set on batches the
2550 * command parser has accepted.
2551 */
2552 eb.batch_flags |= I915_DISPATCH_SECURE;
2553 eb.batch_start_offset = 0;
2554 eb.batch = vma;
2555 }
2556 } 2608 }
2557 2609
2558 if (eb.batch_len == 0)
2559 eb.batch_len = eb.batch->size - eb.batch_start_offset;
2560
2561 /* 2610 /*
2562 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure 2611 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
2563 * batch" bit. Hence we need to pin secure batches into the global gtt. 2612 * batch" bit. Hence we need to pin secure batches into the global gtt.
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index a82cea95c2f2..9dd8c299cb2d 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -475,12 +475,13 @@ struct intel_engine_cs {
475 475
476 struct intel_engine_hangcheck hangcheck; 476 struct intel_engine_hangcheck hangcheck;
477 477
478#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) 478#define I915_ENGINE_USING_CMD_PARSER BIT(0)
479#define I915_ENGINE_SUPPORTS_STATS BIT(1) 479#define I915_ENGINE_SUPPORTS_STATS BIT(1)
480#define I915_ENGINE_HAS_PREEMPTION BIT(2) 480#define I915_ENGINE_HAS_PREEMPTION BIT(2)
481#define I915_ENGINE_HAS_SEMAPHORES BIT(3) 481#define I915_ENGINE_HAS_SEMAPHORES BIT(3)
482#define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4) 482#define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4)
483#define I915_ENGINE_IS_VIRTUAL BIT(5) 483#define I915_ENGINE_IS_VIRTUAL BIT(5)
484#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(7)
484 unsigned int flags; 485 unsigned int flags;
485 486
486 /* 487 /*
@@ -541,9 +542,15 @@ struct intel_engine_cs {
541}; 542};
542 543
543static inline bool 544static inline bool
544intel_engine_needs_cmd_parser(const struct intel_engine_cs *engine) 545intel_engine_using_cmd_parser(const struct intel_engine_cs *engine)
545{ 546{
546 return engine->flags & I915_ENGINE_NEEDS_CMD_PARSER; 547 return engine->flags & I915_ENGINE_USING_CMD_PARSER;
548}
549
550static inline bool
551intel_engine_requires_cmd_parser(const struct intel_engine_cs *engine)
552{
553 return engine->flags & I915_ENGINE_REQUIRES_CMD_PARSER;
547} 554}
548 555
549static inline bool 556static inline bool
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 1363e069ec83..fac75afed35b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -38,6 +38,9 @@ static int __gt_unpark(struct intel_wakeref *wf)
38 gt->awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ); 38 gt->awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
39 GEM_BUG_ON(!gt->awake); 39 GEM_BUG_ON(!gt->awake);
40 40
41 if (NEEDS_RC6_CTX_CORRUPTION_WA(i915))
42 intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
43
41 intel_enable_gt_powersave(i915); 44 intel_enable_gt_powersave(i915);
42 45
43 i915_update_gfx_val(i915); 46 i915_update_gfx_val(i915);
@@ -67,6 +70,11 @@ static int __gt_park(struct intel_wakeref *wf)
67 if (INTEL_GEN(i915) >= 6) 70 if (INTEL_GEN(i915) >= 6)
68 gen6_rps_idle(i915); 71 gen6_rps_idle(i915);
69 72
73 if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) {
74 i915_rc6_ctx_wa_check(i915);
75 intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
76 }
77
70 /* Everything switched off, flush any residual interrupt just in case */ 78 /* Everything switched off, flush any residual interrupt just in case */
71 intel_synchronize_irq(i915); 79 intel_synchronize_irq(i915);
72 80
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 24555102e198..f24096e27bef 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -53,13 +53,11 @@
53 * granting userspace undue privileges. There are three categories of privilege. 53 * granting userspace undue privileges. There are three categories of privilege.
54 * 54 *
55 * First, commands which are explicitly defined as privileged or which should 55 * First, commands which are explicitly defined as privileged or which should
56 * only be used by the kernel driver. The parser generally rejects such 56 * only be used by the kernel driver. The parser rejects such commands
57 * commands, though it may allow some from the drm master process.
58 * 57 *
59 * Second, commands which access registers. To support correct/enhanced 58 * Second, commands which access registers. To support correct/enhanced
60 * userspace functionality, particularly certain OpenGL extensions, the parser 59 * userspace functionality, particularly certain OpenGL extensions, the parser
61 * provides a whitelist of registers which userspace may safely access (for both 60 * provides a whitelist of registers which userspace may safely access
62 * normal and drm master processes).
63 * 61 *
64 * Third, commands which access privileged memory (i.e. GGTT, HWS page, etc). 62 * Third, commands which access privileged memory (i.e. GGTT, HWS page, etc).
65 * The parser always rejects such commands. 63 * The parser always rejects such commands.
@@ -84,9 +82,9 @@
84 * in the per-engine command tables. 82 * in the per-engine command tables.
85 * 83 *
86 * Other command table entries map fairly directly to high level categories 84 * Other command table entries map fairly directly to high level categories
87 * mentioned above: rejected, master-only, register whitelist. The parser 85 * mentioned above: rejected, register whitelist. The parser implements a number
88 * implements a number of checks, including the privileged memory checks, via a 86 * of checks, including the privileged memory checks, via a general bitmasking
89 * general bitmasking mechanism. 87 * mechanism.
90 */ 88 */
91 89
92/* 90/*
@@ -104,8 +102,6 @@ struct drm_i915_cmd_descriptor {
104 * CMD_DESC_REJECT: The command is never allowed 102 * CMD_DESC_REJECT: The command is never allowed
105 * CMD_DESC_REGISTER: The command should be checked against the 103 * CMD_DESC_REGISTER: The command should be checked against the
106 * register whitelist for the appropriate ring 104 * register whitelist for the appropriate ring
107 * CMD_DESC_MASTER: The command is allowed if the submitting process
108 * is the DRM master
109 */ 105 */
110 u32 flags; 106 u32 flags;
111#define CMD_DESC_FIXED (1<<0) 107#define CMD_DESC_FIXED (1<<0)
@@ -113,7 +109,6 @@ struct drm_i915_cmd_descriptor {
113#define CMD_DESC_REJECT (1<<2) 109#define CMD_DESC_REJECT (1<<2)
114#define CMD_DESC_REGISTER (1<<3) 110#define CMD_DESC_REGISTER (1<<3)
115#define CMD_DESC_BITMASK (1<<4) 111#define CMD_DESC_BITMASK (1<<4)
116#define CMD_DESC_MASTER (1<<5)
117 112
118 /* 113 /*
119 * The command's unique identification bits and the bitmask to get them. 114 * The command's unique identification bits and the bitmask to get them.
@@ -194,7 +189,7 @@ struct drm_i915_cmd_table {
194#define CMD(op, opm, f, lm, fl, ...) \ 189#define CMD(op, opm, f, lm, fl, ...) \
195 { \ 190 { \
196 .flags = (fl) | ((f) ? CMD_DESC_FIXED : 0), \ 191 .flags = (fl) | ((f) ? CMD_DESC_FIXED : 0), \
197 .cmd = { (op), ~0u << (opm) }, \ 192 .cmd = { (op & ~0u << (opm)), ~0u << (opm) }, \
198 .length = { (lm) }, \ 193 .length = { (lm) }, \
199 __VA_ARGS__ \ 194 __VA_ARGS__ \
200 } 195 }
@@ -209,14 +204,13 @@ struct drm_i915_cmd_table {
209#define R CMD_DESC_REJECT 204#define R CMD_DESC_REJECT
210#define W CMD_DESC_REGISTER 205#define W CMD_DESC_REGISTER
211#define B CMD_DESC_BITMASK 206#define B CMD_DESC_BITMASK
212#define M CMD_DESC_MASTER
213 207
214/* Command Mask Fixed Len Action 208/* Command Mask Fixed Len Action
215 ---------------------------------------------------------- */ 209 ---------------------------------------------------------- */
216static const struct drm_i915_cmd_descriptor common_cmds[] = { 210static const struct drm_i915_cmd_descriptor gen7_common_cmds[] = {
217 CMD( MI_NOOP, SMI, F, 1, S ), 211 CMD( MI_NOOP, SMI, F, 1, S ),
218 CMD( MI_USER_INTERRUPT, SMI, F, 1, R ), 212 CMD( MI_USER_INTERRUPT, SMI, F, 1, R ),
219 CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, M ), 213 CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, R ),
220 CMD( MI_ARB_CHECK, SMI, F, 1, S ), 214 CMD( MI_ARB_CHECK, SMI, F, 1, S ),
221 CMD( MI_REPORT_HEAD, SMI, F, 1, S ), 215 CMD( MI_REPORT_HEAD, SMI, F, 1, S ),
222 CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ), 216 CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ),
@@ -246,7 +240,7 @@ static const struct drm_i915_cmd_descriptor common_cmds[] = {
246 CMD( MI_BATCH_BUFFER_START, SMI, !F, 0xFF, S ), 240 CMD( MI_BATCH_BUFFER_START, SMI, !F, 0xFF, S ),
247}; 241};
248 242
249static const struct drm_i915_cmd_descriptor render_cmds[] = { 243static const struct drm_i915_cmd_descriptor gen7_render_cmds[] = {
250 CMD( MI_FLUSH, SMI, F, 1, S ), 244 CMD( MI_FLUSH, SMI, F, 1, S ),
251 CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), 245 CMD( MI_ARB_ON_OFF, SMI, F, 1, R ),
252 CMD( MI_PREDICATE, SMI, F, 1, S ), 246 CMD( MI_PREDICATE, SMI, F, 1, S ),
@@ -313,7 +307,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
313 CMD( MI_URB_ATOMIC_ALLOC, SMI, F, 1, S ), 307 CMD( MI_URB_ATOMIC_ALLOC, SMI, F, 1, S ),
314 CMD( MI_SET_APPID, SMI, F, 1, S ), 308 CMD( MI_SET_APPID, SMI, F, 1, S ),
315 CMD( MI_RS_CONTEXT, SMI, F, 1, S ), 309 CMD( MI_RS_CONTEXT, SMI, F, 1, S ),
316 CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ), 310 CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, R ),
317 CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ), 311 CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ),
318 CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W, 312 CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W,
319 .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ), 313 .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ),
@@ -330,7 +324,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
330 CMD( GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS, S3D, !F, 0x1FF, S ), 324 CMD( GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS, S3D, !F, 0x1FF, S ),
331}; 325};
332 326
333static const struct drm_i915_cmd_descriptor video_cmds[] = { 327static const struct drm_i915_cmd_descriptor gen7_video_cmds[] = {
334 CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), 328 CMD( MI_ARB_ON_OFF, SMI, F, 1, R ),
335 CMD( MI_SET_APPID, SMI, F, 1, S ), 329 CMD( MI_SET_APPID, SMI, F, 1, S ),
336 CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B, 330 CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B,
@@ -374,7 +368,7 @@ static const struct drm_i915_cmd_descriptor video_cmds[] = {
374 CMD( MFX_WAIT, SMFX, F, 1, S ), 368 CMD( MFX_WAIT, SMFX, F, 1, S ),
375}; 369};
376 370
377static const struct drm_i915_cmd_descriptor vecs_cmds[] = { 371static const struct drm_i915_cmd_descriptor gen7_vecs_cmds[] = {
378 CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), 372 CMD( MI_ARB_ON_OFF, SMI, F, 1, R ),
379 CMD( MI_SET_APPID, SMI, F, 1, S ), 373 CMD( MI_SET_APPID, SMI, F, 1, S ),
380 CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B, 374 CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B,
@@ -412,7 +406,7 @@ static const struct drm_i915_cmd_descriptor vecs_cmds[] = {
412 }}, ), 406 }}, ),
413}; 407};
414 408
415static const struct drm_i915_cmd_descriptor blt_cmds[] = { 409static const struct drm_i915_cmd_descriptor gen7_blt_cmds[] = {
416 CMD( MI_DISPLAY_FLIP, SMI, !F, 0xFF, R ), 410 CMD( MI_DISPLAY_FLIP, SMI, !F, 0xFF, R ),
417 CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, B, 411 CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, B,
418 .bits = {{ 412 .bits = {{
@@ -446,10 +440,64 @@ static const struct drm_i915_cmd_descriptor blt_cmds[] = {
446}; 440};
447 441
448static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = { 442static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = {
449 CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ), 443 CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, R ),
450 CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ), 444 CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ),
451}; 445};
452 446
447/*
448 * For Gen9 we can still rely on the h/w to enforce cmd security, and only
449 * need to re-enforce the register access checks. We therefore only need to
450 * teach the cmdparser how to find the end of each command, and identify
451 * register accesses. The table doesn't need to reject any commands, and so
452 * the only commands listed here are:
453 * 1) Those that touch registers
454 * 2) Those that do not have the default 8-bit length
455 *
456 * Note that the default MI length mask chosen for this table is 0xFF, not
457 * the 0x3F used on older devices. This is because the vast majority of MI
458 * cmds on Gen9 use a standard 8-bit Length field.
459 * All the Gen9 blitter instructions are standard 0xFF length mask, and
460 * none allow access to non-general registers, so in fact no BLT cmds are
461 * included in the table at all.
462 *
463 */
464static const struct drm_i915_cmd_descriptor gen9_blt_cmds[] = {
465 CMD( MI_NOOP, SMI, F, 1, S ),
466 CMD( MI_USER_INTERRUPT, SMI, F, 1, S ),
467 CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, S ),
468 CMD( MI_FLUSH, SMI, F, 1, S ),
469 CMD( MI_ARB_CHECK, SMI, F, 1, S ),
470 CMD( MI_REPORT_HEAD, SMI, F, 1, S ),
471 CMD( MI_ARB_ON_OFF, SMI, F, 1, S ),
472 CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ),
473 CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, S ),
474 CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, S ),
475 CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, S ),
476 CMD( MI_LOAD_REGISTER_IMM(1), SMI, !F, 0xFF, W,
477 .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 2 } ),
478 CMD( MI_UPDATE_GTT, SMI, !F, 0x3FF, S ),
479 CMD( MI_STORE_REGISTER_MEM_GEN8, SMI, F, 4, W,
480 .reg = { .offset = 1, .mask = 0x007FFFFC } ),
481 CMD( MI_FLUSH_DW, SMI, !F, 0x3F, S ),
482 CMD( MI_LOAD_REGISTER_MEM_GEN8, SMI, F, 4, W,
483 .reg = { .offset = 1, .mask = 0x007FFFFC } ),
484 CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W,
485 .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ),
486
487 /*
488 * We allow BB_START but apply further checks. We just sanitize the
489 * basic fields here.
490 */
491#define MI_BB_START_OPERAND_MASK GENMASK(SMI-1, 0)
492#define MI_BB_START_OPERAND_EXPECT (MI_BATCH_PPGTT_HSW | 1)
493 CMD( MI_BATCH_BUFFER_START_GEN8, SMI, !F, 0xFF, B,
494 .bits = {{
495 .offset = 0,
496 .mask = MI_BB_START_OPERAND_MASK,
497 .expected = MI_BB_START_OPERAND_EXPECT,
498 }}, ),
499};
500
453static const struct drm_i915_cmd_descriptor noop_desc = 501static const struct drm_i915_cmd_descriptor noop_desc =
454 CMD(MI_NOOP, SMI, F, 1, S); 502 CMD(MI_NOOP, SMI, F, 1, S);
455 503
@@ -463,40 +511,44 @@ static const struct drm_i915_cmd_descriptor noop_desc =
463#undef R 511#undef R
464#undef W 512#undef W
465#undef B 513#undef B
466#undef M
467 514
468static const struct drm_i915_cmd_table gen7_render_cmds[] = { 515static const struct drm_i915_cmd_table gen7_render_cmd_table[] = {
469 { common_cmds, ARRAY_SIZE(common_cmds) }, 516 { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
470 { render_cmds, ARRAY_SIZE(render_cmds) }, 517 { gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) },
471}; 518};
472 519
473static const struct drm_i915_cmd_table hsw_render_ring_cmds[] = { 520static const struct drm_i915_cmd_table hsw_render_ring_cmd_table[] = {
474 { common_cmds, ARRAY_SIZE(common_cmds) }, 521 { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
475 { render_cmds, ARRAY_SIZE(render_cmds) }, 522 { gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) },
476 { hsw_render_cmds, ARRAY_SIZE(hsw_render_cmds) }, 523 { hsw_render_cmds, ARRAY_SIZE(hsw_render_cmds) },
477}; 524};
478 525
479static const struct drm_i915_cmd_table gen7_video_cmds[] = { 526static const struct drm_i915_cmd_table gen7_video_cmd_table[] = {
480 { common_cmds, ARRAY_SIZE(common_cmds) }, 527 { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
481 { video_cmds, ARRAY_SIZE(video_cmds) }, 528 { gen7_video_cmds, ARRAY_SIZE(gen7_video_cmds) },
482}; 529};
483 530
484static const struct drm_i915_cmd_table hsw_vebox_cmds[] = { 531static const struct drm_i915_cmd_table hsw_vebox_cmd_table[] = {
485 { common_cmds, ARRAY_SIZE(common_cmds) }, 532 { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
486 { vecs_cmds, ARRAY_SIZE(vecs_cmds) }, 533 { gen7_vecs_cmds, ARRAY_SIZE(gen7_vecs_cmds) },
487}; 534};
488 535
489static const struct drm_i915_cmd_table gen7_blt_cmds[] = { 536static const struct drm_i915_cmd_table gen7_blt_cmd_table[] = {
490 { common_cmds, ARRAY_SIZE(common_cmds) }, 537 { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
491 { blt_cmds, ARRAY_SIZE(blt_cmds) }, 538 { gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) },
492}; 539};
493 540
494static const struct drm_i915_cmd_table hsw_blt_ring_cmds[] = { 541static const struct drm_i915_cmd_table hsw_blt_ring_cmd_table[] = {
495 { common_cmds, ARRAY_SIZE(common_cmds) }, 542 { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
496 { blt_cmds, ARRAY_SIZE(blt_cmds) }, 543 { gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) },
497 { hsw_blt_cmds, ARRAY_SIZE(hsw_blt_cmds) }, 544 { hsw_blt_cmds, ARRAY_SIZE(hsw_blt_cmds) },
498}; 545};
499 546
547static const struct drm_i915_cmd_table gen9_blt_cmd_table[] = {
548 { gen9_blt_cmds, ARRAY_SIZE(gen9_blt_cmds) },
549};
550
551
500/* 552/*
501 * Register whitelists, sorted by increasing register offset. 553 * Register whitelists, sorted by increasing register offset.
502 */ 554 */
@@ -612,17 +664,27 @@ static const struct drm_i915_reg_descriptor gen7_blt_regs[] = {
612 REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE), 664 REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
613}; 665};
614 666
615static const struct drm_i915_reg_descriptor ivb_master_regs[] = { 667static const struct drm_i915_reg_descriptor gen9_blt_regs[] = {
616 REG32(FORCEWAKE_MT), 668 REG64_IDX(RING_TIMESTAMP, RENDER_RING_BASE),
617 REG32(DERRMR), 669 REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE),
618 REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_A)), 670 REG32(BCS_SWCTRL),
619 REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_B)), 671 REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
620 REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_C)), 672 REG64_IDX(BCS_GPR, 0),
621}; 673 REG64_IDX(BCS_GPR, 1),
622 674 REG64_IDX(BCS_GPR, 2),
623static const struct drm_i915_reg_descriptor hsw_master_regs[] = { 675 REG64_IDX(BCS_GPR, 3),
624 REG32(FORCEWAKE_MT), 676 REG64_IDX(BCS_GPR, 4),
625 REG32(DERRMR), 677 REG64_IDX(BCS_GPR, 5),
678 REG64_IDX(BCS_GPR, 6),
679 REG64_IDX(BCS_GPR, 7),
680 REG64_IDX(BCS_GPR, 8),
681 REG64_IDX(BCS_GPR, 9),
682 REG64_IDX(BCS_GPR, 10),
683 REG64_IDX(BCS_GPR, 11),
684 REG64_IDX(BCS_GPR, 12),
685 REG64_IDX(BCS_GPR, 13),
686 REG64_IDX(BCS_GPR, 14),
687 REG64_IDX(BCS_GPR, 15),
626}; 688};
627 689
628#undef REG64 690#undef REG64
@@ -631,28 +693,27 @@ static const struct drm_i915_reg_descriptor hsw_master_regs[] = {
631struct drm_i915_reg_table { 693struct drm_i915_reg_table {
632 const struct drm_i915_reg_descriptor *regs; 694 const struct drm_i915_reg_descriptor *regs;
633 int num_regs; 695 int num_regs;
634 bool master;
635}; 696};
636 697
637static const struct drm_i915_reg_table ivb_render_reg_tables[] = { 698static const struct drm_i915_reg_table ivb_render_reg_tables[] = {
638 { gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false }, 699 { gen7_render_regs, ARRAY_SIZE(gen7_render_regs) },
639 { ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true },
640}; 700};
641 701
642static const struct drm_i915_reg_table ivb_blt_reg_tables[] = { 702static const struct drm_i915_reg_table ivb_blt_reg_tables[] = {
643 { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false }, 703 { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) },
644 { ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true },
645}; 704};
646 705
647static const struct drm_i915_reg_table hsw_render_reg_tables[] = { 706static const struct drm_i915_reg_table hsw_render_reg_tables[] = {
648 { gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false }, 707 { gen7_render_regs, ARRAY_SIZE(gen7_render_regs) },
649 { hsw_render_regs, ARRAY_SIZE(hsw_render_regs), false }, 708 { hsw_render_regs, ARRAY_SIZE(hsw_render_regs) },
650 { hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true },
651}; 709};
652 710
653static const struct drm_i915_reg_table hsw_blt_reg_tables[] = { 711static const struct drm_i915_reg_table hsw_blt_reg_tables[] = {
654 { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false }, 712 { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) },
655 { hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true }, 713};
714
715static const struct drm_i915_reg_table gen9_blt_reg_tables[] = {
716 { gen9_blt_regs, ARRAY_SIZE(gen9_blt_regs) },
656}; 717};
657 718
658static u32 gen7_render_get_cmd_length_mask(u32 cmd_header) 719static u32 gen7_render_get_cmd_length_mask(u32 cmd_header)
@@ -710,6 +771,17 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header)
710 return 0; 771 return 0;
711} 772}
712 773
774static u32 gen9_blt_get_cmd_length_mask(u32 cmd_header)
775{
776 u32 client = cmd_header >> INSTR_CLIENT_SHIFT;
777
778 if (client == INSTR_MI_CLIENT || client == INSTR_BC_CLIENT)
779 return 0xFF;
780
781 DRM_DEBUG_DRIVER("CMD: Abnormal blt cmd length! 0x%08X\n", cmd_header);
782 return 0;
783}
784
713static bool validate_cmds_sorted(const struct intel_engine_cs *engine, 785static bool validate_cmds_sorted(const struct intel_engine_cs *engine,
714 const struct drm_i915_cmd_table *cmd_tables, 786 const struct drm_i915_cmd_table *cmd_tables,
715 int cmd_table_count) 787 int cmd_table_count)
@@ -867,18 +939,19 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
867 int cmd_table_count; 939 int cmd_table_count;
868 int ret; 940 int ret;
869 941
870 if (!IS_GEN(engine->i915, 7)) 942 if (!IS_GEN(engine->i915, 7) && !(IS_GEN(engine->i915, 9) &&
943 engine->class == COPY_ENGINE_CLASS))
871 return; 944 return;
872 945
873 switch (engine->class) { 946 switch (engine->class) {
874 case RENDER_CLASS: 947 case RENDER_CLASS:
875 if (IS_HASWELL(engine->i915)) { 948 if (IS_HASWELL(engine->i915)) {
876 cmd_tables = hsw_render_ring_cmds; 949 cmd_tables = hsw_render_ring_cmd_table;
877 cmd_table_count = 950 cmd_table_count =
878 ARRAY_SIZE(hsw_render_ring_cmds); 951 ARRAY_SIZE(hsw_render_ring_cmd_table);
879 } else { 952 } else {
880 cmd_tables = gen7_render_cmds; 953 cmd_tables = gen7_render_cmd_table;
881 cmd_table_count = ARRAY_SIZE(gen7_render_cmds); 954 cmd_table_count = ARRAY_SIZE(gen7_render_cmd_table);
882 } 955 }
883 956
884 if (IS_HASWELL(engine->i915)) { 957 if (IS_HASWELL(engine->i915)) {
@@ -888,36 +961,46 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
888 engine->reg_tables = ivb_render_reg_tables; 961 engine->reg_tables = ivb_render_reg_tables;
889 engine->reg_table_count = ARRAY_SIZE(ivb_render_reg_tables); 962 engine->reg_table_count = ARRAY_SIZE(ivb_render_reg_tables);
890 } 963 }
891
892 engine->get_cmd_length_mask = gen7_render_get_cmd_length_mask; 964 engine->get_cmd_length_mask = gen7_render_get_cmd_length_mask;
893 break; 965 break;
894 case VIDEO_DECODE_CLASS: 966 case VIDEO_DECODE_CLASS:
895 cmd_tables = gen7_video_cmds; 967 cmd_tables = gen7_video_cmd_table;
896 cmd_table_count = ARRAY_SIZE(gen7_video_cmds); 968 cmd_table_count = ARRAY_SIZE(gen7_video_cmd_table);
897 engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; 969 engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
898 break; 970 break;
899 case COPY_ENGINE_CLASS: 971 case COPY_ENGINE_CLASS:
900 if (IS_HASWELL(engine->i915)) { 972 engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
901 cmd_tables = hsw_blt_ring_cmds; 973 if (IS_GEN(engine->i915, 9)) {
902 cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); 974 cmd_tables = gen9_blt_cmd_table;
975 cmd_table_count = ARRAY_SIZE(gen9_blt_cmd_table);
976 engine->get_cmd_length_mask =
977 gen9_blt_get_cmd_length_mask;
978
979 /* BCS Engine unsafe without parser */
980 engine->flags |= I915_ENGINE_REQUIRES_CMD_PARSER;
981 } else if (IS_HASWELL(engine->i915)) {
982 cmd_tables = hsw_blt_ring_cmd_table;
983 cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmd_table);
903 } else { 984 } else {
904 cmd_tables = gen7_blt_cmds; 985 cmd_tables = gen7_blt_cmd_table;
905 cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); 986 cmd_table_count = ARRAY_SIZE(gen7_blt_cmd_table);
906 } 987 }
907 988
908 if (IS_HASWELL(engine->i915)) { 989 if (IS_GEN(engine->i915, 9)) {
990 engine->reg_tables = gen9_blt_reg_tables;
991 engine->reg_table_count =
992 ARRAY_SIZE(gen9_blt_reg_tables);
993 } else if (IS_HASWELL(engine->i915)) {
909 engine->reg_tables = hsw_blt_reg_tables; 994 engine->reg_tables = hsw_blt_reg_tables;
910 engine->reg_table_count = ARRAY_SIZE(hsw_blt_reg_tables); 995 engine->reg_table_count = ARRAY_SIZE(hsw_blt_reg_tables);
911 } else { 996 } else {
912 engine->reg_tables = ivb_blt_reg_tables; 997 engine->reg_tables = ivb_blt_reg_tables;
913 engine->reg_table_count = ARRAY_SIZE(ivb_blt_reg_tables); 998 engine->reg_table_count = ARRAY_SIZE(ivb_blt_reg_tables);
914 } 999 }
915
916 engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
917 break; 1000 break;
918 case VIDEO_ENHANCEMENT_CLASS: 1001 case VIDEO_ENHANCEMENT_CLASS:
919 cmd_tables = hsw_vebox_cmds; 1002 cmd_tables = hsw_vebox_cmd_table;
920 cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); 1003 cmd_table_count = ARRAY_SIZE(hsw_vebox_cmd_table);
921 /* VECS can use the same length_mask function as VCS */ 1004 /* VECS can use the same length_mask function as VCS */
922 engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; 1005 engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
923 break; 1006 break;
@@ -943,7 +1026,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
943 return; 1026 return;
944 } 1027 }
945 1028
946 engine->flags |= I915_ENGINE_NEEDS_CMD_PARSER; 1029 engine->flags |= I915_ENGINE_USING_CMD_PARSER;
947} 1030}
948 1031
949/** 1032/**
@@ -955,7 +1038,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
955 */ 1038 */
956void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine) 1039void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine)
957{ 1040{
958 if (!intel_engine_needs_cmd_parser(engine)) 1041 if (!intel_engine_using_cmd_parser(engine))
959 return; 1042 return;
960 1043
961 fini_hash_table(engine); 1044 fini_hash_table(engine);
@@ -1029,22 +1112,16 @@ __find_reg(const struct drm_i915_reg_descriptor *table, int count, u32 addr)
1029} 1112}
1030 1113
1031static const struct drm_i915_reg_descriptor * 1114static const struct drm_i915_reg_descriptor *
1032find_reg(const struct intel_engine_cs *engine, bool is_master, u32 addr) 1115find_reg(const struct intel_engine_cs *engine, u32 addr)
1033{ 1116{
1034 const struct drm_i915_reg_table *table = engine->reg_tables; 1117 const struct drm_i915_reg_table *table = engine->reg_tables;
1118 const struct drm_i915_reg_descriptor *reg = NULL;
1035 int count = engine->reg_table_count; 1119 int count = engine->reg_table_count;
1036 1120
1037 for (; count > 0; ++table, --count) { 1121 for (; !reg && (count > 0); ++table, --count)
1038 if (!table->master || is_master) { 1122 reg = __find_reg(table->regs, table->num_regs, addr);
1039 const struct drm_i915_reg_descriptor *reg;
1040 1123
1041 reg = __find_reg(table->regs, table->num_regs, addr); 1124 return reg;
1042 if (reg != NULL)
1043 return reg;
1044 }
1045 }
1046
1047 return NULL;
1048} 1125}
1049 1126
1050/* Returns a vmap'd pointer to dst_obj, which the caller must unmap */ 1127/* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
@@ -1128,8 +1205,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
1128 1205
1129static bool check_cmd(const struct intel_engine_cs *engine, 1206static bool check_cmd(const struct intel_engine_cs *engine,
1130 const struct drm_i915_cmd_descriptor *desc, 1207 const struct drm_i915_cmd_descriptor *desc,
1131 const u32 *cmd, u32 length, 1208 const u32 *cmd, u32 length)
1132 const bool is_master)
1133{ 1209{
1134 if (desc->flags & CMD_DESC_SKIP) 1210 if (desc->flags & CMD_DESC_SKIP)
1135 return true; 1211 return true;
@@ -1139,12 +1215,6 @@ static bool check_cmd(const struct intel_engine_cs *engine,
1139 return false; 1215 return false;
1140 } 1216 }
1141 1217
1142 if ((desc->flags & CMD_DESC_MASTER) && !is_master) {
1143 DRM_DEBUG_DRIVER("CMD: Rejected master-only command: 0x%08X\n",
1144 *cmd);
1145 return false;
1146 }
1147
1148 if (desc->flags & CMD_DESC_REGISTER) { 1218 if (desc->flags & CMD_DESC_REGISTER) {
1149 /* 1219 /*
1150 * Get the distance between individual register offset 1220 * Get the distance between individual register offset
@@ -1158,7 +1228,7 @@ static bool check_cmd(const struct intel_engine_cs *engine,
1158 offset += step) { 1228 offset += step) {
1159 const u32 reg_addr = cmd[offset] & desc->reg.mask; 1229 const u32 reg_addr = cmd[offset] & desc->reg.mask;
1160 const struct drm_i915_reg_descriptor *reg = 1230 const struct drm_i915_reg_descriptor *reg =
1161 find_reg(engine, is_master, reg_addr); 1231 find_reg(engine, reg_addr);
1162 1232
1163 if (!reg) { 1233 if (!reg) {
1164 DRM_DEBUG_DRIVER("CMD: Rejected register 0x%08X in command: 0x%08X (%s)\n", 1234 DRM_DEBUG_DRIVER("CMD: Rejected register 0x%08X in command: 0x%08X (%s)\n",
@@ -1236,16 +1306,112 @@ static bool check_cmd(const struct intel_engine_cs *engine,
1236 return true; 1306 return true;
1237} 1307}
1238 1308
1309static int check_bbstart(const struct i915_gem_context *ctx,
1310 u32 *cmd, u32 offset, u32 length,
1311 u32 batch_len,
1312 u64 batch_start,
1313 u64 shadow_batch_start)
1314{
1315 u64 jump_offset, jump_target;
1316 u32 target_cmd_offset, target_cmd_index;
1317
1318 /* For igt compatibility on older platforms */
1319 if (CMDPARSER_USES_GGTT(ctx->i915)) {
1320 DRM_DEBUG("CMD: Rejecting BB_START for ggtt based submission\n");
1321 return -EACCES;
1322 }
1323
1324 if (length != 3) {
1325 DRM_DEBUG("CMD: Recursive BB_START with bad length(%u)\n",
1326 length);
1327 return -EINVAL;
1328 }
1329
1330 jump_target = *(u64*)(cmd+1);
1331 jump_offset = jump_target - batch_start;
1332
1333 /*
1334 * Any underflow of jump_target is guaranteed to be outside the range
1335 * of a u32, so >= test catches both too large and too small
1336 */
1337 if (jump_offset >= batch_len) {
1338 DRM_DEBUG("CMD: BB_START to 0x%llx jumps out of BB\n",
1339 jump_target);
1340 return -EINVAL;
1341 }
1342
1343 /*
1344 * This cannot overflow a u32 because we already checked jump_offset
1345 * is within the BB, and the batch_len is a u32
1346 */
1347 target_cmd_offset = lower_32_bits(jump_offset);
1348 target_cmd_index = target_cmd_offset / sizeof(u32);
1349
1350 *(u64*)(cmd + 1) = shadow_batch_start + target_cmd_offset;
1351
1352 if (target_cmd_index == offset)
1353 return 0;
1354
1355 if (ctx->jump_whitelist_cmds <= target_cmd_index) {
1356 DRM_DEBUG("CMD: Rejecting BB_START - truncated whitelist array\n");
1357 return -EINVAL;
1358 } else if (!test_bit(target_cmd_index, ctx->jump_whitelist)) {
1359 DRM_DEBUG("CMD: BB_START to 0x%llx not a previously executed cmd\n",
1360 jump_target);
1361 return -EINVAL;
1362 }
1363
1364 return 0;
1365}
1366
1367static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len)
1368{
1369 const u32 batch_cmds = DIV_ROUND_UP(batch_len, sizeof(u32));
1370 const u32 exact_size = BITS_TO_LONGS(batch_cmds);
1371 u32 next_size = BITS_TO_LONGS(roundup_pow_of_two(batch_cmds));
1372 unsigned long *next_whitelist;
1373
1374 if (CMDPARSER_USES_GGTT(ctx->i915))
1375 return;
1376
1377 if (batch_cmds <= ctx->jump_whitelist_cmds) {
1378 bitmap_zero(ctx->jump_whitelist, batch_cmds);
1379 return;
1380 }
1381
1382again:
1383 next_whitelist = kcalloc(next_size, sizeof(long), GFP_KERNEL);
1384 if (next_whitelist) {
1385 kfree(ctx->jump_whitelist);
1386 ctx->jump_whitelist = next_whitelist;
1387 ctx->jump_whitelist_cmds =
1388 next_size * BITS_PER_BYTE * sizeof(long);
1389 return;
1390 }
1391
1392 if (next_size > exact_size) {
1393 next_size = exact_size;
1394 goto again;
1395 }
1396
1397 DRM_DEBUG("CMD: Failed to extend whitelist. BB_START may be disallowed\n");
1398 bitmap_zero(ctx->jump_whitelist, ctx->jump_whitelist_cmds);
1399
1400 return;
1401}
1402
1239#define LENGTH_BIAS 2 1403#define LENGTH_BIAS 2
1240 1404
1241/** 1405/**
1242 * i915_parse_cmds() - parse a submitted batch buffer for privilege violations 1406 * i915_parse_cmds() - parse a submitted batch buffer for privilege violations
1407 * @ctx: the context in which the batch is to execute
1243 * @engine: the engine on which the batch is to execute 1408 * @engine: the engine on which the batch is to execute
1244 * @batch_obj: the batch buffer in question 1409 * @batch_obj: the batch buffer in question
1245 * @shadow_batch_obj: copy of the batch buffer in question 1410 * @batch_start: Canonical base address of batch
1246 * @batch_start_offset: byte offset in the batch at which execution starts 1411 * @batch_start_offset: byte offset in the batch at which execution starts
1247 * @batch_len: length of the commands in batch_obj 1412 * @batch_len: length of the commands in batch_obj
1248 * @is_master: is the submitting process the drm master? 1413 * @shadow_batch_obj: copy of the batch buffer in question
1414 * @shadow_batch_start: Canonical base address of shadow_batch_obj
1249 * 1415 *
1250 * Parses the specified batch buffer looking for privilege violations as 1416 * Parses the specified batch buffer looking for privilege violations as
1251 * described in the overview. 1417 * described in the overview.
@@ -1253,14 +1419,17 @@ static bool check_cmd(const struct intel_engine_cs *engine,
1253 * Return: non-zero if the parser finds violations or otherwise fails; -EACCES 1419 * Return: non-zero if the parser finds violations or otherwise fails; -EACCES
1254 * if the batch appears legal but should use hardware parsing 1420 * if the batch appears legal but should use hardware parsing
1255 */ 1421 */
1256int intel_engine_cmd_parser(struct intel_engine_cs *engine, 1422
1423int intel_engine_cmd_parser(struct i915_gem_context *ctx,
1424 struct intel_engine_cs *engine,
1257 struct drm_i915_gem_object *batch_obj, 1425 struct drm_i915_gem_object *batch_obj,
1258 struct drm_i915_gem_object *shadow_batch_obj, 1426 u64 batch_start,
1259 u32 batch_start_offset, 1427 u32 batch_start_offset,
1260 u32 batch_len, 1428 u32 batch_len,
1261 bool is_master) 1429 struct drm_i915_gem_object *shadow_batch_obj,
1430 u64 shadow_batch_start)
1262{ 1431{
1263 u32 *cmd, *batch_end; 1432 u32 *cmd, *batch_end, offset = 0;
1264 struct drm_i915_cmd_descriptor default_desc = noop_desc; 1433 struct drm_i915_cmd_descriptor default_desc = noop_desc;
1265 const struct drm_i915_cmd_descriptor *desc = &default_desc; 1434 const struct drm_i915_cmd_descriptor *desc = &default_desc;
1266 bool needs_clflush_after = false; 1435 bool needs_clflush_after = false;
@@ -1274,6 +1443,8 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
1274 return PTR_ERR(cmd); 1443 return PTR_ERR(cmd);
1275 } 1444 }
1276 1445
1446 init_whitelist(ctx, batch_len);
1447
1277 /* 1448 /*
1278 * We use the batch length as size because the shadow object is as 1449 * We use the batch length as size because the shadow object is as
1279 * large or larger and copy_batch() will write MI_NOPs to the extra 1450 * large or larger and copy_batch() will write MI_NOPs to the extra
@@ -1283,31 +1454,15 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
1283 do { 1454 do {
1284 u32 length; 1455 u32 length;
1285 1456
1286 if (*cmd == MI_BATCH_BUFFER_END) { 1457 if (*cmd == MI_BATCH_BUFFER_END)
1287 if (needs_clflush_after) {
1288 void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping);
1289 drm_clflush_virt_range(ptr,
1290 (void *)(cmd + 1) - ptr);
1291 }
1292 break; 1458 break;
1293 }
1294 1459
1295 desc = find_cmd(engine, *cmd, desc, &default_desc); 1460 desc = find_cmd(engine, *cmd, desc, &default_desc);
1296 if (!desc) { 1461 if (!desc) {
1297 DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n", 1462 DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
1298 *cmd); 1463 *cmd);
1299 ret = -EINVAL; 1464 ret = -EINVAL;
1300 break; 1465 goto err;
1301 }
1302
1303 /*
1304 * If the batch buffer contains a chained batch, return an
1305 * error that tells the caller to abort and dispatch the
1306 * workload as a non-secure batch.
1307 */
1308 if (desc->cmd.value == MI_BATCH_BUFFER_START) {
1309 ret = -EACCES;
1310 break;
1311 } 1466 }
1312 1467
1313 if (desc->flags & CMD_DESC_FIXED) 1468 if (desc->flags & CMD_DESC_FIXED)
@@ -1321,22 +1476,43 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
1321 length, 1476 length,
1322 batch_end - cmd); 1477 batch_end - cmd);
1323 ret = -EINVAL; 1478 ret = -EINVAL;
1324 break; 1479 goto err;
1325 } 1480 }
1326 1481
1327 if (!check_cmd(engine, desc, cmd, length, is_master)) { 1482 if (!check_cmd(engine, desc, cmd, length)) {
1328 ret = -EACCES; 1483 ret = -EACCES;
1484 goto err;
1485 }
1486
1487 if (desc->cmd.value == MI_BATCH_BUFFER_START) {
1488 ret = check_bbstart(ctx, cmd, offset, length,
1489 batch_len, batch_start,
1490 shadow_batch_start);
1491
1492 if (ret)
1493 goto err;
1329 break; 1494 break;
1330 } 1495 }
1331 1496
1497 if (ctx->jump_whitelist_cmds > offset)
1498 set_bit(offset, ctx->jump_whitelist);
1499
1332 cmd += length; 1500 cmd += length;
1501 offset += length;
1333 if (cmd >= batch_end) { 1502 if (cmd >= batch_end) {
1334 DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n"); 1503 DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
1335 ret = -EINVAL; 1504 ret = -EINVAL;
1336 break; 1505 goto err;
1337 } 1506 }
1338 } while (1); 1507 } while (1);
1339 1508
1509 if (needs_clflush_after) {
1510 void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping);
1511
1512 drm_clflush_virt_range(ptr, (void *)(cmd + 1) - ptr);
1513 }
1514
1515err:
1340 i915_gem_object_unpin_map(shadow_batch_obj); 1516 i915_gem_object_unpin_map(shadow_batch_obj);
1341 return ret; 1517 return ret;
1342} 1518}
@@ -1357,7 +1533,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv)
1357 1533
1358 /* If the command parser is not enabled, report 0 - unsupported */ 1534 /* If the command parser is not enabled, report 0 - unsupported */
1359 for_each_uabi_engine(engine, dev_priv) { 1535 for_each_uabi_engine(engine, dev_priv) {
1360 if (intel_engine_needs_cmd_parser(engine)) { 1536 if (intel_engine_using_cmd_parser(engine)) {
1361 active = true; 1537 active = true;
1362 break; 1538 break;
1363 } 1539 }
@@ -1382,6 +1558,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv)
1382 * the parser enabled. 1558 * the parser enabled.
1383 * 9. Don't whitelist or handle oacontrol specially, as ownership 1559 * 9. Don't whitelist or handle oacontrol specially, as ownership
1384 * for oacontrol state is moving to i915-perf. 1560 * for oacontrol state is moving to i915-perf.
1561 * 10. Support for Gen9 BCS Parsing
1385 */ 1562 */
1386 return 9; 1563 return 10;
1387} 1564}
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index bb6f86c7067a..fe4d7cabfdf1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1850,6 +1850,8 @@ static int i915_drm_suspend_late(struct drm_device *dev, bool hibernation)
1850 1850
1851 i915_gem_suspend_late(dev_priv); 1851 i915_gem_suspend_late(dev_priv);
1852 1852
1853 i915_rc6_ctx_wa_suspend(dev_priv);
1854
1853 intel_uncore_suspend(&dev_priv->uncore); 1855 intel_uncore_suspend(&dev_priv->uncore);
1854 1856
1855 intel_power_domains_suspend(dev_priv, 1857 intel_power_domains_suspend(dev_priv,
@@ -2053,6 +2055,8 @@ static int i915_drm_resume_early(struct drm_device *dev)
2053 2055
2054 intel_power_domains_resume(dev_priv); 2056 intel_power_domains_resume(dev_priv);
2055 2057
2058 i915_rc6_ctx_wa_resume(dev_priv);
2059
2056 intel_gt_sanitize(&dev_priv->gt, true); 2060 intel_gt_sanitize(&dev_priv->gt, true);
2057 2061
2058 enable_rpm_wakeref_asserts(&dev_priv->runtime_pm); 2062 enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 953e1d12c23c..89b6112bd66b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -593,6 +593,8 @@ struct intel_rps {
593 593
594struct intel_rc6 { 594struct intel_rc6 {
595 bool enabled; 595 bool enabled;
596 bool ctx_corrupted;
597 intel_wakeref_t ctx_corrupted_wakeref;
596 u64 prev_hw_residency[4]; 598 u64 prev_hw_residency[4];
597 u64 cur_residency[4]; 599 u64 cur_residency[4];
598}; 600};
@@ -2075,9 +2077,16 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
2075#define VEBOX_MASK(dev_priv) \ 2077#define VEBOX_MASK(dev_priv) \
2076 ENGINE_INSTANCES_MASK(dev_priv, VECS0, I915_MAX_VECS) 2078 ENGINE_INSTANCES_MASK(dev_priv, VECS0, I915_MAX_VECS)
2077 2079
2080/*
2081 * The Gen7 cmdparser copies the scanned buffer to the ggtt for execution
2082 * All later gens can run the final buffer from the ppgtt
2083 */
2084#define CMDPARSER_USES_GGTT(dev_priv) IS_GEN(dev_priv, 7)
2085
2078#define HAS_LLC(dev_priv) (INTEL_INFO(dev_priv)->has_llc) 2086#define HAS_LLC(dev_priv) (INTEL_INFO(dev_priv)->has_llc)
2079#define HAS_SNOOP(dev_priv) (INTEL_INFO(dev_priv)->has_snoop) 2087#define HAS_SNOOP(dev_priv) (INTEL_INFO(dev_priv)->has_snoop)
2080#define HAS_EDRAM(dev_priv) ((dev_priv)->edram_size_mb) 2088#define HAS_EDRAM(dev_priv) ((dev_priv)->edram_size_mb)
2089#define HAS_SECURE_BATCHES(dev_priv) (INTEL_GEN(dev_priv) < 6)
2081#define HAS_WT(dev_priv) ((IS_HASWELL(dev_priv) || \ 2090#define HAS_WT(dev_priv) ((IS_HASWELL(dev_priv) || \
2082 IS_BROADWELL(dev_priv)) && HAS_EDRAM(dev_priv)) 2091 IS_BROADWELL(dev_priv)) && HAS_EDRAM(dev_priv))
2083 2092
@@ -2110,10 +2119,12 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
2110/* Early gen2 have a totally busted CS tlb and require pinned batches. */ 2119/* Early gen2 have a totally busted CS tlb and require pinned batches. */
2111#define HAS_BROKEN_CS_TLB(dev_priv) (IS_I830(dev_priv) || IS_I845G(dev_priv)) 2120#define HAS_BROKEN_CS_TLB(dev_priv) (IS_I830(dev_priv) || IS_I845G(dev_priv))
2112 2121
2122#define NEEDS_RC6_CTX_CORRUPTION_WA(dev_priv) \
2123 (IS_BROADWELL(dev_priv) || IS_GEN(dev_priv, 9))
2124
2113/* WaRsDisableCoarsePowerGating:skl,cnl */ 2125/* WaRsDisableCoarsePowerGating:skl,cnl */
2114#define NEEDS_WaRsDisableCoarsePowerGating(dev_priv) \ 2126#define NEEDS_WaRsDisableCoarsePowerGating(dev_priv) \
2115 (IS_CANNONLAKE(dev_priv) || \ 2127 (IS_CANNONLAKE(dev_priv) || IS_GEN(dev_priv, 9))
2116 IS_SKL_GT3(dev_priv) || IS_SKL_GT4(dev_priv))
2117 2128
2118#define HAS_GMBUS_IRQ(dev_priv) (INTEL_GEN(dev_priv) >= 4) 2129#define HAS_GMBUS_IRQ(dev_priv) (INTEL_GEN(dev_priv) >= 4)
2119#define HAS_GMBUS_BURST_READ(dev_priv) (INTEL_GEN(dev_priv) >= 10 || \ 2130#define HAS_GMBUS_BURST_READ(dev_priv) (INTEL_GEN(dev_priv) >= 10 || \
@@ -2284,6 +2295,14 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj,
2284 unsigned long flags); 2295 unsigned long flags);
2285#define I915_GEM_OBJECT_UNBIND_ACTIVE BIT(0) 2296#define I915_GEM_OBJECT_UNBIND_ACTIVE BIT(0)
2286 2297
2298struct i915_vma * __must_check
2299i915_gem_object_pin(struct drm_i915_gem_object *obj,
2300 struct i915_address_space *vm,
2301 const struct i915_ggtt_view *view,
2302 u64 size,
2303 u64 alignment,
2304 u64 flags);
2305
2287void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv); 2306void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv);
2288 2307
2289static inline int __must_check 2308static inline int __must_check
@@ -2393,12 +2412,14 @@ const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
2393int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv); 2412int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv);
2394void intel_engine_init_cmd_parser(struct intel_engine_cs *engine); 2413void intel_engine_init_cmd_parser(struct intel_engine_cs *engine);
2395void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine); 2414void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine);
2396int intel_engine_cmd_parser(struct intel_engine_cs *engine, 2415int intel_engine_cmd_parser(struct i915_gem_context *cxt,
2416 struct intel_engine_cs *engine,
2397 struct drm_i915_gem_object *batch_obj, 2417 struct drm_i915_gem_object *batch_obj,
2398 struct drm_i915_gem_object *shadow_batch_obj, 2418 u64 user_batch_start,
2399 u32 batch_start_offset, 2419 u32 batch_start_offset,
2400 u32 batch_len, 2420 u32 batch_len,
2401 bool is_master); 2421 struct drm_i915_gem_object *shadow_batch_obj,
2422 u64 shadow_batch_start);
2402 2423
2403/* intel_device_info.c */ 2424/* intel_device_info.c */
2404static inline struct intel_device_info * 2425static inline struct intel_device_info *
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d0f94f239919..98305d987ac1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -964,6 +964,20 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
964{ 964{
965 struct drm_i915_private *dev_priv = to_i915(obj->base.dev); 965 struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
966 struct i915_address_space *vm = &dev_priv->ggtt.vm; 966 struct i915_address_space *vm = &dev_priv->ggtt.vm;
967
968 return i915_gem_object_pin(obj, vm, view, size, alignment,
969 flags | PIN_GLOBAL);
970}
971
972struct i915_vma *
973i915_gem_object_pin(struct drm_i915_gem_object *obj,
974 struct i915_address_space *vm,
975 const struct i915_ggtt_view *view,
976 u64 size,
977 u64 alignment,
978 u64 flags)
979{
980 struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
967 struct i915_vma *vma; 981 struct i915_vma *vma;
968 int ret; 982 int ret;
969 983
@@ -1038,7 +1052,7 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
1038 return ERR_PTR(ret); 1052 return ERR_PTR(ret);
1039 } 1053 }
1040 1054
1041 ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL); 1055 ret = i915_vma_pin(vma, size, alignment, flags);
1042 if (ret) 1056 if (ret)
1043 return ERR_PTR(ret); 1057 return ERR_PTR(ret);
1044 1058
diff --git a/drivers/gpu/drm/i915/i915_getparam.c b/drivers/gpu/drm/i915/i915_getparam.c
index 5d9101376a3d..9f1517af5b7f 100644
--- a/drivers/gpu/drm/i915/i915_getparam.c
+++ b/drivers/gpu/drm/i915/i915_getparam.c
@@ -62,7 +62,7 @@ int i915_getparam_ioctl(struct drm_device *dev, void *data,
62 value = !!(i915->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES); 62 value = !!(i915->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES);
63 break; 63 break;
64 case I915_PARAM_HAS_SECURE_BATCHES: 64 case I915_PARAM_HAS_SECURE_BATCHES:
65 value = capable(CAP_SYS_ADMIN); 65 value = HAS_SECURE_BATCHES(i915) && capable(CAP_SYS_ADMIN);
66 break; 66 break;
67 case I915_PARAM_CMD_PARSER_VERSION: 67 case I915_PARAM_CMD_PARSER_VERSION:
68 value = i915_cmd_parser_get_version(i915); 68 value = i915_cmd_parser_get_version(i915);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 2abd199093c5..f8ee9aba3955 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -471,6 +471,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
471#define ECOCHK_PPGTT_WT_HSW (0x2 << 3) 471#define ECOCHK_PPGTT_WT_HSW (0x2 << 3)
472#define ECOCHK_PPGTT_WB_HSW (0x3 << 3) 472#define ECOCHK_PPGTT_WB_HSW (0x3 << 3)
473 473
474#define GEN8_RC6_CTX_INFO _MMIO(0x8504)
475
474#define GAC_ECO_BITS _MMIO(0x14090) 476#define GAC_ECO_BITS _MMIO(0x14090)
475#define ECOBITS_SNB_BIT (1 << 13) 477#define ECOBITS_SNB_BIT (1 << 13)
476#define ECOBITS_PPGTT_CACHE64B (3 << 8) 478#define ECOBITS_PPGTT_CACHE64B (3 << 8)
@@ -555,6 +557,10 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
555 */ 557 */
556#define BCS_SWCTRL _MMIO(0x22200) 558#define BCS_SWCTRL _MMIO(0x22200)
557 559
560/* There are 16 GPR registers */
561#define BCS_GPR(n) _MMIO(0x22600 + (n) * 8)
562#define BCS_GPR_UDW(n) _MMIO(0x22600 + (n) * 8 + 4)
563
558#define GPGPU_THREADS_DISPATCHED _MMIO(0x2290) 564#define GPGPU_THREADS_DISPATCHED _MMIO(0x2290)
559#define GPGPU_THREADS_DISPATCHED_UDW _MMIO(0x2290 + 4) 565#define GPGPU_THREADS_DISPATCHED_UDW _MMIO(0x2290 + 4)
560#define HS_INVOCATION_COUNT _MMIO(0x2300) 566#define HS_INVOCATION_COUNT _MMIO(0x2300)
@@ -7211,6 +7217,10 @@ enum {
7211#define TGL_DMC_DEBUG_DC5_COUNT _MMIO(0x101084) 7217#define TGL_DMC_DEBUG_DC5_COUNT _MMIO(0x101084)
7212#define TGL_DMC_DEBUG_DC6_COUNT _MMIO(0x101088) 7218#define TGL_DMC_DEBUG_DC6_COUNT _MMIO(0x101088)
7213 7219
7220/* Display Internal Timeout Register */
7221#define RM_TIMEOUT _MMIO(0x42060)
7222#define MMIO_TIMEOUT_US(us) ((us) << 0)
7223
7214/* interrupts */ 7224/* interrupts */
7215#define DE_MASTER_IRQ_CONTROL (1 << 31) 7225#define DE_MASTER_IRQ_CONTROL (1 << 31)
7216#define DE_SPRITEB_FLIP_DONE (1 << 29) 7226#define DE_SPRITEB_FLIP_DONE (1 << 29)
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 75ee027abb80..2efe1d12d5a9 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -126,6 +126,14 @@ static void bxt_init_clock_gating(struct drm_i915_private *dev_priv)
126 */ 126 */
127 I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) | 127 I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) |
128 PWM1_GATING_DIS | PWM2_GATING_DIS); 128 PWM1_GATING_DIS | PWM2_GATING_DIS);
129
130 /*
131 * Lower the display internal timeout.
132 * This is needed to avoid any hard hangs when DSI port PLL
133 * is off and a MMIO access is attempted by any privilege
134 * application, using batch buffers or any other means.
135 */
136 I915_WRITE(RM_TIMEOUT, MMIO_TIMEOUT_US(950));
129} 137}
130 138
131static void glk_init_clock_gating(struct drm_i915_private *dev_priv) 139static void glk_init_clock_gating(struct drm_i915_private *dev_priv)
@@ -8544,6 +8552,100 @@ static void intel_init_emon(struct drm_i915_private *dev_priv)
8544 dev_priv->ips.corr = (lcfuse & LCFUSE_HIV_MASK); 8552 dev_priv->ips.corr = (lcfuse & LCFUSE_HIV_MASK);
8545} 8553}
8546 8554
8555static bool i915_rc6_ctx_corrupted(struct drm_i915_private *dev_priv)
8556{
8557 return !I915_READ(GEN8_RC6_CTX_INFO);
8558}
8559
8560static void i915_rc6_ctx_wa_init(struct drm_i915_private *i915)
8561{
8562 if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915))
8563 return;
8564
8565 if (i915_rc6_ctx_corrupted(i915)) {
8566 DRM_INFO("RC6 context corrupted, disabling runtime power management\n");
8567 i915->gt_pm.rc6.ctx_corrupted = true;
8568 i915->gt_pm.rc6.ctx_corrupted_wakeref =
8569 intel_runtime_pm_get(&i915->runtime_pm);
8570 }
8571}
8572
8573static void i915_rc6_ctx_wa_cleanup(struct drm_i915_private *i915)
8574{
8575 if (i915->gt_pm.rc6.ctx_corrupted) {
8576 intel_runtime_pm_put(&i915->runtime_pm,
8577 i915->gt_pm.rc6.ctx_corrupted_wakeref);
8578 i915->gt_pm.rc6.ctx_corrupted = false;
8579 }
8580}
8581
8582/**
8583 * i915_rc6_ctx_wa_suspend - system suspend sequence for the RC6 CTX WA
8584 * @i915: i915 device
8585 *
8586 * Perform any steps needed to clean up the RC6 CTX WA before system suspend.
8587 */
8588void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915)
8589{
8590 if (i915->gt_pm.rc6.ctx_corrupted)
8591 intel_runtime_pm_put(&i915->runtime_pm,
8592 i915->gt_pm.rc6.ctx_corrupted_wakeref);
8593}
8594
8595/**
8596 * i915_rc6_ctx_wa_resume - system resume sequence for the RC6 CTX WA
8597 * @i915: i915 device
8598 *
8599 * Perform any steps needed to re-init the RC6 CTX WA after system resume.
8600 */
8601void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915)
8602{
8603 if (!i915->gt_pm.rc6.ctx_corrupted)
8604 return;
8605
8606 if (i915_rc6_ctx_corrupted(i915)) {
8607 i915->gt_pm.rc6.ctx_corrupted_wakeref =
8608 intel_runtime_pm_get(&i915->runtime_pm);
8609 return;
8610 }
8611
8612 DRM_INFO("RC6 context restored, re-enabling runtime power management\n");
8613 i915->gt_pm.rc6.ctx_corrupted = false;
8614}
8615
8616static void intel_disable_rc6(struct drm_i915_private *dev_priv);
8617
8618/**
8619 * i915_rc6_ctx_wa_check - check for a new RC6 CTX corruption
8620 * @i915: i915 device
8621 *
8622 * Check if an RC6 CTX corruption has happened since the last check and if so
8623 * disable RC6 and runtime power management.
8624 *
8625 * Return false if no context corruption has happened since the last call of
8626 * this function, true otherwise.
8627*/
8628bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915)
8629{
8630 if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915))
8631 return false;
8632
8633 if (i915->gt_pm.rc6.ctx_corrupted)
8634 return false;
8635
8636 if (!i915_rc6_ctx_corrupted(i915))
8637 return false;
8638
8639 DRM_NOTE("RC6 context corruption, disabling runtime power management\n");
8640
8641 intel_disable_rc6(i915);
8642 i915->gt_pm.rc6.ctx_corrupted = true;
8643 i915->gt_pm.rc6.ctx_corrupted_wakeref =
8644 intel_runtime_pm_get_noresume(&i915->runtime_pm);
8645
8646 return true;
8647}
8648
8547void intel_init_gt_powersave(struct drm_i915_private *dev_priv) 8649void intel_init_gt_powersave(struct drm_i915_private *dev_priv)
8548{ 8650{
8549 struct intel_rps *rps = &dev_priv->gt_pm.rps; 8651 struct intel_rps *rps = &dev_priv->gt_pm.rps;
@@ -8557,6 +8659,8 @@ void intel_init_gt_powersave(struct drm_i915_private *dev_priv)
8557 pm_runtime_get(&dev_priv->drm.pdev->dev); 8659 pm_runtime_get(&dev_priv->drm.pdev->dev);
8558 } 8660 }
8559 8661
8662 i915_rc6_ctx_wa_init(dev_priv);
8663
8560 /* Initialize RPS limits (for userspace) */ 8664 /* Initialize RPS limits (for userspace) */
8561 if (IS_CHERRYVIEW(dev_priv)) 8665 if (IS_CHERRYVIEW(dev_priv))
8562 cherryview_init_gt_powersave(dev_priv); 8666 cherryview_init_gt_powersave(dev_priv);
@@ -8595,6 +8699,8 @@ void intel_cleanup_gt_powersave(struct drm_i915_private *dev_priv)
8595 if (IS_VALLEYVIEW(dev_priv)) 8699 if (IS_VALLEYVIEW(dev_priv))
8596 valleyview_cleanup_gt_powersave(dev_priv); 8700 valleyview_cleanup_gt_powersave(dev_priv);
8597 8701
8702 i915_rc6_ctx_wa_cleanup(dev_priv);
8703
8598 if (!HAS_RC6(dev_priv)) 8704 if (!HAS_RC6(dev_priv))
8599 pm_runtime_put(&dev_priv->drm.pdev->dev); 8705 pm_runtime_put(&dev_priv->drm.pdev->dev);
8600} 8706}
@@ -8623,7 +8729,7 @@ static inline void intel_disable_llc_pstate(struct drm_i915_private *i915)
8623 i915->gt_pm.llc_pstate.enabled = false; 8729 i915->gt_pm.llc_pstate.enabled = false;
8624} 8730}
8625 8731
8626static void intel_disable_rc6(struct drm_i915_private *dev_priv) 8732static void __intel_disable_rc6(struct drm_i915_private *dev_priv)
8627{ 8733{
8628 lockdep_assert_held(&dev_priv->gt_pm.rps.lock); 8734 lockdep_assert_held(&dev_priv->gt_pm.rps.lock);
8629 8735
@@ -8642,6 +8748,15 @@ static void intel_disable_rc6(struct drm_i915_private *dev_priv)
8642 dev_priv->gt_pm.rc6.enabled = false; 8748 dev_priv->gt_pm.rc6.enabled = false;
8643} 8749}
8644 8750
8751static void intel_disable_rc6(struct drm_i915_private *dev_priv)
8752{
8753 struct intel_rps *rps = &dev_priv->gt_pm.rps;
8754
8755 mutex_lock(&rps->lock);
8756 __intel_disable_rc6(dev_priv);
8757 mutex_unlock(&rps->lock);
8758}
8759
8645static void intel_disable_rps(struct drm_i915_private *dev_priv) 8760static void intel_disable_rps(struct drm_i915_private *dev_priv)
8646{ 8761{
8647 lockdep_assert_held(&dev_priv->gt_pm.rps.lock); 8762 lockdep_assert_held(&dev_priv->gt_pm.rps.lock);
@@ -8667,7 +8782,7 @@ void intel_disable_gt_powersave(struct drm_i915_private *dev_priv)
8667{ 8782{
8668 mutex_lock(&dev_priv->gt_pm.rps.lock); 8783 mutex_lock(&dev_priv->gt_pm.rps.lock);
8669 8784
8670 intel_disable_rc6(dev_priv); 8785 __intel_disable_rc6(dev_priv);
8671 intel_disable_rps(dev_priv); 8786 intel_disable_rps(dev_priv);
8672 if (HAS_LLC(dev_priv)) 8787 if (HAS_LLC(dev_priv))
8673 intel_disable_llc_pstate(dev_priv); 8788 intel_disable_llc_pstate(dev_priv);
@@ -8694,6 +8809,9 @@ static void intel_enable_rc6(struct drm_i915_private *dev_priv)
8694 if (dev_priv->gt_pm.rc6.enabled) 8809 if (dev_priv->gt_pm.rc6.enabled)
8695 return; 8810 return;
8696 8811
8812 if (dev_priv->gt_pm.rc6.ctx_corrupted)
8813 return;
8814
8697 if (IS_CHERRYVIEW(dev_priv)) 8815 if (IS_CHERRYVIEW(dev_priv))
8698 cherryview_enable_rc6(dev_priv); 8816 cherryview_enable_rc6(dev_priv);
8699 else if (IS_VALLEYVIEW(dev_priv)) 8817 else if (IS_VALLEYVIEW(dev_priv))
diff --git a/drivers/gpu/drm/i915/intel_pm.h b/drivers/gpu/drm/i915/intel_pm.h
index e3573e1e16e3..0f7390c850ec 100644
--- a/drivers/gpu/drm/i915/intel_pm.h
+++ b/drivers/gpu/drm/i915/intel_pm.h
@@ -36,6 +36,9 @@ void intel_cleanup_gt_powersave(struct drm_i915_private *dev_priv);
36void intel_sanitize_gt_powersave(struct drm_i915_private *dev_priv); 36void intel_sanitize_gt_powersave(struct drm_i915_private *dev_priv);
37void intel_enable_gt_powersave(struct drm_i915_private *dev_priv); 37void intel_enable_gt_powersave(struct drm_i915_private *dev_priv);
38void intel_disable_gt_powersave(struct drm_i915_private *dev_priv); 38void intel_disable_gt_powersave(struct drm_i915_private *dev_priv);
39bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915);
40void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915);
41void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915);
39void gen6_rps_busy(struct drm_i915_private *dev_priv); 42void gen6_rps_busy(struct drm_i915_private *dev_priv);
40void gen6_rps_idle(struct drm_i915_private *dev_priv); 43void gen6_rps_idle(struct drm_i915_private *dev_priv);
41void gen6_rps_boost(struct i915_request *rq); 44void gen6_rps_boost(struct i915_request *rq);
diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c
index 6afad68e5ba2..238240984bc1 100644
--- a/drivers/scsi/qla2xxx/qla_mid.c
+++ b/drivers/scsi/qla2xxx/qla_mid.c
@@ -76,9 +76,11 @@ qla24xx_deallocate_vp_id(scsi_qla_host_t *vha)
76 * ensures no active vp_list traversal while the vport is removed 76 * ensures no active vp_list traversal while the vport is removed
77 * from the queue) 77 * from the queue)
78 */ 78 */
79 for (i = 0; i < 10 && atomic_read(&vha->vref_count); i++) 79 for (i = 0; i < 10; i++) {
80 wait_event_timeout(vha->vref_waitq, 80 if (wait_event_timeout(vha->vref_waitq,
81 atomic_read(&vha->vref_count), HZ); 81 !atomic_read(&vha->vref_count), HZ) > 0)
82 break;
83 }
82 84
83 spin_lock_irqsave(&ha->vport_slock, flags); 85 spin_lock_irqsave(&ha->vport_slock, flags);
84 if (atomic_read(&vha->vref_count)) { 86 if (atomic_read(&vha->vref_count)) {
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 337162ac3a77..726ad4cbf4a6 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1119,9 +1119,11 @@ qla2x00_wait_for_sess_deletion(scsi_qla_host_t *vha)
1119 1119
1120 qla2x00_mark_all_devices_lost(vha, 0); 1120 qla2x00_mark_all_devices_lost(vha, 0);
1121 1121
1122 for (i = 0; i < 10; i++) 1122 for (i = 0; i < 10; i++) {
1123 wait_event_timeout(vha->fcport_waitQ, test_fcport_count(vha), 1123 if (wait_event_timeout(vha->fcport_waitQ,
1124 HZ); 1124 test_fcport_count(vha), HZ) > 0)
1125 break;
1126 }
1125 1127
1126 flush_workqueue(vha->hw->wq); 1128 flush_workqueue(vha->hw->wq);
1127} 1129}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5447738906ac..91c007d26c1e 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1883,7 +1883,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
1883{ 1883{
1884 unsigned int cmd_size, sgl_size; 1884 unsigned int cmd_size, sgl_size;
1885 1885
1886 sgl_size = scsi_mq_inline_sgl_size(shost); 1886 sgl_size = max_t(unsigned int, sizeof(struct scatterlist),
1887 scsi_mq_inline_sgl_size(shost));
1887 cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size; 1888 cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size;
1888 if (scsi_host_get_prot(shost)) 1889 if (scsi_host_get_prot(shost))
1889 cmd_size += sizeof(struct scsi_data_buffer) + 1890 cmd_size += sizeof(struct scsi_data_buffer) +
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index de4019dc0f0b..1efc69e194f8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -263,25 +263,16 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
263 int result = cmd->result; 263 int result = cmd->result;
264 struct request *rq = cmd->request; 264 struct request *rq = cmd->request;
265 265
266 switch (req_op(rq)) { 266 if (req_op(rq) == REQ_OP_ZONE_RESET &&
267 case REQ_OP_ZONE_RESET: 267 result &&
268 case REQ_OP_ZONE_RESET_ALL: 268 sshdr->sense_key == ILLEGAL_REQUEST &&
269 269 sshdr->asc == 0x24) {
270 if (result && 270 /*
271 sshdr->sense_key == ILLEGAL_REQUEST && 271 * INVALID FIELD IN CDB error: reset of a conventional
272 sshdr->asc == 0x24) 272 * zone was attempted. Nothing to worry about, so be
273 /* 273 * quiet about the error.
274 * INVALID FIELD IN CDB error: reset of a conventional 274 */
275 * zone was attempted. Nothing to worry about, so be 275 rq->rq_flags |= RQF_QUIET;
276 * quiet about the error.
277 */
278 rq->rq_flags |= RQF_QUIET;
279 break;
280
281 case REQ_OP_WRITE:
282 case REQ_OP_WRITE_ZEROES:
283 case REQ_OP_WRITE_SAME:
284 break;
285 } 276 }
286} 277}
287 278
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d0633ebdaa9c..bc6c879bd110 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -59,6 +59,11 @@ extern ssize_t cpu_show_l1tf(struct device *dev,
59 struct device_attribute *attr, char *buf); 59 struct device_attribute *attr, char *buf);
60extern ssize_t cpu_show_mds(struct device *dev, 60extern ssize_t cpu_show_mds(struct device *dev,
61 struct device_attribute *attr, char *buf); 61 struct device_attribute *attr, char *buf);
62extern ssize_t cpu_show_tsx_async_abort(struct device *dev,
63 struct device_attribute *attr,
64 char *buf);
65extern ssize_t cpu_show_itlb_multihit(struct device *dev,
66 struct device_attribute *attr, char *buf);
62 67
63extern __printf(4, 5) 68extern __printf(4, 5)
64struct device *cpu_device_create(struct device *parent, void *drvdata, 69struct device *cpu_device_create(struct device *parent, void *drvdata,
@@ -213,28 +218,7 @@ static inline int cpuhp_smt_enable(void) { return 0; }
213static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } 218static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; }
214#endif 219#endif
215 220
216/* 221extern bool cpu_mitigations_off(void);
217 * These are used for a global "mitigations=" cmdline option for toggling 222extern bool cpu_mitigations_auto_nosmt(void);
218 * optional CPU mitigations.
219 */
220enum cpu_mitigations {
221 CPU_MITIGATIONS_OFF,
222 CPU_MITIGATIONS_AUTO,
223 CPU_MITIGATIONS_AUTO_NOSMT,
224};
225
226extern enum cpu_mitigations cpu_mitigations;
227
228/* mitigations=off */
229static inline bool cpu_mitigations_off(void)
230{
231 return cpu_mitigations == CPU_MITIGATIONS_OFF;
232}
233
234/* mitigations=auto,nosmt */
235static inline bool cpu_mitigations_auto_nosmt(void)
236{
237 return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
238}
239 223
240#endif /* _LINUX_CPU_H_ */ 224#endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 290dbe353a47..d41c521a39da 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1383,4 +1383,10 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
1383} 1383}
1384#endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ 1384#endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
1385 1385
1386typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
1387
1388int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
1389 uintptr_t data, const char *name,
1390 struct task_struct **thread_ptr);
1391
1386#endif 1392#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fc28e17940e0..e2cad3ee2ead 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2373,7 +2373,18 @@ void __init boot_cpu_hotplug_init(void)
2373 this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); 2373 this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
2374} 2374}
2375 2375
2376enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; 2376/*
2377 * These are used for a global "mitigations=" cmdline option for toggling
2378 * optional CPU mitigations.
2379 */
2380enum cpu_mitigations {
2381 CPU_MITIGATIONS_OFF,
2382 CPU_MITIGATIONS_AUTO,
2383 CPU_MITIGATIONS_AUTO_NOSMT,
2384};
2385
2386static enum cpu_mitigations cpu_mitigations __ro_after_init =
2387 CPU_MITIGATIONS_AUTO;
2377 2388
2378static int __init mitigations_parse_cmdline(char *arg) 2389static int __init mitigations_parse_cmdline(char *arg)
2379{ 2390{
@@ -2390,3 +2401,17 @@ static int __init mitigations_parse_cmdline(char *arg)
2390 return 0; 2401 return 0;
2391} 2402}
2392early_param("mitigations", mitigations_parse_cmdline); 2403early_param("mitigations", mitigations_parse_cmdline);
2404
2405/* mitigations=off */
2406bool cpu_mitigations_off(void)
2407{
2408 return cpu_mitigations == CPU_MITIGATIONS_OFF;
2409}
2410EXPORT_SYMBOL_GPL(cpu_mitigations_off);
2411
2412/* mitigations=auto,nosmt */
2413bool cpu_mitigations_auto_nosmt(void)
2414{
2415 return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
2416}
2417EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
diff --git a/kernel/signal.c b/kernel/signal.c
index c4da1ef56fdf..bcd46f547db3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2205,8 +2205,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
2205 */ 2205 */
2206 preempt_disable(); 2206 preempt_disable();
2207 read_unlock(&tasklist_lock); 2207 read_unlock(&tasklist_lock);
2208 preempt_enable_no_resched();
2209 cgroup_enter_frozen(); 2208 cgroup_enter_frozen();
2209 preempt_enable_no_resched();
2210 freezable_schedule(); 2210 freezable_schedule();
2211 cgroup_leave_frozen(true); 2211 cgroup_leave_frozen(true);
2212 } else { 2212 } else {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0dac149ead16..524cff24a68d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -50,6 +50,7 @@
50#include <linux/bsearch.h> 50#include <linux/bsearch.h>
51#include <linux/io.h> 51#include <linux/io.h>
52#include <linux/lockdep.h> 52#include <linux/lockdep.h>
53#include <linux/kthread.h>
53 54
54#include <asm/processor.h> 55#include <asm/processor.h>
55#include <asm/ioctl.h> 56#include <asm/ioctl.h>
@@ -645,6 +646,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
645 return 0; 646 return 0;
646} 647}
647 648
649/*
650 * Called after the VM is otherwise initialized, but just before adding it to
651 * the vm_list.
652 */
653int __weak kvm_arch_post_init_vm(struct kvm *kvm)
654{
655 return 0;
656}
657
658/*
659 * Called just after removing the VM from the vm_list, but before doing any
660 * other destruction.
661 */
662void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
663{
664}
665
648static struct kvm *kvm_create_vm(unsigned long type) 666static struct kvm *kvm_create_vm(unsigned long type)
649{ 667{
650 struct kvm *kvm = kvm_arch_alloc_vm(); 668 struct kvm *kvm = kvm_arch_alloc_vm();
@@ -702,6 +720,10 @@ static struct kvm *kvm_create_vm(unsigned long type)
702 720
703 r = kvm_init_mmu_notifier(kvm); 721 r = kvm_init_mmu_notifier(kvm);
704 if (r) 722 if (r)
723 goto out_err_no_mmu_notifier;
724
725 r = kvm_arch_post_init_vm(kvm);
726 if (r)
705 goto out_err; 727 goto out_err;
706 728
707 mutex_lock(&kvm_lock); 729 mutex_lock(&kvm_lock);
@@ -713,6 +735,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
713 return kvm; 735 return kvm;
714 736
715out_err: 737out_err:
738#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
739 if (kvm->mmu_notifier.ops)
740 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
741#endif
742out_err_no_mmu_notifier:
716 hardware_disable_all(); 743 hardware_disable_all();
717out_err_no_disable: 744out_err_no_disable:
718 kvm_arch_destroy_vm(kvm); 745 kvm_arch_destroy_vm(kvm);
@@ -757,6 +784,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
757 mutex_lock(&kvm_lock); 784 mutex_lock(&kvm_lock);
758 list_del(&kvm->vm_list); 785 list_del(&kvm->vm_list);
759 mutex_unlock(&kvm_lock); 786 mutex_unlock(&kvm_lock);
787 kvm_arch_pre_destroy_vm(kvm);
788
760 kvm_free_irq_routing(kvm); 789 kvm_free_irq_routing(kvm);
761 for (i = 0; i < KVM_NR_BUSES; i++) { 790 for (i = 0; i < KVM_NR_BUSES; i++) {
762 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 791 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
@@ -4391,3 +4420,86 @@ void kvm_exit(void)
4391 kvm_vfio_ops_exit(); 4420 kvm_vfio_ops_exit();
4392} 4421}
4393EXPORT_SYMBOL_GPL(kvm_exit); 4422EXPORT_SYMBOL_GPL(kvm_exit);
4423
4424struct kvm_vm_worker_thread_context {
4425 struct kvm *kvm;
4426 struct task_struct *parent;
4427 struct completion init_done;
4428 kvm_vm_thread_fn_t thread_fn;
4429 uintptr_t data;
4430 int err;
4431};
4432
4433static int kvm_vm_worker_thread(void *context)
4434{
4435 /*
4436 * The init_context is allocated on the stack of the parent thread, so
4437 * we have to locally copy anything that is needed beyond initialization
4438 */
4439 struct kvm_vm_worker_thread_context *init_context = context;
4440 struct kvm *kvm = init_context->kvm;
4441 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
4442 uintptr_t data = init_context->data;
4443 int err;
4444
4445 err = kthread_park(current);
4446 /* kthread_park(current) is never supposed to return an error */
4447 WARN_ON(err != 0);
4448 if (err)
4449 goto init_complete;
4450
4451 err = cgroup_attach_task_all(init_context->parent, current);
4452 if (err) {
4453 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
4454 __func__, err);
4455 goto init_complete;
4456 }
4457
4458 set_user_nice(current, task_nice(init_context->parent));
4459
4460init_complete:
4461 init_context->err = err;
4462 complete(&init_context->init_done);
4463 init_context = NULL;
4464
4465 if (err)
4466 return err;
4467
4468 /* Wait to be woken up by the spawner before proceeding. */
4469 kthread_parkme();
4470
4471 if (!kthread_should_stop())
4472 err = thread_fn(kvm, data);
4473
4474 return err;
4475}
4476
4477int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
4478 uintptr_t data, const char *name,
4479 struct task_struct **thread_ptr)
4480{
4481 struct kvm_vm_worker_thread_context init_context = {};
4482 struct task_struct *thread;
4483
4484 *thread_ptr = NULL;
4485 init_context.kvm = kvm;
4486 init_context.parent = current;
4487 init_context.thread_fn = thread_fn;
4488 init_context.data = data;
4489 init_completion(&init_context.init_done);
4490
4491 thread = kthread_run(kvm_vm_worker_thread, &init_context,
4492 "%s-%d", name, task_pid_nr(current));
4493 if (IS_ERR(thread))
4494 return PTR_ERR(thread);
4495
4496 /* kthread_run is never supposed to return NULL */
4497 WARN_ON(thread == NULL);
4498
4499 wait_for_completion(&init_context.init_done);
4500
4501 if (!init_context.err)
4502 *thread_ptr = thread;
4503
4504 return init_context.err;
4505}