aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-31 11:55:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-31 11:55:59 -0400
commit63fc9c23488d6cf34e4c233e24ba59b7e5548412 (patch)
tree08188f35d8625be520730e4ae106e8af2ee7b058
parent915ee0da5ecb7ac7fd023ae36f01c47ce47a45d1 (diff)
parent690edec54cbaa0e98dc592aae6864272f48f3c84 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini: "A collection of x86 and ARM bugfixes, and some improvements to documentation. On top of this, a cleanup of kvm_para.h headers, which were exported by some architectures even though they not support KVM at all. This is responsible for all the Kbuild changes in the diffstat" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits) Documentation: kvm: clarify KVM_SET_USER_MEMORY_REGION KVM: doc: Document the life cycle of a VM and its resources KVM: selftests: complete IO before migrating guest state KVM: selftests: disable stack protector for all KVM tests KVM: selftests: explicitly disable PIE for tests KVM: selftests: assert on exit reason in CR4/cpuid sync test KVM: x86: update %rip after emulating IO x86/kvm/hyper-v: avoid spurious pending stimer on vCPU init kvm/x86: Move MSR_IA32_ARCH_CAPABILITIES to array emulated_msrs KVM: x86: Emulate MSR_IA32_ARCH_CAPABILITIES on AMD hosts kvm: don't redefine flags as something else kvm: mmu: Used range based flushing in slot_handle_level_range KVM: export <linux/kvm_para.h> and <asm/kvm_para.h> iif KVM is supported KVM: x86: remove check on nr_mmu_pages in kvm_arch_commit_memory_region() kvm: nVMX: Add a vmentry check for HOST_SYSENTER_ESP and HOST_SYSENTER_EIP fields KVM: SVM: Workaround errata#1096 (insn_len maybe zero on SMAP violation) KVM: Reject device ioctls from processes other than the VM's creator KVM: doc: Fix incorrect word ordering regarding supported use of APIs KVM: x86: fix handling of role.cr4_pae and rename it to 'gpte_size' KVM: nVMX: Do not inherit quadrant and invalid for the root shadow EPT ...
-rw-r--r--Documentation/virtual/kvm/api.txt77
-rw-r--r--Documentation/virtual/kvm/mmu.txt11
-rw-r--r--arch/alpha/include/asm/Kbuild1
-rw-r--r--arch/alpha/include/uapi/asm/kvm_para.h2
-rw-r--r--arch/arc/include/asm/Kbuild1
-rw-r--r--arch/arc/include/uapi/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/kvm_mmu.h11
-rw-r--r--arch/arm/include/asm/stage2_pgtable.h2
-rw-r--r--arch/arm/include/uapi/asm/Kbuild1
-rw-r--r--arch/arm/include/uapi/asm/kvm_para.h2
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h11
-rw-r--r--arch/arm64/kvm/reset.c6
-rw-r--r--arch/c6x/include/asm/Kbuild1
-rw-r--r--arch/c6x/include/uapi/asm/Kbuild1
-rw-r--r--arch/h8300/include/asm/Kbuild1
-rw-r--r--arch/h8300/include/uapi/asm/Kbuild1
-rw-r--r--arch/hexagon/include/asm/Kbuild1
-rw-r--r--arch/hexagon/include/uapi/asm/kvm_para.h2
-rw-r--r--arch/ia64/include/asm/Kbuild1
-rw-r--r--arch/ia64/include/uapi/asm/Kbuild1
-rw-r--r--arch/m68k/include/asm/Kbuild1
-rw-r--r--arch/m68k/include/uapi/asm/Kbuild1
-rw-r--r--arch/microblaze/include/asm/Kbuild1
-rw-r--r--arch/microblaze/include/uapi/asm/Kbuild1
-rw-r--r--arch/nios2/include/asm/Kbuild1
-rw-r--r--arch/nios2/include/uapi/asm/Kbuild1
-rw-r--r--arch/openrisc/include/asm/Kbuild1
-rw-r--r--arch/openrisc/include/uapi/asm/Kbuild1
-rw-r--r--arch/parisc/include/asm/Kbuild1
-rw-r--r--arch/parisc/include/uapi/asm/Kbuild1
-rw-r--r--arch/sh/include/asm/Kbuild1
-rw-r--r--arch/sh/include/uapi/asm/Kbuild1
-rw-r--r--arch/sparc/include/asm/Kbuild1
-rw-r--r--arch/sparc/include/uapi/asm/kvm_para.h2
-rw-r--r--arch/unicore32/include/asm/Kbuild1
-rw-r--r--arch/unicore32/include/uapi/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/kvm_host.h10
-rw-r--r--arch/x86/kvm/hyperv.c9
-rw-r--r--arch/x86/kvm/mmu.c54
-rw-r--r--arch/x86/kvm/mmutrace.h4
-rw-r--r--arch/x86/kvm/svm.c32
-rw-r--r--arch/x86/kvm/vmx/nested.c5
-rw-r--r--arch/x86/kvm/vmx/vmx.c19
-rw-r--r--arch/x86/kvm/vmx/vmx.h1
-rw-r--r--arch/x86/kvm/x86.c59
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--arch/xtensa/include/uapi/asm/Kbuild1
-rw-r--r--include/uapi/linux/Kbuild2
-rw-r--r--tools/testing/selftests/kvm/Makefile4
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h1
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c16
-rw-r--r--tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c35
-rw-r--r--tools/testing/selftests/kvm/x86_64/state_test.c18
-rw-r--r--virt/kvm/arm/hyp/vgic-v3-sr.c4
-rw-r--r--virt/kvm/arm/mmu.c125
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c31
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c4
-rw-r--r--virt/kvm/arm/vgic/vgic.c14
-rw-r--r--virt/kvm/eventfd.c6
-rw-r--r--virt/kvm/kvm_main.c3
60 files changed, 409 insertions, 201 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 7de9eee73fcd..67068c47c591 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -5,25 +5,32 @@ The Definitive KVM (Kernel-based Virtual Machine) API Documentation
5---------------------- 5----------------------
6 6
7The kvm API is a set of ioctls that are issued to control various aspects 7The kvm API is a set of ioctls that are issued to control various aspects
8of a virtual machine. The ioctls belong to three classes 8of a virtual machine. The ioctls belong to three classes:
9 9
10 - System ioctls: These query and set global attributes which affect the 10 - System ioctls: These query and set global attributes which affect the
11 whole kvm subsystem. In addition a system ioctl is used to create 11 whole kvm subsystem. In addition a system ioctl is used to create
12 virtual machines 12 virtual machines.
13 13
14 - VM ioctls: These query and set attributes that affect an entire virtual 14 - VM ioctls: These query and set attributes that affect an entire virtual
15 machine, for example memory layout. In addition a VM ioctl is used to 15 machine, for example memory layout. In addition a VM ioctl is used to
16 create virtual cpus (vcpus). 16 create virtual cpus (vcpus) and devices.
17 17
18 Only run VM ioctls from the same process (address space) that was used 18 VM ioctls must be issued from the same process (address space) that was
19 to create the VM. 19 used to create the VM.
20 20
21 - vcpu ioctls: These query and set attributes that control the operation 21 - vcpu ioctls: These query and set attributes that control the operation
22 of a single virtual cpu. 22 of a single virtual cpu.
23 23
24 Only run vcpu ioctls from the same thread that was used to create the 24 vcpu ioctls should be issued from the same thread that was used to create
25 vcpu. 25 the vcpu, except for asynchronous vcpu ioctl that are marked as such in
26 the documentation. Otherwise, the first ioctl after switching threads
27 could see a performance impact.
26 28
29 - device ioctls: These query and set attributes that control the operation
30 of a single device.
31
32 device ioctls must be issued from the same process (address space) that
33 was used to create the VM.
27 34
282. File descriptors 352. File descriptors
29------------------- 36-------------------
@@ -32,17 +39,34 @@ The kvm API is centered around file descriptors. An initial
32open("/dev/kvm") obtains a handle to the kvm subsystem; this handle 39open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
33can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this 40can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this
34handle will create a VM file descriptor which can be used to issue VM 41handle will create a VM file descriptor which can be used to issue VM
35ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu 42ioctls. A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will
36and return a file descriptor pointing to it. Finally, ioctls on a vcpu 43create a virtual cpu or device and return a file descriptor pointing to
37fd can be used to control the vcpu, including the important task of 44the new resource. Finally, ioctls on a vcpu or device fd can be used
38actually running guest code. 45to control the vcpu or device. For vcpus, this includes the important
46task of actually running guest code.
39 47
40In general file descriptors can be migrated among processes by means 48In general file descriptors can be migrated among processes by means
41of fork() and the SCM_RIGHTS facility of unix domain socket. These 49of fork() and the SCM_RIGHTS facility of unix domain socket. These
42kinds of tricks are explicitly not supported by kvm. While they will 50kinds of tricks are explicitly not supported by kvm. While they will
43not cause harm to the host, their actual behavior is not guaranteed by 51not cause harm to the host, their actual behavior is not guaranteed by
44the API. The only supported use is one virtual machine per process, 52the API. See "General description" for details on the ioctl usage
45and one vcpu per thread. 53model that is supported by KVM.
54
55It is important to note that althought VM ioctls may only be issued from
56the process that created the VM, a VM's lifecycle is associated with its
57file descriptor, not its creator (process). In other words, the VM and
58its resources, *including the associated address space*, are not freed
59until the last reference to the VM's file descriptor has been released.
60For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will
61not be freed until both the parent (original) process and its child have
62put their references to the VM's file descriptor.
63
64Because a VM's resources are not freed until the last reference to its
65file descriptor is released, creating additional references to a VM via
66via fork(), dup(), etc... without careful consideration is strongly
67discouraged and may have unwanted side effects, e.g. memory allocated
68by and on behalf of the VM's process may not be freed/unaccounted when
69the VM is shut down.
46 70
47 71
48It is important to note that althought VM ioctls may only be issued from 72It is important to note that althought VM ioctls may only be issued from
@@ -515,11 +539,15 @@ c) KVM_INTERRUPT_SET_LEVEL
515Note that any value for 'irq' other than the ones stated above is invalid 539Note that any value for 'irq' other than the ones stated above is invalid
516and incurs unexpected behavior. 540and incurs unexpected behavior.
517 541
542This is an asynchronous vcpu ioctl and can be invoked from any thread.
543
518MIPS: 544MIPS:
519 545
520Queues an external interrupt to be injected into the virtual CPU. A negative 546Queues an external interrupt to be injected into the virtual CPU. A negative
521interrupt number dequeues the interrupt. 547interrupt number dequeues the interrupt.
522 548
549This is an asynchronous vcpu ioctl and can be invoked from any thread.
550
523 551
5244.17 KVM_DEBUG_GUEST 5524.17 KVM_DEBUG_GUEST
525 553
@@ -1086,14 +1114,12 @@ struct kvm_userspace_memory_region {
1086#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) 1114#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
1087#define KVM_MEM_READONLY (1UL << 1) 1115#define KVM_MEM_READONLY (1UL << 1)
1088 1116
1089This ioctl allows the user to create or modify a guest physical memory 1117This ioctl allows the user to create, modify or delete a guest physical
1090slot. When changing an existing slot, it may be moved in the guest 1118memory slot. Bits 0-15 of "slot" specify the slot id and this value
1091physical memory space, or its flags may be modified. It may not be 1119should be less than the maximum number of user memory slots supported per
1092resized. Slots may not overlap in guest physical address space. 1120VM. The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS,
1093Bits 0-15 of "slot" specifies the slot id and this value should be 1121if this capability is supported by the architecture. Slots may not
1094less than the maximum number of user memory slots supported per VM. 1122overlap in guest physical address space.
1095The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS,
1096if this capability is supported by the architecture.
1097 1123
1098If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot" 1124If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot"
1099specifies the address space which is being modified. They must be 1125specifies the address space which is being modified. They must be
@@ -1102,6 +1128,10 @@ KVM_CAP_MULTI_ADDRESS_SPACE capability. Slots in separate address spaces
1102are unrelated; the restriction on overlapping slots only applies within 1128are unrelated; the restriction on overlapping slots only applies within
1103each address space. 1129each address space.
1104 1130
1131Deleting a slot is done by passing zero for memory_size. When changing
1132an existing slot, it may be moved in the guest physical memory space,
1133or its flags may be modified, but it may not be resized.
1134
1105Memory for the region is taken starting at the address denoted by the 1135Memory for the region is taken starting at the address denoted by the
1106field userspace_addr, which must point at user addressable memory for 1136field userspace_addr, which must point at user addressable memory for
1107the entire memory slot size. Any object may back this memory, including 1137the entire memory slot size. Any object may back this memory, including
@@ -2493,7 +2523,7 @@ KVM_S390_MCHK (vm, vcpu) - machine check interrupt; cr 14 bits in parm,
2493 machine checks needing further payload are not 2523 machine checks needing further payload are not
2494 supported by this ioctl) 2524 supported by this ioctl)
2495 2525
2496Note that the vcpu ioctl is asynchronous to vcpu execution. 2526This is an asynchronous vcpu ioctl and can be invoked from any thread.
2497 2527
24984.78 KVM_PPC_GET_HTAB_FD 25284.78 KVM_PPC_GET_HTAB_FD
2499 2529
@@ -3042,8 +3072,7 @@ KVM_S390_INT_EMERGENCY - sigp emergency; parameters in .emerg
3042KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall 3072KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall
3043KVM_S390_MCHK - machine check interrupt; parameters in .mchk 3073KVM_S390_MCHK - machine check interrupt; parameters in .mchk
3044 3074
3045 3075This is an asynchronous vcpu ioctl and can be invoked from any thread.
3046Note that the vcpu ioctl is asynchronous to vcpu execution.
3047 3076
30484.94 KVM_S390_GET_IRQ_STATE 30774.94 KVM_S390_GET_IRQ_STATE
3049 3078
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index f365102c80f5..2efe0efc516e 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -142,7 +142,7 @@ Shadow pages contain the following information:
142 If clear, this page corresponds to a guest page table denoted by the gfn 142 If clear, this page corresponds to a guest page table denoted by the gfn
143 field. 143 field.
144 role.quadrant: 144 role.quadrant:
145 When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit 145 When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit
146 sptes. That means a guest page table contains more ptes than the host, 146 sptes. That means a guest page table contains more ptes than the host,
147 so multiple shadow pages are needed to shadow one guest page. 147 so multiple shadow pages are needed to shadow one guest page.
148 For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the 148 For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the
@@ -158,9 +158,9 @@ Shadow pages contain the following information:
158 The page is invalid and should not be used. It is a root page that is 158 The page is invalid and should not be used. It is a root page that is
159 currently pinned (by a cpu hardware register pointing to it); once it is 159 currently pinned (by a cpu hardware register pointing to it); once it is
160 unpinned it will be destroyed. 160 unpinned it will be destroyed.
161 role.cr4_pae: 161 role.gpte_is_8_bytes:
162 Contains the value of cr4.pae for which the page is valid (e.g. whether 162 Reflects the size of the guest PTE for which the page is valid, i.e. '1'
163 32-bit or 64-bit gptes are in use). 163 if 64-bit gptes are in use, '0' if 32-bit gptes are in use.
164 role.nxe: 164 role.nxe:
165 Contains the value of efer.nxe for which the page is valid. 165 Contains the value of efer.nxe for which the page is valid.
166 role.cr0_wp: 166 role.cr0_wp:
@@ -173,6 +173,9 @@ Shadow pages contain the following information:
173 Contains the value of cr4.smap && !cr0.wp for which the page is valid 173 Contains the value of cr4.smap && !cr0.wp for which the page is valid
174 (pages for which this is true are different from other pages; see the 174 (pages for which this is true are different from other pages; see the
175 treatment of cr0.wp=0 below). 175 treatment of cr0.wp=0 below).
176 role.ept_sp:
177 This is a virtual flag to denote a shadowed nested EPT page. ept_sp
178 is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination.
176 role.smm: 179 role.smm:
177 Is 1 if the page is valid in system management mode. This field 180 Is 1 if the page is valid in system management mode. This field
178 determines which of the kvm_memslots array was used to build this 181 determines which of the kvm_memslots array was used to build this
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index dc0ab28baca1..70b783333965 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -6,6 +6,7 @@ generic-y += exec.h
6generic-y += export.h 6generic-y += export.h
7generic-y += fb.h 7generic-y += fb.h
8generic-y += irq_work.h 8generic-y += irq_work.h
9generic-y += kvm_para.h
9generic-y += mcs_spinlock.h 10generic-y += mcs_spinlock.h
10generic-y += mm-arch-hooks.h 11generic-y += mm-arch-hooks.h
11generic-y += preempt.h 12generic-y += preempt.h
diff --git a/arch/alpha/include/uapi/asm/kvm_para.h b/arch/alpha/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/alpha/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2#include <asm-generic/kvm_para.h>
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index b41f8881ecc8..decc306a3b52 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -11,6 +11,7 @@ generic-y += hardirq.h
11generic-y += hw_irq.h 11generic-y += hw_irq.h
12generic-y += irq_regs.h 12generic-y += irq_regs.h
13generic-y += irq_work.h 13generic-y += irq_work.h
14generic-y += kvm_para.h
14generic-y += local.h 15generic-y += local.h
15generic-y += local64.h 16generic-y += local64.h
16generic-y += mcs_spinlock.h 17generic-y += mcs_spinlock.h
diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/arc/include/uapi/asm/Kbuild
+++ b/arch/arc/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
1generic-y += kvm_para.h
2generic-y += ucontext.h generic-y += ucontext.h
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 2de96a180166..31de4ab93005 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -381,6 +381,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm,
381 return ret; 381 return ret;
382} 382}
383 383
384static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
385 const void *data, unsigned long len)
386{
387 int srcu_idx = srcu_read_lock(&kvm->srcu);
388 int ret = kvm_write_guest(kvm, gpa, data, len);
389
390 srcu_read_unlock(&kvm->srcu, srcu_idx);
391
392 return ret;
393}
394
384static inline void *kvm_get_hyp_vector(void) 395static inline void *kvm_get_hyp_vector(void)
385{ 396{
386 switch(read_cpuid_part()) { 397 switch(read_cpuid_part()) {
diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h
index de2089501b8b..9e11dce55e06 100644
--- a/arch/arm/include/asm/stage2_pgtable.h
+++ b/arch/arm/include/asm/stage2_pgtable.h
@@ -75,6 +75,8 @@ static inline bool kvm_stage2_has_pud(struct kvm *kvm)
75 75
76#define S2_PMD_MASK PMD_MASK 76#define S2_PMD_MASK PMD_MASK
77#define S2_PMD_SIZE PMD_SIZE 77#define S2_PMD_SIZE PMD_SIZE
78#define S2_PUD_MASK PUD_MASK
79#define S2_PUD_SIZE PUD_SIZE
78 80
79static inline bool kvm_stage2_has_pmd(struct kvm *kvm) 81static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
80{ 82{
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index 23b4464c0995..ce8573157774 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -3,3 +3,4 @@
3generated-y += unistd-common.h 3generated-y += unistd-common.h
4generated-y += unistd-oabi.h 4generated-y += unistd-oabi.h
5generated-y += unistd-eabi.h 5generated-y += unistd-eabi.h
6generic-y += kvm_para.h
diff --git a/arch/arm/include/uapi/asm/kvm_para.h b/arch/arm/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/arm/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2#include <asm-generic/kvm_para.h>
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b0742a16c6c9..ebeefcf835e8 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -445,6 +445,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm,
445 return ret; 445 return ret;
446} 446}
447 447
448static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
449 const void *data, unsigned long len)
450{
451 int srcu_idx = srcu_read_lock(&kvm->srcu);
452 int ret = kvm_write_guest(kvm, gpa, data, len);
453
454 srcu_read_unlock(&kvm->srcu, srcu_idx);
455
456 return ret;
457}
458
448#ifdef CONFIG_KVM_INDIRECT_VECTORS 459#ifdef CONFIG_KVM_INDIRECT_VECTORS
449/* 460/*
450 * EL2 vectors can be mapped and rerouted in a number of ways, 461 * EL2 vectors can be mapped and rerouted in a number of ways,
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index f16a5f8ff2b4..e2a0500cd7a2 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -123,6 +123,9 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
123 int ret = -EINVAL; 123 int ret = -EINVAL;
124 bool loaded; 124 bool loaded;
125 125
126 /* Reset PMU outside of the non-preemptible section */
127 kvm_pmu_vcpu_reset(vcpu);
128
126 preempt_disable(); 129 preempt_disable();
127 loaded = (vcpu->cpu != -1); 130 loaded = (vcpu->cpu != -1);
128 if (loaded) 131 if (loaded)
@@ -170,9 +173,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
170 vcpu->arch.reset_state.reset = false; 173 vcpu->arch.reset_state.reset = false;
171 } 174 }
172 175
173 /* Reset PMU */
174 kvm_pmu_vcpu_reset(vcpu);
175
176 /* Default workaround setup is enabled (if supported) */ 176 /* Default workaround setup is enabled (if supported) */
177 if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL) 177 if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL)
178 vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG; 178 vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 63b4a1705182..249c9f6f26dc 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += irq_work.h
19generic-y += kdebug.h 19generic-y += kdebug.h
20generic-y += kmap_types.h 20generic-y += kmap_types.h
21generic-y += kprobes.h 21generic-y += kprobes.h
22generic-y += kvm_para.h
22generic-y += local.h 23generic-y += local.h
23generic-y += mcs_spinlock.h 24generic-y += mcs_spinlock.h
24generic-y += mm-arch-hooks.h 25generic-y += mm-arch-hooks.h
diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/c6x/include/uapi/asm/Kbuild
+++ b/arch/c6x/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
1generic-y += kvm_para.h
2generic-y += ucontext.h generic-y += ucontext.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 3e7c8ecf151e..e3dead402e5f 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -23,6 +23,7 @@ generic-y += irq_work.h
23generic-y += kdebug.h 23generic-y += kdebug.h
24generic-y += kmap_types.h 24generic-y += kmap_types.h
25generic-y += kprobes.h 25generic-y += kprobes.h
26generic-y += kvm_para.h
26generic-y += linkage.h 27generic-y += linkage.h
27generic-y += local.h 28generic-y += local.h
28generic-y += local64.h 29generic-y += local64.h
diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/h8300/include/uapi/asm/Kbuild
+++ b/arch/h8300/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
1generic-y += kvm_para.h
2generic-y += ucontext.h generic-y += ucontext.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index b25fd42aa0f4..d046e8ccdf78 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += irq_work.h
19generic-y += kdebug.h 19generic-y += kdebug.h
20generic-y += kmap_types.h 20generic-y += kmap_types.h
21generic-y += kprobes.h 21generic-y += kprobes.h
22generic-y += kvm_para.h
22generic-y += local.h 23generic-y += local.h
23generic-y += local64.h 24generic-y += local64.h
24generic-y += mcs_spinlock.h 25generic-y += mcs_spinlock.h
diff --git a/arch/hexagon/include/uapi/asm/kvm_para.h b/arch/hexagon/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/hexagon/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2#include <asm-generic/kvm_para.h>
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 43e21fe3499c..11f191689c9e 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -2,6 +2,7 @@ generated-y += syscall_table.h
2generic-y += compat.h 2generic-y += compat.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += irq_work.h 4generic-y += irq_work.h
5generic-y += kvm_para.h
5generic-y += mcs_spinlock.h 6generic-y += mcs_spinlock.h
6generic-y += mm-arch-hooks.h 7generic-y += mm-arch-hooks.h
7generic-y += preempt.h 8generic-y += preempt.h
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index 20018cb883a9..62a9522af51e 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
1generated-y += unistd_64.h generated-y += unistd_64.h
2generic-y += kvm_para.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 95f8f631c4df..2c359d9e80f6 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -13,6 +13,7 @@ generic-y += irq_work.h
13generic-y += kdebug.h 13generic-y += kdebug.h
14generic-y += kmap_types.h 14generic-y += kmap_types.h
15generic-y += kprobes.h 15generic-y += kprobes.h
16generic-y += kvm_para.h
16generic-y += local.h 17generic-y += local.h
17generic-y += local64.h 18generic-y += local64.h
18generic-y += mcs_spinlock.h 19generic-y += mcs_spinlock.h
diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild
index 8a7ad40be463..7417847dc438 100644
--- a/arch/m68k/include/uapi/asm/Kbuild
+++ b/arch/m68k/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
1generated-y += unistd_32.h generated-y += unistd_32.h
2generic-y += kvm_para.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 791cc8d54d0a..1a8285c3f693 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += irq_work.h
17generic-y += kdebug.h 17generic-y += kdebug.h
18generic-y += kmap_types.h 18generic-y += kmap_types.h
19generic-y += kprobes.h 19generic-y += kprobes.h
20generic-y += kvm_para.h
20generic-y += linkage.h 21generic-y += linkage.h
21generic-y += local.h 22generic-y += local.h
22generic-y += local64.h 23generic-y += local64.h
diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild
index 3ce84fbb2678..13f59631c576 100644
--- a/arch/microblaze/include/uapi/asm/Kbuild
+++ b/arch/microblaze/include/uapi/asm/Kbuild
@@ -1,3 +1,2 @@
1generated-y += unistd_32.h 1generated-y += unistd_32.h
2generic-y += kvm_para.h
3generic-y += ucontext.h 2generic-y += ucontext.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 8fde4fa2c34f..88a667d12aaa 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -23,6 +23,7 @@ generic-y += irq_work.h
23generic-y += kdebug.h 23generic-y += kdebug.h
24generic-y += kmap_types.h 24generic-y += kmap_types.h
25generic-y += kprobes.h 25generic-y += kprobes.h
26generic-y += kvm_para.h
26generic-y += local.h 27generic-y += local.h
27generic-y += mcs_spinlock.h 28generic-y += mcs_spinlock.h
28generic-y += mm-arch-hooks.h 29generic-y += mm-arch-hooks.h
diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/nios2/include/uapi/asm/Kbuild
+++ b/arch/nios2/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
1generic-y += kvm_para.h
2generic-y += ucontext.h generic-y += ucontext.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 5a73e2956ac4..22aa97136c01 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -20,6 +20,7 @@ generic-y += irq_work.h
20generic-y += kdebug.h 20generic-y += kdebug.h
21generic-y += kmap_types.h 21generic-y += kmap_types.h
22generic-y += kprobes.h 22generic-y += kprobes.h
23generic-y += kvm_para.h
23generic-y += local.h 24generic-y += local.h
24generic-y += mcs_spinlock.h 25generic-y += mcs_spinlock.h
25generic-y += mm-arch-hooks.h 26generic-y += mm-arch-hooks.h
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
1generic-y += kvm_para.h
2generic-y += ucontext.h generic-y += ucontext.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 6f49e77d82a2..9bcd0c903dbb 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -11,6 +11,7 @@ generic-y += irq_regs.h
11generic-y += irq_work.h 11generic-y += irq_work.h
12generic-y += kdebug.h 12generic-y += kdebug.h
13generic-y += kprobes.h 13generic-y += kprobes.h
14generic-y += kvm_para.h
14generic-y += local.h 15generic-y += local.h
15generic-y += local64.h 16generic-y += local64.h
16generic-y += mcs_spinlock.h 17generic-y += mcs_spinlock.h
diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild
index 22fdbd08cdc8..2bd5b392277c 100644
--- a/arch/parisc/include/uapi/asm/Kbuild
+++ b/arch/parisc/include/uapi/asm/Kbuild
@@ -1,3 +1,2 @@
1generated-y += unistd_32.h 1generated-y += unistd_32.h
2generated-y += unistd_64.h 2generated-y += unistd_64.h
3generic-y += kvm_para.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index a6ef3fee5f85..7bf2cb680d32 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += emergency-restart.h
9generic-y += exec.h 9generic-y += exec.h
10generic-y += irq_regs.h 10generic-y += irq_regs.h
11generic-y += irq_work.h 11generic-y += irq_work.h
12generic-y += kvm_para.h
12generic-y += local.h 13generic-y += local.h
13generic-y += local64.h 14generic-y += local64.h
14generic-y += mcs_spinlock.h 15generic-y += mcs_spinlock.h
diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild
index ecfbd40924dd..b8812c74c1de 100644
--- a/arch/sh/include/uapi/asm/Kbuild
+++ b/arch/sh/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2 2
3generated-y += unistd_32.h 3generated-y += unistd_32.h
4generic-y += kvm_para.h
5generic-y += ucontext.h 4generic-y += ucontext.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index b82f64e28f55..a22cfd5c0ee8 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += exec.h
9generic-y += export.h 9generic-y += export.h
10generic-y += irq_regs.h 10generic-y += irq_regs.h
11generic-y += irq_work.h 11generic-y += irq_work.h
12generic-y += kvm_para.h
12generic-y += linkage.h 13generic-y += linkage.h
13generic-y += local.h 14generic-y += local.h
14generic-y += local64.h 15generic-y += local64.h
diff --git a/arch/sparc/include/uapi/asm/kvm_para.h b/arch/sparc/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/sparc/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2#include <asm-generic/kvm_para.h>
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 1d1544b6ca74..d77d953c04c1 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -18,6 +18,7 @@ generic-y += irq_work.h
18generic-y += kdebug.h 18generic-y += kdebug.h
19generic-y += kmap_types.h 19generic-y += kmap_types.h
20generic-y += kprobes.h 20generic-y += kprobes.h
21generic-y += kvm_para.h
21generic-y += local.h 22generic-y += local.h
22generic-y += mcs_spinlock.h 23generic-y += mcs_spinlock.h
23generic-y += mm-arch-hooks.h 24generic-y += mm-arch-hooks.h
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ b/arch/unicore32/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
1generic-y += kvm_para.h
2generic-y += ucontext.h generic-y += ucontext.h
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a5db4475e72d..159b5988292f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -253,14 +253,14 @@ struct kvm_mmu_memory_cache {
253 * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used 253 * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
254 * by indirect shadow page can not be more than 15 bits. 254 * by indirect shadow page can not be more than 15 bits.
255 * 255 *
256 * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access, 256 * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access,
257 * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp. 257 * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
258 */ 258 */
259union kvm_mmu_page_role { 259union kvm_mmu_page_role {
260 u32 word; 260 u32 word;
261 struct { 261 struct {
262 unsigned level:4; 262 unsigned level:4;
263 unsigned cr4_pae:1; 263 unsigned gpte_is_8_bytes:1;
264 unsigned quadrant:2; 264 unsigned quadrant:2;
265 unsigned direct:1; 265 unsigned direct:1;
266 unsigned access:3; 266 unsigned access:3;
@@ -350,6 +350,7 @@ struct kvm_mmu_page {
350}; 350};
351 351
352struct kvm_pio_request { 352struct kvm_pio_request {
353 unsigned long linear_rip;
353 unsigned long count; 354 unsigned long count;
354 int in; 355 int in;
355 int port; 356 int port;
@@ -568,6 +569,7 @@ struct kvm_vcpu_arch {
568 bool tpr_access_reporting; 569 bool tpr_access_reporting;
569 u64 ia32_xss; 570 u64 ia32_xss;
570 u64 microcode_version; 571 u64 microcode_version;
572 u64 arch_capabilities;
571 573
572 /* 574 /*
573 * Paging state of the vcpu 575 * Paging state of the vcpu
@@ -1192,6 +1194,8 @@ struct kvm_x86_ops {
1192 int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, 1194 int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
1193 uint16_t *vmcs_version); 1195 uint16_t *vmcs_version);
1194 uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); 1196 uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
1197
1198 bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
1195}; 1199};
1196 1200
1197struct kvm_arch_async_pf { 1201struct kvm_arch_async_pf {
@@ -1252,7 +1256,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1252 gfn_t gfn_offset, unsigned long mask); 1256 gfn_t gfn_offset, unsigned long mask);
1253void kvm_mmu_zap_all(struct kvm *kvm); 1257void kvm_mmu_zap_all(struct kvm *kvm);
1254void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); 1258void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
1255unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 1259unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
1256void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 1260void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
1257 1261
1258int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); 1262int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 27c43525a05f..421899f6ad7b 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -526,7 +526,9 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
526 new_config.enable = 0; 526 new_config.enable = 0;
527 stimer->config.as_uint64 = new_config.as_uint64; 527 stimer->config.as_uint64 = new_config.as_uint64;
528 528
529 stimer_mark_pending(stimer, false); 529 if (stimer->config.enable)
530 stimer_mark_pending(stimer, false);
531
530 return 0; 532 return 0;
531} 533}
532 534
@@ -542,7 +544,10 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
542 stimer->config.enable = 0; 544 stimer->config.enable = 0;
543 else if (stimer->config.auto_enable) 545 else if (stimer->config.auto_enable)
544 stimer->config.enable = 1; 546 stimer->config.enable = 1;
545 stimer_mark_pending(stimer, false); 547
548 if (stimer->config.enable)
549 stimer_mark_pending(stimer, false);
550
546 return 0; 551 return 0;
547} 552}
548 553
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7837ab001d80..eee455a8a612 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -182,7 +182,7 @@ struct kvm_shadow_walk_iterator {
182 182
183static const union kvm_mmu_page_role mmu_base_role_mask = { 183static const union kvm_mmu_page_role mmu_base_role_mask = {
184 .cr0_wp = 1, 184 .cr0_wp = 1,
185 .cr4_pae = 1, 185 .gpte_is_8_bytes = 1,
186 .nxe = 1, 186 .nxe = 1,
187 .smep_andnot_wp = 1, 187 .smep_andnot_wp = 1,
188 .smap_andnot_wp = 1, 188 .smap_andnot_wp = 1,
@@ -2205,6 +2205,7 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2205static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2205static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2206 struct list_head *invalid_list); 2206 struct list_head *invalid_list);
2207 2207
2208
2208#define for_each_valid_sp(_kvm, _sp, _gfn) \ 2209#define for_each_valid_sp(_kvm, _sp, _gfn) \
2209 hlist_for_each_entry(_sp, \ 2210 hlist_for_each_entry(_sp, \
2210 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 2211 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -2215,12 +2216,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2215 for_each_valid_sp(_kvm, _sp, _gfn) \ 2216 for_each_valid_sp(_kvm, _sp, _gfn) \
2216 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else 2217 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2217 2218
2219static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2220{
2221 return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2222}
2223
2218/* @sp->gfn should be write-protected at the call site */ 2224/* @sp->gfn should be write-protected at the call site */
2219static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 2225static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2220 struct list_head *invalid_list) 2226 struct list_head *invalid_list)
2221{ 2227{
2222 if (sp->role.cr4_pae != !!is_pae(vcpu) 2228 if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2223 || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { 2229 vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2224 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 2230 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2225 return false; 2231 return false;
2226 } 2232 }
@@ -2423,7 +2429,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2423 role.level = level; 2429 role.level = level;
2424 role.direct = direct; 2430 role.direct = direct;
2425 if (role.direct) 2431 if (role.direct)
2426 role.cr4_pae = 0; 2432 role.gpte_is_8_bytes = true;
2427 role.access = access; 2433 role.access = access;
2428 if (!vcpu->arch.mmu->direct_map 2434 if (!vcpu->arch.mmu->direct_map
2429 && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { 2435 && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
@@ -4794,7 +4800,6 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4794 4800
4795 role.base.access = ACC_ALL; 4801 role.base.access = ACC_ALL;
4796 role.base.nxe = !!is_nx(vcpu); 4802 role.base.nxe = !!is_nx(vcpu);
4797 role.base.cr4_pae = !!is_pae(vcpu);
4798 role.base.cr0_wp = is_write_protection(vcpu); 4803 role.base.cr0_wp = is_write_protection(vcpu);
4799 role.base.smm = is_smm(vcpu); 4804 role.base.smm = is_smm(vcpu);
4800 role.base.guest_mode = is_guest_mode(vcpu); 4805 role.base.guest_mode = is_guest_mode(vcpu);
@@ -4815,6 +4820,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4815 role.base.ad_disabled = (shadow_accessed_mask == 0); 4820 role.base.ad_disabled = (shadow_accessed_mask == 0);
4816 role.base.level = kvm_x86_ops->get_tdp_level(vcpu); 4821 role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
4817 role.base.direct = true; 4822 role.base.direct = true;
4823 role.base.gpte_is_8_bytes = true;
4818 4824
4819 return role; 4825 return role;
4820} 4826}
@@ -4879,6 +4885,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4879 role.base.smap_andnot_wp = role.ext.cr4_smap && 4885 role.base.smap_andnot_wp = role.ext.cr4_smap &&
4880 !is_write_protection(vcpu); 4886 !is_write_protection(vcpu);
4881 role.base.direct = !is_paging(vcpu); 4887 role.base.direct = !is_paging(vcpu);
4888 role.base.gpte_is_8_bytes = !!is_pae(vcpu);
4882 4889
4883 if (!is_long_mode(vcpu)) 4890 if (!is_long_mode(vcpu))
4884 role.base.level = PT32E_ROOT_LEVEL; 4891 role.base.level = PT32E_ROOT_LEVEL;
@@ -4918,18 +4925,26 @@ static union kvm_mmu_role
4918kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, 4925kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4919 bool execonly) 4926 bool execonly)
4920{ 4927{
4921 union kvm_mmu_role role; 4928 union kvm_mmu_role role = {0};
4922 4929
4923 /* Base role is inherited from root_mmu */ 4930 /* SMM flag is inherited from root_mmu */
4924 role.base.word = vcpu->arch.root_mmu.mmu_role.base.word; 4931 role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
4925 role.ext = kvm_calc_mmu_role_ext(vcpu);
4926 4932
4927 role.base.level = PT64_ROOT_4LEVEL; 4933 role.base.level = PT64_ROOT_4LEVEL;
4934 role.base.gpte_is_8_bytes = true;
4928 role.base.direct = false; 4935 role.base.direct = false;
4929 role.base.ad_disabled = !accessed_dirty; 4936 role.base.ad_disabled = !accessed_dirty;
4930 role.base.guest_mode = true; 4937 role.base.guest_mode = true;
4931 role.base.access = ACC_ALL; 4938 role.base.access = ACC_ALL;
4932 4939
4940 /*
4941 * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
4942 * SMAP variation to denote shadow EPT entries.
4943 */
4944 role.base.cr0_wp = true;
4945 role.base.smap_andnot_wp = true;
4946
4947 role.ext = kvm_calc_mmu_role_ext(vcpu);
4933 role.ext.execonly = execonly; 4948 role.ext.execonly = execonly;
4934 4949
4935 return role; 4950 return role;
@@ -5179,7 +5194,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5179 gpa, bytes, sp->role.word); 5194 gpa, bytes, sp->role.word);
5180 5195
5181 offset = offset_in_page(gpa); 5196 offset = offset_in_page(gpa);
5182 pte_size = sp->role.cr4_pae ? 8 : 4; 5197 pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5183 5198
5184 /* 5199 /*
5185 * Sometimes, the OS only writes the last one bytes to update status 5200 * Sometimes, the OS only writes the last one bytes to update status
@@ -5203,7 +5218,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5203 page_offset = offset_in_page(gpa); 5218 page_offset = offset_in_page(gpa);
5204 level = sp->role.level; 5219 level = sp->role.level;
5205 *nspte = 1; 5220 *nspte = 1;
5206 if (!sp->role.cr4_pae) { 5221 if (!sp->role.gpte_is_8_bytes) {
5207 page_offset <<= 1; /* 32->64 */ 5222 page_offset <<= 1; /* 32->64 */
5208 /* 5223 /*
5209 * A 32-bit pde maps 4MB while the shadow pdes map 5224 * A 32-bit pde maps 4MB while the shadow pdes map
@@ -5393,10 +5408,12 @@ emulate:
5393 * This can happen if a guest gets a page-fault on data access but the HW 5408 * This can happen if a guest gets a page-fault on data access but the HW
5394 * table walker is not able to read the instruction page (e.g instruction 5409 * table walker is not able to read the instruction page (e.g instruction
5395 * page is not present in memory). In those cases we simply restart the 5410 * page is not present in memory). In those cases we simply restart the
5396 * guest. 5411 * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
5397 */ 5412 */
5398 if (unlikely(insn && !insn_len)) 5413 if (unlikely(insn && !insn_len)) {
5399 return 1; 5414 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5415 return 1;
5416 }
5400 5417
5401 er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); 5418 er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
5402 5419
@@ -5509,7 +5526,9 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5509 5526
5510 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 5527 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5511 if (flush && lock_flush_tlb) { 5528 if (flush && lock_flush_tlb) {
5512 kvm_flush_remote_tlbs(kvm); 5529 kvm_flush_remote_tlbs_with_address(kvm,
5530 start_gfn,
5531 iterator.gfn - start_gfn + 1);
5513 flush = false; 5532 flush = false;
5514 } 5533 }
5515 cond_resched_lock(&kvm->mmu_lock); 5534 cond_resched_lock(&kvm->mmu_lock);
@@ -5517,7 +5536,8 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5517 } 5536 }
5518 5537
5519 if (flush && lock_flush_tlb) { 5538 if (flush && lock_flush_tlb) {
5520 kvm_flush_remote_tlbs(kvm); 5539 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5540 end_gfn - start_gfn + 1);
5521 flush = false; 5541 flush = false;
5522 } 5542 }
5523 5543
@@ -6011,7 +6031,7 @@ out:
6011/* 6031/*
6012 * Calculate mmu pages needed for kvm. 6032 * Calculate mmu pages needed for kvm.
6013 */ 6033 */
6014unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) 6034unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6015{ 6035{
6016 unsigned int nr_mmu_pages; 6036 unsigned int nr_mmu_pages;
6017 unsigned int nr_pages = 0; 6037 unsigned int nr_pages = 0;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 9f6c855a0043..dd30dccd2ad5 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -29,10 +29,10 @@
29 \ 29 \
30 role.word = __entry->role; \ 30 role.word = __entry->role; \
31 \ 31 \
32 trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \ 32 trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \
33 " %snxe %sad root %u %s%c", \ 33 " %snxe %sad root %u %s%c", \
34 __entry->gfn, role.level, \ 34 __entry->gfn, role.level, \
35 role.cr4_pae ? " pae" : "", \ 35 role.gpte_is_8_bytes ? 8 : 4, \
36 role.quadrant, \ 36 role.quadrant, \
37 role.direct ? " direct" : "", \ 37 role.direct ? " direct" : "", \
38 access_str[role.access], \ 38 access_str[role.access], \
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b5b128a0a051..426039285fd1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -7098,6 +7098,36 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
7098 return -ENODEV; 7098 return -ENODEV;
7099} 7099}
7100 7100
7101static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
7102{
7103 bool is_user, smap;
7104
7105 is_user = svm_get_cpl(vcpu) == 3;
7106 smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
7107
7108 /*
7109 * Detect and workaround Errata 1096 Fam_17h_00_0Fh
7110 *
7111 * In non SEV guest, hypervisor will be able to read the guest
7112 * memory to decode the instruction pointer when insn_len is zero
7113 * so we return true to indicate that decoding is possible.
7114 *
7115 * But in the SEV guest, the guest memory is encrypted with the
7116 * guest specific key and hypervisor will not be able to decode the
7117 * instruction pointer so we will not able to workaround it. Lets
7118 * print the error and request to kill the guest.
7119 */
7120 if (is_user && smap) {
7121 if (!sev_guest(vcpu->kvm))
7122 return true;
7123
7124 pr_err_ratelimited("KVM: Guest triggered AMD Erratum 1096\n");
7125 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
7126 }
7127
7128 return false;
7129}
7130
7101static struct kvm_x86_ops svm_x86_ops __ro_after_init = { 7131static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
7102 .cpu_has_kvm_support = has_svm, 7132 .cpu_has_kvm_support = has_svm,
7103 .disabled_by_bios = is_disabled, 7133 .disabled_by_bios = is_disabled,
@@ -7231,6 +7261,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
7231 7261
7232 .nested_enable_evmcs = nested_enable_evmcs, 7262 .nested_enable_evmcs = nested_enable_evmcs,
7233 .nested_get_evmcs_version = nested_get_evmcs_version, 7263 .nested_get_evmcs_version = nested_get_evmcs_version,
7264
7265 .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
7234}; 7266};
7235 7267
7236static int __init svm_init(void) 7268static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f24a2c225070..153e539c29c9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2585,6 +2585,11 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
2585 !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || 2585 !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
2586 !nested_cr3_valid(vcpu, vmcs12->host_cr3)) 2586 !nested_cr3_valid(vcpu, vmcs12->host_cr3))
2587 return -EINVAL; 2587 return -EINVAL;
2588
2589 if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) ||
2590 is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
2591 return -EINVAL;
2592
2588 /* 2593 /*
2589 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2594 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2590 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2595 * IA32_EFER MSR must be 0 in the field for that register. In addition,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c73375e01ab8..ab432a930ae8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1683,12 +1683,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1683 1683
1684 msr_info->data = to_vmx(vcpu)->spec_ctrl; 1684 msr_info->data = to_vmx(vcpu)->spec_ctrl;
1685 break; 1685 break;
1686 case MSR_IA32_ARCH_CAPABILITIES:
1687 if (!msr_info->host_initiated &&
1688 !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
1689 return 1;
1690 msr_info->data = to_vmx(vcpu)->arch_capabilities;
1691 break;
1692 case MSR_IA32_SYSENTER_CS: 1686 case MSR_IA32_SYSENTER_CS:
1693 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 1687 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
1694 break; 1688 break;
@@ -1895,11 +1889,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1895 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, 1889 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
1896 MSR_TYPE_W); 1890 MSR_TYPE_W);
1897 break; 1891 break;
1898 case MSR_IA32_ARCH_CAPABILITIES:
1899 if (!msr_info->host_initiated)
1900 return 1;
1901 vmx->arch_capabilities = data;
1902 break;
1903 case MSR_IA32_CR_PAT: 1892 case MSR_IA32_CR_PAT:
1904 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1893 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1905 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 1894 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -4088,8 +4077,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
4088 ++vmx->nmsrs; 4077 ++vmx->nmsrs;
4089 } 4078 }
4090 4079
4091 vmx->arch_capabilities = kvm_get_arch_capabilities();
4092
4093 vm_exit_controls_init(vmx, vmx_vmexit_ctrl()); 4080 vm_exit_controls_init(vmx, vmx_vmexit_ctrl());
4094 4081
4095 /* 22.2.1, 20.8.1 */ 4082 /* 22.2.1, 20.8.1 */
@@ -7409,6 +7396,11 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
7409 return 0; 7396 return 0;
7410} 7397}
7411 7398
7399static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
7400{
7401 return 0;
7402}
7403
7412static __init int hardware_setup(void) 7404static __init int hardware_setup(void)
7413{ 7405{
7414 unsigned long host_bndcfgs; 7406 unsigned long host_bndcfgs;
@@ -7711,6 +7703,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
7711 .set_nested_state = NULL, 7703 .set_nested_state = NULL,
7712 .get_vmcs12_pages = NULL, 7704 .get_vmcs12_pages = NULL,
7713 .nested_enable_evmcs = NULL, 7705 .nested_enable_evmcs = NULL,
7706 .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
7714}; 7707};
7715 7708
7716static void vmx_cleanup_l1d_flush(void) 7709static void vmx_cleanup_l1d_flush(void)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 1554cb45b393..a1e00d0a2482 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -190,7 +190,6 @@ struct vcpu_vmx {
190 u64 msr_guest_kernel_gs_base; 190 u64 msr_guest_kernel_gs_base;
191#endif 191#endif
192 192
193 u64 arch_capabilities;
194 u64 spec_ctrl; 193 u64 spec_ctrl;
195 194
196 u32 vm_entry_controls_shadow; 195 u32 vm_entry_controls_shadow;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 65e4559eef2f..099b851dabaf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1125,7 +1125,7 @@ static u32 msrs_to_save[] = {
1125#endif 1125#endif
1126 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 1126 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1127 MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, 1127 MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1128 MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES, 1128 MSR_IA32_SPEC_CTRL,
1129 MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, 1129 MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1130 MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, 1130 MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1131 MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, 1131 MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
@@ -1158,6 +1158,7 @@ static u32 emulated_msrs[] = {
1158 1158
1159 MSR_IA32_TSC_ADJUST, 1159 MSR_IA32_TSC_ADJUST,
1160 MSR_IA32_TSCDEADLINE, 1160 MSR_IA32_TSCDEADLINE,
1161 MSR_IA32_ARCH_CAPABILITIES,
1161 MSR_IA32_MISC_ENABLE, 1162 MSR_IA32_MISC_ENABLE,
1162 MSR_IA32_MCG_STATUS, 1163 MSR_IA32_MCG_STATUS,
1163 MSR_IA32_MCG_CTL, 1164 MSR_IA32_MCG_CTL,
@@ -2443,6 +2444,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2443 if (msr_info->host_initiated) 2444 if (msr_info->host_initiated)
2444 vcpu->arch.microcode_version = data; 2445 vcpu->arch.microcode_version = data;
2445 break; 2446 break;
2447 case MSR_IA32_ARCH_CAPABILITIES:
2448 if (!msr_info->host_initiated)
2449 return 1;
2450 vcpu->arch.arch_capabilities = data;
2451 break;
2446 case MSR_EFER: 2452 case MSR_EFER:
2447 return set_efer(vcpu, data); 2453 return set_efer(vcpu, data);
2448 case MSR_K7_HWCR: 2454 case MSR_K7_HWCR:
@@ -2747,6 +2753,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2747 case MSR_IA32_UCODE_REV: 2753 case MSR_IA32_UCODE_REV:
2748 msr_info->data = vcpu->arch.microcode_version; 2754 msr_info->data = vcpu->arch.microcode_version;
2749 break; 2755 break;
2756 case MSR_IA32_ARCH_CAPABILITIES:
2757 if (!msr_info->host_initiated &&
2758 !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
2759 return 1;
2760 msr_info->data = vcpu->arch.arch_capabilities;
2761 break;
2750 case MSR_IA32_TSC: 2762 case MSR_IA32_TSC:
2751 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; 2763 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
2752 break; 2764 break;
@@ -6523,14 +6535,27 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
6523} 6535}
6524EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); 6536EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
6525 6537
6538static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
6539{
6540 vcpu->arch.pio.count = 0;
6541
6542 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
6543 return 1;
6544
6545 return kvm_skip_emulated_instruction(vcpu);
6546}
6547
6526static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, 6548static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
6527 unsigned short port) 6549 unsigned short port)
6528{ 6550{
6529 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 6551 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
6530 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, 6552 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
6531 size, port, &val, 1); 6553 size, port, &val, 1);
6532 /* do not return to emulator after return from userspace */ 6554
6533 vcpu->arch.pio.count = 0; 6555 if (!ret) {
6556 vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
6557 vcpu->arch.complete_userspace_io = complete_fast_pio_out;
6558 }
6534 return ret; 6559 return ret;
6535} 6560}
6536 6561
@@ -6541,6 +6566,11 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
6541 /* We should only ever be called with arch.pio.count equal to 1 */ 6566 /* We should only ever be called with arch.pio.count equal to 1 */
6542 BUG_ON(vcpu->arch.pio.count != 1); 6567 BUG_ON(vcpu->arch.pio.count != 1);
6543 6568
6569 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
6570 vcpu->arch.pio.count = 0;
6571 return 1;
6572 }
6573
6544 /* For size less than 4 we merge, else we zero extend */ 6574 /* For size less than 4 we merge, else we zero extend */
6545 val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) 6575 val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
6546 : 0; 6576 : 0;
@@ -6553,7 +6583,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
6553 vcpu->arch.pio.port, &val, 1); 6583 vcpu->arch.pio.port, &val, 1);
6554 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 6584 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
6555 6585
6556 return 1; 6586 return kvm_skip_emulated_instruction(vcpu);
6557} 6587}
6558 6588
6559static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, 6589static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
@@ -6572,6 +6602,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
6572 return ret; 6602 return ret;
6573 } 6603 }
6574 6604
6605 vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
6575 vcpu->arch.complete_userspace_io = complete_fast_pio_in; 6606 vcpu->arch.complete_userspace_io = complete_fast_pio_in;
6576 6607
6577 return 0; 6608 return 0;
@@ -6579,16 +6610,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
6579 6610
6580int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in) 6611int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
6581{ 6612{
6582 int ret = kvm_skip_emulated_instruction(vcpu); 6613 int ret;
6583 6614
6584 /*
6585 * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
6586 * KVM_EXIT_DEBUG here.
6587 */
6588 if (in) 6615 if (in)
6589 return kvm_fast_pio_in(vcpu, size, port) && ret; 6616 ret = kvm_fast_pio_in(vcpu, size, port);
6590 else 6617 else
6591 return kvm_fast_pio_out(vcpu, size, port) && ret; 6618 ret = kvm_fast_pio_out(vcpu, size, port);
6619 return ret && kvm_skip_emulated_instruction(vcpu);
6592} 6620}
6593EXPORT_SYMBOL_GPL(kvm_fast_pio); 6621EXPORT_SYMBOL_GPL(kvm_fast_pio);
6594 6622
@@ -8733,6 +8761,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
8733 8761
8734int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 8762int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
8735{ 8763{
8764 vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
8736 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; 8765 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
8737 kvm_vcpu_mtrr_init(vcpu); 8766 kvm_vcpu_mtrr_init(vcpu);
8738 vcpu_load(vcpu); 8767 vcpu_load(vcpu);
@@ -9429,13 +9458,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
9429 const struct kvm_memory_slot *new, 9458 const struct kvm_memory_slot *new,
9430 enum kvm_mr_change change) 9459 enum kvm_mr_change change)
9431{ 9460{
9432 int nr_mmu_pages = 0;
9433
9434 if (!kvm->arch.n_requested_mmu_pages) 9461 if (!kvm->arch.n_requested_mmu_pages)
9435 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 9462 kvm_mmu_change_mmu_pages(kvm,
9436 9463 kvm_mmu_calculate_default_mmu_pages(kvm));
9437 if (nr_mmu_pages)
9438 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
9439 9464
9440 /* 9465 /*
9441 * Dirty logging tracks sptes in 4k granularity, meaning that large 9466 * Dirty logging tracks sptes in 4k granularity, meaning that large
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 42b6cb3d16f7..3843198e03d4 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -15,6 +15,7 @@ generic-y += irq_work.h
15generic-y += kdebug.h 15generic-y += kdebug.h
16generic-y += kmap_types.h 16generic-y += kmap_types.h
17generic-y += kprobes.h 17generic-y += kprobes.h
18generic-y += kvm_para.h
18generic-y += local.h 19generic-y += local.h
19generic-y += local64.h 20generic-y += local64.h
20generic-y += mcs_spinlock.h 21generic-y += mcs_spinlock.h
diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild
index 8a7ad40be463..7417847dc438 100644
--- a/arch/xtensa/include/uapi/asm/Kbuild
+++ b/arch/xtensa/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
1generated-y += unistd_32.h generated-y += unistd_32.h
2generic-y += kvm_para.h
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 5f24b50c9e88..059dc2bedaf6 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -7,5 +7,7 @@ no-export-headers += kvm.h
7endif 7endif
8 8
9ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h),) 9ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h),)
10ifeq ($(wildcard $(objtree)/arch/$(SRCARCH)/include/generated/uapi/asm/kvm_para.h),)
10no-export-headers += kvm_para.h 11no-export-headers += kvm_para.h
11endif 12endif
13endif
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 3c1f4bdf9000..7514fcea91a7 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -29,8 +29,8 @@ LIBKVM += $(LIBKVM_$(UNAME_M))
29INSTALL_HDR_PATH = $(top_srcdir)/usr 29INSTALL_HDR_PATH = $(top_srcdir)/usr
30LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ 30LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
31LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include 31LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
32CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(UNAME_M) -I.. 32CFLAGS += -O2 -g -std=gnu99 -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(UNAME_M) -I..
33LDFLAGS += -pthread 33LDFLAGS += -pthread -no-pie
34 34
35# After inclusion, $(OUTPUT) is defined and 35# After inclusion, $(OUTPUT) is defined and
36# $(TEST_GEN_PROGS) starts with $(OUTPUT)/ 36# $(TEST_GEN_PROGS) starts with $(OUTPUT)/
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index a84785b02557..07b71ad9734a 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -102,6 +102,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva);
102struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid); 102struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
103void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); 103void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
104int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); 104int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
105void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid);
105void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, 106void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
106 struct kvm_mp_state *mp_state); 107 struct kvm_mp_state *mp_state);
107void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); 108void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index b52cfdefecbf..efa0aad8b3c6 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1121,6 +1121,22 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
1121 return rc; 1121 return rc;
1122} 1122}
1123 1123
1124void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid)
1125{
1126 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1127 int ret;
1128
1129 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1130
1131 vcpu->state->immediate_exit = 1;
1132 ret = ioctl(vcpu->fd, KVM_RUN, NULL);
1133 vcpu->state->immediate_exit = 0;
1134
1135 TEST_ASSERT(ret == -1 && errno == EINTR,
1136 "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i",
1137 ret, errno);
1138}
1139
1124/* 1140/*
1125 * VM VCPU Set MP State 1141 * VM VCPU Set MP State
1126 * 1142 *
diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
index d503a51fad30..7c2c4d4055a8 100644
--- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
@@ -87,22 +87,25 @@ int main(int argc, char *argv[])
87 while (1) { 87 while (1) {
88 rc = _vcpu_run(vm, VCPU_ID); 88 rc = _vcpu_run(vm, VCPU_ID);
89 89
90 if (run->exit_reason == KVM_EXIT_IO) { 90 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
91 switch (get_ucall(vm, VCPU_ID, &uc)) { 91 "Unexpected exit reason: %u (%s),\n",
92 case UCALL_SYNC: 92 run->exit_reason,
93 /* emulate hypervisor clearing CR4.OSXSAVE */ 93 exit_reason_str(run->exit_reason));
94 vcpu_sregs_get(vm, VCPU_ID, &sregs); 94
95 sregs.cr4 &= ~X86_CR4_OSXSAVE; 95 switch (get_ucall(vm, VCPU_ID, &uc)) {
96 vcpu_sregs_set(vm, VCPU_ID, &sregs); 96 case UCALL_SYNC:
97 break; 97 /* emulate hypervisor clearing CR4.OSXSAVE */
98 case UCALL_ABORT: 98 vcpu_sregs_get(vm, VCPU_ID, &sregs);
99 TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); 99 sregs.cr4 &= ~X86_CR4_OSXSAVE;
100 break; 100 vcpu_sregs_set(vm, VCPU_ID, &sregs);
101 case UCALL_DONE: 101 break;
102 goto done; 102 case UCALL_ABORT:
103 default: 103 TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit.");
104 TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); 104 break;
105 } 105 case UCALL_DONE:
106 goto done;
107 default:
108 TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
106 } 109 }
107 } 110 }
108 111
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
index 4b3f556265f1..30f75856cf39 100644
--- a/tools/testing/selftests/kvm/x86_64/state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/state_test.c
@@ -134,6 +134,11 @@ int main(int argc, char *argv[])
134 134
135 struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); 135 struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
136 136
137 if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) {
138 fprintf(stderr, "immediate_exit not available, skipping test\n");
139 exit(KSFT_SKIP);
140 }
141
137 /* Create VM */ 142 /* Create VM */
138 vm = vm_create_default(VCPU_ID, 0, guest_code); 143 vm = vm_create_default(VCPU_ID, 0, guest_code);
139 vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); 144 vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
@@ -156,8 +161,6 @@ int main(int argc, char *argv[])
156 stage, run->exit_reason, 161 stage, run->exit_reason,
157 exit_reason_str(run->exit_reason)); 162 exit_reason_str(run->exit_reason));
158 163
159 memset(&regs1, 0, sizeof(regs1));
160 vcpu_regs_get(vm, VCPU_ID, &regs1);
161 switch (get_ucall(vm, VCPU_ID, &uc)) { 164 switch (get_ucall(vm, VCPU_ID, &uc)) {
162 case UCALL_ABORT: 165 case UCALL_ABORT:
163 TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0], 166 TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0],
@@ -176,6 +179,17 @@ int main(int argc, char *argv[])
176 uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx", 179 uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx",
177 stage, (ulong)uc.args[1]); 180 stage, (ulong)uc.args[1]);
178 181
182 /*
183 * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
184 * guest state is consistent only after userspace re-enters the
185 * kernel with KVM_RUN. Complete IO prior to migrating state
186 * to a new VM.
187 */
188 vcpu_run_complete_io(vm, VCPU_ID);
189
190 memset(&regs1, 0, sizeof(regs1));
191 vcpu_regs_get(vm, VCPU_ID, &regs1);
192
179 state = vcpu_save_state(vm, VCPU_ID); 193 state = vcpu_save_state(vm, VCPU_ID);
180 kvm_vm_release(vm); 194 kvm_vm_release(vm);
181 195
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 264d92da3240..370bd6c5e6cb 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -222,7 +222,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
222 } 222 }
223 } 223 }
224 224
225 if (used_lrs) { 225 if (used_lrs || cpu_if->its_vpe.its_vm) {
226 int i; 226 int i;
227 u32 elrsr; 227 u32 elrsr;
228 228
@@ -247,7 +247,7 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
247 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; 247 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
248 int i; 248 int i;
249 249
250 if (used_lrs) { 250 if (used_lrs || cpu_if->its_vpe.its_vm) {
251 write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); 251 write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
252 252
253 for (i = 0; i < used_lrs; i++) 253 for (i = 0; i < used_lrs; i++)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index ffd7acdceac7..27c958306449 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -102,8 +102,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
102 * @addr: IPA 102 * @addr: IPA
103 * @pmd: pmd pointer for IPA 103 * @pmd: pmd pointer for IPA
104 * 104 *
105 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all 105 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
106 * pages in the range dirty.
107 */ 106 */
108static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) 107static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
109{ 108{
@@ -121,8 +120,7 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
121 * @addr: IPA 120 * @addr: IPA
122 * @pud: pud pointer for IPA 121 * @pud: pud pointer for IPA
123 * 122 *
124 * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all 123 * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
125 * pages in the range dirty.
126 */ 124 */
127static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) 125static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
128{ 126{
@@ -899,9 +897,8 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
899 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 897 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
900 * @kvm: The KVM struct pointer for the VM. 898 * @kvm: The KVM struct pointer for the VM.
901 * 899 *
902 * Allocates only the stage-2 HW PGD level table(s) (can support either full 900 * Allocates only the stage-2 HW PGD level table(s) of size defined by
903 * 40-bit input addresses or limited to 32-bit input addresses). Clears the 901 * stage2_pgd_size(kvm).
904 * allocated pages.
905 * 902 *
906 * Note we don't need locking here as this is only called when the VM is 903 * Note we don't need locking here as this is only called when the VM is
907 * created, which can only be done once. 904 * created, which can only be done once.
@@ -1067,25 +1064,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1067{ 1064{
1068 pmd_t *pmd, old_pmd; 1065 pmd_t *pmd, old_pmd;
1069 1066
1067retry:
1070 pmd = stage2_get_pmd(kvm, cache, addr); 1068 pmd = stage2_get_pmd(kvm, cache, addr);
1071 VM_BUG_ON(!pmd); 1069 VM_BUG_ON(!pmd);
1072 1070
1073 old_pmd = *pmd; 1071 old_pmd = *pmd;
1072 /*
1073 * Multiple vcpus faulting on the same PMD entry, can
1074 * lead to them sequentially updating the PMD with the
1075 * same value. Following the break-before-make
1076 * (pmd_clear() followed by tlb_flush()) process can
1077 * hinder forward progress due to refaults generated
1078 * on missing translations.
1079 *
1080 * Skip updating the page table if the entry is
1081 * unchanged.
1082 */
1083 if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1084 return 0;
1085
1074 if (pmd_present(old_pmd)) { 1086 if (pmd_present(old_pmd)) {
1075 /* 1087 /*
1076 * Multiple vcpus faulting on the same PMD entry, can 1088 * If we already have PTE level mapping for this block,
1077 * lead to them sequentially updating the PMD with the 1089 * we must unmap it to avoid inconsistent TLB state and
1078 * same value. Following the break-before-make 1090 * leaking the table page. We could end up in this situation
1079 * (pmd_clear() followed by tlb_flush()) process can 1091 * if the memory slot was marked for dirty logging and was
1080 * hinder forward progress due to refaults generated 1092 * reverted, leaving PTE level mappings for the pages accessed
1081 * on missing translations. 1093 * during the period. So, unmap the PTE level mapping for this
1094 * block and retry, as we could have released the upper level
1095 * table in the process.
1082 * 1096 *
1083 * Skip updating the page table if the entry is 1097 * Normal THP split/merge follows mmu_notifier callbacks and do
1084 * unchanged. 1098 * get handled accordingly.
1085 */ 1099 */
1086 if (pmd_val(old_pmd) == pmd_val(*new_pmd)) 1100 if (!pmd_thp_or_huge(old_pmd)) {
1087 return 0; 1101 unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
1088 1102 goto retry;
1103 }
1089 /* 1104 /*
1090 * Mapping in huge pages should only happen through a 1105 * Mapping in huge pages should only happen through a
1091 * fault. If a page is merged into a transparent huge 1106 * fault. If a page is merged into a transparent huge
@@ -1097,8 +1112,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1097 * should become splitting first, unmapped, merged, 1112 * should become splitting first, unmapped, merged,
1098 * and mapped back in on-demand. 1113 * and mapped back in on-demand.
1099 */ 1114 */
1100 VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); 1115 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
1101
1102 pmd_clear(pmd); 1116 pmd_clear(pmd);
1103 kvm_tlb_flush_vmid_ipa(kvm, addr); 1117 kvm_tlb_flush_vmid_ipa(kvm, addr);
1104 } else { 1118 } else {
@@ -1114,6 +1128,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
1114{ 1128{
1115 pud_t *pudp, old_pud; 1129 pud_t *pudp, old_pud;
1116 1130
1131retry:
1117 pudp = stage2_get_pud(kvm, cache, addr); 1132 pudp = stage2_get_pud(kvm, cache, addr);
1118 VM_BUG_ON(!pudp); 1133 VM_BUG_ON(!pudp);
1119 1134
@@ -1121,14 +1136,23 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
1121 1136
1122 /* 1137 /*
1123 * A large number of vcpus faulting on the same stage 2 entry, 1138 * A large number of vcpus faulting on the same stage 2 entry,
1124 * can lead to a refault due to the 1139 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1125 * stage2_pud_clear()/tlb_flush(). Skip updating the page 1140 * Skip updating the page tables if there is no change.
1126 * tables if there is no change.
1127 */ 1141 */
1128 if (pud_val(old_pud) == pud_val(*new_pudp)) 1142 if (pud_val(old_pud) == pud_val(*new_pudp))
1129 return 0; 1143 return 0;
1130 1144
1131 if (stage2_pud_present(kvm, old_pud)) { 1145 if (stage2_pud_present(kvm, old_pud)) {
1146 /*
1147 * If we already have table level mapping for this block, unmap
1148 * the range for this block and retry.
1149 */
1150 if (!stage2_pud_huge(kvm, old_pud)) {
1151 unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
1152 goto retry;
1153 }
1154
1155 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
1132 stage2_pud_clear(kvm, pudp); 1156 stage2_pud_clear(kvm, pudp);
1133 kvm_tlb_flush_vmid_ipa(kvm, addr); 1157 kvm_tlb_flush_vmid_ipa(kvm, addr);
1134 } else { 1158 } else {
@@ -1451,13 +1475,11 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
1451} 1475}
1452 1476
1453/** 1477/**
1454 * stage2_wp_puds - write protect PGD range 1478 * stage2_wp_puds - write protect PGD range
1455 * @pgd: pointer to pgd entry 1479 * @pgd: pointer to pgd entry
1456 * @addr: range start address 1480 * @addr: range start address
1457 * @end: range end address 1481 * @end: range end address
1458 * 1482 */
1459 * Process PUD entries, for a huge PUD we cause a panic.
1460 */
1461static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, 1483static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
1462 phys_addr_t addr, phys_addr_t end) 1484 phys_addr_t addr, phys_addr_t end)
1463{ 1485{
@@ -1594,8 +1616,9 @@ static void kvm_send_hwpoison_signal(unsigned long address,
1594 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1616 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1595} 1617}
1596 1618
1597static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, 1619static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1598 unsigned long hva) 1620 unsigned long hva,
1621 unsigned long map_size)
1599{ 1622{
1600 gpa_t gpa_start; 1623 gpa_t gpa_start;
1601 hva_t uaddr_start, uaddr_end; 1624 hva_t uaddr_start, uaddr_end;
@@ -1610,34 +1633,34 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
1610 1633
1611 /* 1634 /*
1612 * Pages belonging to memslots that don't have the same alignment 1635 * Pages belonging to memslots that don't have the same alignment
1613 * within a PMD for userspace and IPA cannot be mapped with stage-2 1636 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1614 * PMD entries, because we'll end up mapping the wrong pages. 1637 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1615 * 1638 *
1616 * Consider a layout like the following: 1639 * Consider a layout like the following:
1617 * 1640 *
1618 * memslot->userspace_addr: 1641 * memslot->userspace_addr:
1619 * +-----+--------------------+--------------------+---+ 1642 * +-----+--------------------+--------------------+---+
1620 * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz| 1643 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
1621 * +-----+--------------------+--------------------+---+ 1644 * +-----+--------------------+--------------------+---+
1622 * 1645 *
1623 * memslot->base_gfn << PAGE_SIZE: 1646 * memslot->base_gfn << PAGE_SIZE:
1624 * +---+--------------------+--------------------+-----+ 1647 * +---+--------------------+--------------------+-----+
1625 * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz| 1648 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
1626 * +---+--------------------+--------------------+-----+ 1649 * +---+--------------------+--------------------+-----+
1627 * 1650 *
1628 * If we create those stage-2 PMDs, we'll end up with this incorrect 1651 * If we create those stage-2 blocks, we'll end up with this incorrect
1629 * mapping: 1652 * mapping:
1630 * d -> f 1653 * d -> f
1631 * e -> g 1654 * e -> g
1632 * f -> h 1655 * f -> h
1633 */ 1656 */
1634 if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK)) 1657 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1635 return false; 1658 return false;
1636 1659
1637 /* 1660 /*
1638 * Next, let's make sure we're not trying to map anything not covered 1661 * Next, let's make sure we're not trying to map anything not covered
1639 * by the memslot. This means we have to prohibit PMD size mappings 1662 * by the memslot. This means we have to prohibit block size mappings
1640 * for the beginning and end of a non-PMD aligned and non-PMD sized 1663 * for the beginning and end of a non-block aligned and non-block sized
1641 * memory slot (illustrated by the head and tail parts of the 1664 * memory slot (illustrated by the head and tail parts of the
1642 * userspace view above containing pages 'abcde' and 'xyz', 1665 * userspace view above containing pages 'abcde' and 'xyz',
1643 * respectively). 1666 * respectively).
@@ -1646,8 +1669,8 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
1646 * userspace_addr or the base_gfn, as both are equally aligned (per 1669 * userspace_addr or the base_gfn, as both are equally aligned (per
1647 * the check above) and equally sized. 1670 * the check above) and equally sized.
1648 */ 1671 */
1649 return (hva & S2_PMD_MASK) >= uaddr_start && 1672 return (hva & ~(map_size - 1)) >= uaddr_start &&
1650 (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end; 1673 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1651} 1674}
1652 1675
1653static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1676static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1676,12 +1699,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1676 return -EFAULT; 1699 return -EFAULT;
1677 } 1700 }
1678 1701
1679 if (!fault_supports_stage2_pmd_mappings(memslot, hva))
1680 force_pte = true;
1681
1682 if (logging_active)
1683 force_pte = true;
1684
1685 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1702 /* Let's check if we will get back a huge page backed by hugetlbfs */
1686 down_read(&current->mm->mmap_sem); 1703 down_read(&current->mm->mmap_sem);
1687 vma = find_vma_intersection(current->mm, hva, hva + 1); 1704 vma = find_vma_intersection(current->mm, hva, hva + 1);
@@ -1692,6 +1709,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1692 } 1709 }
1693 1710
1694 vma_pagesize = vma_kernel_pagesize(vma); 1711 vma_pagesize = vma_kernel_pagesize(vma);
1712 if (logging_active ||
1713 !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1714 force_pte = true;
1715 vma_pagesize = PAGE_SIZE;
1716 }
1717
1695 /* 1718 /*
1696 * The stage2 has a minimum of 2 level table (For arm64 see 1719 * The stage2 has a minimum of 2 level table (For arm64 see
1697 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can 1720 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
@@ -1699,11 +1722,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1699 * As for PUD huge maps, we must make sure that we have at least 1722 * As for PUD huge maps, we must make sure that we have at least
1700 * 3 levels, i.e, PMD is not folded. 1723 * 3 levels, i.e, PMD is not folded.
1701 */ 1724 */
1702 if ((vma_pagesize == PMD_SIZE || 1725 if (vma_pagesize == PMD_SIZE ||
1703 (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) && 1726 (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1704 !force_pte) {
1705 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; 1727 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1706 }
1707 up_read(&current->mm->mmap_sem); 1728 up_read(&current->mm->mmap_sem);
1708 1729
1709 /* We need minimum second+third level pages */ 1730 /* We need minimum second+third level pages */
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index ab3f47745d9c..44ceaccb18cf 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -754,8 +754,9 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
754 u64 indirect_ptr, type = GITS_BASER_TYPE(baser); 754 u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
755 phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); 755 phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
756 int esz = GITS_BASER_ENTRY_SIZE(baser); 756 int esz = GITS_BASER_ENTRY_SIZE(baser);
757 int index; 757 int index, idx;
758 gfn_t gfn; 758 gfn_t gfn;
759 bool ret;
759 760
760 switch (type) { 761 switch (type) {
761 case GITS_BASER_TYPE_DEVICE: 762 case GITS_BASER_TYPE_DEVICE:
@@ -782,7 +783,8 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
782 783
783 if (eaddr) 784 if (eaddr)
784 *eaddr = addr; 785 *eaddr = addr;
785 return kvm_is_visible_gfn(its->dev->kvm, gfn); 786
787 goto out;
786 } 788 }
787 789
788 /* calculate and check the index into the 1st level */ 790 /* calculate and check the index into the 1st level */
@@ -812,7 +814,12 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
812 814
813 if (eaddr) 815 if (eaddr)
814 *eaddr = indirect_ptr; 816 *eaddr = indirect_ptr;
815 return kvm_is_visible_gfn(its->dev->kvm, gfn); 817
818out:
819 idx = srcu_read_lock(&its->dev->kvm->srcu);
820 ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
821 srcu_read_unlock(&its->dev->kvm->srcu, idx);
822 return ret;
816} 823}
817 824
818static int vgic_its_alloc_collection(struct vgic_its *its, 825static int vgic_its_alloc_collection(struct vgic_its *its,
@@ -1729,8 +1736,8 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev)
1729 kfree(its); 1736 kfree(its);
1730} 1737}
1731 1738
1732int vgic_its_has_attr_regs(struct kvm_device *dev, 1739static int vgic_its_has_attr_regs(struct kvm_device *dev,
1733 struct kvm_device_attr *attr) 1740 struct kvm_device_attr *attr)
1734{ 1741{
1735 const struct vgic_register_region *region; 1742 const struct vgic_register_region *region;
1736 gpa_t offset = attr->attr; 1743 gpa_t offset = attr->attr;
@@ -1750,9 +1757,9 @@ int vgic_its_has_attr_regs(struct kvm_device *dev,
1750 return 0; 1757 return 0;
1751} 1758}
1752 1759
1753int vgic_its_attr_regs_access(struct kvm_device *dev, 1760static int vgic_its_attr_regs_access(struct kvm_device *dev,
1754 struct kvm_device_attr *attr, 1761 struct kvm_device_attr *attr,
1755 u64 *reg, bool is_write) 1762 u64 *reg, bool is_write)
1756{ 1763{
1757 const struct vgic_register_region *region; 1764 const struct vgic_register_region *region;
1758 struct vgic_its *its; 1765 struct vgic_its *its;
@@ -1919,7 +1926,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
1919 ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | 1926 ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
1920 ite->collection->collection_id; 1927 ite->collection->collection_id;
1921 val = cpu_to_le64(val); 1928 val = cpu_to_le64(val);
1922 return kvm_write_guest(kvm, gpa, &val, ite_esz); 1929 return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
1923} 1930}
1924 1931
1925/** 1932/**
@@ -2066,7 +2073,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
2066 (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | 2073 (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
2067 (dev->num_eventid_bits - 1)); 2074 (dev->num_eventid_bits - 1));
2068 val = cpu_to_le64(val); 2075 val = cpu_to_le64(val);
2069 return kvm_write_guest(kvm, ptr, &val, dte_esz); 2076 return kvm_write_guest_lock(kvm, ptr, &val, dte_esz);
2070} 2077}
2071 2078
2072/** 2079/**
@@ -2246,7 +2253,7 @@ static int vgic_its_save_cte(struct vgic_its *its,
2246 ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | 2253 ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
2247 collection->collection_id); 2254 collection->collection_id);
2248 val = cpu_to_le64(val); 2255 val = cpu_to_le64(val);
2249 return kvm_write_guest(its->dev->kvm, gpa, &val, esz); 2256 return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
2250} 2257}
2251 2258
2252static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) 2259static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
@@ -2317,7 +2324,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its)
2317 */ 2324 */
2318 val = 0; 2325 val = 0;
2319 BUG_ON(cte_esz > sizeof(val)); 2326 BUG_ON(cte_esz > sizeof(val));
2320 ret = kvm_write_guest(its->dev->kvm, gpa, &val, cte_esz); 2327 ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
2321 return ret; 2328 return ret;
2322} 2329}
2323 2330
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 408a78eb6a97..9f87e58dbd4a 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -358,7 +358,7 @@ retry:
358 if (status) { 358 if (status) {
359 /* clear consumed data */ 359 /* clear consumed data */
360 val &= ~(1 << bit_nr); 360 val &= ~(1 << bit_nr);
361 ret = kvm_write_guest(kvm, ptr, &val, 1); 361 ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
362 if (ret) 362 if (ret)
363 return ret; 363 return ret;
364 } 364 }
@@ -409,7 +409,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
409 else 409 else
410 val &= ~(1 << bit_nr); 410 val &= ~(1 << bit_nr);
411 411
412 ret = kvm_write_guest(kvm, ptr, &val, 1); 412 ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
413 if (ret) 413 if (ret)
414 return ret; 414 return ret;
415 } 415 }
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index abd9c7352677..3af69f2a3866 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -867,15 +867,21 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
867 * either observe the new interrupt before or after doing this check, 867 * either observe the new interrupt before or after doing this check,
868 * and introducing additional synchronization mechanism doesn't change 868 * and introducing additional synchronization mechanism doesn't change
869 * this. 869 * this.
870 *
871 * Note that we still need to go through the whole thing if anything
872 * can be directly injected (GICv4).
870 */ 873 */
871 if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) 874 if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
875 !vgic_supports_direct_msis(vcpu->kvm))
872 return; 876 return;
873 877
874 DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); 878 DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
875 879
876 raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); 880 if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
877 vgic_flush_lr_state(vcpu); 881 raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
878 raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); 882 vgic_flush_lr_state(vcpu);
883 raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
884 }
879 885
880 if (can_access_vgic_from_kernel()) 886 if (can_access_vgic_from_kernel())
881 vgic_restore_state(vcpu); 887 vgic_restore_state(vcpu);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 4325250afd72..001aeda4c154 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -214,9 +214,9 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
214 214
215 if (flags & EPOLLHUP) { 215 if (flags & EPOLLHUP) {
216 /* The eventfd is closing, detach from KVM */ 216 /* The eventfd is closing, detach from KVM */
217 unsigned long flags; 217 unsigned long iflags;
218 218
219 spin_lock_irqsave(&kvm->irqfds.lock, flags); 219 spin_lock_irqsave(&kvm->irqfds.lock, iflags);
220 220
221 /* 221 /*
222 * We must check if someone deactivated the irqfd before 222 * We must check if someone deactivated the irqfd before
@@ -230,7 +230,7 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
230 if (irqfd_is_active(irqfd)) 230 if (irqfd_is_active(irqfd))
231 irqfd_deactivate(irqfd); 231 irqfd_deactivate(irqfd);
232 232
233 spin_unlock_irqrestore(&kvm->irqfds.lock, flags); 233 spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
234 } 234 }
235 235
236 return 0; 236 return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f25aa98a94df..55fe8e20d8fd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2905,6 +2905,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
2905{ 2905{
2906 struct kvm_device *dev = filp->private_data; 2906 struct kvm_device *dev = filp->private_data;
2907 2907
2908 if (dev->kvm->mm != current->mm)
2909 return -EIO;
2910
2908 switch (ioctl) { 2911 switch (ioctl) {
2909 case KVM_SET_DEVICE_ATTR: 2912 case KVM_SET_DEVICE_ATTR:
2910 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 2913 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);