summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-08 18:18:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-08 18:18:36 -0400
commit0756b7fbb696d2cb18785da9cab13ec164017f64 (patch)
treed06242e3f35a7623e00068d7c95d06824f396df3
parent6d6218976df142ba5594371f8dbd56650151c56f (diff)
parent5f54c8b2d4fad95d1f8ecbe023ebe6038e6d3760 (diff)
Merge tag 'kvm-4.14-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Radim Krčmář: "First batch of KVM changes for 4.14 Common: - improve heuristic for boosting preempted spinlocks by ignoring VCPUs in user mode ARM: - fix for decoding external abort types from guests - added support for migrating the active priority of interrupts when running a GICv2 guest on a GICv3 host - minor cleanup PPC: - expose storage keys to userspace - merge kvm-ppc-fixes with a fix that missed 4.13 because of vacations - fixes s390: - merge of kvm/master to avoid conflicts with additional sthyi fixes - wire up the no-dat enhancements in KVM - multiple epoch facility (z14 feature) - Configuration z/Architecture Mode - more sthyi fixes - gdb server range checking fix - small code cleanups x86: - emulate Hyper-V TSC frequency MSRs - add nested INVPCID - emulate EPTP switching VMFUNC - support Virtual GIF - support 5 level page tables - speedup nested VM exits by packing byte operations - speedup MMIO by using hardware provided physical address - a lot of fixes and cleanups, especially nested" * tag 'kvm-4.14-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (67 commits) KVM: arm/arm64: Support uaccess of GICC_APRn KVM: arm/arm64: Extract GICv3 max APRn index calculation KVM: arm/arm64: vITS: Drop its_ite->lpi field KVM: arm/arm64: vgic: constify seq_operations and file_operations KVM: arm/arm64: Fix guest external abort matching KVM: PPC: Book3S HV: Fix memory leak in kvm_vm_ioctl_get_htab_fd KVM: s390: vsie: cleanup mcck reinjection KVM: s390: use WARN_ON_ONCE only for checking KVM: s390: guestdbg: fix range check KVM: PPC: Book3S HV: Report storage key support to userspace KVM: PPC: Book3S HV: Fix case where HDEC is treated as 32-bit on POWER9 KVM: PPC: Book3S HV: Fix invalid use of register expression KVM: PPC: Book3S HV: Fix H_REGISTER_VPA VPA size validation KVM: PPC: Book3S HV: Fix setting of storage key in H_ENTER KVM: PPC: e500mc: Fix a NULL dereference KVM: PPC: e500: Fix some NULL dereferences on error KVM: PPC: Book3S HV: Protect updates to spapr_tce_tables list KVM: s390: we are always in czam mode KVM: s390: expose no-DAT to guest and migration support KVM: s390: sthyi: remove invalid guest write access ...
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic.txt5
-rw-r--r--Documentation/virtual/kvm/devices/vm.txt14
-rw-r--r--MAINTAINERS37
-rw-r--r--arch/arm/include/asm/kvm_arm.h1
-rw-r--r--arch/arm/include/asm/kvm_emulate.h24
-rw-r--r--arch/arm/kvm/handle_exit.c2
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h24
-rw-r--r--arch/arm64/kvm/handle_exit.c2
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c23
-rw-r--r--arch/mips/kvm/mips.c5
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu-hash.h1
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c1
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c21
-rw-r--r--arch/powerpc/kvm/book3s_hv.c16
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S5
-rw-r--r--arch/powerpc/kvm/e500.c8
-rw-r--r--arch/powerpc/kvm/e500mc.c4
-rw-r--r--arch/powerpc/kvm/powerpc.c5
-rw-r--r--arch/s390/include/asm/kvm_host.h6
-rw-r--r--arch/s390/include/asm/page-states.h2
-rw-r--r--arch/s390/include/uapi/asm/kvm.h6
-rw-r--r--arch/s390/kvm/diag.c2
-rw-r--r--arch/s390/kvm/guestdbg.c2
-rw-r--r--arch/s390/kvm/interrupt.c6
-rw-r--r--arch/s390/kvm/kvm-s390.c118
-rw-r--r--arch/s390/kvm/kvm-s390.h2
-rw-r--r--arch/s390/kvm/priv.c6
-rw-r--r--arch/s390/kvm/sigp.c36
-rw-r--r--arch/s390/kvm/sthyi.c8
-rw-r--r--arch/s390/kvm/vsie.c16
-rw-r--r--arch/s390/mm/pgtable.c6
-rw-r--r--arch/s390/tools/gen_facilities.c1
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h40
-rw-r--r--arch/x86/include/asm/svm.h6
-rw-r--r--arch/x86/include/asm/vmx.h22
-rw-r--r--arch/x86/kvm/cpuid.c34
-rw-r--r--arch/x86/kvm/cpuid.h186
-rw-r--r--arch/x86/kvm/emulate.c42
-rw-r--r--arch/x86/kvm/hyperv.c8
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h2
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/lapic.h3
-rw-r--r--arch/x86/kvm/mmu.c267
-rw-r--r--arch/x86/kvm/mmu.h23
-rw-r--r--arch/x86/kvm/mmu_audit.c4
-rw-r--r--arch/x86/kvm/mtrr.c2
-rw-r--r--arch/x86/kvm/paging_tmpl.h6
-rw-r--r--arch/x86/kvm/svm.c139
-rw-r--r--arch/x86/kvm/trace.h11
-rw-r--r--arch/x86/kvm/vmx.c620
-rw-r--r--arch/x86/kvm/x86.c213
-rw-r--r--arch/x86/kvm/x86.h54
-rw-r--r--include/linux/kvm_host.h9
-rw-r--r--include/uapi/linux/kvm.h3
-rw-r--r--virt/kvm/arm/arm.c5
-rw-r--r--virt/kvm/arm/mmu.c40
-rw-r--r--virt/kvm/arm/vgic/vgic-debug.c4
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c10
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v2.c47
-rw-r--r--virt/kvm/arm/vgic/vgic.h16
-rw-r--r--virt/kvm/kvm_main.c7
64 files changed, 1479 insertions, 768 deletions
diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index b2f60ca8b60c..b3ce12643553 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -83,6 +83,11 @@ Groups:
83 83
84 Bits for undefined preemption levels are RAZ/WI. 84 Bits for undefined preemption levels are RAZ/WI.
85 85
86 Note that this differs from a CPU's view of the APRs on hardware in which
87 a GIC without the security extensions expose group 0 and group 1 active
88 priorities in separate register groups, whereas we show a combined view
89 similar to GICv2's GICH_APR.
90
86 For historical reasons and to provide ABI compatibility with userspace we 91 For historical reasons and to provide ABI compatibility with userspace we
87 export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask 92 export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask
88 field in the lower 5 bits of a word, meaning that userspace must always 93 field in the lower 5 bits of a word, meaning that userspace must always
diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index 903fc926860b..95ca68d663a4 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -176,7 +176,8 @@ Architectures: s390
176 176
1773.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH 1773.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH
178 178
179Allows user space to set/get the TOD clock extension (u8). 179Allows user space to set/get the TOD clock extension (u8) (superseded by
180KVM_S390_VM_TOD_EXT).
180 181
181Parameters: address of a buffer in user space to store the data (u8) to 182Parameters: address of a buffer in user space to store the data (u8) to
182Returns: -EFAULT if the given address is not accessible from kernel space 183Returns: -EFAULT if the given address is not accessible from kernel space
@@ -190,6 +191,17 @@ the POP (u64).
190Parameters: address of a buffer in user space to store the data (u64) to 191Parameters: address of a buffer in user space to store the data (u64) to
191Returns: -EFAULT if the given address is not accessible from kernel space 192Returns: -EFAULT if the given address is not accessible from kernel space
192 193
1943.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT
195Allows user space to set/get bits 0-63 of the TOD clock register as defined in
196the POP (u64). If the guest CPU model supports the TOD clock extension (u8), it
197also allows user space to get/set it. If the guest CPU model does not support
198it, it is stored as 0 and not allowed to be set to a value != 0.
199
200Parameters: address of a buffer in user space to store the data
201 (kvm_s390_vm_tod_clock) to
202Returns: -EFAULT if the given address is not accessible from kernel space
203 -EINVAL if setting the TOD clock extension to != 0 is not supported
204
1934. GROUP: KVM_S390_VM_CRYPTO 2054. GROUP: KVM_S390_VM_CRYPTO
194Architectures: s390 206Architectures: s390
195 207
diff --git a/MAINTAINERS b/MAINTAINERS
index bf206bd9f056..722c7aec88c2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7464,18 +7464,30 @@ L: kvm@vger.kernel.org
7464W: http://www.linux-kvm.org 7464W: http://www.linux-kvm.org
7465T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git 7465T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
7466S: Supported 7466S: Supported
7467F: Documentation/*/kvm*.txt
7468F: Documentation/virtual/kvm/ 7467F: Documentation/virtual/kvm/
7469F: arch/*/kvm/ 7468F: include/trace/events/kvm.h
7470F: arch/x86/kernel/kvm.c 7469F: include/uapi/asm-generic/kvm*
7471F: arch/x86/kernel/kvmclock.c
7472F: arch/*/include/asm/kvm*
7473F: include/linux/kvm*
7474F: include/uapi/linux/kvm* 7470F: include/uapi/linux/kvm*
7475F: virt/kvm/ 7471F: include/asm-generic/kvm*
7472F: include/linux/kvm*
7473F: include/kvm/iodev.h
7474F: virt/kvm/*
7476F: tools/kvm/ 7475F: tools/kvm/
7477 7476
7478KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V 7477KERNEL VIRTUAL MACHINE FOR X86 (KVM/x86)
7478M: Paolo Bonzini <pbonzini@redhat.com>
7479M: Radim KrÄmář <rkrcmar@redhat.com>
7480L: kvm@vger.kernel.org
7481W: http://www.linux-kvm.org
7482T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
7483S: Supported
7484F: arch/x86/kvm/
7485F: arch/x86/include/uapi/asm/kvm*
7486F: arch/x86/include/asm/kvm*
7487F: arch/x86/kernel/kvm.c
7488F: arch/x86/kernel/kvmclock.c
7489
7490KERNEL VIRTUAL MACHINE FOR AMD-V (KVM/amd)
7479M: Joerg Roedel <joro@8bytes.org> 7491M: Joerg Roedel <joro@8bytes.org>
7480L: kvm@vger.kernel.org 7492L: kvm@vger.kernel.org
7481W: http://www.linux-kvm.org/ 7493W: http://www.linux-kvm.org/
@@ -7483,7 +7495,7 @@ S: Maintained
7483F: arch/x86/include/asm/svm.h 7495F: arch/x86/include/asm/svm.h
7484F: arch/x86/kvm/svm.c 7496F: arch/x86/kvm/svm.c
7485 7497
7486KERNEL VIRTUAL MACHINE (KVM) FOR ARM 7498KERNEL VIRTUAL MACHINE FOR ARM (KVM/arm)
7487M: Christoffer Dall <christoffer.dall@linaro.org> 7499M: Christoffer Dall <christoffer.dall@linaro.org>
7488M: Marc Zyngier <marc.zyngier@arm.com> 7500M: Marc Zyngier <marc.zyngier@arm.com>
7489L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) 7501L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@ -7497,14 +7509,16 @@ F: arch/arm/kvm/
7497F: virt/kvm/arm/ 7509F: virt/kvm/arm/
7498F: include/kvm/arm_* 7510F: include/kvm/arm_*
7499 7511
7500KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC 7512KERNEL VIRTUAL MACHINE FOR POWERPC (KVM/powerpc)
7501M: Alexander Graf <agraf@suse.com> 7513M: Alexander Graf <agraf@suse.com>
7502L: kvm-ppc@vger.kernel.org 7514L: kvm-ppc@vger.kernel.org
7503W: http://www.linux-kvm.org/ 7515W: http://www.linux-kvm.org/
7504T: git git://github.com/agraf/linux-2.6.git 7516T: git git://github.com/agraf/linux-2.6.git
7505S: Supported 7517S: Supported
7518F: arch/powerpc/include/uapi/asm/kvm*
7506F: arch/powerpc/include/asm/kvm* 7519F: arch/powerpc/include/asm/kvm*
7507F: arch/powerpc/kvm/ 7520F: arch/powerpc/kvm/
7521F: arch/powerpc/kernel/kvm*
7508 7522
7509KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) 7523KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
7510M: Christoffer Dall <christoffer.dall@linaro.org> 7524M: Christoffer Dall <christoffer.dall@linaro.org>
@@ -7531,7 +7545,8 @@ L: linux-s390@vger.kernel.org
7531W: http://www.ibm.com/developerworks/linux/linux390/ 7545W: http://www.ibm.com/developerworks/linux/linux390/
7532T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git 7546T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
7533S: Supported 7547S: Supported
7534F: Documentation/s390/kvm.txt 7548F: arch/s390/include/uapi/asm/kvm*
7549F: arch/s390/include/asm/gmap.h
7535F: arch/s390/include/asm/kvm* 7550F: arch/s390/include/asm/kvm*
7536F: arch/s390/kvm/ 7551F: arch/s390/kvm/
7537F: arch/s390/mm/gmap.c 7552F: arch/s390/mm/gmap.c
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index ebf020b02bc8..c8781450905b 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -227,7 +227,6 @@
227 227
228#define HSR_DABT_S1PTW (_AC(1, UL) << 7) 228#define HSR_DABT_S1PTW (_AC(1, UL) << 7)
229#define HSR_DABT_CM (_AC(1, UL) << 8) 229#define HSR_DABT_CM (_AC(1, UL) << 8)
230#define HSR_DABT_EA (_AC(1, UL) << 9)
231 230
232#define kvm_arm_exception_type \ 231#define kvm_arm_exception_type \
233 {0, "RESET" }, \ 232 {0, "RESET" }, \
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 9a8a45aaf19a..98089ffd91bb 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -149,11 +149,6 @@ static inline int kvm_vcpu_dabt_get_rd(struct kvm_vcpu *vcpu)
149 return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT; 149 return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT;
150} 150}
151 151
152static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu)
153{
154 return kvm_vcpu_get_hsr(vcpu) & HSR_DABT_EA;
155}
156
157static inline bool kvm_vcpu_dabt_iss1tw(struct kvm_vcpu *vcpu) 152static inline bool kvm_vcpu_dabt_iss1tw(struct kvm_vcpu *vcpu)
158{ 153{
159 return kvm_vcpu_get_hsr(vcpu) & HSR_DABT_S1PTW; 154 return kvm_vcpu_get_hsr(vcpu) & HSR_DABT_S1PTW;
@@ -206,6 +201,25 @@ static inline u8 kvm_vcpu_trap_get_fault_type(struct kvm_vcpu *vcpu)
206 return kvm_vcpu_get_hsr(vcpu) & HSR_FSC_TYPE; 201 return kvm_vcpu_get_hsr(vcpu) & HSR_FSC_TYPE;
207} 202}
208 203
204static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu)
205{
206 switch (kvm_vcpu_trap_get_fault_type(vcpu)) {
207 case FSC_SEA:
208 case FSC_SEA_TTW0:
209 case FSC_SEA_TTW1:
210 case FSC_SEA_TTW2:
211 case FSC_SEA_TTW3:
212 case FSC_SECC:
213 case FSC_SECC_TTW0:
214 case FSC_SECC_TTW1:
215 case FSC_SECC_TTW2:
216 case FSC_SECC_TTW3:
217 return true;
218 default:
219 return false;
220 }
221}
222
209static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) 223static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu)
210{ 224{
211 return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; 225 return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK;
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 54442e375354..cf8bf6bf87c4 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -67,7 +67,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
67 if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) { 67 if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) {
68 trace_kvm_wfx(*vcpu_pc(vcpu), true); 68 trace_kvm_wfx(*vcpu_pc(vcpu), true);
69 vcpu->stat.wfe_exit_stat++; 69 vcpu->stat.wfe_exit_stat++;
70 kvm_vcpu_on_spin(vcpu); 70 kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu));
71 } else { 71 } else {
72 trace_kvm_wfx(*vcpu_pc(vcpu), false); 72 trace_kvm_wfx(*vcpu_pc(vcpu), false);
73 vcpu->stat.wfi_exit_stat++; 73 vcpu->stat.wfi_exit_stat++;
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index fe39e6841326..e5df3fce0008 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -188,11 +188,6 @@ static inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu)
188 return (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT; 188 return (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
189} 189}
190 190
191static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu)
192{
193 return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_EA);
194}
195
196static inline bool kvm_vcpu_dabt_iss1tw(const struct kvm_vcpu *vcpu) 191static inline bool kvm_vcpu_dabt_iss1tw(const struct kvm_vcpu *vcpu)
197{ 192{
198 return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_S1PTW); 193 return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_S1PTW);
@@ -240,6 +235,25 @@ static inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu)
240 return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_FSC_TYPE; 235 return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_FSC_TYPE;
241} 236}
242 237
238static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu)
239{
240 switch (kvm_vcpu_trap_get_fault_type(vcpu)) {
241 case FSC_SEA:
242 case FSC_SEA_TTW0:
243 case FSC_SEA_TTW1:
244 case FSC_SEA_TTW2:
245 case FSC_SEA_TTW3:
246 case FSC_SECC:
247 case FSC_SECC_TTW0:
248 case FSC_SECC_TTW1:
249 case FSC_SECC_TTW2:
250 case FSC_SECC_TTW3:
251 return true;
252 default:
253 return false;
254 }
255}
256
243static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) 257static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
244{ 258{
245 u32 esr = kvm_vcpu_get_hsr(vcpu); 259 u32 esr = kvm_vcpu_get_hsr(vcpu);
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 17d8a1677a0b..7debb74843a0 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -84,7 +84,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
84 if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) { 84 if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) {
85 trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); 85 trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true);
86 vcpu->stat.wfe_exit_stat++; 86 vcpu->stat.wfe_exit_stat++;
87 kvm_vcpu_on_spin(vcpu); 87 kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu));
88 } else { 88 } else {
89 trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); 89 trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false);
90 vcpu->stat.wfi_exit_stat++; 90 vcpu->stat.wfi_exit_stat++;
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index 116786d2e8e8..c77d508b7462 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -208,29 +208,12 @@ static void vgic_v3_access_apr_reg(struct kvm_vcpu *vcpu,
208static bool access_gic_aprn(struct kvm_vcpu *vcpu, struct sys_reg_params *p, 208static bool access_gic_aprn(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
209 const struct sys_reg_desc *r, u8 apr) 209 const struct sys_reg_desc *r, u8 apr)
210{ 210{
211 struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu;
212 u8 idx = r->Op2 & 3; 211 u8 idx = r->Op2 & 3;
213 212
214 /* 213 if (idx > vgic_v3_max_apr_idx(vcpu))
215 * num_pri_bits are initialized with HW supported values. 214 goto err;
216 * We can rely safely on num_pri_bits even if VM has not
217 * restored ICC_CTLR_EL1 before restoring APnR registers.
218 */
219 switch (vgic_v3_cpu->num_pri_bits) {
220 case 7:
221 vgic_v3_access_apr_reg(vcpu, p, apr, idx);
222 break;
223 case 6:
224 if (idx > 1)
225 goto err;
226 vgic_v3_access_apr_reg(vcpu, p, apr, idx);
227 break;
228 default:
229 if (idx > 0)
230 goto err;
231 vgic_v3_access_apr_reg(vcpu, p, apr, idx);
232 }
233 215
216 vgic_v3_access_apr_reg(vcpu, p, apr, idx);
234 return true; 217 return true;
235err: 218err:
236 if (!p->is_write) 219 if (!p->is_write)
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index d4b2ad18eef2..bce2a6431430 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -98,6 +98,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
98 return !!(vcpu->arch.pending_exceptions); 98 return !!(vcpu->arch.pending_exceptions);
99} 99}
100 100
101bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
102{
103 return false;
104}
105
101int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 106int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
102{ 107{
103 return 1; 108 return 1;
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index f28d21c69f79..508275bb05d5 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -104,6 +104,7 @@
104#define HPTE_R_C ASM_CONST(0x0000000000000080) 104#define HPTE_R_C ASM_CONST(0x0000000000000080)
105#define HPTE_R_R ASM_CONST(0x0000000000000100) 105#define HPTE_R_R ASM_CONST(0x0000000000000100)
106#define HPTE_R_KEY_LO ASM_CONST(0x0000000000000e00) 106#define HPTE_R_KEY_LO ASM_CONST(0x0000000000000e00)
107#define HPTE_R_KEY (HPTE_R_KEY_LO | HPTE_R_KEY_HI)
107 108
108#define HPTE_V_1TB_SEG ASM_CONST(0x4000000000000000) 109#define HPTE_V_1TB_SEG ASM_CONST(0x4000000000000000)
109#define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000) 110#define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 67075e065ef2..7c62967d672c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1941,6 +1941,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1941 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1941 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1942 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); 1942 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
1943 if (ret < 0) { 1943 if (ret < 0) {
1944 kfree(ctx);
1944 kvm_put_kvm(kvm); 1945 kvm_put_kvm(kvm);
1945 return ret; 1946 return ret;
1946 } 1947 }
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 53766e2bc029..8f2da8bba737 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -265,8 +265,11 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
265{ 265{
266 struct kvmppc_spapr_tce_table *stt = filp->private_data; 266 struct kvmppc_spapr_tce_table *stt = filp->private_data;
267 struct kvmppc_spapr_tce_iommu_table *stit, *tmp; 267 struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
268 struct kvm *kvm = stt->kvm;
268 269
270 mutex_lock(&kvm->lock);
269 list_del_rcu(&stt->list); 271 list_del_rcu(&stt->list);
272 mutex_unlock(&kvm->lock);
270 273
271 list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) { 274 list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
272 WARN_ON(!kref_read(&stit->kref)); 275 WARN_ON(!kref_read(&stit->kref));
@@ -298,7 +301,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
298 unsigned long npages, size; 301 unsigned long npages, size;
299 int ret = -ENOMEM; 302 int ret = -ENOMEM;
300 int i; 303 int i;
301 int fd = -1;
302 304
303 if (!args->size) 305 if (!args->size)
304 return -EINVAL; 306 return -EINVAL;
@@ -328,11 +330,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
328 goto fail; 330 goto fail;
329 } 331 }
330 332
331 ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
332 stt, O_RDWR | O_CLOEXEC);
333 if (ret < 0)
334 goto fail;
335
336 mutex_lock(&kvm->lock); 333 mutex_lock(&kvm->lock);
337 334
338 /* Check this LIOBN hasn't been previously allocated */ 335 /* Check this LIOBN hasn't been previously allocated */
@@ -344,17 +341,19 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
344 } 341 }
345 } 342 }
346 343
347 if (!ret) { 344 if (!ret)
345 ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
346 stt, O_RDWR | O_CLOEXEC);
347
348 if (ret >= 0) {
348 list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); 349 list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
349 kvm_get_kvm(kvm); 350 kvm_get_kvm(kvm);
350 } 351 }
351 352
352 mutex_unlock(&kvm->lock); 353 mutex_unlock(&kvm->lock);
353 354
354 if (!ret) 355 if (ret >= 0)
355 return fd; 356 return ret;
356
357 put_unused_fd(fd);
358 357
359 fail: 358 fail:
360 for (i = 0; i < npages; i++) 359 for (i = 0; i < npages; i++)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ebcf97cb5c98..18e974a34fce 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -485,7 +485,13 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
485 485
486 switch (subfunc) { 486 switch (subfunc) {
487 case H_VPA_REG_VPA: /* register VPA */ 487 case H_VPA_REG_VPA: /* register VPA */
488 if (len < sizeof(struct lppaca)) 488 /*
489 * The size of our lppaca is 1kB because of the way we align
490 * it for the guest to avoid crossing a 4kB boundary. We only
491 * use 640 bytes of the structure though, so we should accept
492 * clients that set a size of 640.
493 */
494 if (len < 640)
489 break; 495 break;
490 vpap = &tvcpu->arch.vpa; 496 vpap = &tvcpu->arch.vpa;
491 err = 0; 497 err = 0;
@@ -3336,6 +3342,14 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
3336 if (radix_enabled()) 3342 if (radix_enabled())
3337 return -EINVAL; 3343 return -EINVAL;
3338 3344
3345 /*
3346 * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
3347 * POWER7 doesn't support keys for instruction accesses,
3348 * POWER8 and POWER9 do.
3349 */
3350 info->data_keys = 32;
3351 info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
3352
3339 info->flags = KVM_PPC_PAGE_SIZES_REAL; 3353 info->flags = KVM_PPC_PAGE_SIZES_REAL;
3340 if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) 3354 if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
3341 info->flags |= KVM_PPC_1T_SEGMENTS; 3355 info->flags |= KVM_PPC_1T_SEGMENTS;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fedb0139524c..4efe364f1188 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -269,7 +269,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
269 if (!realmode) 269 if (!realmode)
270 local_irq_restore(irq_flags); 270 local_irq_restore(irq_flags);
271 271
272 ptel &= ~(HPTE_R_PP0 - psize); 272 ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1);
273 ptel |= pa; 273 ptel |= pa;
274 274
275 if (pa) 275 if (pa)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 2259b6cde119..663a4a861e7f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -982,7 +982,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
982#ifdef CONFIG_KVM_XICS 982#ifdef CONFIG_KVM_XICS
983 /* We are entering the guest on that thread, push VCPU to XIVE */ 983 /* We are entering the guest on that thread, push VCPU to XIVE */
984 ld r10, HSTATE_XIVE_TIMA_PHYS(r13) 984 ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
985 cmpldi cr0, r10, r0 985 cmpldi cr0, r10, 0
986 beq no_xive 986 beq no_xive
987 ld r11, VCPU_XIVE_SAVED_STATE(r4) 987 ld r11, VCPU_XIVE_SAVED_STATE(r4)
988 li r9, TM_QW1_OS 988 li r9, TM_QW1_OS
@@ -1286,7 +1286,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
1286 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER 1286 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
1287 bne 2f 1287 bne 2f
1288 mfspr r3,SPRN_HDEC 1288 mfspr r3,SPRN_HDEC
1289 cmpwi r3,0 1289 EXTEND_HDEC(r3)
1290 cmpdi r3,0
1290 mr r4,r9 1291 mr r4,r9
1291 bge fast_guest_return 1292 bge fast_guest_return
12922: 12932:
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 32fdab57d604..f9f6468f4171 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -455,16 +455,20 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm,
455 if (err) 455 if (err)
456 goto free_vcpu; 456 goto free_vcpu;
457 457
458 if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) 458 if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) {
459 err = -ENOMEM;
459 goto uninit_vcpu; 460 goto uninit_vcpu;
461 }
460 462
461 err = kvmppc_e500_tlb_init(vcpu_e500); 463 err = kvmppc_e500_tlb_init(vcpu_e500);
462 if (err) 464 if (err)
463 goto uninit_id; 465 goto uninit_id;
464 466
465 vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO); 467 vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
466 if (!vcpu->arch.shared) 468 if (!vcpu->arch.shared) {
469 err = -ENOMEM;
467 goto uninit_tlb; 470 goto uninit_tlb;
471 }
468 472
469 return vcpu; 473 return vcpu;
470 474
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index f48a0c22e8f9..d0b6b5788afc 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -331,8 +331,10 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm,
331 goto uninit_vcpu; 331 goto uninit_vcpu;
332 332
333 vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 333 vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
334 if (!vcpu->arch.shared) 334 if (!vcpu->arch.shared) {
335 err = -ENOMEM;
335 goto uninit_tlb; 336 goto uninit_tlb;
337 }
336 338
337 return vcpu; 339 return vcpu;
338 340
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1a75c0b5f4ca..3480faaf1ef8 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -58,6 +58,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
58 return !!(v->arch.pending_exceptions) || kvm_request_pending(v); 58 return !!(v->arch.pending_exceptions) || kvm_request_pending(v);
59} 59}
60 60
61bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
62{
63 return false;
64}
65
61int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 66int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
62{ 67{
63 return 1; 68 return 1;
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index a409d5991934..51375e766e90 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -226,7 +226,9 @@ struct kvm_s390_sie_block {
226#define ECB3_RI 0x01 226#define ECB3_RI 0x01
227 __u8 ecb3; /* 0x0063 */ 227 __u8 ecb3; /* 0x0063 */
228 __u32 scaol; /* 0x0064 */ 228 __u32 scaol; /* 0x0064 */
229 __u8 reserved68[4]; /* 0x0068 */ 229 __u8 reserved68; /* 0x0068 */
230 __u8 epdx; /* 0x0069 */
231 __u8 reserved6a[2]; /* 0x006a */
230 __u32 todpr; /* 0x006c */ 232 __u32 todpr; /* 0x006c */
231 __u8 reserved70[16]; /* 0x0070 */ 233 __u8 reserved70[16]; /* 0x0070 */
232 __u64 mso; /* 0x0080 */ 234 __u64 mso; /* 0x0080 */
@@ -265,6 +267,7 @@ struct kvm_s390_sie_block {
265 __u64 cbrlo; /* 0x01b8 */ 267 __u64 cbrlo; /* 0x01b8 */
266 __u8 reserved1c0[8]; /* 0x01c0 */ 268 __u8 reserved1c0[8]; /* 0x01c0 */
267#define ECD_HOSTREGMGMT 0x20000000 269#define ECD_HOSTREGMGMT 0x20000000
270#define ECD_MEF 0x08000000
268 __u32 ecd; /* 0x01c8 */ 271 __u32 ecd; /* 0x01c8 */
269 __u8 reserved1cc[18]; /* 0x01cc */ 272 __u8 reserved1cc[18]; /* 0x01cc */
270 __u64 pp; /* 0x01de */ 273 __u64 pp; /* 0x01de */
@@ -739,6 +742,7 @@ struct kvm_arch{
739 struct kvm_s390_cpu_model model; 742 struct kvm_s390_cpu_model model;
740 struct kvm_s390_crypto crypto; 743 struct kvm_s390_crypto crypto;
741 struct kvm_s390_vsie vsie; 744 struct kvm_s390_vsie vsie;
745 u8 epdx;
742 u64 epoch; 746 u64 epoch;
743 struct kvm_s390_migration_state *migration_state; 747 struct kvm_s390_migration_state *migration_state;
744 /* subset of available cpu features enabled by user space */ 748 /* subset of available cpu features enabled by user space */
diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h
index ca21b28a7b17..22b0f49e87c1 100644
--- a/arch/s390/include/asm/page-states.h
+++ b/arch/s390/include/asm/page-states.h
@@ -15,6 +15,6 @@
15#define ESSA_SET_STABLE_IF_RESIDENT 6 15#define ESSA_SET_STABLE_IF_RESIDENT 6
16#define ESSA_SET_STABLE_NODAT 7 16#define ESSA_SET_STABLE_NODAT 7
17 17
18#define ESSA_MAX ESSA_SET_STABLE_IF_RESIDENT 18#define ESSA_MAX ESSA_SET_STABLE_NODAT
19 19
20#endif 20#endif
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 69d09c39bbcd..cd7359e23d86 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -88,6 +88,12 @@ struct kvm_s390_io_adapter_req {
88/* kvm attributes for KVM_S390_VM_TOD */ 88/* kvm attributes for KVM_S390_VM_TOD */
89#define KVM_S390_VM_TOD_LOW 0 89#define KVM_S390_VM_TOD_LOW 0
90#define KVM_S390_VM_TOD_HIGH 1 90#define KVM_S390_VM_TOD_HIGH 1
91#define KVM_S390_VM_TOD_EXT 2
92
93struct kvm_s390_vm_tod_clock {
94 __u8 epoch_idx;
95 __u64 tod;
96};
91 97
92/* kvm attributes for KVM_S390_VM_CPU_MODEL */ 98/* kvm attributes for KVM_S390_VM_CPU_MODEL */
93/* processor related attributes are r/w */ 99/* processor related attributes are r/w */
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index e4d36094aceb..d93a2c0474bf 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -150,7 +150,7 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
150{ 150{
151 VCPU_EVENT(vcpu, 5, "%s", "diag time slice end"); 151 VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
152 vcpu->stat.diagnose_44++; 152 vcpu->stat.diagnose_44++;
153 kvm_vcpu_on_spin(vcpu); 153 kvm_vcpu_on_spin(vcpu, true);
154 return 0; 154 return 0;
155} 155}
156 156
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index c2e0ddc1356e..bcbd86621d01 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -308,7 +308,7 @@ static inline int in_addr_range(u64 addr, u64 a, u64 b)
308 return (addr >= a) && (addr <= b); 308 return (addr >= a) && (addr <= b);
309 else 309 else
310 /* "overflowing" interval */ 310 /* "overflowing" interval */
311 return (addr <= a) && (addr >= b); 311 return (addr >= a) || (addr <= b);
312} 312}
313 313
314#define end_of_range(bp_info) (bp_info->addr + bp_info->len - 1) 314#define end_of_range(bp_info) (bp_info->addr + bp_info->len - 1)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index a619ddae610d..a832ad031cee 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2479,6 +2479,7 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
2479 struct kvm_s390_mchk_info *mchk; 2479 struct kvm_s390_mchk_info *mchk;
2480 union mci mci; 2480 union mci mci;
2481 __u64 cr14 = 0; /* upper bits are not used */ 2481 __u64 cr14 = 0; /* upper bits are not used */
2482 int rc;
2482 2483
2483 mci.val = mcck_info->mcic; 2484 mci.val = mcck_info->mcic;
2484 if (mci.sr) 2485 if (mci.sr)
@@ -2496,12 +2497,13 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
2496 if (mci.ck) { 2497 if (mci.ck) {
2497 /* Inject the floating machine check */ 2498 /* Inject the floating machine check */
2498 inti.type = KVM_S390_MCHK; 2499 inti.type = KVM_S390_MCHK;
2499 WARN_ON_ONCE(__inject_vm(vcpu->kvm, &inti)); 2500 rc = __inject_vm(vcpu->kvm, &inti);
2500 } else { 2501 } else {
2501 /* Inject the machine check to specified vcpu */ 2502 /* Inject the machine check to specified vcpu */
2502 irq.type = KVM_S390_MCHK; 2503 irq.type = KVM_S390_MCHK;
2503 WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &irq)); 2504 rc = kvm_s390_inject_vcpu(vcpu, &irq);
2504 } 2505 }
2506 WARN_ON_ONCE(rc);
2505} 2507}
2506 2508
2507int kvm_set_routing_entry(struct kvm *kvm, 2509int kvm_set_routing_entry(struct kvm *kvm,
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index af09d3437631..40d0a1a97889 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -130,6 +130,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
130 { NULL } 130 { NULL }
131}; 131};
132 132
133struct kvm_s390_tod_clock_ext {
134 __u8 epoch_idx;
135 __u64 tod;
136 __u8 reserved[7];
137} __packed;
138
133/* allow nested virtualization in KVM (if enabled by user space) */ 139/* allow nested virtualization in KVM (if enabled by user space) */
134static int nested; 140static int nested;
135module_param(nested, int, S_IRUGO); 141module_param(nested, int, S_IRUGO);
@@ -874,6 +880,26 @@ static int kvm_s390_vm_get_migration(struct kvm *kvm,
874 return 0; 880 return 0;
875} 881}
876 882
883static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
884{
885 struct kvm_s390_vm_tod_clock gtod;
886
887 if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod)))
888 return -EFAULT;
889
890 if (test_kvm_facility(kvm, 139))
891 kvm_s390_set_tod_clock_ext(kvm, &gtod);
892 else if (gtod.epoch_idx == 0)
893 kvm_s390_set_tod_clock(kvm, gtod.tod);
894 else
895 return -EINVAL;
896
897 VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx",
898 gtod.epoch_idx, gtod.tod);
899
900 return 0;
901}
902
877static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) 903static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
878{ 904{
879 u8 gtod_high; 905 u8 gtod_high;
@@ -909,6 +935,9 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
909 return -EINVAL; 935 return -EINVAL;
910 936
911 switch (attr->attr) { 937 switch (attr->attr) {
938 case KVM_S390_VM_TOD_EXT:
939 ret = kvm_s390_set_tod_ext(kvm, attr);
940 break;
912 case KVM_S390_VM_TOD_HIGH: 941 case KVM_S390_VM_TOD_HIGH:
913 ret = kvm_s390_set_tod_high(kvm, attr); 942 ret = kvm_s390_set_tod_high(kvm, attr);
914 break; 943 break;
@@ -922,6 +951,43 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
922 return ret; 951 return ret;
923} 952}
924 953
954static void kvm_s390_get_tod_clock_ext(struct kvm *kvm,
955 struct kvm_s390_vm_tod_clock *gtod)
956{
957 struct kvm_s390_tod_clock_ext htod;
958
959 preempt_disable();
960
961 get_tod_clock_ext((char *)&htod);
962
963 gtod->tod = htod.tod + kvm->arch.epoch;
964 gtod->epoch_idx = htod.epoch_idx + kvm->arch.epdx;
965
966 if (gtod->tod < htod.tod)
967 gtod->epoch_idx += 1;
968
969 preempt_enable();
970}
971
972static int kvm_s390_get_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
973{
974 struct kvm_s390_vm_tod_clock gtod;
975
976 memset(&gtod, 0, sizeof(gtod));
977
978 if (test_kvm_facility(kvm, 139))
979 kvm_s390_get_tod_clock_ext(kvm, &gtod);
980 else
981 gtod.tod = kvm_s390_get_tod_clock_fast(kvm);
982
983 if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod)))
984 return -EFAULT;
985
986 VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x, TOD base: 0x%llx",
987 gtod.epoch_idx, gtod.tod);
988 return 0;
989}
990
925static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) 991static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
926{ 992{
927 u8 gtod_high = 0; 993 u8 gtod_high = 0;
@@ -954,6 +1020,9 @@ static int kvm_s390_get_tod(struct kvm *kvm, struct kvm_device_attr *attr)
954 return -EINVAL; 1020 return -EINVAL;
955 1021
956 switch (attr->attr) { 1022 switch (attr->attr) {
1023 case KVM_S390_VM_TOD_EXT:
1024 ret = kvm_s390_get_tod_ext(kvm, attr);
1025 break;
957 case KVM_S390_VM_TOD_HIGH: 1026 case KVM_S390_VM_TOD_HIGH:
958 ret = kvm_s390_get_tod_high(kvm, attr); 1027 ret = kvm_s390_get_tod_high(kvm, attr);
959 break; 1028 break;
@@ -1505,7 +1574,7 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
1505 if (r < 0) 1574 if (r < 0)
1506 pgstev = 0; 1575 pgstev = 0;
1507 /* save the value */ 1576 /* save the value */
1508 res[i++] = (pgstev >> 24) & 0x3; 1577 res[i++] = (pgstev >> 24) & 0x43;
1509 /* 1578 /*
1510 * if the next bit is too far away, stop. 1579 * if the next bit is too far away, stop.
1511 * if we reached the previous "next", find the next one 1580 * if we reached the previous "next", find the next one
@@ -1583,7 +1652,7 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
1583 1652
1584 pgstev = bits[i]; 1653 pgstev = bits[i];
1585 pgstev = pgstev << 24; 1654 pgstev = pgstev << 24;
1586 mask &= _PGSTE_GPS_USAGE_MASK; 1655 mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT;
1587 set_pgste_bits(kvm->mm, hva, mask, pgstev); 1656 set_pgste_bits(kvm->mm, hva, mask, pgstev);
1588 } 1657 }
1589 srcu_read_unlock(&kvm->srcu, srcu_idx); 1658 srcu_read_unlock(&kvm->srcu, srcu_idx);
@@ -1858,8 +1927,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
1858 memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask, 1927 memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
1859 S390_ARCH_FAC_LIST_SIZE_BYTE); 1928 S390_ARCH_FAC_LIST_SIZE_BYTE);
1860 1929
1930 /* we are always in czam mode - even on pre z14 machines */
1931 set_kvm_facility(kvm->arch.model.fac_mask, 138);
1932 set_kvm_facility(kvm->arch.model.fac_list, 138);
1933 /* we emulate STHYI in kvm */
1861 set_kvm_facility(kvm->arch.model.fac_mask, 74); 1934 set_kvm_facility(kvm->arch.model.fac_mask, 74);
1862 set_kvm_facility(kvm->arch.model.fac_list, 74); 1935 set_kvm_facility(kvm->arch.model.fac_list, 74);
1936 if (MACHINE_HAS_TLB_GUEST) {
1937 set_kvm_facility(kvm->arch.model.fac_mask, 147);
1938 set_kvm_facility(kvm->arch.model.fac_list, 147);
1939 }
1863 1940
1864 kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); 1941 kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
1865 kvm->arch.model.ibc = sclp.ibc & 0x0fff; 1942 kvm->arch.model.ibc = sclp.ibc & 0x0fff;
@@ -2369,6 +2446,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
2369 vcpu->arch.sie_block->eca |= ECA_VX; 2446 vcpu->arch.sie_block->eca |= ECA_VX;
2370 vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; 2447 vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
2371 } 2448 }
2449 if (test_kvm_facility(vcpu->kvm, 139))
2450 vcpu->arch.sie_block->ecd |= ECD_MEF;
2451
2372 vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx) 2452 vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
2373 | SDNXC; 2453 | SDNXC;
2374 vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; 2454 vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
@@ -2447,6 +2527,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
2447 return kvm_s390_vcpu_has_irq(vcpu, 0); 2527 return kvm_s390_vcpu_has_irq(vcpu, 0);
2448} 2528}
2449 2529
2530bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
2531{
2532 return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE);
2533}
2534
2450void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu) 2535void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu)
2451{ 2536{
2452 atomic_or(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20); 2537 atomic_or(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
@@ -2855,6 +2940,35 @@ retry:
2855 return 0; 2940 return 0;
2856} 2941}
2857 2942
2943void kvm_s390_set_tod_clock_ext(struct kvm *kvm,
2944 const struct kvm_s390_vm_tod_clock *gtod)
2945{
2946 struct kvm_vcpu *vcpu;
2947 struct kvm_s390_tod_clock_ext htod;
2948 int i;
2949
2950 mutex_lock(&kvm->lock);
2951 preempt_disable();
2952
2953 get_tod_clock_ext((char *)&htod);
2954
2955 kvm->arch.epoch = gtod->tod - htod.tod;
2956 kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx;
2957
2958 if (kvm->arch.epoch > gtod->tod)
2959 kvm->arch.epdx -= 1;
2960
2961 kvm_s390_vcpu_block_all(kvm);
2962 kvm_for_each_vcpu(i, vcpu, kvm) {
2963 vcpu->arch.sie_block->epoch = kvm->arch.epoch;
2964 vcpu->arch.sie_block->epdx = kvm->arch.epdx;
2965 }
2966
2967 kvm_s390_vcpu_unblock_all(kvm);
2968 preempt_enable();
2969 mutex_unlock(&kvm->lock);
2970}
2971
2858void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod) 2972void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod)
2859{ 2973{
2860 struct kvm_vcpu *vcpu; 2974 struct kvm_vcpu *vcpu;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 6fedc8bc7a37..9f8fdd7b2311 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -272,6 +272,8 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
272int handle_sthyi(struct kvm_vcpu *vcpu); 272int handle_sthyi(struct kvm_vcpu *vcpu);
273 273
274/* implemented in kvm-s390.c */ 274/* implemented in kvm-s390.c */
275void kvm_s390_set_tod_clock_ext(struct kvm *kvm,
276 const struct kvm_s390_vm_tod_clock *gtod);
275void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); 277void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
276long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); 278long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
277int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); 279int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 785ad028bde6..c954ac49eee4 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -988,6 +988,8 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
988 if (pgstev & _PGSTE_GPS_ZERO) 988 if (pgstev & _PGSTE_GPS_ZERO)
989 res |= 1; 989 res |= 1;
990 } 990 }
991 if (pgstev & _PGSTE_GPS_NODAT)
992 res |= 0x20;
991 vcpu->run->s.regs.gprs[r1] = res; 993 vcpu->run->s.regs.gprs[r1] = res;
992 /* 994 /*
993 * It is possible that all the normal 511 slots were full, in which case 995 * It is possible that all the normal 511 slots were full, in which case
@@ -1027,7 +1029,9 @@ static int handle_essa(struct kvm_vcpu *vcpu)
1027 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 1029 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
1028 /* Check for invalid operation request code */ 1030 /* Check for invalid operation request code */
1029 orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; 1031 orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
1030 if (orc > ESSA_MAX) 1032 /* ORCs 0-6 are always valid */
1033 if (orc > (test_kvm_facility(vcpu->kvm, 147) ? ESSA_SET_STABLE_NODAT
1034 : ESSA_SET_STABLE_IF_RESIDENT))
1031 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 1035 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
1032 1036
1033 if (likely(!vcpu->kvm->arch.migration_state)) { 1037 if (likely(!vcpu->kvm->arch.migration_state)) {
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 1a252f537081..9d592ef4104b 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -155,29 +155,26 @@ static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu,
155 return rc; 155 return rc;
156} 156}
157 157
158static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) 158static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter,
159 u64 *status_reg)
159{ 160{
160 int rc;
161 unsigned int i; 161 unsigned int i;
162 struct kvm_vcpu *v; 162 struct kvm_vcpu *v;
163 bool all_stopped = true;
163 164
164 switch (parameter & 0xff) { 165 kvm_for_each_vcpu(i, v, vcpu->kvm) {
165 case 0: 166 if (v == vcpu)
166 rc = SIGP_CC_NOT_OPERATIONAL; 167 continue;
167 break; 168 if (!is_vcpu_stopped(v))
168 case 1: 169 all_stopped = false;
169 case 2:
170 kvm_for_each_vcpu(i, v, vcpu->kvm) {
171 v->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
172 kvm_clear_async_pf_completion_queue(v);
173 }
174
175 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
176 break;
177 default:
178 rc = -EOPNOTSUPP;
179 } 170 }
180 return rc; 171
172 *status_reg &= 0xffffffff00000000UL;
173
174 /* Reject set arch order, with czam we're always in z/Arch mode. */
175 *status_reg |= (all_stopped ? SIGP_STATUS_INVALID_PARAMETER :
176 SIGP_STATUS_INCORRECT_STATE);
177 return SIGP_CC_STATUS_STORED;
181} 178}
182 179
183static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, 180static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
@@ -446,7 +443,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
446 switch (order_code) { 443 switch (order_code) {
447 case SIGP_SET_ARCHITECTURE: 444 case SIGP_SET_ARCHITECTURE:
448 vcpu->stat.instruction_sigp_arch++; 445 vcpu->stat.instruction_sigp_arch++;
449 rc = __sigp_set_arch(vcpu, parameter); 446 rc = __sigp_set_arch(vcpu, parameter,
447 &vcpu->run->s.regs.gprs[r1]);
450 break; 448 break;
451 default: 449 default:
452 rc = handle_sigp_dst(vcpu, order_code, cpu_addr, 450 rc = handle_sigp_dst(vcpu, order_code, cpu_addr,
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
index a2e5c24f47a7..395926b8c1ed 100644
--- a/arch/s390/kvm/sthyi.c
+++ b/arch/s390/kvm/sthyi.c
@@ -436,14 +436,6 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
436 if (addr & ~PAGE_MASK) 436 if (addr & ~PAGE_MASK)
437 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 437 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
438 438
439 /*
440 * If the page has not yet been faulted in, we want to do that
441 * now and not after all the expensive calculations.
442 */
443 r = write_guest(vcpu, addr, reg2, &cc, 1);
444 if (r)
445 return kvm_s390_inject_prog_cond(vcpu, r);
446
447 sctns = (void *)get_zeroed_page(GFP_KERNEL); 439 sctns = (void *)get_zeroed_page(GFP_KERNEL);
448 if (!sctns) 440 if (!sctns)
449 return -ENOMEM; 441 return -ENOMEM;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index ba8203e4d516..b18b5652e5c5 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -349,6 +349,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
349 scb_s->eca |= scb_o->eca & ECA_IB; 349 scb_s->eca |= scb_o->eca & ECA_IB;
350 if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) 350 if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
351 scb_s->eca |= scb_o->eca & ECA_CEI; 351 scb_s->eca |= scb_o->eca & ECA_CEI;
352 /* Epoch Extension */
353 if (test_kvm_facility(vcpu->kvm, 139))
354 scb_s->ecd |= scb_o->ecd & ECD_MEF;
352 355
353 prepare_ibc(vcpu, vsie_page); 356 prepare_ibc(vcpu, vsie_page);
354 rc = shadow_crycb(vcpu, vsie_page); 357 rc = shadow_crycb(vcpu, vsie_page);
@@ -806,8 +809,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
806{ 809{
807 struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; 810 struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
808 struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; 811 struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
809 struct mcck_volatile_info *mcck_info;
810 struct sie_page *sie_page;
811 int rc; 812 int rc;
812 813
813 handle_last_fault(vcpu, vsie_page); 814 handle_last_fault(vcpu, vsie_page);
@@ -831,9 +832,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
831 832
832 if (rc == -EINTR) { 833 if (rc == -EINTR) {
833 VCPU_EVENT(vcpu, 3, "%s", "machine check"); 834 VCPU_EVENT(vcpu, 3, "%s", "machine check");
834 sie_page = container_of(scb_s, struct sie_page, sie_block); 835 kvm_s390_reinject_machine_check(vcpu, &vsie_page->mcck_info);
835 mcck_info = &sie_page->mcck_info;
836 kvm_s390_reinject_machine_check(vcpu, mcck_info);
837 return 0; 836 return 0;
838 } 837 }
839 838
@@ -919,6 +918,13 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu,
919 */ 918 */
920 preempt_disable(); 919 preempt_disable();
921 scb_s->epoch += vcpu->kvm->arch.epoch; 920 scb_s->epoch += vcpu->kvm->arch.epoch;
921
922 if (scb_s->ecd & ECD_MEF) {
923 scb_s->epdx += vcpu->kvm->arch.epdx;
924 if (scb_s->epoch < vcpu->kvm->arch.epoch)
925 scb_s->epdx += 1;
926 }
927
922 preempt_enable(); 928 preempt_enable();
923} 929}
924 930
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 4198a71b8fdd..ae677f814bc0 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -919,7 +919,7 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
919 case ESSA_GET_STATE: 919 case ESSA_GET_STATE:
920 break; 920 break;
921 case ESSA_SET_STABLE: 921 case ESSA_SET_STABLE:
922 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 922 pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
923 pgstev |= _PGSTE_GPS_USAGE_STABLE; 923 pgstev |= _PGSTE_GPS_USAGE_STABLE;
924 break; 924 break;
925 case ESSA_SET_UNUSED: 925 case ESSA_SET_UNUSED:
@@ -965,6 +965,10 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
965 pgstev |= _PGSTE_GPS_USAGE_STABLE; 965 pgstev |= _PGSTE_GPS_USAGE_STABLE;
966 } 966 }
967 break; 967 break;
968 case ESSA_SET_STABLE_NODAT:
969 pgstev &= ~_PGSTE_GPS_USAGE_MASK;
970 pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT;
971 break;
968 default: 972 default:
969 /* we should never get here! */ 973 /* we should never get here! */
970 break; 974 break;
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 29d72bf8ed2b..70dd8f17d054 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -83,6 +83,7 @@ static struct facility_def facility_defs[] = {
83 78, /* enhanced-DAT 2 */ 83 78, /* enhanced-DAT 2 */
84 130, /* instruction-execution-protection */ 84 130, /* instruction-execution-protection */
85 131, /* enhanced-SOP 2 and side-effect */ 85 131, /* enhanced-SOP 2 and side-effect */
86 139, /* multiple epoch facility */
86 146, /* msa extension 8 */ 87 146, /* msa extension 8 */
87 -1 /* END */ 88 -1 /* END */
88 } 89 }
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 42bbbf0f173d..2519c6c801c9 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -288,6 +288,7 @@
288#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ 288#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
289#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ 289#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
290#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ 290#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
291#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
291 292
292/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ 293/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
293#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ 294#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index fde36f189836..fa2558e12024 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -219,8 +219,8 @@ struct x86_emulate_ops {
219 struct x86_instruction_info *info, 219 struct x86_instruction_info *info,
220 enum x86_intercept_stage stage); 220 enum x86_intercept_stage stage);
221 221
222 void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, 222 bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
223 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 223 u32 *ecx, u32 *edx, bool check_limit);
224 void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); 224 void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
225 225
226 unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); 226 unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 369e41c23f07..8844eee290b2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,15 +79,14 @@
79 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 79 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
80 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 80 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
81 81
82#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
83#define CR3_PCID_INVD BIT_64(63) 82#define CR3_PCID_INVD BIT_64(63)
84#define CR4_RESERVED_BITS \ 83#define CR4_RESERVED_BITS \
85 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 84 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
86 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 85 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
87 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ 86 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
88 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ 87 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
89 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \ 88 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
90 | X86_CR4_PKE)) 89 | X86_CR4_SMAP | X86_CR4_PKE))
91 90
92#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 91#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
93 92
@@ -204,7 +203,6 @@ enum {
204#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) 203#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
205 204
206#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ 205#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
207 PFERR_USER_MASK | \
208 PFERR_WRITE_MASK | \ 206 PFERR_WRITE_MASK | \
209 PFERR_PRESENT_MASK) 207 PFERR_PRESENT_MASK)
210 208
@@ -317,15 +315,17 @@ struct kvm_pio_request {
317 int size; 315 int size;
318}; 316};
319 317
318#define PT64_ROOT_MAX_LEVEL 5
319
320struct rsvd_bits_validate { 320struct rsvd_bits_validate {
321 u64 rsvd_bits_mask[2][4]; 321 u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
322 u64 bad_mt_xwr; 322 u64 bad_mt_xwr;
323}; 323};
324 324
325/* 325/*
326 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level 326 * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
327 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu 327 * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the
328 * mode. 328 * current mmu mode.
329 */ 329 */
330struct kvm_mmu { 330struct kvm_mmu {
331 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); 331 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
@@ -548,8 +548,8 @@ struct kvm_vcpu_arch {
548 548
549 struct kvm_queued_exception { 549 struct kvm_queued_exception {
550 bool pending; 550 bool pending;
551 bool injected;
551 bool has_error_code; 552 bool has_error_code;
552 bool reinject;
553 u8 nr; 553 u8 nr;
554 u32 error_code; 554 u32 error_code;
555 u8 nested_apf; 555 u8 nested_apf;
@@ -687,8 +687,12 @@ struct kvm_vcpu_arch {
687 int pending_ioapic_eoi; 687 int pending_ioapic_eoi;
688 int pending_external_vector; 688 int pending_external_vector;
689 689
690 /* GPA available (AMD only) */ 690 /* GPA available */
691 bool gpa_available; 691 bool gpa_available;
692 gpa_t gpa_val;
693
694 /* be preempted when it's in kernel-mode(cpl=0) */
695 bool preempted_in_kernel;
692}; 696};
693 697
694struct kvm_lpage_info { 698struct kvm_lpage_info {
@@ -979,7 +983,7 @@ struct kvm_x86_ops {
979 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); 983 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
980 int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); 984 int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
981 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 985 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
982 int (*get_tdp_level)(void); 986 int (*get_tdp_level)(struct kvm_vcpu *vcpu);
983 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 987 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
984 int (*get_lpage_level)(void); 988 int (*get_lpage_level)(void);
985 bool (*rdtscp_supported)(void); 989 bool (*rdtscp_supported)(void);
@@ -1297,20 +1301,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
1297 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 1301 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
1298} 1302}
1299 1303
1300static inline u64 get_canonical(u64 la)
1301{
1302 return ((int64_t)la << 16) >> 16;
1303}
1304
1305static inline bool is_noncanonical_address(u64 la)
1306{
1307#ifdef CONFIG_X86_64
1308 return get_canonical(la) != la;
1309#else
1310 return false;
1311#endif
1312}
1313
1314#define TSS_IOPB_BASE_OFFSET 0x66 1304#define TSS_IOPB_BASE_OFFSET 0x66
1315#define TSS_BASE_SIZE 0x68 1305#define TSS_BASE_SIZE 0x68
1316#define TSS_IOPB_SIZE (65536 / 8) 1306#define TSS_IOPB_SIZE (65536 / 8)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 58fffe79e417..14835dd205a5 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -107,6 +107,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
107#define V_IRQ_SHIFT 8 107#define V_IRQ_SHIFT 8
108#define V_IRQ_MASK (1 << V_IRQ_SHIFT) 108#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
109 109
110#define V_GIF_SHIFT 9
111#define V_GIF_MASK (1 << V_GIF_SHIFT)
112
110#define V_INTR_PRIO_SHIFT 16 113#define V_INTR_PRIO_SHIFT 16
111#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) 114#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
112 115
@@ -116,6 +119,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
116#define V_INTR_MASKING_SHIFT 24 119#define V_INTR_MASKING_SHIFT 24
117#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) 120#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
118 121
122#define V_GIF_ENABLE_SHIFT 25
123#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
124
119#define AVIC_ENABLE_SHIFT 31 125#define AVIC_ENABLE_SHIFT 31
120#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) 126#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
121 127
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 35cd06f636ab..caec8417539f 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,6 +72,7 @@
72#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 72#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
73#define SECONDARY_EXEC_RDRAND 0x00000800 73#define SECONDARY_EXEC_RDRAND 0x00000800
74#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 74#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
75#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
75#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 76#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
76#define SECONDARY_EXEC_RDSEED 0x00010000 77#define SECONDARY_EXEC_RDSEED 0x00010000
77#define SECONDARY_EXEC_ENABLE_PML 0x00020000 78#define SECONDARY_EXEC_ENABLE_PML 0x00020000
@@ -114,6 +115,10 @@
114#define VMX_MISC_SAVE_EFER_LMA 0x00000020 115#define VMX_MISC_SAVE_EFER_LMA 0x00000020
115#define VMX_MISC_ACTIVITY_HLT 0x00000040 116#define VMX_MISC_ACTIVITY_HLT 0x00000040
116 117
118/* VMFUNC functions */
119#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001
120#define VMFUNC_EPTP_ENTRIES 512
121
117static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) 122static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
118{ 123{
119 return vmx_basic & GENMASK_ULL(30, 0); 124 return vmx_basic & GENMASK_ULL(30, 0);
@@ -187,6 +192,8 @@ enum vmcs_field {
187 APIC_ACCESS_ADDR_HIGH = 0x00002015, 192 APIC_ACCESS_ADDR_HIGH = 0x00002015,
188 POSTED_INTR_DESC_ADDR = 0x00002016, 193 POSTED_INTR_DESC_ADDR = 0x00002016,
189 POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, 194 POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
195 VM_FUNCTION_CONTROL = 0x00002018,
196 VM_FUNCTION_CONTROL_HIGH = 0x00002019,
190 EPT_POINTER = 0x0000201a, 197 EPT_POINTER = 0x0000201a,
191 EPT_POINTER_HIGH = 0x0000201b, 198 EPT_POINTER_HIGH = 0x0000201b,
192 EOI_EXIT_BITMAP0 = 0x0000201c, 199 EOI_EXIT_BITMAP0 = 0x0000201c,
@@ -197,6 +204,8 @@ enum vmcs_field {
197 EOI_EXIT_BITMAP2_HIGH = 0x00002021, 204 EOI_EXIT_BITMAP2_HIGH = 0x00002021,
198 EOI_EXIT_BITMAP3 = 0x00002022, 205 EOI_EXIT_BITMAP3 = 0x00002022,
199 EOI_EXIT_BITMAP3_HIGH = 0x00002023, 206 EOI_EXIT_BITMAP3_HIGH = 0x00002023,
207 EPTP_LIST_ADDRESS = 0x00002024,
208 EPTP_LIST_ADDRESS_HIGH = 0x00002025,
200 VMREAD_BITMAP = 0x00002026, 209 VMREAD_BITMAP = 0x00002026,
201 VMWRITE_BITMAP = 0x00002028, 210 VMWRITE_BITMAP = 0x00002028,
202 XSS_EXIT_BITMAP = 0x0000202C, 211 XSS_EXIT_BITMAP = 0x0000202C,
@@ -444,6 +453,7 @@ enum vmcs_field {
444 453
445#define VMX_EPT_EXECUTE_ONLY_BIT (1ull) 454#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
446#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) 455#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
456#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7)
447#define VMX_EPTP_UC_BIT (1ull << 8) 457#define VMX_EPTP_UC_BIT (1ull << 8)
448#define VMX_EPTP_WB_BIT (1ull << 14) 458#define VMX_EPTP_WB_BIT (1ull << 14)
449#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 459#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
@@ -459,12 +469,14 @@ enum vmcs_field {
459#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ 469#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
460#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */ 470#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */
461 471
462#define VMX_EPT_DEFAULT_GAW 3
463#define VMX_EPT_MAX_GAW 0x4
464#define VMX_EPT_MT_EPTE_SHIFT 3 472#define VMX_EPT_MT_EPTE_SHIFT 3
465#define VMX_EPT_GAW_EPTP_SHIFT 3 473#define VMX_EPTP_PWL_MASK 0x38ull
466#define VMX_EPT_AD_ENABLE_BIT (1ull << 6) 474#define VMX_EPTP_PWL_4 0x18ull
467#define VMX_EPT_DEFAULT_MT 0x6ull 475#define VMX_EPTP_PWL_5 0x20ull
476#define VMX_EPTP_AD_ENABLE_BIT (1ull << 6)
477#define VMX_EPTP_MT_MASK 0x7ull
478#define VMX_EPTP_MT_WB 0x6ull
479#define VMX_EPTP_MT_UC 0x0ull
468#define VMX_EPT_READABLE_MASK 0x1ull 480#define VMX_EPT_READABLE_MASK 0x1ull
469#define VMX_EPT_WRITABLE_MASK 0x2ull 481#define VMX_EPT_WRITABLE_MASK 0x2ull
470#define VMX_EPT_EXECUTABLE_MASK 0x4ull 482#define VMX_EPT_EXECUTABLE_MASK 0x4ull
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 19adbb418443..0099e10eb045 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -126,16 +126,20 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
126 best->ebx = xstate_required_size(vcpu->arch.xcr0, true); 126 best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
127 127
128 /* 128 /*
129 * The existing code assumes virtual address is 48-bit in the canonical 129 * The existing code assumes virtual address is 48-bit or 57-bit in the
130 * address checks; exit if it is ever changed. 130 * canonical address checks; exit if it is ever changed.
131 */ 131 */
132 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 132 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
133 if (best && ((best->eax & 0xff00) >> 8) != 48 && 133 if (best) {
134 ((best->eax & 0xff00) >> 8) != 0) 134 int vaddr_bits = (best->eax & 0xff00) >> 8;
135 return -EINVAL; 135
136 if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
137 return -EINVAL;
138 }
136 139
137 /* Update physical-address width */ 140 /* Update physical-address width */
138 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 141 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
142 kvm_mmu_reset_context(vcpu);
139 143
140 kvm_pmu_refresh(vcpu); 144 kvm_pmu_refresh(vcpu);
141 return 0; 145 return 0;
@@ -383,7 +387,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
383 387
384 /* cpuid 7.0.ecx*/ 388 /* cpuid 7.0.ecx*/
385 const u32 kvm_cpuid_7_0_ecx_x86_features = 389 const u32 kvm_cpuid_7_0_ecx_x86_features =
386 F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); 390 F(AVX512VBMI) | F(LA57) | F(PKU) |
391 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
387 392
388 /* cpuid 7.0.edx*/ 393 /* cpuid 7.0.edx*/
389 const u32 kvm_cpuid_7_0_edx_x86_features = 394 const u32 kvm_cpuid_7_0_edx_x86_features =
@@ -853,16 +858,24 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
853 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); 858 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
854} 859}
855 860
856void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 861bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
862 u32 *ecx, u32 *edx, bool check_limit)
857{ 863{
858 u32 function = *eax, index = *ecx; 864 u32 function = *eax, index = *ecx;
859 struct kvm_cpuid_entry2 *best; 865 struct kvm_cpuid_entry2 *best;
866 bool entry_found = true;
860 867
861 best = kvm_find_cpuid_entry(vcpu, function, index); 868 best = kvm_find_cpuid_entry(vcpu, function, index);
862 869
863 if (!best) 870 if (!best) {
871 entry_found = false;
872 if (!check_limit)
873 goto out;
874
864 best = check_cpuid_limit(vcpu, function, index); 875 best = check_cpuid_limit(vcpu, function, index);
876 }
865 877
878out:
866 if (best) { 879 if (best) {
867 *eax = best->eax; 880 *eax = best->eax;
868 *ebx = best->ebx; 881 *ebx = best->ebx;
@@ -870,7 +883,8 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
870 *edx = best->edx; 883 *edx = best->edx;
871 } else 884 } else
872 *eax = *ebx = *ecx = *edx = 0; 885 *eax = *ebx = *ecx = *edx = 0;
873 trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx); 886 trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
887 return entry_found;
874} 888}
875EXPORT_SYMBOL_GPL(kvm_cpuid); 889EXPORT_SYMBOL_GPL(kvm_cpuid);
876 890
@@ -883,7 +897,7 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
883 897
884 eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 898 eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
885 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 899 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
886 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); 900 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true);
887 kvm_register_write(vcpu, VCPU_REGS_RAX, eax); 901 kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
888 kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); 902 kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
889 kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); 903 kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index da6728383052..1ea3c0e1e3a9 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -3,6 +3,7 @@
3 3
4#include "x86.h" 4#include "x86.h"
5#include <asm/cpu.h> 5#include <asm/cpu.h>
6#include <asm/processor.h>
6 7
7int kvm_update_cpuid(struct kvm_vcpu *vcpu); 8int kvm_update_cpuid(struct kvm_vcpu *vcpu);
8bool kvm_mpx_supported(void); 9bool kvm_mpx_supported(void);
@@ -20,7 +21,8 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
20int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 21int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
21 struct kvm_cpuid2 *cpuid, 22 struct kvm_cpuid2 *cpuid,
22 struct kvm_cpuid_entry2 __user *entries); 23 struct kvm_cpuid_entry2 __user *entries);
23void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 24bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
25 u32 *ecx, u32 *edx, bool check_limit);
24 26
25int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); 27int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
26 28
@@ -29,95 +31,87 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
29 return vcpu->arch.maxphyaddr; 31 return vcpu->arch.maxphyaddr;
30} 32}
31 33
32static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 34struct cpuid_reg {
33{ 35 u32 function;
34 struct kvm_cpuid_entry2 *best; 36 u32 index;
35 37 int reg;
36 if (!static_cpu_has(X86_FEATURE_XSAVE)) 38};
37 return false;
38
39 best = kvm_find_cpuid_entry(vcpu, 1, 0);
40 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
41}
42
43static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu)
44{
45 struct kvm_cpuid_entry2 *best;
46
47 best = kvm_find_cpuid_entry(vcpu, 1, 0);
48 return best && (best->edx & bit(X86_FEATURE_MTRR));
49}
50
51static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
52{
53 struct kvm_cpuid_entry2 *best;
54
55 best = kvm_find_cpuid_entry(vcpu, 7, 0);
56 return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST));
57}
58 39
59static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) 40static const struct cpuid_reg reverse_cpuid[] = {
60{ 41 [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
61 struct kvm_cpuid_entry2 *best; 42 [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
62 43 [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
63 best = kvm_find_cpuid_entry(vcpu, 7, 0); 44 [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
64 return best && (best->ebx & bit(X86_FEATURE_SMEP)); 45 [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
65} 46 [CPUID_8000_0001_ECX] = {0xc0000001, 0, CPUID_ECX},
47 [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
48 [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
49 [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX},
50 [CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX},
51 [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
52 [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
53 [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
54 [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
55 [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
56};
66 57
67static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) 58static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
68{ 59{
69 struct kvm_cpuid_entry2 *best; 60 unsigned x86_leaf = x86_feature / 32;
70
71 best = kvm_find_cpuid_entry(vcpu, 7, 0);
72 return best && (best->ebx & bit(X86_FEATURE_SMAP));
73}
74 61
75static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) 62 BUILD_BUG_ON(!__builtin_constant_p(x86_leaf));
76{ 63 BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
77 struct kvm_cpuid_entry2 *best; 64 BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
78 65
79 best = kvm_find_cpuid_entry(vcpu, 7, 0); 66 return reverse_cpuid[x86_leaf];
80 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
81} 67}
82 68
83static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) 69static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature)
84{ 70{
85 struct kvm_cpuid_entry2 *best; 71 struct kvm_cpuid_entry2 *entry;
86 72 const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
87 best = kvm_find_cpuid_entry(vcpu, 7, 0);
88 return best && (best->ecx & bit(X86_FEATURE_PKU));
89}
90 73
91static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) 74 entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
92{ 75 if (!entry)
93 struct kvm_cpuid_entry2 *best; 76 return NULL;
94 77
95 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 78 switch (cpuid.reg) {
96 return best && (best->edx & bit(X86_FEATURE_LM)); 79 case CPUID_EAX:
80 return &entry->eax;
81 case CPUID_EBX:
82 return &entry->ebx;
83 case CPUID_ECX:
84 return &entry->ecx;
85 case CPUID_EDX:
86 return &entry->edx;
87 default:
88 BUILD_BUG();
89 return NULL;
90 }
97} 91}
98 92
99static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) 93static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature)
100{ 94{
101 struct kvm_cpuid_entry2 *best; 95 int *reg;
102 96
103 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 97 if (x86_feature == X86_FEATURE_XSAVE &&
104 return best && (best->ecx & bit(X86_FEATURE_OSVW)); 98 !static_cpu_has(X86_FEATURE_XSAVE))
105} 99 return false;
106 100
107static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) 101 reg = guest_cpuid_get_register(vcpu, x86_feature);
108{ 102 if (!reg)
109 struct kvm_cpuid_entry2 *best; 103 return false;
110 104
111 best = kvm_find_cpuid_entry(vcpu, 1, 0); 105 return *reg & bit(x86_feature);
112 return best && (best->ecx & bit(X86_FEATURE_PCID));
113} 106}
114 107
115static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) 108static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
116{ 109{
117 struct kvm_cpuid_entry2 *best; 110 int *reg;
118 111
119 best = kvm_find_cpuid_entry(vcpu, 1, 0); 112 reg = guest_cpuid_get_register(vcpu, x86_feature);
120 return best && (best->ecx & bit(X86_FEATURE_X2APIC)); 113 if (reg)
114 *reg &= ~bit(x86_feature);
121} 115}
122 116
123static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) 117static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
@@ -128,58 +122,6 @@ static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
128 return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx; 122 return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx;
129} 123}
130 124
131static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
132{
133 struct kvm_cpuid_entry2 *best;
134
135 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
136 return best && (best->edx & bit(X86_FEATURE_GBPAGES));
137}
138
139static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
140{
141 struct kvm_cpuid_entry2 *best;
142
143 best = kvm_find_cpuid_entry(vcpu, 7, 0);
144 return best && (best->ebx & bit(X86_FEATURE_RTM));
145}
146
147static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
148{
149 struct kvm_cpuid_entry2 *best;
150
151 best = kvm_find_cpuid_entry(vcpu, 7, 0);
152 return best && (best->ebx & bit(X86_FEATURE_MPX));
153}
154
155static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
156{
157 struct kvm_cpuid_entry2 *best;
158
159 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
160 return best && (best->edx & bit(X86_FEATURE_RDTSCP));
161}
162
163/*
164 * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
165 */
166#define BIT_NRIPS 3
167
168static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
169{
170 struct kvm_cpuid_entry2 *best;
171
172 best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
173
174 /*
175 * NRIPS is a scattered cpuid feature, so we can't use
176 * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
177 * position 8, not 3).
178 */
179 return best && (best->edx & bit(BIT_NRIPS));
180}
181#undef BIT_NRIPS
182
183static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) 125static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
184{ 126{
185 struct kvm_cpuid_entry2 *best; 127 struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb0055953fbc..16bf6655aa85 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@
28 28
29#include "x86.h" 29#include "x86.h"
30#include "tss.h" 30#include "tss.h"
31#include "mmu.h"
31 32
32/* 33/*
33 * Operand types 34 * Operand types
@@ -688,16 +689,18 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
688 ulong la; 689 ulong la;
689 u32 lim; 690 u32 lim;
690 u16 sel; 691 u16 sel;
692 u8 va_bits;
691 693
692 la = seg_base(ctxt, addr.seg) + addr.ea; 694 la = seg_base(ctxt, addr.seg) + addr.ea;
693 *max_size = 0; 695 *max_size = 0;
694 switch (mode) { 696 switch (mode) {
695 case X86EMUL_MODE_PROT64: 697 case X86EMUL_MODE_PROT64:
696 *linear = la; 698 *linear = la;
697 if (is_noncanonical_address(la)) 699 va_bits = ctxt_virt_addr_bits(ctxt);
700 if (get_canonical(la, va_bits) != la)
698 goto bad; 701 goto bad;
699 702
700 *max_size = min_t(u64, ~0u, (1ull << 48) - la); 703 *max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
701 if (size > *max_size) 704 if (size > *max_size)
702 goto bad; 705 goto bad;
703 break; 706 break;
@@ -1748,8 +1751,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1748 sizeof(base3), &ctxt->exception); 1751 sizeof(base3), &ctxt->exception);
1749 if (ret != X86EMUL_CONTINUE) 1752 if (ret != X86EMUL_CONTINUE)
1750 return ret; 1753 return ret;
1751 if (is_noncanonical_address(get_desc_base(&seg_desc) | 1754 if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
1752 ((u64)base3 << 32))) 1755 ((u64)base3 << 32), ctxt))
1753 return emulate_gp(ctxt, 0); 1756 return emulate_gp(ctxt, 0);
1754 } 1757 }
1755load: 1758load:
@@ -2333,7 +2336,7 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
2333 2336
2334 eax = 0x80000001; 2337 eax = 0x80000001;
2335 ecx = 0; 2338 ecx = 0;
2336 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 2339 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
2337 return edx & bit(X86_FEATURE_LM); 2340 return edx & bit(X86_FEATURE_LM);
2338} 2341}
2339 2342
@@ -2636,7 +2639,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
2636 u32 eax, ebx, ecx, edx; 2639 u32 eax, ebx, ecx, edx;
2637 2640
2638 eax = ecx = 0; 2641 eax = ecx = 0;
2639 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 2642 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
2640 return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 2643 return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
2641 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 2644 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
2642 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; 2645 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
@@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
2656 2659
2657 eax = 0x00000000; 2660 eax = 0x00000000;
2658 ecx = 0x00000000; 2661 ecx = 0x00000000;
2659 ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 2662 ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
2660 /* 2663 /*
2661 * Intel ("GenuineIntel") 2664 * Intel ("GenuineIntel")
2662 * remark: Intel CPUs only support "syscall" in 64bit 2665 * remark: Intel CPUs only support "syscall" in 64bit
@@ -2840,8 +2843,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2840 ss_sel = cs_sel + 8; 2843 ss_sel = cs_sel + 8;
2841 cs.d = 0; 2844 cs.d = 0;
2842 cs.l = 1; 2845 cs.l = 1;
2843 if (is_noncanonical_address(rcx) || 2846 if (emul_is_noncanonical_address(rcx, ctxt) ||
2844 is_noncanonical_address(rdx)) 2847 emul_is_noncanonical_address(rdx, ctxt))
2845 return emulate_gp(ctxt, 0); 2848 return emulate_gp(ctxt, 0);
2846 break; 2849 break;
2847 } 2850 }
@@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
3551 /* 3554 /*
3552 * Check MOVBE is set in the guest-visible CPUID leaf. 3555 * Check MOVBE is set in the guest-visible CPUID leaf.
3553 */ 3556 */
3554 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 3557 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
3555 if (!(ecx & FFL(MOVBE))) 3558 if (!(ecx & FFL(MOVBE)))
3556 return emulate_ud(ctxt); 3559 return emulate_ud(ctxt);
3557 3560
@@ -3756,7 +3759,7 @@ static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
3756 if (rc != X86EMUL_CONTINUE) 3759 if (rc != X86EMUL_CONTINUE)
3757 return rc; 3760 return rc;
3758 if (ctxt->mode == X86EMUL_MODE_PROT64 && 3761 if (ctxt->mode == X86EMUL_MODE_PROT64 &&
3759 is_noncanonical_address(desc_ptr.address)) 3762 emul_is_noncanonical_address(desc_ptr.address, ctxt))
3760 return emulate_gp(ctxt, 0); 3763 return emulate_gp(ctxt, 0);
3761 if (lgdt) 3764 if (lgdt)
3762 ctxt->ops->set_gdt(ctxt, &desc_ptr); 3765 ctxt->ops->set_gdt(ctxt, &desc_ptr);
@@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3865 3868
3866 eax = reg_read(ctxt, VCPU_REGS_RAX); 3869 eax = reg_read(ctxt, VCPU_REGS_RAX);
3867 ecx = reg_read(ctxt, VCPU_REGS_RCX); 3870 ecx = reg_read(ctxt, VCPU_REGS_RCX);
3868 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 3871 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
3869 *reg_write(ctxt, VCPU_REGS_RAX) = eax; 3872 *reg_write(ctxt, VCPU_REGS_RAX) = eax;
3870 *reg_write(ctxt, VCPU_REGS_RBX) = ebx; 3873 *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
3871 *reg_write(ctxt, VCPU_REGS_RCX) = ecx; 3874 *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
@@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
3924{ 3927{
3925 u32 eax = 1, ebx, ecx = 0, edx; 3928 u32 eax = 1, ebx, ecx = 0, edx;
3926 3929
3927 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 3930 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
3928 if (!(edx & FFL(FXSR))) 3931 if (!(edx & FFL(FXSR)))
3929 return emulate_ud(ctxt); 3932 return emulate_ud(ctxt);
3930 3933
@@ -4097,8 +4100,17 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
4097 u64 rsvd = 0; 4100 u64 rsvd = 0;
4098 4101
4099 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 4102 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
4100 if (efer & EFER_LMA) 4103 if (efer & EFER_LMA) {
4101 rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD; 4104 u64 maxphyaddr;
4105 u32 eax = 0x80000008;
4106
4107 if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL,
4108 NULL, false))
4109 maxphyaddr = eax & 0xff;
4110 else
4111 maxphyaddr = 36;
4112 rsvd = rsvd_bits(maxphyaddr, 62);
4113 }
4102 4114
4103 if (new_val & rsvd) 4115 if (new_val & rsvd)
4104 return emulate_gp(ctxt, 0); 4116 return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 337b6d2730fa..dc97f2544b6f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1160,6 +1160,12 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1160 return stimer_get_count(vcpu_to_stimer(vcpu, timer_index), 1160 return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
1161 pdata); 1161 pdata);
1162 } 1162 }
1163 case HV_X64_MSR_TSC_FREQUENCY:
1164 data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
1165 break;
1166 case HV_X64_MSR_APIC_FREQUENCY:
1167 data = APIC_BUS_FREQUENCY;
1168 break;
1163 default: 1169 default:
1164 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1170 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1165 return 1; 1171 return 1;
@@ -1268,7 +1274,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
1268 1274
1269 switch (code) { 1275 switch (code) {
1270 case HVCALL_NOTIFY_LONG_SPIN_WAIT: 1276 case HVCALL_NOTIFY_LONG_SPIN_WAIT:
1271 kvm_vcpu_on_spin(vcpu); 1277 kvm_vcpu_on_spin(vcpu, true);
1272 break; 1278 break;
1273 case HVCALL_POST_MESSAGE: 1279 case HVCALL_POST_MESSAGE:
1274 case HVCALL_SIGNAL_EVENT: 1280 case HVCALL_SIGNAL_EVENT:
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index e1e89ee4af75..9add410f195f 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -4,7 +4,7 @@
4#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS 4#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
5#define KVM_POSSIBLE_CR4_GUEST_BITS \ 5#define KVM_POSSIBLE_CR4_GUEST_BITS \
6 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 6 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
7 | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) 7 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE)
8 8
9static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, 9static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
10 enum kvm_reg reg) 10 enum kvm_reg reg)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 589dcc117086..aaf10b6f5380 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -54,8 +54,6 @@
54#define PRIu64 "u" 54#define PRIu64 "u"
55#define PRIo64 "o" 55#define PRIo64 "o"
56 56
57#define APIC_BUS_CYCLE_NS 1
58
59/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ 57/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
60#define apic_debug(fmt, arg...) 58#define apic_debug(fmt, arg...)
61 59
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 29caa2c3dff9..215721e1426a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,9 @@
12#define KVM_APIC_SHORT_MASK 0xc0000 12#define KVM_APIC_SHORT_MASK 0xc0000
13#define KVM_APIC_DEST_MASK 0x800 13#define KVM_APIC_DEST_MASK 0x800
14 14
15#define APIC_BUS_CYCLE_NS 1
16#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
17
15struct kvm_timer { 18struct kvm_timer {
16 struct hrtimer timer; 19 struct hrtimer timer;
17 s64 period; /* unit: ns */ 20 s64 period; /* unit: ns */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 04d750813c9d..eca30c1eb1d9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2169,8 +2169,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2169} 2169}
2170 2170
2171struct mmu_page_path { 2171struct mmu_page_path {
2172 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL]; 2172 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2173 unsigned int idx[PT64_ROOT_LEVEL]; 2173 unsigned int idx[PT64_ROOT_MAX_LEVEL];
2174}; 2174};
2175 2175
2176#define for_each_sp(pvec, sp, parents, i) \ 2176#define for_each_sp(pvec, sp, parents, i) \
@@ -2385,8 +2385,8 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2385 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 2385 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
2386 iterator->level = vcpu->arch.mmu.shadow_root_level; 2386 iterator->level = vcpu->arch.mmu.shadow_root_level;
2387 2387
2388 if (iterator->level == PT64_ROOT_LEVEL && 2388 if (iterator->level == PT64_ROOT_4LEVEL &&
2389 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && 2389 vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
2390 !vcpu->arch.mmu.direct_map) 2390 !vcpu->arch.mmu.direct_map)
2391 --iterator->level; 2391 --iterator->level;
2392 2392
@@ -2610,9 +2610,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2610 2610
2611 sp = list_last_entry(&kvm->arch.active_mmu_pages, 2611 sp = list_last_entry(&kvm->arch.active_mmu_pages,
2612 struct kvm_mmu_page, link); 2612 struct kvm_mmu_page, link);
2613 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2613 return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2614
2615 return true;
2616} 2614}
2617 2615
2618/* 2616/*
@@ -3262,7 +3260,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3262 3260
3263static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3261static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3264 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); 3262 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
3265static void make_mmu_pages_available(struct kvm_vcpu *vcpu); 3263static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3266 3264
3267static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, 3265static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3268 gfn_t gfn, bool prefault) 3266 gfn_t gfn, bool prefault)
@@ -3302,7 +3300,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3302 spin_lock(&vcpu->kvm->mmu_lock); 3300 spin_lock(&vcpu->kvm->mmu_lock);
3303 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3301 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3304 goto out_unlock; 3302 goto out_unlock;
3305 make_mmu_pages_available(vcpu); 3303 if (make_mmu_pages_available(vcpu) < 0)
3304 goto out_unlock;
3306 if (likely(!force_pt_level)) 3305 if (likely(!force_pt_level))
3307 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3306 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
3308 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); 3307 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
@@ -3326,8 +3325,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
3326 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3325 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3327 return; 3326 return;
3328 3327
3329 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && 3328 if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
3330 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || 3329 (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
3331 vcpu->arch.mmu.direct_map)) { 3330 vcpu->arch.mmu.direct_map)) {
3332 hpa_t root = vcpu->arch.mmu.root_hpa; 3331 hpa_t root = vcpu->arch.mmu.root_hpa;
3333 3332
@@ -3379,10 +3378,14 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3379 struct kvm_mmu_page *sp; 3378 struct kvm_mmu_page *sp;
3380 unsigned i; 3379 unsigned i;
3381 3380
3382 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 3381 if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
3383 spin_lock(&vcpu->kvm->mmu_lock); 3382 spin_lock(&vcpu->kvm->mmu_lock);
3384 make_mmu_pages_available(vcpu); 3383 if(make_mmu_pages_available(vcpu) < 0) {
3385 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL); 3384 spin_unlock(&vcpu->kvm->mmu_lock);
3385 return 1;
3386 }
3387 sp = kvm_mmu_get_page(vcpu, 0, 0,
3388 vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
3386 ++sp->root_count; 3389 ++sp->root_count;
3387 spin_unlock(&vcpu->kvm->mmu_lock); 3390 spin_unlock(&vcpu->kvm->mmu_lock);
3388 vcpu->arch.mmu.root_hpa = __pa(sp->spt); 3391 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3392,7 +3395,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3392 3395
3393 MMU_WARN_ON(VALID_PAGE(root)); 3396 MMU_WARN_ON(VALID_PAGE(root));
3394 spin_lock(&vcpu->kvm->mmu_lock); 3397 spin_lock(&vcpu->kvm->mmu_lock);
3395 make_mmu_pages_available(vcpu); 3398 if (make_mmu_pages_available(vcpu) < 0) {
3399 spin_unlock(&vcpu->kvm->mmu_lock);
3400 return 1;
3401 }
3396 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), 3402 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3397 i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); 3403 i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3398 root = __pa(sp->spt); 3404 root = __pa(sp->spt);
@@ -3423,15 +3429,18 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3423 * Do we shadow a long mode page table? If so we need to 3429 * Do we shadow a long mode page table? If so we need to
3424 * write-protect the guests page table root. 3430 * write-protect the guests page table root.
3425 */ 3431 */
3426 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 3432 if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
3427 hpa_t root = vcpu->arch.mmu.root_hpa; 3433 hpa_t root = vcpu->arch.mmu.root_hpa;
3428 3434
3429 MMU_WARN_ON(VALID_PAGE(root)); 3435 MMU_WARN_ON(VALID_PAGE(root));
3430 3436
3431 spin_lock(&vcpu->kvm->mmu_lock); 3437 spin_lock(&vcpu->kvm->mmu_lock);
3432 make_mmu_pages_available(vcpu); 3438 if (make_mmu_pages_available(vcpu) < 0) {
3433 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 3439 spin_unlock(&vcpu->kvm->mmu_lock);
3434 0, ACC_ALL); 3440 return 1;
3441 }
3442 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3443 vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
3435 root = __pa(sp->spt); 3444 root = __pa(sp->spt);
3436 ++sp->root_count; 3445 ++sp->root_count;
3437 spin_unlock(&vcpu->kvm->mmu_lock); 3446 spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3445,7 +3454,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3445 * the shadow page table may be a PAE or a long mode page table. 3454 * the shadow page table may be a PAE or a long mode page table.
3446 */ 3455 */
3447 pm_mask = PT_PRESENT_MASK; 3456 pm_mask = PT_PRESENT_MASK;
3448 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) 3457 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
3449 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 3458 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3450 3459
3451 for (i = 0; i < 4; ++i) { 3460 for (i = 0; i < 4; ++i) {
@@ -3463,7 +3472,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3463 return 1; 3472 return 1;
3464 } 3473 }
3465 spin_lock(&vcpu->kvm->mmu_lock); 3474 spin_lock(&vcpu->kvm->mmu_lock);
3466 make_mmu_pages_available(vcpu); 3475 if (make_mmu_pages_available(vcpu) < 0) {
3476 spin_unlock(&vcpu->kvm->mmu_lock);
3477 return 1;
3478 }
3467 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, 3479 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3468 0, ACC_ALL); 3480 0, ACC_ALL);
3469 root = __pa(sp->spt); 3481 root = __pa(sp->spt);
@@ -3478,7 +3490,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3478 * If we shadow a 32 bit page table with a long mode page 3490 * If we shadow a 32 bit page table with a long mode page
3479 * table we enter this path. 3491 * table we enter this path.
3480 */ 3492 */
3481 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 3493 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
3482 if (vcpu->arch.mmu.lm_root == NULL) { 3494 if (vcpu->arch.mmu.lm_root == NULL) {
3483 /* 3495 /*
3484 * The additional page necessary for this is only 3496 * The additional page necessary for this is only
@@ -3523,7 +3535,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
3523 3535
3524 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 3536 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3525 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3537 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3526 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 3538 if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
3527 hpa_t root = vcpu->arch.mmu.root_hpa; 3539 hpa_t root = vcpu->arch.mmu.root_hpa;
3528 sp = page_header(root); 3540 sp = page_header(root);
3529 mmu_sync_children(vcpu, sp); 3541 mmu_sync_children(vcpu, sp);
@@ -3588,6 +3600,13 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
3588 3600
3589static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3601static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3590{ 3602{
3603 /*
3604 * A nested guest cannot use the MMIO cache if it is using nested
3605 * page tables, because cr2 is a nGPA while the cache stores GPAs.
3606 */
3607 if (mmu_is_nested(vcpu))
3608 return false;
3609
3591 if (direct) 3610 if (direct)
3592 return vcpu_match_mmio_gpa(vcpu, addr); 3611 return vcpu_match_mmio_gpa(vcpu, addr);
3593 3612
@@ -3599,7 +3618,7 @@ static bool
3599walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) 3618walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3600{ 3619{
3601 struct kvm_shadow_walk_iterator iterator; 3620 struct kvm_shadow_walk_iterator iterator;
3602 u64 sptes[PT64_ROOT_LEVEL], spte = 0ull; 3621 u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
3603 int root, leaf; 3622 int root, leaf;
3604 bool reserved = false; 3623 bool reserved = false;
3605 3624
@@ -3640,7 +3659,23 @@ exit:
3640 return reserved; 3659 return reserved;
3641} 3660}
3642 3661
3643int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3662/*
3663 * Return values of handle_mmio_page_fault:
3664 * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
3665 * directly.
3666 * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
3667 * fault path update the mmio spte.
3668 * RET_MMIO_PF_RETRY: let CPU fault again on the address.
3669 * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
3670 */
3671enum {
3672 RET_MMIO_PF_EMULATE = 1,
3673 RET_MMIO_PF_INVALID = 2,
3674 RET_MMIO_PF_RETRY = 0,
3675 RET_MMIO_PF_BUG = -1
3676};
3677
3678static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3644{ 3679{
3645 u64 spte; 3680 u64 spte;
3646 bool reserved; 3681 bool reserved;
@@ -3872,7 +3907,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3872 spin_lock(&vcpu->kvm->mmu_lock); 3907 spin_lock(&vcpu->kvm->mmu_lock);
3873 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3908 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3874 goto out_unlock; 3909 goto out_unlock;
3875 make_mmu_pages_available(vcpu); 3910 if (make_mmu_pages_available(vcpu) < 0)
3911 goto out_unlock;
3876 if (likely(!force_pt_level)) 3912 if (likely(!force_pt_level))
3877 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3913 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
3878 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); 3914 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
@@ -4025,7 +4061,13 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4025 rsvd_check->rsvd_bits_mask[1][0] = 4061 rsvd_check->rsvd_bits_mask[1][0] =
4026 rsvd_check->rsvd_bits_mask[0][0]; 4062 rsvd_check->rsvd_bits_mask[0][0];
4027 break; 4063 break;
4028 case PT64_ROOT_LEVEL: 4064 case PT64_ROOT_5LEVEL:
4065 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4066 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4067 rsvd_bits(maxphyaddr, 51);
4068 rsvd_check->rsvd_bits_mask[1][4] =
4069 rsvd_check->rsvd_bits_mask[0][4];
4070 case PT64_ROOT_4LEVEL:
4029 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | 4071 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4030 nonleaf_bit8_rsvd | rsvd_bits(7, 7) | 4072 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4031 rsvd_bits(maxphyaddr, 51); 4073 rsvd_bits(maxphyaddr, 51);
@@ -4055,7 +4097,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4055{ 4097{
4056 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, 4098 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4057 cpuid_maxphyaddr(vcpu), context->root_level, 4099 cpuid_maxphyaddr(vcpu), context->root_level,
4058 context->nx, guest_cpuid_has_gbpages(vcpu), 4100 context->nx,
4101 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4059 is_pse(vcpu), guest_cpuid_is_amd(vcpu)); 4102 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4060} 4103}
4061 4104
@@ -4065,6 +4108,8 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4065{ 4108{
4066 u64 bad_mt_xwr; 4109 u64 bad_mt_xwr;
4067 4110
4111 rsvd_check->rsvd_bits_mask[0][4] =
4112 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4068 rsvd_check->rsvd_bits_mask[0][3] = 4113 rsvd_check->rsvd_bits_mask[0][3] =
4069 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); 4114 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4070 rsvd_check->rsvd_bits_mask[0][2] = 4115 rsvd_check->rsvd_bits_mask[0][2] =
@@ -4074,6 +4119,7 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4074 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); 4119 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4075 4120
4076 /* large page */ 4121 /* large page */
4122 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4077 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; 4123 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4078 rsvd_check->rsvd_bits_mask[1][2] = 4124 rsvd_check->rsvd_bits_mask[1][2] =
4079 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); 4125 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
@@ -4120,8 +4166,8 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4120 __reset_rsvds_bits_mask(vcpu, shadow_zero_check, 4166 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4121 boot_cpu_data.x86_phys_bits, 4167 boot_cpu_data.x86_phys_bits,
4122 context->shadow_root_level, uses_nx, 4168 context->shadow_root_level, uses_nx,
4123 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), 4169 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4124 true); 4170 is_pse(vcpu), true);
4125 4171
4126 if (!shadow_me_mask) 4172 if (!shadow_me_mask)
4127 return; 4173 return;
@@ -4185,66 +4231,85 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4185 boot_cpu_data.x86_phys_bits, execonly); 4231 boot_cpu_data.x86_phys_bits, execonly);
4186} 4232}
4187 4233
4234#define BYTE_MASK(access) \
4235 ((1 & (access) ? 2 : 0) | \
4236 (2 & (access) ? 4 : 0) | \
4237 (3 & (access) ? 8 : 0) | \
4238 (4 & (access) ? 16 : 0) | \
4239 (5 & (access) ? 32 : 0) | \
4240 (6 & (access) ? 64 : 0) | \
4241 (7 & (access) ? 128 : 0))
4242
4243
4188static void update_permission_bitmask(struct kvm_vcpu *vcpu, 4244static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4189 struct kvm_mmu *mmu, bool ept) 4245 struct kvm_mmu *mmu, bool ept)
4190{ 4246{
4191 unsigned bit, byte, pfec; 4247 unsigned byte;
4192 u8 map; 4248
4193 bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; 4249 const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4250 const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4251 const u8 u = BYTE_MASK(ACC_USER_MASK);
4252
4253 bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4254 bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4255 bool cr0_wp = is_write_protection(vcpu);
4194 4256
4195 cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4196 cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4197 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { 4257 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4198 pfec = byte << 1; 4258 unsigned pfec = byte << 1;
4199 map = 0; 4259
4200 wf = pfec & PFERR_WRITE_MASK;
4201 uf = pfec & PFERR_USER_MASK;
4202 ff = pfec & PFERR_FETCH_MASK;
4203 /* 4260 /*
4204 * PFERR_RSVD_MASK bit is set in PFEC if the access is not 4261 * Each "*f" variable has a 1 bit for each UWX value
4205 * subject to SMAP restrictions, and cleared otherwise. The 4262 * that causes a fault with the given PFEC.
4206 * bit is only meaningful if the SMAP bit is set in CR4.
4207 */ 4263 */
4208 smapf = !(pfec & PFERR_RSVD_MASK);
4209 for (bit = 0; bit < 8; ++bit) {
4210 x = bit & ACC_EXEC_MASK;
4211 w = bit & ACC_WRITE_MASK;
4212 u = bit & ACC_USER_MASK;
4213
4214 if (!ept) {
4215 /* Not really needed: !nx will cause pte.nx to fault */
4216 x |= !mmu->nx;
4217 /* Allow supervisor writes if !cr0.wp */
4218 w |= !is_write_protection(vcpu) && !uf;
4219 /* Disallow supervisor fetches of user code if cr4.smep */
4220 x &= !(cr4_smep && u && !uf);
4221
4222 /*
4223 * SMAP:kernel-mode data accesses from user-mode
4224 * mappings should fault. A fault is considered
4225 * as a SMAP violation if all of the following
4226 * conditions are ture:
4227 * - X86_CR4_SMAP is set in CR4
4228 * - A user page is accessed
4229 * - Page fault in kernel mode
4230 * - if CPL = 3 or X86_EFLAGS_AC is clear
4231 *
4232 * Here, we cover the first three conditions.
4233 * The fourth is computed dynamically in
4234 * permission_fault() and is in smapf.
4235 *
4236 * Also, SMAP does not affect instruction
4237 * fetches, add the !ff check here to make it
4238 * clearer.
4239 */
4240 smap = cr4_smap && u && !uf && !ff;
4241 }
4242 4264
4243 fault = (ff && !x) || (uf && !u) || (wf && !w) || 4265 /* Faults from writes to non-writable pages */
4244 (smapf && smap); 4266 u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0;
4245 map |= fault << bit; 4267 /* Faults from user mode accesses to supervisor pages */
4268 u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0;
4269 /* Faults from fetches of non-executable pages*/
4270 u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0;
4271 /* Faults from kernel mode fetches of user pages */
4272 u8 smepf = 0;
4273 /* Faults from kernel mode accesses of user pages */
4274 u8 smapf = 0;
4275
4276 if (!ept) {
4277 /* Faults from kernel mode accesses to user pages */
4278 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4279
4280 /* Not really needed: !nx will cause pte.nx to fault */
4281 if (!mmu->nx)
4282 ff = 0;
4283
4284 /* Allow supervisor writes if !cr0.wp */
4285 if (!cr0_wp)
4286 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4287
4288 /* Disallow supervisor fetches of user code if cr4.smep */
4289 if (cr4_smep)
4290 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4291
4292 /*
4293 * SMAP:kernel-mode data accesses from user-mode
4294 * mappings should fault. A fault is considered
4295 * as a SMAP violation if all of the following
4296 * conditions are ture:
4297 * - X86_CR4_SMAP is set in CR4
4298 * - A user page is accessed
4299 * - The access is not a fetch
4300 * - Page fault in kernel mode
4301 * - if CPL = 3 or X86_EFLAGS_AC is clear
4302 *
4303 * Here, we cover the first three conditions.
4304 * The fourth is computed dynamically in permission_fault();
4305 * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4306 * *not* subject to SMAP restrictions.
4307 */
4308 if (cr4_smap)
4309 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4246 } 4310 }
4247 mmu->permissions[byte] = map; 4311
4312 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4248 } 4313 }
4249} 4314}
4250 4315
@@ -4358,7 +4423,10 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4358static void paging64_init_context(struct kvm_vcpu *vcpu, 4423static void paging64_init_context(struct kvm_vcpu *vcpu,
4359 struct kvm_mmu *context) 4424 struct kvm_mmu *context)
4360{ 4425{
4361 paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); 4426 int root_level = is_la57_mode(vcpu) ?
4427 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4428
4429 paging64_init_context_common(vcpu, context, root_level);
4362} 4430}
4363 4431
4364static void paging32_init_context(struct kvm_vcpu *vcpu, 4432static void paging32_init_context(struct kvm_vcpu *vcpu,
@@ -4399,7 +4467,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4399 context->sync_page = nonpaging_sync_page; 4467 context->sync_page = nonpaging_sync_page;
4400 context->invlpg = nonpaging_invlpg; 4468 context->invlpg = nonpaging_invlpg;
4401 context->update_pte = nonpaging_update_pte; 4469 context->update_pte = nonpaging_update_pte;
4402 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 4470 context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
4403 context->root_hpa = INVALID_PAGE; 4471 context->root_hpa = INVALID_PAGE;
4404 context->direct_map = true; 4472 context->direct_map = true;
4405 context->set_cr3 = kvm_x86_ops->set_tdp_cr3; 4473 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
@@ -4413,7 +4481,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4413 context->root_level = 0; 4481 context->root_level = 0;
4414 } else if (is_long_mode(vcpu)) { 4482 } else if (is_long_mode(vcpu)) {
4415 context->nx = is_nx(vcpu); 4483 context->nx = is_nx(vcpu);
4416 context->root_level = PT64_ROOT_LEVEL; 4484 context->root_level = is_la57_mode(vcpu) ?
4485 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4417 reset_rsvds_bits_mask(vcpu, context); 4486 reset_rsvds_bits_mask(vcpu, context);
4418 context->gva_to_gpa = paging64_gva_to_gpa; 4487 context->gva_to_gpa = paging64_gva_to_gpa;
4419 } else if (is_pae(vcpu)) { 4488 } else if (is_pae(vcpu)) {
@@ -4470,7 +4539,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4470 4539
4471 MMU_WARN_ON(VALID_PAGE(context->root_hpa)); 4540 MMU_WARN_ON(VALID_PAGE(context->root_hpa));
4472 4541
4473 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 4542 context->shadow_root_level = PT64_ROOT_4LEVEL;
4474 4543
4475 context->nx = true; 4544 context->nx = true;
4476 context->ept_ad = accessed_dirty; 4545 context->ept_ad = accessed_dirty;
@@ -4479,7 +4548,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4479 context->sync_page = ept_sync_page; 4548 context->sync_page = ept_sync_page;
4480 context->invlpg = ept_invlpg; 4549 context->invlpg = ept_invlpg;
4481 context->update_pte = ept_update_pte; 4550 context->update_pte = ept_update_pte;
4482 context->root_level = context->shadow_root_level; 4551 context->root_level = PT64_ROOT_4LEVEL;
4483 context->root_hpa = INVALID_PAGE; 4552 context->root_hpa = INVALID_PAGE;
4484 context->direct_map = false; 4553 context->direct_map = false;
4485 context->base_role.ad_disabled = !accessed_dirty; 4554 context->base_role.ad_disabled = !accessed_dirty;
@@ -4524,7 +4593,8 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
4524 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; 4593 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
4525 } else if (is_long_mode(vcpu)) { 4594 } else if (is_long_mode(vcpu)) {
4526 g_context->nx = is_nx(vcpu); 4595 g_context->nx = is_nx(vcpu);
4527 g_context->root_level = PT64_ROOT_LEVEL; 4596 g_context->root_level = is_la57_mode(vcpu) ?
4597 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4528 reset_rsvds_bits_mask(vcpu, g_context); 4598 reset_rsvds_bits_mask(vcpu, g_context);
4529 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 4599 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
4530 } else if (is_pae(vcpu)) { 4600 } else if (is_pae(vcpu)) {
@@ -4814,12 +4884,12 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
4814} 4884}
4815EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 4885EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
4816 4886
4817static void make_mmu_pages_available(struct kvm_vcpu *vcpu) 4887static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
4818{ 4888{
4819 LIST_HEAD(invalid_list); 4889 LIST_HEAD(invalid_list);
4820 4890
4821 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) 4891 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
4822 return; 4892 return 0;
4823 4893
4824 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { 4894 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
4825 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) 4895 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
@@ -4828,6 +4898,10 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
4828 ++vcpu->kvm->stat.mmu_recycled; 4898 ++vcpu->kvm->stat.mmu_recycled;
4829 } 4899 }
4830 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 4900 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
4901
4902 if (!kvm_mmu_available_pages(vcpu->kvm))
4903 return -ENOSPC;
4904 return 0;
4831} 4905}
4832 4906
4833int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, 4907int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
@@ -4835,7 +4909,13 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
4835{ 4909{
4836 int r, emulation_type = EMULTYPE_RETRY; 4910 int r, emulation_type = EMULTYPE_RETRY;
4837 enum emulation_result er; 4911 enum emulation_result er;
4838 bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); 4912 bool direct = vcpu->arch.mmu.direct_map;
4913
4914 /* With shadow page tables, fault_address contains a GVA or nGPA. */
4915 if (vcpu->arch.mmu.direct_map) {
4916 vcpu->arch.gpa_available = true;
4917 vcpu->arch.gpa_val = cr2;
4918 }
4839 4919
4840 if (unlikely(error_code & PFERR_RSVD_MASK)) { 4920 if (unlikely(error_code & PFERR_RSVD_MASK)) {
4841 r = handle_mmio_page_fault(vcpu, cr2, direct); 4921 r = handle_mmio_page_fault(vcpu, cr2, direct);
@@ -4847,6 +4927,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
4847 return 1; 4927 return 1;
4848 if (r < 0) 4928 if (r < 0)
4849 return r; 4929 return r;
4930 /* Must be RET_MMIO_PF_INVALID. */
4850 } 4931 }
4851 4932
4852 r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), 4933 r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
@@ -4862,11 +4943,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
4862 * This can occur when using nested virtualization with nested 4943 * This can occur when using nested virtualization with nested
4863 * paging in both guests. If true, we simply unprotect the page 4944 * paging in both guests. If true, we simply unprotect the page
4864 * and resume the guest. 4945 * and resume the guest.
4865 *
4866 * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used
4867 * in PFERR_NEXT_GUEST_PAGE)
4868 */ 4946 */
4869 if (error_code == PFERR_NESTED_GUEST_PAGE) { 4947 if (vcpu->arch.mmu.direct_map &&
4948 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
4870 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); 4949 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
4871 return 1; 4950 return 1;
4872 } 4951 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 4b9a3ae6b725..64a2dbd2b1af 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,7 +37,8 @@
37#define PT32_DIR_PSE36_MASK \ 37#define PT32_DIR_PSE36_MASK \
38 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) 38 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
39 39
40#define PT64_ROOT_LEVEL 4 40#define PT64_ROOT_5LEVEL 5
41#define PT64_ROOT_4LEVEL 4
41#define PT32_ROOT_LEVEL 2 42#define PT32_ROOT_LEVEL 2
42#define PT32E_ROOT_LEVEL 3 43#define PT32E_ROOT_LEVEL 3
43 44
@@ -48,6 +49,9 @@
48 49
49static inline u64 rsvd_bits(int s, int e) 50static inline u64 rsvd_bits(int s, int e)
50{ 51{
52 if (e < s)
53 return 0;
54
51 return ((1ULL << (e - s + 1)) - 1) << s; 55 return ((1ULL << (e - s + 1)) - 1) << s;
52} 56}
53 57
@@ -56,23 +60,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
56void 60void
57reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 61reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
58 62
59/*
60 * Return values of handle_mmio_page_fault:
61 * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
62 * directly.
63 * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
64 * fault path update the mmio spte.
65 * RET_MMIO_PF_RETRY: let CPU fault again on the address.
66 * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
67 */
68enum {
69 RET_MMIO_PF_EMULATE = 1,
70 RET_MMIO_PF_INVALID = 2,
71 RET_MMIO_PF_RETRY = 0,
72 RET_MMIO_PF_BUG = -1
73};
74
75int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
76void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); 63void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
77void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, 64void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
78 bool accessed_dirty); 65 bool accessed_dirty);
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index dcce533d420c..d22ddbdf5e6e 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -62,11 +62,11 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
62 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 62 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
63 return; 63 return;
64 64
65 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 65 if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
66 hpa_t root = vcpu->arch.mmu.root_hpa; 66 hpa_t root = vcpu->arch.mmu.root_hpa;
67 67
68 sp = page_header(root); 68 sp = page_header(root);
69 __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); 69 __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
70 return; 70 return;
71 } 71 }
72 72
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 0149ac59c273..e9ea2d45ae66 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -130,7 +130,7 @@ static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
130 * enable MTRRs and it is obviously undesirable to run the 130 * enable MTRRs and it is obviously undesirable to run the
131 * guest entirely with UC memory and we use WB. 131 * guest entirely with UC memory and we use WB.
132 */ 132 */
133 if (guest_cpuid_has_mtrr(vcpu)) 133 if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
134 return MTRR_TYPE_UNCACHABLE; 134 return MTRR_TYPE_UNCACHABLE;
135 else 135 else
136 return MTRR_TYPE_WRBACK; 136 return MTRR_TYPE_WRBACK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b0454c7e4cff..86b68dc5a649 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -790,8 +790,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
790 &map_writable)) 790 &map_writable))
791 return 0; 791 return 0;
792 792
793 if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, 793 if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
794 walker.gfn, pfn, walker.pte_access, &r))
795 return r; 794 return r;
796 795
797 /* 796 /*
@@ -819,7 +818,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
819 goto out_unlock; 818 goto out_unlock;
820 819
821 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 820 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
822 make_mmu_pages_available(vcpu); 821 if (make_mmu_pages_available(vcpu) < 0)
822 goto out_unlock;
823 if (!force_pt_level) 823 if (!force_pt_level)
824 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 824 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
825 r = FNAME(fetch)(vcpu, addr, &walker, write_fault, 825 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8dbd8dbc83eb..2c1cfe68a9af 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -280,9 +280,9 @@ module_param(avic, int, S_IRUGO);
280static int vls = true; 280static int vls = true;
281module_param(vls, int, 0444); 281module_param(vls, int, 0444);
282 282
283/* AVIC VM ID bit masks and lock */ 283/* enable/disable Virtual GIF */
284static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); 284static int vgif = true;
285static DEFINE_SPINLOCK(avic_vm_id_lock); 285module_param(vgif, int, 0444);
286 286
287static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 287static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
288static void svm_flush_tlb(struct kvm_vcpu *vcpu); 288static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@ -479,19 +479,33 @@ static inline void clr_intercept(struct vcpu_svm *svm, int bit)
479 recalc_intercepts(svm); 479 recalc_intercepts(svm);
480} 480}
481 481
482static inline bool vgif_enabled(struct vcpu_svm *svm)
483{
484 return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
485}
486
482static inline void enable_gif(struct vcpu_svm *svm) 487static inline void enable_gif(struct vcpu_svm *svm)
483{ 488{
484 svm->vcpu.arch.hflags |= HF_GIF_MASK; 489 if (vgif_enabled(svm))
490 svm->vmcb->control.int_ctl |= V_GIF_MASK;
491 else
492 svm->vcpu.arch.hflags |= HF_GIF_MASK;
485} 493}
486 494
487static inline void disable_gif(struct vcpu_svm *svm) 495static inline void disable_gif(struct vcpu_svm *svm)
488{ 496{
489 svm->vcpu.arch.hflags &= ~HF_GIF_MASK; 497 if (vgif_enabled(svm))
498 svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
499 else
500 svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
490} 501}
491 502
492static inline bool gif_set(struct vcpu_svm *svm) 503static inline bool gif_set(struct vcpu_svm *svm)
493{ 504{
494 return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); 505 if (vgif_enabled(svm))
506 return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
507 else
508 return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
495} 509}
496 510
497static unsigned long iopm_base; 511static unsigned long iopm_base;
@@ -567,10 +581,10 @@ static inline void invlpga(unsigned long addr, u32 asid)
567 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 581 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
568} 582}
569 583
570static int get_npt_level(void) 584static int get_npt_level(struct kvm_vcpu *vcpu)
571{ 585{
572#ifdef CONFIG_X86_64 586#ifdef CONFIG_X86_64
573 return PT64_ROOT_LEVEL; 587 return PT64_ROOT_4LEVEL;
574#else 588#else
575 return PT32E_ROOT_LEVEL; 589 return PT32E_ROOT_LEVEL;
576#endif 590#endif
@@ -641,7 +655,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
641 struct vcpu_svm *svm = to_svm(vcpu); 655 struct vcpu_svm *svm = to_svm(vcpu);
642 unsigned nr = vcpu->arch.exception.nr; 656 unsigned nr = vcpu->arch.exception.nr;
643 bool has_error_code = vcpu->arch.exception.has_error_code; 657 bool has_error_code = vcpu->arch.exception.has_error_code;
644 bool reinject = vcpu->arch.exception.reinject; 658 bool reinject = vcpu->arch.exception.injected;
645 u32 error_code = vcpu->arch.exception.error_code; 659 u32 error_code = vcpu->arch.exception.error_code;
646 660
647 /* 661 /*
@@ -973,6 +987,7 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
973static void disable_nmi_singlestep(struct vcpu_svm *svm) 987static void disable_nmi_singlestep(struct vcpu_svm *svm)
974{ 988{
975 svm->nmi_singlestep = false; 989 svm->nmi_singlestep = false;
990
976 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { 991 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
977 /* Clear our flags if they were not set by the guest */ 992 /* Clear our flags if they were not set by the guest */
978 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 993 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
@@ -989,6 +1004,8 @@ static void disable_nmi_singlestep(struct vcpu_svm *svm)
989 */ 1004 */
990#define SVM_VM_DATA_HASH_BITS 8 1005#define SVM_VM_DATA_HASH_BITS 8
991static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 1006static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
1007static u32 next_vm_id = 0;
1008static bool next_vm_id_wrapped = 0;
992static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 1009static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
993 1010
994/* Note: 1011/* Note:
@@ -1108,6 +1125,13 @@ static __init int svm_hardware_setup(void)
1108 } 1125 }
1109 } 1126 }
1110 1127
1128 if (vgif) {
1129 if (!boot_cpu_has(X86_FEATURE_VGIF))
1130 vgif = false;
1131 else
1132 pr_info("Virtual GIF supported\n");
1133 }
1134
1111 return 0; 1135 return 0;
1112 1136
1113err: 1137err:
@@ -1305,6 +1329,12 @@ static void init_vmcb(struct vcpu_svm *svm)
1305 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1329 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1306 } 1330 }
1307 1331
1332 if (vgif) {
1333 clr_intercept(svm, INTERCEPT_STGI);
1334 clr_intercept(svm, INTERCEPT_CLGI);
1335 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1336 }
1337
1308 mark_all_dirty(svm->vmcb); 1338 mark_all_dirty(svm->vmcb);
1309 1339
1310 enable_gif(svm); 1340 enable_gif(svm);
@@ -1387,34 +1417,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
1387 return 0; 1417 return 0;
1388} 1418}
1389 1419
1390static inline int avic_get_next_vm_id(void)
1391{
1392 int id;
1393
1394 spin_lock(&avic_vm_id_lock);
1395
1396 /* AVIC VM ID is one-based. */
1397 id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
1398 if (id <= AVIC_VM_ID_MASK)
1399 __set_bit(id, avic_vm_id_bitmap);
1400 else
1401 id = -EAGAIN;
1402
1403 spin_unlock(&avic_vm_id_lock);
1404 return id;
1405}
1406
1407static inline int avic_free_vm_id(int id)
1408{
1409 if (id <= 0 || id > AVIC_VM_ID_MASK)
1410 return -EINVAL;
1411
1412 spin_lock(&avic_vm_id_lock);
1413 __clear_bit(id, avic_vm_id_bitmap);
1414 spin_unlock(&avic_vm_id_lock);
1415 return 0;
1416}
1417
1418static void avic_vm_destroy(struct kvm *kvm) 1420static void avic_vm_destroy(struct kvm *kvm)
1419{ 1421{
1420 unsigned long flags; 1422 unsigned long flags;
@@ -1423,8 +1425,6 @@ static void avic_vm_destroy(struct kvm *kvm)
1423 if (!avic) 1425 if (!avic)
1424 return; 1426 return;
1425 1427
1426 avic_free_vm_id(vm_data->avic_vm_id);
1427
1428 if (vm_data->avic_logical_id_table_page) 1428 if (vm_data->avic_logical_id_table_page)
1429 __free_page(vm_data->avic_logical_id_table_page); 1429 __free_page(vm_data->avic_logical_id_table_page);
1430 if (vm_data->avic_physical_id_table_page) 1430 if (vm_data->avic_physical_id_table_page)
@@ -1438,19 +1438,16 @@ static void avic_vm_destroy(struct kvm *kvm)
1438static int avic_vm_init(struct kvm *kvm) 1438static int avic_vm_init(struct kvm *kvm)
1439{ 1439{
1440 unsigned long flags; 1440 unsigned long flags;
1441 int vm_id, err = -ENOMEM; 1441 int err = -ENOMEM;
1442 struct kvm_arch *vm_data = &kvm->arch; 1442 struct kvm_arch *vm_data = &kvm->arch;
1443 struct page *p_page; 1443 struct page *p_page;
1444 struct page *l_page; 1444 struct page *l_page;
1445 struct kvm_arch *ka;
1446 u32 vm_id;
1445 1447
1446 if (!avic) 1448 if (!avic)
1447 return 0; 1449 return 0;
1448 1450
1449 vm_id = avic_get_next_vm_id();
1450 if (vm_id < 0)
1451 return vm_id;
1452 vm_data->avic_vm_id = (u32)vm_id;
1453
1454 /* Allocating physical APIC ID table (4KB) */ 1451 /* Allocating physical APIC ID table (4KB) */
1455 p_page = alloc_page(GFP_KERNEL); 1452 p_page = alloc_page(GFP_KERNEL);
1456 if (!p_page) 1453 if (!p_page)
@@ -1468,6 +1465,22 @@ static int avic_vm_init(struct kvm *kvm)
1468 clear_page(page_address(l_page)); 1465 clear_page(page_address(l_page));
1469 1466
1470 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1467 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1468 again:
1469 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
1470 if (vm_id == 0) { /* id is 1-based, zero is not okay */
1471 next_vm_id_wrapped = 1;
1472 goto again;
1473 }
1474 /* Is it still in use? Only possible if wrapped at least once */
1475 if (next_vm_id_wrapped) {
1476 hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
1477 struct kvm *k2 = container_of(ka, struct kvm, arch);
1478 struct kvm_arch *vd2 = &k2->arch;
1479 if (vd2->avic_vm_id == vm_id)
1480 goto again;
1481 }
1482 }
1483 vm_data->avic_vm_id = vm_id;
1471 hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); 1484 hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
1472 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1485 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1473 1486
@@ -1580,7 +1593,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1580 } 1593 }
1581 init_vmcb(svm); 1594 init_vmcb(svm);
1582 1595
1583 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); 1596 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
1584 kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 1597 kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1585 1598
1586 if (kvm_vcpu_apicv_active(vcpu) && !init_event) 1599 if (kvm_vcpu_apicv_active(vcpu) && !init_event)
@@ -2384,7 +2397,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
2384 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; 2397 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
2385 vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; 2398 vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
2386 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; 2399 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
2387 vcpu->arch.mmu.shadow_root_level = get_npt_level(); 2400 vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
2388 reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu); 2401 reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
2389 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 2402 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
2390} 2403}
@@ -3147,6 +3160,13 @@ static int stgi_interception(struct vcpu_svm *svm)
3147 if (nested_svm_check_permissions(svm)) 3160 if (nested_svm_check_permissions(svm))
3148 return 1; 3161 return 1;
3149 3162
3163 /*
3164 * If VGIF is enabled, the STGI intercept is only added to
3165 * detect the opening of the NMI window; remove it now.
3166 */
3167 if (vgif_enabled(svm))
3168 clr_intercept(svm, INTERCEPT_STGI);
3169
3150 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3170 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3151 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3171 ret = kvm_skip_emulated_instruction(&svm->vcpu);
3152 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3172 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
@@ -3744,7 +3764,10 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
3744 3764
3745static int pause_interception(struct vcpu_svm *svm) 3765static int pause_interception(struct vcpu_svm *svm)
3746{ 3766{
3747 kvm_vcpu_on_spin(&(svm->vcpu)); 3767 struct kvm_vcpu *vcpu = &svm->vcpu;
3768 bool in_kernel = (svm_get_cpl(vcpu) == 0);
3769
3770 kvm_vcpu_on_spin(vcpu, in_kernel);
3748 return 1; 3771 return 1;
3749} 3772}
3750 3773
@@ -4228,8 +4251,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
4228 4251
4229 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); 4252 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
4230 4253
4231 vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
4232
4233 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) 4254 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
4234 vcpu->arch.cr0 = svm->vmcb->save.cr0; 4255 vcpu->arch.cr0 = svm->vmcb->save.cr0;
4235 if (npt_enabled) 4256 if (npt_enabled)
@@ -4682,9 +4703,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
4682 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes 4703 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
4683 * 1, because that's a separate STGI/VMRUN intercept. The next time we 4704 * 1, because that's a separate STGI/VMRUN intercept. The next time we
4684 * get that intercept, this function will be called again though and 4705 * get that intercept, this function will be called again though and
4685 * we'll get the vintr intercept. 4706 * we'll get the vintr intercept. However, if the vGIF feature is
4707 * enabled, the STGI interception will not occur. Enable the irq
4708 * window under the assumption that the hardware will set the GIF.
4686 */ 4709 */
4687 if (gif_set(svm) && nested_svm_intr(svm)) { 4710 if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
4688 svm_set_vintr(svm); 4711 svm_set_vintr(svm);
4689 svm_inject_irq(svm, 0x0); 4712 svm_inject_irq(svm, 0x0);
4690 } 4713 }
@@ -4698,8 +4721,11 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
4698 == HF_NMI_MASK) 4721 == HF_NMI_MASK)
4699 return; /* IRET will cause a vm exit */ 4722 return; /* IRET will cause a vm exit */
4700 4723
4701 if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0) 4724 if (!gif_set(svm)) {
4725 if (vgif_enabled(svm))
4726 set_intercept(svm, INTERCEPT_STGI);
4702 return; /* STGI will cause a vm exit */ 4727 return; /* STGI will cause a vm exit */
4728 }
4703 4729
4704 if (svm->nested.exit_required) 4730 if (svm->nested.exit_required)
4705 return; /* we're not going to run the guest yet */ 4731 return; /* we're not going to run the guest yet */
@@ -5071,17 +5097,14 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
5071static void svm_cpuid_update(struct kvm_vcpu *vcpu) 5097static void svm_cpuid_update(struct kvm_vcpu *vcpu)
5072{ 5098{
5073 struct vcpu_svm *svm = to_svm(vcpu); 5099 struct vcpu_svm *svm = to_svm(vcpu);
5074 struct kvm_cpuid_entry2 *entry;
5075 5100
5076 /* Update nrips enabled cache */ 5101 /* Update nrips enabled cache */
5077 svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); 5102 svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
5078 5103
5079 if (!kvm_vcpu_apicv_active(vcpu)) 5104 if (!kvm_vcpu_apicv_active(vcpu))
5080 return; 5105 return;
5081 5106
5082 entry = kvm_find_cpuid_entry(vcpu, 1, 0); 5107 guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
5083 if (entry)
5084 entry->ecx &= ~bit(X86_FEATURE_X2APIC);
5085} 5108}
5086 5109
5087static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 5110static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0a6cc6754ec5..8a202c49e2a0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -151,8 +151,8 @@ TRACE_EVENT(kvm_fast_mmio,
151 */ 151 */
152TRACE_EVENT(kvm_cpuid, 152TRACE_EVENT(kvm_cpuid,
153 TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, 153 TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
154 unsigned long rcx, unsigned long rdx), 154 unsigned long rcx, unsigned long rdx, bool found),
155 TP_ARGS(function, rax, rbx, rcx, rdx), 155 TP_ARGS(function, rax, rbx, rcx, rdx, found),
156 156
157 TP_STRUCT__entry( 157 TP_STRUCT__entry(
158 __field( unsigned int, function ) 158 __field( unsigned int, function )
@@ -160,6 +160,7 @@ TRACE_EVENT(kvm_cpuid,
160 __field( unsigned long, rbx ) 160 __field( unsigned long, rbx )
161 __field( unsigned long, rcx ) 161 __field( unsigned long, rcx )
162 __field( unsigned long, rdx ) 162 __field( unsigned long, rdx )
163 __field( bool, found )
163 ), 164 ),
164 165
165 TP_fast_assign( 166 TP_fast_assign(
@@ -168,11 +169,13 @@ TRACE_EVENT(kvm_cpuid,
168 __entry->rbx = rbx; 169 __entry->rbx = rbx;
169 __entry->rcx = rcx; 170 __entry->rcx = rcx;
170 __entry->rdx = rdx; 171 __entry->rdx = rdx;
172 __entry->found = found;
171 ), 173 ),
172 174
173 TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", 175 TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s",
174 __entry->function, __entry->rax, 176 __entry->function, __entry->rax,
175 __entry->rbx, __entry->rcx, __entry->rdx) 177 __entry->rbx, __entry->rcx, __entry->rdx,
178 __entry->found ? "found" : "not found")
176); 179);
177 180
178#define AREG(x) { APIC_##x, "APIC_" #x } 181#define AREG(x) { APIC_##x, "APIC_" #x }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 70b90c0810d0..4253adef9044 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -122,7 +122,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
122 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 122 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
123#define KVM_CR4_GUEST_OWNED_BITS \ 123#define KVM_CR4_GUEST_OWNED_BITS \
124 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 124 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
125 | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) 125 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
126 126
127#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 127#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
128#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 128#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
@@ -243,11 +243,13 @@ struct __packed vmcs12 {
243 u64 virtual_apic_page_addr; 243 u64 virtual_apic_page_addr;
244 u64 apic_access_addr; 244 u64 apic_access_addr;
245 u64 posted_intr_desc_addr; 245 u64 posted_intr_desc_addr;
246 u64 vm_function_control;
246 u64 ept_pointer; 247 u64 ept_pointer;
247 u64 eoi_exit_bitmap0; 248 u64 eoi_exit_bitmap0;
248 u64 eoi_exit_bitmap1; 249 u64 eoi_exit_bitmap1;
249 u64 eoi_exit_bitmap2; 250 u64 eoi_exit_bitmap2;
250 u64 eoi_exit_bitmap3; 251 u64 eoi_exit_bitmap3;
252 u64 eptp_list_address;
251 u64 xss_exit_bitmap; 253 u64 xss_exit_bitmap;
252 u64 guest_physical_address; 254 u64 guest_physical_address;
253 u64 vmcs_link_pointer; 255 u64 vmcs_link_pointer;
@@ -481,6 +483,7 @@ struct nested_vmx {
481 u64 nested_vmx_cr4_fixed0; 483 u64 nested_vmx_cr4_fixed0;
482 u64 nested_vmx_cr4_fixed1; 484 u64 nested_vmx_cr4_fixed1;
483 u64 nested_vmx_vmcs_enum; 485 u64 nested_vmx_vmcs_enum;
486 u64 nested_vmx_vmfunc_controls;
484}; 487};
485 488
486#define POSTED_INTR_ON 0 489#define POSTED_INTR_ON 0
@@ -573,6 +576,8 @@ struct vcpu_vmx {
573#endif 576#endif
574 u32 vm_entry_controls_shadow; 577 u32 vm_entry_controls_shadow;
575 u32 vm_exit_controls_shadow; 578 u32 vm_exit_controls_shadow;
579 u32 secondary_exec_control;
580
576 /* 581 /*
577 * loaded_vmcs points to the VMCS currently used in this vcpu. For a 582 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
578 * non-nested (L1) guest, it always points to vmcs01. For a nested 583 * non-nested (L1) guest, it always points to vmcs01. For a nested
@@ -761,11 +766,13 @@ static const unsigned short vmcs_field_to_offset_table[] = {
761 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), 766 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
762 FIELD64(APIC_ACCESS_ADDR, apic_access_addr), 767 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
763 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), 768 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
769 FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
764 FIELD64(EPT_POINTER, ept_pointer), 770 FIELD64(EPT_POINTER, ept_pointer),
765 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), 771 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
766 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), 772 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
767 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), 773 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
768 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), 774 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
775 FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
769 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), 776 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
770 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), 777 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
771 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), 778 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -889,25 +896,6 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
889 return to_vmx(vcpu)->nested.cached_vmcs12; 896 return to_vmx(vcpu)->nested.cached_vmcs12;
890} 897}
891 898
892static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
893{
894 struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
895 if (is_error_page(page))
896 return NULL;
897
898 return page;
899}
900
901static void nested_release_page(struct page *page)
902{
903 kvm_release_page_dirty(page);
904}
905
906static void nested_release_page_clean(struct page *page)
907{
908 kvm_release_page_clean(page);
909}
910
911static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); 899static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
912static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); 900static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
913static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); 901static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
@@ -1212,6 +1200,16 @@ static inline bool cpu_has_vmx_ept_4levels(void)
1212 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 1200 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1213} 1201}
1214 1202
1203static inline bool cpu_has_vmx_ept_mt_wb(void)
1204{
1205 return vmx_capability.ept & VMX_EPTP_WB_BIT;
1206}
1207
1208static inline bool cpu_has_vmx_ept_5levels(void)
1209{
1210 return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
1211}
1212
1215static inline bool cpu_has_vmx_ept_ad_bits(void) 1213static inline bool cpu_has_vmx_ept_ad_bits(void)
1216{ 1214{
1217 return vmx_capability.ept & VMX_EPT_AD_BIT; 1215 return vmx_capability.ept & VMX_EPT_AD_BIT;
@@ -1317,6 +1315,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
1317 SECONDARY_EXEC_TSC_SCALING; 1315 SECONDARY_EXEC_TSC_SCALING;
1318} 1316}
1319 1317
1318static inline bool cpu_has_vmx_vmfunc(void)
1319{
1320 return vmcs_config.cpu_based_2nd_exec_ctrl &
1321 SECONDARY_EXEC_ENABLE_VMFUNC;
1322}
1323
1320static inline bool report_flexpriority(void) 1324static inline bool report_flexpriority(void)
1321{ 1325{
1322 return flexpriority_enabled; 1326 return flexpriority_enabled;
@@ -1357,8 +1361,7 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1357 1361
1358static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) 1362static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
1359{ 1363{
1360 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) && 1364 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
1361 vmx_xsaves_supported();
1362} 1365}
1363 1366
1364static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) 1367static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
@@ -1391,6 +1394,18 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
1391 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; 1394 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
1392} 1395}
1393 1396
1397static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
1398{
1399 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
1400}
1401
1402static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
1403{
1404 return nested_cpu_has_vmfunc(vmcs12) &&
1405 (vmcs12->vm_function_control &
1406 VMX_VMFUNC_EPTP_SWITCHING);
1407}
1408
1394static inline bool is_nmi(u32 intr_info) 1409static inline bool is_nmi(u32 intr_info)
1395{ 1410{
1396 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1411 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2450,15 +2465,14 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
2450 * KVM wants to inject page-faults which it got to the guest. This function 2465 * KVM wants to inject page-faults which it got to the guest. This function
2451 * checks whether in a nested guest, we need to inject them to L1 or L2. 2466 * checks whether in a nested guest, we need to inject them to L1 or L2.
2452 */ 2467 */
2453static int nested_vmx_check_exception(struct kvm_vcpu *vcpu) 2468static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
2454{ 2469{
2455 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2470 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2456 unsigned int nr = vcpu->arch.exception.nr; 2471 unsigned int nr = vcpu->arch.exception.nr;
2457 2472
2458 if (nr == PF_VECTOR) { 2473 if (nr == PF_VECTOR) {
2459 if (vcpu->arch.exception.nested_apf) { 2474 if (vcpu->arch.exception.nested_apf) {
2460 nested_vmx_inject_exception_vmexit(vcpu, 2475 *exit_qual = vcpu->arch.apf.nested_apf_token;
2461 vcpu->arch.apf.nested_apf_token);
2462 return 1; 2476 return 1;
2463 } 2477 }
2464 /* 2478 /*
@@ -2472,16 +2486,15 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
2472 */ 2486 */
2473 if (nested_vmx_is_page_fault_vmexit(vmcs12, 2487 if (nested_vmx_is_page_fault_vmexit(vmcs12,
2474 vcpu->arch.exception.error_code)) { 2488 vcpu->arch.exception.error_code)) {
2475 nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2); 2489 *exit_qual = vcpu->arch.cr2;
2476 return 1; 2490 return 1;
2477 } 2491 }
2478 } else { 2492 } else {
2479 unsigned long exit_qual = 0;
2480 if (nr == DB_VECTOR)
2481 exit_qual = vcpu->arch.dr6;
2482
2483 if (vmcs12->exception_bitmap & (1u << nr)) { 2493 if (vmcs12->exception_bitmap & (1u << nr)) {
2484 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 2494 if (nr == DB_VECTOR)
2495 *exit_qual = vcpu->arch.dr6;
2496 else
2497 *exit_qual = 0;
2485 return 1; 2498 return 1;
2486 } 2499 }
2487 } 2500 }
@@ -2494,14 +2507,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2494 struct vcpu_vmx *vmx = to_vmx(vcpu); 2507 struct vcpu_vmx *vmx = to_vmx(vcpu);
2495 unsigned nr = vcpu->arch.exception.nr; 2508 unsigned nr = vcpu->arch.exception.nr;
2496 bool has_error_code = vcpu->arch.exception.has_error_code; 2509 bool has_error_code = vcpu->arch.exception.has_error_code;
2497 bool reinject = vcpu->arch.exception.reinject;
2498 u32 error_code = vcpu->arch.exception.error_code; 2510 u32 error_code = vcpu->arch.exception.error_code;
2499 u32 intr_info = nr | INTR_INFO_VALID_MASK; 2511 u32 intr_info = nr | INTR_INFO_VALID_MASK;
2500 2512
2501 if (!reinject && is_guest_mode(vcpu) &&
2502 nested_vmx_check_exception(vcpu))
2503 return;
2504
2505 if (has_error_code) { 2513 if (has_error_code) {
2506 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 2514 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2507 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 2515 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -2600,7 +2608,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
2600 if (index >= 0) 2608 if (index >= 0)
2601 move_msr_up(vmx, index, save_nmsrs++); 2609 move_msr_up(vmx, index, save_nmsrs++);
2602 index = __find_msr_index(vmx, MSR_TSC_AUX); 2610 index = __find_msr_index(vmx, MSR_TSC_AUX);
2603 if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) 2611 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
2604 move_msr_up(vmx, index, save_nmsrs++); 2612 move_msr_up(vmx, index, save_nmsrs++);
2605 /* 2613 /*
2606 * MSR_STAR is only needed on long mode guests, and only 2614 * MSR_STAR is only needed on long mode guests, and only
@@ -2660,12 +2668,6 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2660 } 2668 }
2661} 2669}
2662 2670
2663static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
2664{
2665 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
2666 return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
2667}
2668
2669/* 2671/*
2670 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX 2672 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2671 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for 2673 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
@@ -2674,7 +2676,7 @@ static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
2674 */ 2676 */
2675static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) 2677static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2676{ 2678{
2677 return nested && guest_cpuid_has_vmx(vcpu); 2679 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
2678} 2680}
2679 2681
2680/* 2682/*
@@ -2797,21 +2799,21 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2797 vmx->nested.nested_vmx_procbased_ctls_low &= 2799 vmx->nested.nested_vmx_procbased_ctls_low &=
2798 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 2800 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2799 2801
2800 /* secondary cpu-based controls */ 2802 /*
2803 * secondary cpu-based controls. Do not include those that
2804 * depend on CPUID bits, they are added later by vmx_cpuid_update.
2805 */
2801 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 2806 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2802 vmx->nested.nested_vmx_secondary_ctls_low, 2807 vmx->nested.nested_vmx_secondary_ctls_low,
2803 vmx->nested.nested_vmx_secondary_ctls_high); 2808 vmx->nested.nested_vmx_secondary_ctls_high);
2804 vmx->nested.nested_vmx_secondary_ctls_low = 0; 2809 vmx->nested.nested_vmx_secondary_ctls_low = 0;
2805 vmx->nested.nested_vmx_secondary_ctls_high &= 2810 vmx->nested.nested_vmx_secondary_ctls_high &=
2806 SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED |
2807 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2811 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2808 SECONDARY_EXEC_RDTSCP |
2809 SECONDARY_EXEC_DESC | 2812 SECONDARY_EXEC_DESC |
2810 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2813 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2811 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2814 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2812 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2815 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2813 SECONDARY_EXEC_WBINVD_EXITING | 2816 SECONDARY_EXEC_WBINVD_EXITING;
2814 SECONDARY_EXEC_XSAVES;
2815 2817
2816 if (enable_ept) { 2818 if (enable_ept) {
2817 /* nested EPT: emulate EPT also to L1 */ 2819 /* nested EPT: emulate EPT also to L1 */
@@ -2834,6 +2836,17 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2834 } else 2836 } else
2835 vmx->nested.nested_vmx_ept_caps = 0; 2837 vmx->nested.nested_vmx_ept_caps = 0;
2836 2838
2839 if (cpu_has_vmx_vmfunc()) {
2840 vmx->nested.nested_vmx_secondary_ctls_high |=
2841 SECONDARY_EXEC_ENABLE_VMFUNC;
2842 /*
2843 * Advertise EPTP switching unconditionally
2844 * since we emulate it
2845 */
2846 vmx->nested.nested_vmx_vmfunc_controls =
2847 VMX_VMFUNC_EPTP_SWITCHING;
2848 }
2849
2837 /* 2850 /*
2838 * Old versions of KVM use the single-context version without 2851 * Old versions of KVM use the single-context version without
2839 * checking for support, so declare that it is supported even 2852 * checking for support, so declare that it is supported even
@@ -3203,6 +3216,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
3203 *pdata = vmx->nested.nested_vmx_ept_caps | 3216 *pdata = vmx->nested.nested_vmx_ept_caps |
3204 ((u64)vmx->nested.nested_vmx_vpid_caps << 32); 3217 ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
3205 break; 3218 break;
3219 case MSR_IA32_VMX_VMFUNC:
3220 *pdata = vmx->nested.nested_vmx_vmfunc_controls;
3221 break;
3206 default: 3222 default:
3207 return 1; 3223 return 1;
3208 } 3224 }
@@ -3256,7 +3272,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3256 break; 3272 break;
3257 case MSR_IA32_BNDCFGS: 3273 case MSR_IA32_BNDCFGS:
3258 if (!kvm_mpx_supported() || 3274 if (!kvm_mpx_supported() ||
3259 (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) 3275 (!msr_info->host_initiated &&
3276 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3260 return 1; 3277 return 1;
3261 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 3278 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
3262 break; 3279 break;
@@ -3280,7 +3297,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3280 msr_info->data = vcpu->arch.ia32_xss; 3297 msr_info->data = vcpu->arch.ia32_xss;
3281 break; 3298 break;
3282 case MSR_TSC_AUX: 3299 case MSR_TSC_AUX:
3283 if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) 3300 if (!msr_info->host_initiated &&
3301 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3284 return 1; 3302 return 1;
3285 /* Otherwise falls through */ 3303 /* Otherwise falls through */
3286 default: 3304 default:
@@ -3339,9 +3357,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3339 break; 3357 break;
3340 case MSR_IA32_BNDCFGS: 3358 case MSR_IA32_BNDCFGS:
3341 if (!kvm_mpx_supported() || 3359 if (!kvm_mpx_supported() ||
3342 (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) 3360 (!msr_info->host_initiated &&
3361 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3343 return 1; 3362 return 1;
3344 if (is_noncanonical_address(data & PAGE_MASK) || 3363 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
3345 (data & MSR_IA32_BNDCFGS_RSVD)) 3364 (data & MSR_IA32_BNDCFGS_RSVD))
3346 return 1; 3365 return 1;
3347 vmcs_write64(GUEST_BNDCFGS, data); 3366 vmcs_write64(GUEST_BNDCFGS, data);
@@ -3402,7 +3421,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3402 clear_atomic_switch_msr(vmx, MSR_IA32_XSS); 3421 clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
3403 break; 3422 break;
3404 case MSR_TSC_AUX: 3423 case MSR_TSC_AUX:
3405 if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) 3424 if (!msr_info->host_initiated &&
3425 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3406 return 1; 3426 return 1;
3407 /* Check reserved bit, higher 32 bits should be zero */ 3427 /* Check reserved bit, higher 32 bits should be zero */
3408 if ((data >> 32) != 0) 3428 if ((data >> 32) != 0)
@@ -3639,8 +3659,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3639 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3659 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3640 SECONDARY_EXEC_SHADOW_VMCS | 3660 SECONDARY_EXEC_SHADOW_VMCS |
3641 SECONDARY_EXEC_XSAVES | 3661 SECONDARY_EXEC_XSAVES |
3662 SECONDARY_EXEC_RDSEED |
3663 SECONDARY_EXEC_RDRAND |
3642 SECONDARY_EXEC_ENABLE_PML | 3664 SECONDARY_EXEC_ENABLE_PML |
3643 SECONDARY_EXEC_TSC_SCALING; 3665 SECONDARY_EXEC_TSC_SCALING |
3666 SECONDARY_EXEC_ENABLE_VMFUNC;
3644 if (adjust_vmx_controls(min2, opt2, 3667 if (adjust_vmx_controls(min2, opt2,
3645 MSR_IA32_VMX_PROCBASED_CTLS2, 3668 MSR_IA32_VMX_PROCBASED_CTLS2,
3646 &_cpu_based_2nd_exec_control) < 0) 3669 &_cpu_based_2nd_exec_control) < 0)
@@ -4272,16 +4295,22 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
4272 vmx->emulation_required = emulation_required(vcpu); 4295 vmx->emulation_required = emulation_required(vcpu);
4273} 4296}
4274 4297
4298static int get_ept_level(struct kvm_vcpu *vcpu)
4299{
4300 if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
4301 return 5;
4302 return 4;
4303}
4304
4275static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) 4305static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
4276{ 4306{
4277 u64 eptp; 4307 u64 eptp = VMX_EPTP_MT_WB;
4308
4309 eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
4278 4310
4279 /* TODO write the value reading from MSR */
4280 eptp = VMX_EPT_DEFAULT_MT |
4281 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
4282 if (enable_ept_ad_bits && 4311 if (enable_ept_ad_bits &&
4283 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 4312 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
4284 eptp |= VMX_EPT_AD_ENABLE_BIT; 4313 eptp |= VMX_EPTP_AD_ENABLE_BIT;
4285 eptp |= (root_hpa & PAGE_MASK); 4314 eptp |= (root_hpa & PAGE_MASK);
4286 4315
4287 return eptp; 4316 return eptp;
@@ -5243,10 +5272,24 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
5243 return exec_control; 5272 return exec_control;
5244} 5273}
5245 5274
5246static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 5275static bool vmx_rdrand_supported(void)
5247{ 5276{
5277 return vmcs_config.cpu_based_2nd_exec_ctrl &
5278 SECONDARY_EXEC_RDRAND;
5279}
5280
5281static bool vmx_rdseed_supported(void)
5282{
5283 return vmcs_config.cpu_based_2nd_exec_ctrl &
5284 SECONDARY_EXEC_RDSEED;
5285}
5286
5287static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5288{
5289 struct kvm_vcpu *vcpu = &vmx->vcpu;
5290
5248 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 5291 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
5249 if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu)) 5292 if (!cpu_need_virtualize_apic_accesses(vcpu))
5250 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 5293 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
5251 if (vmx->vpid == 0) 5294 if (vmx->vpid == 0)
5252 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 5295 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -5260,7 +5303,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
5260 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 5303 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
5261 if (!ple_gap) 5304 if (!ple_gap)
5262 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 5305 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
5263 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 5306 if (!kvm_vcpu_apicv_active(vcpu))
5264 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 5307 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
5265 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 5308 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5266 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 5309 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -5274,7 +5317,92 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
5274 if (!enable_pml) 5317 if (!enable_pml)
5275 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 5318 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
5276 5319
5277 return exec_control; 5320 if (vmx_xsaves_supported()) {
5321 /* Exposing XSAVES only when XSAVE is exposed */
5322 bool xsaves_enabled =
5323 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
5324 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
5325
5326 if (!xsaves_enabled)
5327 exec_control &= ~SECONDARY_EXEC_XSAVES;
5328
5329 if (nested) {
5330 if (xsaves_enabled)
5331 vmx->nested.nested_vmx_secondary_ctls_high |=
5332 SECONDARY_EXEC_XSAVES;
5333 else
5334 vmx->nested.nested_vmx_secondary_ctls_high &=
5335 ~SECONDARY_EXEC_XSAVES;
5336 }
5337 }
5338
5339 if (vmx_rdtscp_supported()) {
5340 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
5341 if (!rdtscp_enabled)
5342 exec_control &= ~SECONDARY_EXEC_RDTSCP;
5343
5344 if (nested) {
5345 if (rdtscp_enabled)
5346 vmx->nested.nested_vmx_secondary_ctls_high |=
5347 SECONDARY_EXEC_RDTSCP;
5348 else
5349 vmx->nested.nested_vmx_secondary_ctls_high &=
5350 ~SECONDARY_EXEC_RDTSCP;
5351 }
5352 }
5353
5354 if (vmx_invpcid_supported()) {
5355 /* Exposing INVPCID only when PCID is exposed */
5356 bool invpcid_enabled =
5357 guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
5358 guest_cpuid_has(vcpu, X86_FEATURE_PCID);
5359
5360 if (!invpcid_enabled) {
5361 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
5362 guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
5363 }
5364
5365 if (nested) {
5366 if (invpcid_enabled)
5367 vmx->nested.nested_vmx_secondary_ctls_high |=
5368 SECONDARY_EXEC_ENABLE_INVPCID;
5369 else
5370 vmx->nested.nested_vmx_secondary_ctls_high &=
5371 ~SECONDARY_EXEC_ENABLE_INVPCID;
5372 }
5373 }
5374
5375 if (vmx_rdrand_supported()) {
5376 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
5377 if (rdrand_enabled)
5378 exec_control &= ~SECONDARY_EXEC_RDRAND;
5379
5380 if (nested) {
5381 if (rdrand_enabled)
5382 vmx->nested.nested_vmx_secondary_ctls_high |=
5383 SECONDARY_EXEC_RDRAND;
5384 else
5385 vmx->nested.nested_vmx_secondary_ctls_high &=
5386 ~SECONDARY_EXEC_RDRAND;
5387 }
5388 }
5389
5390 if (vmx_rdseed_supported()) {
5391 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
5392 if (rdseed_enabled)
5393 exec_control &= ~SECONDARY_EXEC_RDSEED;
5394
5395 if (nested) {
5396 if (rdseed_enabled)
5397 vmx->nested.nested_vmx_secondary_ctls_high |=
5398 SECONDARY_EXEC_RDSEED;
5399 else
5400 vmx->nested.nested_vmx_secondary_ctls_high &=
5401 ~SECONDARY_EXEC_RDSEED;
5402 }
5403 }
5404
5405 vmx->secondary_exec_control = exec_control;
5278} 5406}
5279 5407
5280static void ept_set_mmio_spte_mask(void) 5408static void ept_set_mmio_spte_mask(void)
@@ -5318,8 +5446,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
5318 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 5446 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
5319 5447
5320 if (cpu_has_secondary_exec_ctrls()) { 5448 if (cpu_has_secondary_exec_ctrls()) {
5449 vmx_compute_secondary_exec_control(vmx);
5321 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 5450 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
5322 vmx_secondary_exec_control(vmx)); 5451 vmx->secondary_exec_control);
5323 } 5452 }
5324 5453
5325 if (kvm_vcpu_apicv_active(&vmx->vcpu)) { 5454 if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
@@ -5357,6 +5486,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
5357 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 5486 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
5358#endif 5487#endif
5359 5488
5489 if (cpu_has_vmx_vmfunc())
5490 vmcs_write64(VM_FUNCTION_CONTROL, 0);
5491
5360 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 5492 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
5361 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 5493 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
5362 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); 5494 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
@@ -5835,6 +5967,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
5835static int handle_triple_fault(struct kvm_vcpu *vcpu) 5967static int handle_triple_fault(struct kvm_vcpu *vcpu)
5836{ 5968{
5837 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5969 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5970 vcpu->mmio_needed = 0;
5838 return 0; 5971 return 0;
5839} 5972}
5840 5973
@@ -6330,7 +6463,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
6330{ 6463{
6331 unsigned long exit_qualification; 6464 unsigned long exit_qualification;
6332 gpa_t gpa; 6465 gpa_t gpa;
6333 u32 error_code; 6466 u64 error_code;
6334 6467
6335 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6468 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6336 6469
@@ -6362,9 +6495,10 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
6362 EPT_VIOLATION_EXECUTABLE)) 6495 EPT_VIOLATION_EXECUTABLE))
6363 ? PFERR_PRESENT_MASK : 0; 6496 ? PFERR_PRESENT_MASK : 0;
6364 6497
6365 vcpu->arch.gpa_available = true; 6498 error_code |= (exit_qualification & 0x100) != 0 ?
6366 vcpu->arch.exit_qualification = exit_qualification; 6499 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
6367 6500
6501 vcpu->arch.exit_qualification = exit_qualification;
6368 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 6502 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
6369} 6503}
6370 6504
@@ -6373,23 +6507,20 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
6373 int ret; 6507 int ret;
6374 gpa_t gpa; 6508 gpa_t gpa;
6375 6509
6510 /*
6511 * A nested guest cannot optimize MMIO vmexits, because we have an
6512 * nGPA here instead of the required GPA.
6513 */
6376 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 6514 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6377 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 6515 if (!is_guest_mode(vcpu) &&
6516 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
6378 trace_kvm_fast_mmio(gpa); 6517 trace_kvm_fast_mmio(gpa);
6379 return kvm_skip_emulated_instruction(vcpu); 6518 return kvm_skip_emulated_instruction(vcpu);
6380 } 6519 }
6381 6520
6382 ret = handle_mmio_page_fault(vcpu, gpa, true); 6521 ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
6383 vcpu->arch.gpa_available = true; 6522 if (ret >= 0)
6384 if (likely(ret == RET_MMIO_PF_EMULATE)) 6523 return ret;
6385 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
6386 EMULATE_DONE;
6387
6388 if (unlikely(ret == RET_MMIO_PF_INVALID))
6389 return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
6390
6391 if (unlikely(ret == RET_MMIO_PF_RETRY))
6392 return 1;
6393 6524
6394 /* It is the real ept misconfig */ 6525 /* It is the real ept misconfig */
6395 WARN_ON(1); 6526 WARN_ON(1);
@@ -6611,7 +6742,8 @@ static __init int hardware_setup(void)
6611 init_vmcs_shadow_fields(); 6742 init_vmcs_shadow_fields();
6612 6743
6613 if (!cpu_has_vmx_ept() || 6744 if (!cpu_has_vmx_ept() ||
6614 !cpu_has_vmx_ept_4levels()) { 6745 !cpu_has_vmx_ept_4levels() ||
6746 !cpu_has_vmx_ept_mt_wb()) {
6615 enable_ept = 0; 6747 enable_ept = 0;
6616 enable_unrestricted_guest = 0; 6748 enable_unrestricted_guest = 0;
6617 enable_ept_ad_bits = 0; 6749 enable_ept_ad_bits = 0;
@@ -6754,7 +6886,13 @@ static int handle_pause(struct kvm_vcpu *vcpu)
6754 if (ple_gap) 6886 if (ple_gap)
6755 grow_ple_window(vcpu); 6887 grow_ple_window(vcpu);
6756 6888
6757 kvm_vcpu_on_spin(vcpu); 6889 /*
6890 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
6891 * VM-execution control is ignored if CPL > 0. OTOH, KVM
6892 * never set PAUSE_EXITING and just set PLE if supported,
6893 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
6894 */
6895 kvm_vcpu_on_spin(vcpu, true);
6758 return kvm_skip_emulated_instruction(vcpu); 6896 return kvm_skip_emulated_instruction(vcpu);
6759} 6897}
6760 6898
@@ -6769,6 +6907,12 @@ static int handle_mwait(struct kvm_vcpu *vcpu)
6769 return handle_nop(vcpu); 6907 return handle_nop(vcpu);
6770} 6908}
6771 6909
6910static int handle_invalid_op(struct kvm_vcpu *vcpu)
6911{
6912 kvm_queue_exception(vcpu, UD_VECTOR);
6913 return 1;
6914}
6915
6772static int handle_monitor_trap(struct kvm_vcpu *vcpu) 6916static int handle_monitor_trap(struct kvm_vcpu *vcpu)
6773{ 6917{
6774 return 1; 6918 return 1;
@@ -6985,7 +7129,7 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
6985 * non-canonical form. This is the only check on the memory 7129 * non-canonical form. This is the only check on the memory
6986 * destination for long mode! 7130 * destination for long mode!
6987 */ 7131 */
6988 exn = is_noncanonical_address(*ret); 7132 exn = is_noncanonical_address(*ret, vcpu);
6989 } else if (is_protmode(vcpu)) { 7133 } else if (is_protmode(vcpu)) {
6990 /* Protected mode: apply checks for segment validity in the 7134 /* Protected mode: apply checks for segment validity in the
6991 * following order: 7135 * following order:
@@ -7149,19 +7293,19 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
7149 return kvm_skip_emulated_instruction(vcpu); 7293 return kvm_skip_emulated_instruction(vcpu);
7150 } 7294 }
7151 7295
7152 page = nested_get_page(vcpu, vmptr); 7296 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
7153 if (page == NULL) { 7297 if (is_error_page(page)) {
7154 nested_vmx_failInvalid(vcpu); 7298 nested_vmx_failInvalid(vcpu);
7155 return kvm_skip_emulated_instruction(vcpu); 7299 return kvm_skip_emulated_instruction(vcpu);
7156 } 7300 }
7157 if (*(u32 *)kmap(page) != VMCS12_REVISION) { 7301 if (*(u32 *)kmap(page) != VMCS12_REVISION) {
7158 kunmap(page); 7302 kunmap(page);
7159 nested_release_page_clean(page); 7303 kvm_release_page_clean(page);
7160 nested_vmx_failInvalid(vcpu); 7304 nested_vmx_failInvalid(vcpu);
7161 return kvm_skip_emulated_instruction(vcpu); 7305 return kvm_skip_emulated_instruction(vcpu);
7162 } 7306 }
7163 kunmap(page); 7307 kunmap(page);
7164 nested_release_page_clean(page); 7308 kvm_release_page_clean(page);
7165 7309
7166 vmx->nested.vmxon_ptr = vmptr; 7310 vmx->nested.vmxon_ptr = vmptr;
7167 ret = enter_vmx_operation(vcpu); 7311 ret = enter_vmx_operation(vcpu);
@@ -7242,16 +7386,16 @@ static void free_nested(struct vcpu_vmx *vmx)
7242 kfree(vmx->nested.cached_vmcs12); 7386 kfree(vmx->nested.cached_vmcs12);
7243 /* Unpin physical memory we referred to in current vmcs02 */ 7387 /* Unpin physical memory we referred to in current vmcs02 */
7244 if (vmx->nested.apic_access_page) { 7388 if (vmx->nested.apic_access_page) {
7245 nested_release_page(vmx->nested.apic_access_page); 7389 kvm_release_page_dirty(vmx->nested.apic_access_page);
7246 vmx->nested.apic_access_page = NULL; 7390 vmx->nested.apic_access_page = NULL;
7247 } 7391 }
7248 if (vmx->nested.virtual_apic_page) { 7392 if (vmx->nested.virtual_apic_page) {
7249 nested_release_page(vmx->nested.virtual_apic_page); 7393 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
7250 vmx->nested.virtual_apic_page = NULL; 7394 vmx->nested.virtual_apic_page = NULL;
7251 } 7395 }
7252 if (vmx->nested.pi_desc_page) { 7396 if (vmx->nested.pi_desc_page) {
7253 kunmap(vmx->nested.pi_desc_page); 7397 kunmap(vmx->nested.pi_desc_page);
7254 nested_release_page(vmx->nested.pi_desc_page); 7398 kvm_release_page_dirty(vmx->nested.pi_desc_page);
7255 vmx->nested.pi_desc_page = NULL; 7399 vmx->nested.pi_desc_page = NULL;
7256 vmx->nested.pi_desc = NULL; 7400 vmx->nested.pi_desc = NULL;
7257 } 7401 }
@@ -7618,15 +7762,15 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
7618 if (vmx->nested.current_vmptr != vmptr) { 7762 if (vmx->nested.current_vmptr != vmptr) {
7619 struct vmcs12 *new_vmcs12; 7763 struct vmcs12 *new_vmcs12;
7620 struct page *page; 7764 struct page *page;
7621 page = nested_get_page(vcpu, vmptr); 7765 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
7622 if (page == NULL) { 7766 if (is_error_page(page)) {
7623 nested_vmx_failInvalid(vcpu); 7767 nested_vmx_failInvalid(vcpu);
7624 return kvm_skip_emulated_instruction(vcpu); 7768 return kvm_skip_emulated_instruction(vcpu);
7625 } 7769 }
7626 new_vmcs12 = kmap(page); 7770 new_vmcs12 = kmap(page);
7627 if (new_vmcs12->revision_id != VMCS12_REVISION) { 7771 if (new_vmcs12->revision_id != VMCS12_REVISION) {
7628 kunmap(page); 7772 kunmap(page);
7629 nested_release_page_clean(page); 7773 kvm_release_page_clean(page);
7630 nested_vmx_failValid(vcpu, 7774 nested_vmx_failValid(vcpu,
7631 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 7775 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
7632 return kvm_skip_emulated_instruction(vcpu); 7776 return kvm_skip_emulated_instruction(vcpu);
@@ -7639,7 +7783,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
7639 */ 7783 */
7640 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 7784 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
7641 kunmap(page); 7785 kunmap(page);
7642 nested_release_page_clean(page); 7786 kvm_release_page_clean(page);
7643 7787
7644 set_current_vmptr(vmx, vmptr); 7788 set_current_vmptr(vmx, vmptr);
7645 } 7789 }
@@ -7790,7 +7934,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
7790 7934
7791 switch (type) { 7935 switch (type) {
7792 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 7936 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
7793 if (is_noncanonical_address(operand.gla)) { 7937 if (is_noncanonical_address(operand.gla, vcpu)) {
7794 nested_vmx_failValid(vcpu, 7938 nested_vmx_failValid(vcpu,
7795 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 7939 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7796 return kvm_skip_emulated_instruction(vcpu); 7940 return kvm_skip_emulated_instruction(vcpu);
@@ -7847,6 +7991,124 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
7847 return 1; 7991 return 1;
7848} 7992}
7849 7993
7994static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
7995{
7996 struct vcpu_vmx *vmx = to_vmx(vcpu);
7997 int maxphyaddr = cpuid_maxphyaddr(vcpu);
7998
7999 /* Check for memory type validity */
8000 switch (address & VMX_EPTP_MT_MASK) {
8001 case VMX_EPTP_MT_UC:
8002 if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT))
8003 return false;
8004 break;
8005 case VMX_EPTP_MT_WB:
8006 if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT))
8007 return false;
8008 break;
8009 default:
8010 return false;
8011 }
8012
8013 /* only 4 levels page-walk length are valid */
8014 if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
8015 return false;
8016
8017 /* Reserved bits should not be set */
8018 if (address >> maxphyaddr || ((address >> 7) & 0x1f))
8019 return false;
8020
8021 /* AD, if set, should be supported */
8022 if (address & VMX_EPTP_AD_ENABLE_BIT) {
8023 if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT))
8024 return false;
8025 }
8026
8027 return true;
8028}
8029
8030static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
8031 struct vmcs12 *vmcs12)
8032{
8033 u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
8034 u64 address;
8035 bool accessed_dirty;
8036 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
8037
8038 if (!nested_cpu_has_eptp_switching(vmcs12) ||
8039 !nested_cpu_has_ept(vmcs12))
8040 return 1;
8041
8042 if (index >= VMFUNC_EPTP_ENTRIES)
8043 return 1;
8044
8045
8046 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
8047 &address, index * 8, 8))
8048 return 1;
8049
8050 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
8051
8052 /*
8053 * If the (L2) guest does a vmfunc to the currently
8054 * active ept pointer, we don't have to do anything else
8055 */
8056 if (vmcs12->ept_pointer != address) {
8057 if (!valid_ept_address(vcpu, address))
8058 return 1;
8059
8060 kvm_mmu_unload(vcpu);
8061 mmu->ept_ad = accessed_dirty;
8062 mmu->base_role.ad_disabled = !accessed_dirty;
8063 vmcs12->ept_pointer = address;
8064 /*
8065 * TODO: Check what's the correct approach in case
8066 * mmu reload fails. Currently, we just let the next
8067 * reload potentially fail
8068 */
8069 kvm_mmu_reload(vcpu);
8070 }
8071
8072 return 0;
8073}
8074
8075static int handle_vmfunc(struct kvm_vcpu *vcpu)
8076{
8077 struct vcpu_vmx *vmx = to_vmx(vcpu);
8078 struct vmcs12 *vmcs12;
8079 u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
8080
8081 /*
8082 * VMFUNC is only supported for nested guests, but we always enable the
8083 * secondary control for simplicity; for non-nested mode, fake that we
8084 * didn't by injecting #UD.
8085 */
8086 if (!is_guest_mode(vcpu)) {
8087 kvm_queue_exception(vcpu, UD_VECTOR);
8088 return 1;
8089 }
8090
8091 vmcs12 = get_vmcs12(vcpu);
8092 if ((vmcs12->vm_function_control & (1 << function)) == 0)
8093 goto fail;
8094
8095 switch (function) {
8096 case 0:
8097 if (nested_vmx_eptp_switching(vcpu, vmcs12))
8098 goto fail;
8099 break;
8100 default:
8101 goto fail;
8102 }
8103 return kvm_skip_emulated_instruction(vcpu);
8104
8105fail:
8106 nested_vmx_vmexit(vcpu, vmx->exit_reason,
8107 vmcs_read32(VM_EXIT_INTR_INFO),
8108 vmcs_readl(EXIT_QUALIFICATION));
8109 return 1;
8110}
8111
7850/* 8112/*
7851 * The exit handlers return 1 if the exit was handled fully and guest execution 8113 * The exit handlers return 1 if the exit was handled fully and guest execution
7852 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 8114 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7894,9 +8156,12 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7894 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, 8156 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
7895 [EXIT_REASON_INVEPT] = handle_invept, 8157 [EXIT_REASON_INVEPT] = handle_invept,
7896 [EXIT_REASON_INVVPID] = handle_invvpid, 8158 [EXIT_REASON_INVVPID] = handle_invvpid,
8159 [EXIT_REASON_RDRAND] = handle_invalid_op,
8160 [EXIT_REASON_RDSEED] = handle_invalid_op,
7897 [EXIT_REASON_XSAVES] = handle_xsaves, 8161 [EXIT_REASON_XSAVES] = handle_xsaves,
7898 [EXIT_REASON_XRSTORS] = handle_xrstors, 8162 [EXIT_REASON_XRSTORS] = handle_xrstors,
7899 [EXIT_REASON_PML_FULL] = handle_pml_full, 8163 [EXIT_REASON_PML_FULL] = handle_pml_full,
8164 [EXIT_REASON_VMFUNC] = handle_vmfunc,
7900 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 8165 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
7901}; 8166};
7902 8167
@@ -8212,6 +8477,10 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
8212 * table is L0's fault. 8477 * table is L0's fault.
8213 */ 8478 */
8214 return false; 8479 return false;
8480 case EXIT_REASON_INVPCID:
8481 return
8482 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
8483 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
8215 case EXIT_REASON_WBINVD: 8484 case EXIT_REASON_WBINVD:
8216 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 8485 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
8217 case EXIT_REASON_XSETBV: 8486 case EXIT_REASON_XSETBV:
@@ -8229,6 +8498,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
8229 case EXIT_REASON_PML_FULL: 8498 case EXIT_REASON_PML_FULL:
8230 /* We emulate PML support to L1. */ 8499 /* We emulate PML support to L1. */
8231 return false; 8500 return false;
8501 case EXIT_REASON_VMFUNC:
8502 /* VM functions are emulated through L2->L0 vmexits. */
8503 return false;
8232 default: 8504 default:
8233 return true; 8505 return true;
8234 } 8506 }
@@ -8487,7 +8759,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
8487 u32 vectoring_info = vmx->idt_vectoring_info; 8759 u32 vectoring_info = vmx->idt_vectoring_info;
8488 8760
8489 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); 8761 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
8490 vcpu->arch.gpa_available = false;
8491 8762
8492 /* 8763 /*
8493 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 8764 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -9341,11 +9612,6 @@ static void __init vmx_check_processor_compat(void *rtn)
9341 } 9612 }
9342} 9613}
9343 9614
9344static int get_ept_level(void)
9345{
9346 return VMX_EPT_DEFAULT_GAW + 1;
9347}
9348
9349static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 9615static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
9350{ 9616{
9351 u8 cache; 9617 u8 cache;
@@ -9462,39 +9728,13 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
9462 9728
9463static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 9729static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
9464{ 9730{
9465 struct kvm_cpuid_entry2 *best;
9466 struct vcpu_vmx *vmx = to_vmx(vcpu); 9731 struct vcpu_vmx *vmx = to_vmx(vcpu);
9467 u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
9468 9732
9469 if (vmx_rdtscp_supported()) { 9733 if (cpu_has_secondary_exec_ctrls()) {
9470 bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu); 9734 vmx_compute_secondary_exec_control(vmx);
9471 if (!rdtscp_enabled) 9735 vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
9472 secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
9473
9474 if (nested) {
9475 if (rdtscp_enabled)
9476 vmx->nested.nested_vmx_secondary_ctls_high |=
9477 SECONDARY_EXEC_RDTSCP;
9478 else
9479 vmx->nested.nested_vmx_secondary_ctls_high &=
9480 ~SECONDARY_EXEC_RDTSCP;
9481 }
9482 }
9483
9484 /* Exposing INVPCID only when PCID is exposed */
9485 best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
9486 if (vmx_invpcid_supported() &&
9487 (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
9488 !guest_cpuid_has_pcid(vcpu))) {
9489 secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
9490
9491 if (best)
9492 best->ebx &= ~bit(X86_FEATURE_INVPCID);
9493 } 9736 }
9494 9737
9495 if (cpu_has_secondary_exec_ctrls())
9496 vmcs_set_secondary_exec_control(secondary_exec_ctl);
9497
9498 if (nested_vmx_allowed(vcpu)) 9738 if (nested_vmx_allowed(vcpu))
9499 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 9739 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
9500 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 9740 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -9535,7 +9775,7 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
9535 9775
9536static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) 9776static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
9537{ 9777{
9538 return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT; 9778 return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
9539} 9779}
9540 9780
9541/* Callbacks for nested_ept_init_mmu_context: */ 9781/* Callbacks for nested_ept_init_mmu_context: */
@@ -9548,18 +9788,15 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
9548 9788
9549static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 9789static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
9550{ 9790{
9551 bool wants_ad;
9552
9553 WARN_ON(mmu_is_nested(vcpu)); 9791 WARN_ON(mmu_is_nested(vcpu));
9554 wants_ad = nested_ept_ad_enabled(vcpu); 9792 if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
9555 if (wants_ad && !enable_ept_ad_bits)
9556 return 1; 9793 return 1;
9557 9794
9558 kvm_mmu_unload(vcpu); 9795 kvm_mmu_unload(vcpu);
9559 kvm_init_shadow_ept_mmu(vcpu, 9796 kvm_init_shadow_ept_mmu(vcpu,
9560 to_vmx(vcpu)->nested.nested_vmx_ept_caps & 9797 to_vmx(vcpu)->nested.nested_vmx_ept_caps &
9561 VMX_EPT_EXECUTE_ONLY_BIT, 9798 VMX_EPT_EXECUTE_ONLY_BIT,
9562 wants_ad); 9799 nested_ept_ad_enabled(vcpu));
9563 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 9800 vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
9564 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; 9801 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
9565 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; 9802 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -9610,6 +9847,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
9610 struct vmcs12 *vmcs12) 9847 struct vmcs12 *vmcs12)
9611{ 9848{
9612 struct vcpu_vmx *vmx = to_vmx(vcpu); 9849 struct vcpu_vmx *vmx = to_vmx(vcpu);
9850 struct page *page;
9613 u64 hpa; 9851 u64 hpa;
9614 9852
9615 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 9853 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -9619,17 +9857,19 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
9619 * physical address remains valid. We keep a reference 9857 * physical address remains valid. We keep a reference
9620 * to it so we can release it later. 9858 * to it so we can release it later.
9621 */ 9859 */
9622 if (vmx->nested.apic_access_page) /* shouldn't happen */ 9860 if (vmx->nested.apic_access_page) { /* shouldn't happen */
9623 nested_release_page(vmx->nested.apic_access_page); 9861 kvm_release_page_dirty(vmx->nested.apic_access_page);
9624 vmx->nested.apic_access_page = 9862 vmx->nested.apic_access_page = NULL;
9625 nested_get_page(vcpu, vmcs12->apic_access_addr); 9863 }
9864 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
9626 /* 9865 /*
9627 * If translation failed, no matter: This feature asks 9866 * If translation failed, no matter: This feature asks
9628 * to exit when accessing the given address, and if it 9867 * to exit when accessing the given address, and if it
9629 * can never be accessed, this feature won't do 9868 * can never be accessed, this feature won't do
9630 * anything anyway. 9869 * anything anyway.
9631 */ 9870 */
9632 if (vmx->nested.apic_access_page) { 9871 if (!is_error_page(page)) {
9872 vmx->nested.apic_access_page = page;
9633 hpa = page_to_phys(vmx->nested.apic_access_page); 9873 hpa = page_to_phys(vmx->nested.apic_access_page);
9634 vmcs_write64(APIC_ACCESS_ADDR, hpa); 9874 vmcs_write64(APIC_ACCESS_ADDR, hpa);
9635 } else { 9875 } else {
@@ -9644,10 +9884,11 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
9644 } 9884 }
9645 9885
9646 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 9886 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
9647 if (vmx->nested.virtual_apic_page) /* shouldn't happen */ 9887 if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
9648 nested_release_page(vmx->nested.virtual_apic_page); 9888 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
9649 vmx->nested.virtual_apic_page = 9889 vmx->nested.virtual_apic_page = NULL;
9650 nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); 9890 }
9891 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
9651 9892
9652 /* 9893 /*
9653 * If translation failed, VM entry will fail because 9894 * If translation failed, VM entry will fail because
@@ -9662,7 +9903,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
9662 * control. But such a configuration is useless, so 9903 * control. But such a configuration is useless, so
9663 * let's keep the code simple. 9904 * let's keep the code simple.
9664 */ 9905 */
9665 if (vmx->nested.virtual_apic_page) { 9906 if (!is_error_page(page)) {
9907 vmx->nested.virtual_apic_page = page;
9666 hpa = page_to_phys(vmx->nested.virtual_apic_page); 9908 hpa = page_to_phys(vmx->nested.virtual_apic_page);
9667 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); 9909 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
9668 } 9910 }
@@ -9671,16 +9913,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
9671 if (nested_cpu_has_posted_intr(vmcs12)) { 9913 if (nested_cpu_has_posted_intr(vmcs12)) {
9672 if (vmx->nested.pi_desc_page) { /* shouldn't happen */ 9914 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
9673 kunmap(vmx->nested.pi_desc_page); 9915 kunmap(vmx->nested.pi_desc_page);
9674 nested_release_page(vmx->nested.pi_desc_page); 9916 kvm_release_page_dirty(vmx->nested.pi_desc_page);
9917 vmx->nested.pi_desc_page = NULL;
9675 } 9918 }
9676 vmx->nested.pi_desc_page = 9919 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
9677 nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); 9920 if (is_error_page(page))
9678 vmx->nested.pi_desc =
9679 (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
9680 if (!vmx->nested.pi_desc) {
9681 nested_release_page_clean(vmx->nested.pi_desc_page);
9682 return; 9921 return;
9683 } 9922 vmx->nested.pi_desc_page = page;
9923 vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
9684 vmx->nested.pi_desc = 9924 vmx->nested.pi_desc =
9685 (struct pi_desc *)((void *)vmx->nested.pi_desc + 9925 (struct pi_desc *)((void *)vmx->nested.pi_desc +
9686 (unsigned long)(vmcs12->posted_intr_desc_addr & 9926 (unsigned long)(vmcs12->posted_intr_desc_addr &
@@ -9746,6 +9986,18 @@ static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
9746 return 0; 9986 return 0;
9747} 9987}
9748 9988
9989static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
9990 struct vmcs12 *vmcs12)
9991{
9992 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
9993 return 0;
9994
9995 if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
9996 return -EINVAL;
9997
9998 return 0;
9999}
10000
9749/* 10001/*
9750 * Merge L0's and L1's MSR bitmap, return false to indicate that 10002 * Merge L0's and L1's MSR bitmap, return false to indicate that
9751 * we do not use the hardware. 10003 * we do not use the hardware.
@@ -9762,8 +10014,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
9762 if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) 10014 if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
9763 return false; 10015 return false;
9764 10016
9765 page = nested_get_page(vcpu, vmcs12->msr_bitmap); 10017 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
9766 if (!page) 10018 if (is_error_page(page))
9767 return false; 10019 return false;
9768 msr_bitmap_l1 = (unsigned long *)kmap(page); 10020 msr_bitmap_l1 = (unsigned long *)kmap(page);
9769 10021
@@ -9793,7 +10045,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
9793 } 10045 }
9794 } 10046 }
9795 kunmap(page); 10047 kunmap(page);
9796 nested_release_page_clean(page); 10048 kvm_release_page_clean(page);
9797 10049
9798 return true; 10050 return true;
9799} 10051}
@@ -10187,13 +10439,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10187 enable_ept ? vmcs12->page_fault_error_code_match : 0); 10439 enable_ept ? vmcs12->page_fault_error_code_match : 0);
10188 10440
10189 if (cpu_has_secondary_exec_ctrls()) { 10441 if (cpu_has_secondary_exec_ctrls()) {
10190 exec_control = vmx_secondary_exec_control(vmx); 10442 exec_control = vmx->secondary_exec_control;
10191 10443
10192 /* Take the following fields only from vmcs12 */ 10444 /* Take the following fields only from vmcs12 */
10193 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 10445 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
10446 SECONDARY_EXEC_ENABLE_INVPCID |
10194 SECONDARY_EXEC_RDTSCP | 10447 SECONDARY_EXEC_RDTSCP |
10448 SECONDARY_EXEC_XSAVES |
10195 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 10449 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
10196 SECONDARY_EXEC_APIC_REGISTER_VIRT); 10450 SECONDARY_EXEC_APIC_REGISTER_VIRT |
10451 SECONDARY_EXEC_ENABLE_VMFUNC);
10197 if (nested_cpu_has(vmcs12, 10452 if (nested_cpu_has(vmcs12,
10198 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { 10453 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
10199 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & 10454 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
@@ -10201,6 +10456,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10201 exec_control |= vmcs12_exec_ctrl; 10456 exec_control |= vmcs12_exec_ctrl;
10202 } 10457 }
10203 10458
10459 /* All VMFUNCs are currently emulated through L0 vmexits. */
10460 if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
10461 vmcs_write64(VM_FUNCTION_CONTROL, 0);
10462
10204 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 10463 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
10205 vmcs_write64(EOI_EXIT_BITMAP0, 10464 vmcs_write64(EOI_EXIT_BITMAP0,
10206 vmcs12->eoi_exit_bitmap0); 10465 vmcs12->eoi_exit_bitmap0);
@@ -10426,6 +10685,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10426 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) 10685 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
10427 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10686 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10428 10687
10688 if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
10689 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10690
10429 if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) 10691 if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
10430 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10692 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10431 10693
@@ -10453,6 +10715,18 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10453 vmx->nested.nested_vmx_entry_ctls_high)) 10715 vmx->nested.nested_vmx_entry_ctls_high))
10454 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10716 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10455 10717
10718 if (nested_cpu_has_vmfunc(vmcs12)) {
10719 if (vmcs12->vm_function_control &
10720 ~vmx->nested.nested_vmx_vmfunc_controls)
10721 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10722
10723 if (nested_cpu_has_eptp_switching(vmcs12)) {
10724 if (!nested_cpu_has_ept(vmcs12) ||
10725 !page_address_valid(vcpu, vmcs12->eptp_list_address))
10726 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10727 }
10728 }
10729
10456 if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) 10730 if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
10457 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10731 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10458 10732
@@ -10699,7 +10973,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
10699 u32 idt_vectoring; 10973 u32 idt_vectoring;
10700 unsigned int nr; 10974 unsigned int nr;
10701 10975
10702 if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { 10976 if (vcpu->arch.exception.injected) {
10703 nr = vcpu->arch.exception.nr; 10977 nr = vcpu->arch.exception.nr;
10704 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 10978 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
10705 10979
@@ -10738,12 +11012,20 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
10738static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 11012static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
10739{ 11013{
10740 struct vcpu_vmx *vmx = to_vmx(vcpu); 11014 struct vcpu_vmx *vmx = to_vmx(vcpu);
11015 unsigned long exit_qual;
10741 11016
10742 if (vcpu->arch.exception.pending || 11017 if (kvm_event_needs_reinjection(vcpu))
10743 vcpu->arch.nmi_injected ||
10744 vcpu->arch.interrupt.pending)
10745 return -EBUSY; 11018 return -EBUSY;
10746 11019
11020 if (vcpu->arch.exception.pending &&
11021 nested_vmx_check_exception(vcpu, &exit_qual)) {
11022 if (vmx->nested.nested_run_pending)
11023 return -EBUSY;
11024 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
11025 vcpu->arch.exception.pending = false;
11026 return 0;
11027 }
11028
10747 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 11029 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
10748 vmx->nested.preemption_timer_expired) { 11030 vmx->nested.preemption_timer_expired) {
10749 if (vmx->nested.nested_run_pending) 11031 if (vmx->nested.nested_run_pending)
@@ -11184,16 +11466,16 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
11184 11466
11185 /* Unpin physical memory we referred to in vmcs02 */ 11467 /* Unpin physical memory we referred to in vmcs02 */
11186 if (vmx->nested.apic_access_page) { 11468 if (vmx->nested.apic_access_page) {
11187 nested_release_page(vmx->nested.apic_access_page); 11469 kvm_release_page_dirty(vmx->nested.apic_access_page);
11188 vmx->nested.apic_access_page = NULL; 11470 vmx->nested.apic_access_page = NULL;
11189 } 11471 }
11190 if (vmx->nested.virtual_apic_page) { 11472 if (vmx->nested.virtual_apic_page) {
11191 nested_release_page(vmx->nested.virtual_apic_page); 11473 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
11192 vmx->nested.virtual_apic_page = NULL; 11474 vmx->nested.virtual_apic_page = NULL;
11193 } 11475 }
11194 if (vmx->nested.pi_desc_page) { 11476 if (vmx->nested.pi_desc_page) {
11195 kunmap(vmx->nested.pi_desc_page); 11477 kunmap(vmx->nested.pi_desc_page);
11196 nested_release_page(vmx->nested.pi_desc_page); 11478 kvm_release_page_dirty(vmx->nested.pi_desc_page);
11197 vmx->nested.pi_desc_page = NULL; 11479 vmx->nested.pi_desc_page = NULL;
11198 vmx->nested.pi_desc = NULL; 11480 vmx->nested.pi_desc = NULL;
11199 } 11481 }
@@ -11369,14 +11651,14 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
11369 11651
11370 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; 11652 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
11371 11653
11372 page = nested_get_page(vcpu, vmcs12->pml_address); 11654 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
11373 if (!page) 11655 if (is_error_page(page))
11374 return 0; 11656 return 0;
11375 11657
11376 pml_address = kmap(page); 11658 pml_address = kmap(page);
11377 pml_address[vmcs12->guest_pml_index--] = gpa; 11659 pml_address[vmcs12->guest_pml_index--] = gpa;
11378 kunmap(page); 11660 kunmap(page);
11379 nested_release_page_clean(page); 11661 kvm_release_page_clean(page);
11380 } 11662 }
11381 11663
11382 return 0; 11664 return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ef5102f80497..6069af86da3b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -311,13 +311,13 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
311 (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); 311 (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
312 u64 new_state = msr_info->data & 312 u64 new_state = msr_info->data &
313 (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); 313 (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
314 u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 314 u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
315 0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE); 315 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
316 316
317 if ((msr_info->data & reserved_bits) || new_state == X2APIC_ENABLE)
318 return 1;
317 if (!msr_info->host_initiated && 319 if (!msr_info->host_initiated &&
318 ((msr_info->data & reserved_bits) != 0 || 320 ((new_state == MSR_IA32_APICBASE_ENABLE &&
319 new_state == X2APIC_ENABLE ||
320 (new_state == MSR_IA32_APICBASE_ENABLE &&
321 old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) || 321 old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
322 (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) && 322 (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
323 old_state == 0))) 323 old_state == 0)))
@@ -390,15 +390,28 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
390 390
391 kvm_make_request(KVM_REQ_EVENT, vcpu); 391 kvm_make_request(KVM_REQ_EVENT, vcpu);
392 392
393 if (!vcpu->arch.exception.pending) { 393 if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
394 queue: 394 queue:
395 if (has_error && !is_protmode(vcpu)) 395 if (has_error && !is_protmode(vcpu))
396 has_error = false; 396 has_error = false;
397 vcpu->arch.exception.pending = true; 397 if (reinject) {
398 /*
399 * On vmentry, vcpu->arch.exception.pending is only
400 * true if an event injection was blocked by
401 * nested_run_pending. In that case, however,
402 * vcpu_enter_guest requests an immediate exit,
403 * and the guest shouldn't proceed far enough to
404 * need reinjection.
405 */
406 WARN_ON_ONCE(vcpu->arch.exception.pending);
407 vcpu->arch.exception.injected = true;
408 } else {
409 vcpu->arch.exception.pending = true;
410 vcpu->arch.exception.injected = false;
411 }
398 vcpu->arch.exception.has_error_code = has_error; 412 vcpu->arch.exception.has_error_code = has_error;
399 vcpu->arch.exception.nr = nr; 413 vcpu->arch.exception.nr = nr;
400 vcpu->arch.exception.error_code = error_code; 414 vcpu->arch.exception.error_code = error_code;
401 vcpu->arch.exception.reinject = reinject;
402 return; 415 return;
403 } 416 }
404 417
@@ -413,8 +426,13 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
413 class2 = exception_class(nr); 426 class2 = exception_class(nr);
414 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) 427 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
415 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { 428 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
416 /* generate double fault per SDM Table 5-5 */ 429 /*
430 * Generate double fault per SDM Table 5-5. Set
431 * exception.pending = true so that the double fault
432 * can trigger a nested vmexit.
433 */
417 vcpu->arch.exception.pending = true; 434 vcpu->arch.exception.pending = true;
435 vcpu->arch.exception.injected = false;
418 vcpu->arch.exception.has_error_code = true; 436 vcpu->arch.exception.has_error_code = true;
419 vcpu->arch.exception.nr = DF_VECTOR; 437 vcpu->arch.exception.nr = DF_VECTOR;
420 vcpu->arch.exception.error_code = 0; 438 vcpu->arch.exception.error_code = 0;
@@ -755,19 +773,22 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
755 if (cr4 & CR4_RESERVED_BITS) 773 if (cr4 & CR4_RESERVED_BITS)
756 return 1; 774 return 1;
757 775
758 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) 776 if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
777 return 1;
778
779 if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
759 return 1; 780 return 1;
760 781
761 if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) 782 if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
762 return 1; 783 return 1;
763 784
764 if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) 785 if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
765 return 1; 786 return 1;
766 787
767 if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) 788 if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
768 return 1; 789 return 1;
769 790
770 if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE)) 791 if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
771 return 1; 792 return 1;
772 793
773 if (is_long_mode(vcpu)) { 794 if (is_long_mode(vcpu)) {
@@ -780,7 +801,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
780 return 1; 801 return 1;
781 802
782 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { 803 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
783 if (!guest_cpuid_has_pcid(vcpu)) 804 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
784 return 1; 805 return 1;
785 806
786 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ 807 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
@@ -814,10 +835,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
814 return 0; 835 return 0;
815 } 836 }
816 837
817 if (is_long_mode(vcpu)) { 838 if (is_long_mode(vcpu) &&
818 if (cr3 & CR3_L_MODE_RESERVED_BITS) 839 (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62)))
819 return 1; 840 return 1;
820 } else if (is_pae(vcpu) && is_paging(vcpu) && 841 else if (is_pae(vcpu) && is_paging(vcpu) &&
821 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) 842 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
822 return 1; 843 return 1;
823 844
@@ -884,7 +905,7 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
884{ 905{
885 u64 fixed = DR6_FIXED_1; 906 u64 fixed = DR6_FIXED_1;
886 907
887 if (!guest_cpuid_has_rtm(vcpu)) 908 if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
888 fixed |= DR6_RTM; 909 fixed |= DR6_RTM;
889 return fixed; 910 return fixed;
890} 911}
@@ -994,6 +1015,7 @@ static u32 emulated_msrs[] = {
994 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 1015 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
995 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 1016 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
996 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 1017 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1018 HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
997 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 1019 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
998 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 1020 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
999 HV_X64_MSR_RESET, 1021 HV_X64_MSR_RESET,
@@ -1022,21 +1044,11 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1022 if (efer & efer_reserved_bits) 1044 if (efer & efer_reserved_bits)
1023 return false; 1045 return false;
1024 1046
1025 if (efer & EFER_FFXSR) { 1047 if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1026 struct kvm_cpuid_entry2 *feat;
1027
1028 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
1029 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
1030 return false; 1048 return false;
1031 }
1032 1049
1033 if (efer & EFER_SVME) { 1050 if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1034 struct kvm_cpuid_entry2 *feat;
1035
1036 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
1037 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
1038 return false; 1051 return false;
1039 }
1040 1052
1041 return true; 1053 return true;
1042} 1054}
@@ -1084,7 +1096,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1084 case MSR_KERNEL_GS_BASE: 1096 case MSR_KERNEL_GS_BASE:
1085 case MSR_CSTAR: 1097 case MSR_CSTAR:
1086 case MSR_LSTAR: 1098 case MSR_LSTAR:
1087 if (is_noncanonical_address(msr->data)) 1099 if (is_noncanonical_address(msr->data, vcpu))
1088 return 1; 1100 return 1;
1089 break; 1101 break;
1090 case MSR_IA32_SYSENTER_EIP: 1102 case MSR_IA32_SYSENTER_EIP:
@@ -1101,7 +1113,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1101 * value, and that something deterministic happens if the guest 1113 * value, and that something deterministic happens if the guest
1102 * invokes 64-bit SYSENTER. 1114 * invokes 64-bit SYSENTER.
1103 */ 1115 */
1104 msr->data = get_canonical(msr->data); 1116 msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
1105 } 1117 }
1106 return kvm_x86_ops->set_msr(vcpu, msr); 1118 return kvm_x86_ops->set_msr(vcpu, msr);
1107} 1119}
@@ -1534,8 +1546,9 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1534 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; 1546 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1535 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; 1547 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1536 1548
1537 if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) 1549 if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1538 update_ia32_tsc_adjust_msr(vcpu, offset); 1550 update_ia32_tsc_adjust_msr(vcpu, offset);
1551
1539 kvm_vcpu_write_tsc_offset(vcpu, offset); 1552 kvm_vcpu_write_tsc_offset(vcpu, offset);
1540 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1553 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1541 1554
@@ -2185,7 +2198,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2185 kvm_set_lapic_tscdeadline_msr(vcpu, data); 2198 kvm_set_lapic_tscdeadline_msr(vcpu, data);
2186 break; 2199 break;
2187 case MSR_IA32_TSC_ADJUST: 2200 case MSR_IA32_TSC_ADJUST:
2188 if (guest_cpuid_has_tsc_adjust(vcpu)) { 2201 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
2189 if (!msr_info->host_initiated) { 2202 if (!msr_info->host_initiated) {
2190 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; 2203 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
2191 adjust_tsc_offset_guest(vcpu, adj); 2204 adjust_tsc_offset_guest(vcpu, adj);
@@ -2307,12 +2320,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2307 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); 2320 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data);
2308 break; 2321 break;
2309 case MSR_AMD64_OSVW_ID_LENGTH: 2322 case MSR_AMD64_OSVW_ID_LENGTH:
2310 if (!guest_cpuid_has_osvw(vcpu)) 2323 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2311 return 1; 2324 return 1;
2312 vcpu->arch.osvw.length = data; 2325 vcpu->arch.osvw.length = data;
2313 break; 2326 break;
2314 case MSR_AMD64_OSVW_STATUS: 2327 case MSR_AMD64_OSVW_STATUS:
2315 if (!guest_cpuid_has_osvw(vcpu)) 2328 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2316 return 1; 2329 return 1;
2317 vcpu->arch.osvw.status = data; 2330 vcpu->arch.osvw.status = data;
2318 break; 2331 break;
@@ -2537,12 +2550,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2537 msr_info->data = 0xbe702111; 2550 msr_info->data = 0xbe702111;
2538 break; 2551 break;
2539 case MSR_AMD64_OSVW_ID_LENGTH: 2552 case MSR_AMD64_OSVW_ID_LENGTH:
2540 if (!guest_cpuid_has_osvw(vcpu)) 2553 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2541 return 1; 2554 return 1;
2542 msr_info->data = vcpu->arch.osvw.length; 2555 msr_info->data = vcpu->arch.osvw.length;
2543 break; 2556 break;
2544 case MSR_AMD64_OSVW_STATUS: 2557 case MSR_AMD64_OSVW_STATUS:
2545 if (!guest_cpuid_has_osvw(vcpu)) 2558 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2546 return 1; 2559 return 1;
2547 msr_info->data = vcpu->arch.osvw.status; 2560 msr_info->data = vcpu->arch.osvw.status;
2548 break; 2561 break;
@@ -2882,6 +2895,10 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
2882void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2895void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2883{ 2896{
2884 int idx; 2897 int idx;
2898
2899 if (vcpu->preempted)
2900 vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
2901
2885 /* 2902 /*
2886 * Disable page faults because we're in atomic context here. 2903 * Disable page faults because we're in atomic context here.
2887 * kvm_write_guest_offset_cached() would call might_fault() 2904 * kvm_write_guest_offset_cached() would call might_fault()
@@ -3074,8 +3091,14 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3074 struct kvm_vcpu_events *events) 3091 struct kvm_vcpu_events *events)
3075{ 3092{
3076 process_nmi(vcpu); 3093 process_nmi(vcpu);
3094 /*
3095 * FIXME: pass injected and pending separately. This is only
3096 * needed for nested virtualization, whose state cannot be
3097 * migrated yet. For now we can combine them.
3098 */
3077 events->exception.injected = 3099 events->exception.injected =
3078 vcpu->arch.exception.pending && 3100 (vcpu->arch.exception.pending ||
3101 vcpu->arch.exception.injected) &&
3079 !kvm_exception_is_soft(vcpu->arch.exception.nr); 3102 !kvm_exception_is_soft(vcpu->arch.exception.nr);
3080 events->exception.nr = vcpu->arch.exception.nr; 3103 events->exception.nr = vcpu->arch.exception.nr;
3081 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 3104 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
@@ -3130,6 +3153,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3130 return -EINVAL; 3153 return -EINVAL;
3131 3154
3132 process_nmi(vcpu); 3155 process_nmi(vcpu);
3156 vcpu->arch.exception.injected = false;
3133 vcpu->arch.exception.pending = events->exception.injected; 3157 vcpu->arch.exception.pending = events->exception.injected;
3134 vcpu->arch.exception.nr = events->exception.nr; 3158 vcpu->arch.exception.nr = events->exception.nr;
3135 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 3159 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -4671,25 +4695,18 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
4671 */ 4695 */
4672 if (vcpu->arch.gpa_available && 4696 if (vcpu->arch.gpa_available &&
4673 emulator_can_use_gpa(ctxt) && 4697 emulator_can_use_gpa(ctxt) &&
4674 vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) && 4698 (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
4675 (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) { 4699 gpa = vcpu->arch.gpa_val;
4676 gpa = exception->address; 4700 ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
4677 goto mmio; 4701 } else {
4702 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
4703 if (ret < 0)
4704 return X86EMUL_PROPAGATE_FAULT;
4678 } 4705 }
4679 4706
4680 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); 4707 if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
4681
4682 if (ret < 0)
4683 return X86EMUL_PROPAGATE_FAULT;
4684
4685 /* For APIC access vmexit */
4686 if (ret)
4687 goto mmio;
4688
4689 if (ops->read_write_emulate(vcpu, gpa, val, bytes))
4690 return X86EMUL_CONTINUE; 4708 return X86EMUL_CONTINUE;
4691 4709
4692mmio:
4693 /* 4710 /*
4694 * Is this MMIO handled locally? 4711 * Is this MMIO handled locally?
4695 */ 4712 */
@@ -5227,10 +5244,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
5227 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); 5244 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
5228} 5245}
5229 5246
5230static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 5247static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
5231 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 5248 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
5232{ 5249{
5233 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); 5250 return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
5234} 5251}
5235 5252
5236static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) 5253static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
@@ -6362,11 +6379,42 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
6362 int r; 6379 int r;
6363 6380
6364 /* try to reinject previous events if any */ 6381 /* try to reinject previous events if any */
6382 if (vcpu->arch.exception.injected) {
6383 kvm_x86_ops->queue_exception(vcpu);
6384 return 0;
6385 }
6386
6387 /*
6388 * Exceptions must be injected immediately, or the exception
6389 * frame will have the address of the NMI or interrupt handler.
6390 */
6391 if (!vcpu->arch.exception.pending) {
6392 if (vcpu->arch.nmi_injected) {
6393 kvm_x86_ops->set_nmi(vcpu);
6394 return 0;
6395 }
6396
6397 if (vcpu->arch.interrupt.pending) {
6398 kvm_x86_ops->set_irq(vcpu);
6399 return 0;
6400 }
6401 }
6402
6403 if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
6404 r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
6405 if (r != 0)
6406 return r;
6407 }
6408
6409 /* try to inject new event if pending */
6365 if (vcpu->arch.exception.pending) { 6410 if (vcpu->arch.exception.pending) {
6366 trace_kvm_inj_exception(vcpu->arch.exception.nr, 6411 trace_kvm_inj_exception(vcpu->arch.exception.nr,
6367 vcpu->arch.exception.has_error_code, 6412 vcpu->arch.exception.has_error_code,
6368 vcpu->arch.exception.error_code); 6413 vcpu->arch.exception.error_code);
6369 6414
6415 vcpu->arch.exception.pending = false;
6416 vcpu->arch.exception.injected = true;
6417
6370 if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) 6418 if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
6371 __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | 6419 __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
6372 X86_EFLAGS_RF); 6420 X86_EFLAGS_RF);
@@ -6378,27 +6426,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
6378 } 6426 }
6379 6427
6380 kvm_x86_ops->queue_exception(vcpu); 6428 kvm_x86_ops->queue_exception(vcpu);
6381 return 0; 6429 } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
6382 }
6383
6384 if (vcpu->arch.nmi_injected) {
6385 kvm_x86_ops->set_nmi(vcpu);
6386 return 0;
6387 }
6388
6389 if (vcpu->arch.interrupt.pending) {
6390 kvm_x86_ops->set_irq(vcpu);
6391 return 0;
6392 }
6393
6394 if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
6395 r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
6396 if (r != 0)
6397 return r;
6398 }
6399
6400 /* try to inject new event if pending */
6401 if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
6402 vcpu->arch.smi_pending = false; 6430 vcpu->arch.smi_pending = false;
6403 enter_smm(vcpu); 6431 enter_smm(vcpu);
6404 } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { 6432 } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
@@ -6615,7 +6643,7 @@ static void enter_smm(struct kvm_vcpu *vcpu)
6615 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); 6643 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
6616 vcpu->arch.hflags |= HF_SMM_MASK; 6644 vcpu->arch.hflags |= HF_SMM_MASK;
6617 memset(buf, 0, 512); 6645 memset(buf, 0, 512);
6618 if (guest_cpuid_has_longmode(vcpu)) 6646 if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
6619 enter_smm_save_state_64(vcpu, buf); 6647 enter_smm_save_state_64(vcpu, buf);
6620 else 6648 else
6621 enter_smm_save_state_32(vcpu, buf); 6649 enter_smm_save_state_32(vcpu, buf);
@@ -6667,7 +6695,7 @@ static void enter_smm(struct kvm_vcpu *vcpu)
6667 kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); 6695 kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
6668 kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); 6696 kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
6669 6697
6670 if (guest_cpuid_has_longmode(vcpu)) 6698 if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
6671 kvm_x86_ops->set_efer(vcpu, 0); 6699 kvm_x86_ops->set_efer(vcpu, 0);
6672 6700
6673 kvm_update_cpuid(vcpu); 6701 kvm_update_cpuid(vcpu);
@@ -6774,6 +6802,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6774 } 6802 }
6775 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { 6803 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
6776 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 6804 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
6805 vcpu->mmio_needed = 0;
6777 r = 0; 6806 r = 0;
6778 goto out; 6807 goto out;
6779 } 6808 }
@@ -6862,6 +6891,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6862 kvm_x86_ops->enable_nmi_window(vcpu); 6891 kvm_x86_ops->enable_nmi_window(vcpu);
6863 if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) 6892 if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
6864 kvm_x86_ops->enable_irq_window(vcpu); 6893 kvm_x86_ops->enable_irq_window(vcpu);
6894 WARN_ON(vcpu->arch.exception.pending);
6865 } 6895 }
6866 6896
6867 if (kvm_lapic_enabled(vcpu)) { 6897 if (kvm_lapic_enabled(vcpu)) {
@@ -7004,6 +7034,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
7004 if (vcpu->arch.apic_attention) 7034 if (vcpu->arch.apic_attention)
7005 kvm_lapic_sync_from_vapic(vcpu); 7035 kvm_lapic_sync_from_vapic(vcpu);
7006 7036
7037 vcpu->arch.gpa_available = false;
7007 r = kvm_x86_ops->handle_exit(vcpu); 7038 r = kvm_x86_ops->handle_exit(vcpu);
7008 return r; 7039 return r;
7009 7040
@@ -7422,7 +7453,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
7422 int pending_vec, max_bits, idx; 7453 int pending_vec, max_bits, idx;
7423 struct desc_ptr dt; 7454 struct desc_ptr dt;
7424 7455
7425 if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) 7456 if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
7457 (sregs->cr4 & X86_CR4_OSXSAVE))
7458 return -EINVAL;
7459
7460 apic_base_msr.data = sregs->apic_base;
7461 apic_base_msr.host_initiated = true;
7462 if (kvm_set_apic_base(vcpu, &apic_base_msr))
7426 return -EINVAL; 7463 return -EINVAL;
7427 7464
7428 dt.size = sregs->idt.limit; 7465 dt.size = sregs->idt.limit;
@@ -7441,9 +7478,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
7441 7478
7442 mmu_reset_needed |= vcpu->arch.efer != sregs->efer; 7479 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
7443 kvm_x86_ops->set_efer(vcpu, sregs->efer); 7480 kvm_x86_ops->set_efer(vcpu, sregs->efer);
7444 apic_base_msr.data = sregs->apic_base;
7445 apic_base_msr.host_initiated = true;
7446 kvm_set_apic_base(vcpu, &apic_base_msr);
7447 7481
7448 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; 7482 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
7449 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 7483 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
@@ -7734,6 +7768,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
7734 vcpu->arch.nmi_injected = false; 7768 vcpu->arch.nmi_injected = false;
7735 kvm_clear_interrupt_queue(vcpu); 7769 kvm_clear_interrupt_queue(vcpu);
7736 kvm_clear_exception_queue(vcpu); 7770 kvm_clear_exception_queue(vcpu);
7771 vcpu->arch.exception.pending = false;
7737 7772
7738 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 7773 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
7739 kvm_update_dr0123(vcpu); 7774 kvm_update_dr0123(vcpu);
@@ -7993,6 +8028,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7993 kvm_pmu_init(vcpu); 8028 kvm_pmu_init(vcpu);
7994 8029
7995 vcpu->arch.pending_external_vector = -1; 8030 vcpu->arch.pending_external_vector = -1;
8031 vcpu->arch.preempted_in_kernel = false;
7996 8032
7997 kvm_hv_vcpu_init(vcpu); 8033 kvm_hv_vcpu_init(vcpu);
7998 8034
@@ -8440,6 +8476,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
8440 return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); 8476 return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
8441} 8477}
8442 8478
8479bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
8480{
8481 return vcpu->arch.preempted_in_kernel;
8482}
8483
8443int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 8484int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
8444{ 8485{
8445 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; 8486 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 612067074905..51e349cf5f45 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -11,7 +11,7 @@
11 11
12static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) 12static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
13{ 13{
14 vcpu->arch.exception.pending = false; 14 vcpu->arch.exception.injected = false;
15} 15}
16 16
17static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, 17static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
@@ -29,7 +29,7 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
29 29
30static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) 30static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
31{ 31{
32 return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || 32 return vcpu->arch.exception.injected || vcpu->arch.interrupt.pending ||
33 vcpu->arch.nmi_injected; 33 vcpu->arch.nmi_injected;
34} 34}
35 35
@@ -62,6 +62,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
62 return cs_l; 62 return cs_l;
63} 63}
64 64
65static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
66{
67#ifdef CONFIG_X86_64
68 return (vcpu->arch.efer & EFER_LMA) &&
69 kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
70#else
71 return 0;
72#endif
73}
74
65static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) 75static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
66{ 76{
67 return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; 77 return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
@@ -87,10 +97,48 @@ static inline u32 bit(int bitno)
87 return 1 << (bitno & 31); 97 return 1 << (bitno & 31);
88} 98}
89 99
100static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
101{
102 return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
103}
104
105static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
106{
107 return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48;
108}
109
110static inline u64 get_canonical(u64 la, u8 vaddr_bits)
111{
112 return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits);
113}
114
115static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
116{
117#ifdef CONFIG_X86_64
118 return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
119#else
120 return false;
121#endif
122}
123
124static inline bool emul_is_noncanonical_address(u64 la,
125 struct x86_emulate_ctxt *ctxt)
126{
127#ifdef CONFIG_X86_64
128 return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
129#else
130 return false;
131#endif
132}
133
90static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, 134static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
91 gva_t gva, gfn_t gfn, unsigned access) 135 gva_t gva, gfn_t gfn, unsigned access)
92{ 136{
93 vcpu->arch.mmio_gva = gva & PAGE_MASK; 137 /*
138 * If this is a shadow nested page table, the "GVA" is
139 * actually a nGPA.
140 */
141 vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
94 vcpu->arch.access = access; 142 vcpu->arch.access = access;
95 vcpu->arch.mmio_gfn = gfn; 143 vcpu->arch.mmio_gfn = gfn;
96 vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; 144 vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 21a6fd6c44af..6882538eda32 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -720,7 +720,7 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
720bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); 720bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
721void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 721void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
722int kvm_vcpu_yield_to(struct kvm_vcpu *target); 722int kvm_vcpu_yield_to(struct kvm_vcpu *target);
723void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); 723void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible);
724void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); 724void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
725void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); 725void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
726 726
@@ -800,6 +800,7 @@ int kvm_arch_hardware_setup(void);
800void kvm_arch_hardware_unsetup(void); 800void kvm_arch_hardware_unsetup(void);
801void kvm_arch_check_processor_compat(void *rtn); 801void kvm_arch_check_processor_compat(void *rtn);
802int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); 802int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
803bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
803int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); 804int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
804 805
805#ifndef __KVM_HAVE_ARCH_VM_ALLOC 806#ifndef __KVM_HAVE_ARCH_VM_ALLOC
@@ -985,6 +986,12 @@ static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn)
985 return (hpa_t)pfn << PAGE_SHIFT; 986 return (hpa_t)pfn << PAGE_SHIFT;
986} 987}
987 988
989static inline struct page *kvm_vcpu_gpa_to_page(struct kvm_vcpu *vcpu,
990 gpa_t gpa)
991{
992 return kvm_vcpu_gfn_to_page(vcpu, gpa_to_gfn(gpa));
993}
994
988static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa) 995static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)
989{ 996{
990 unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); 997 unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6cd63c18708a..838887587411 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -711,7 +711,8 @@ struct kvm_ppc_one_seg_page_size {
711struct kvm_ppc_smmu_info { 711struct kvm_ppc_smmu_info {
712 __u64 flags; 712 __u64 flags;
713 __u32 slb_size; 713 __u32 slb_size;
714 __u32 pad; 714 __u16 data_keys; /* # storage keys supported for data */
715 __u16 instr_keys; /* # storage keys supported for instructions */
715 struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; 716 struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
716}; 717};
717 718
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index a39a1e161e63..b9f68e4add71 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -416,6 +416,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
416 && !v->arch.power_off && !v->arch.pause); 416 && !v->arch.power_off && !v->arch.pause);
417} 417}
418 418
419bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
420{
421 return vcpu_mode_priv(vcpu);
422}
423
419/* Just ensure a guest exit from a particular CPU */ 424/* Just ensure a guest exit from a particular CPU */
420static void exit_vm_noop(void *info) 425static void exit_vm_noop(void *info)
421{ 426{
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 2ea21dac0b44..b36945d49986 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1454,25 +1454,6 @@ out:
1454 kvm_set_pfn_accessed(pfn); 1454 kvm_set_pfn_accessed(pfn);
1455} 1455}
1456 1456
1457static bool is_abort_sea(unsigned long fault_status)
1458{
1459 switch (fault_status) {
1460 case FSC_SEA:
1461 case FSC_SEA_TTW0:
1462 case FSC_SEA_TTW1:
1463 case FSC_SEA_TTW2:
1464 case FSC_SEA_TTW3:
1465 case FSC_SECC:
1466 case FSC_SECC_TTW0:
1467 case FSC_SECC_TTW1:
1468 case FSC_SECC_TTW2:
1469 case FSC_SECC_TTW3:
1470 return true;
1471 default:
1472 return false;
1473 }
1474}
1475
1476/** 1457/**
1477 * kvm_handle_guest_abort - handles all 2nd stage aborts 1458 * kvm_handle_guest_abort - handles all 2nd stage aborts
1478 * @vcpu: the VCPU pointer 1459 * @vcpu: the VCPU pointer
@@ -1498,20 +1479,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
1498 fault_status = kvm_vcpu_trap_get_fault_type(vcpu); 1479 fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1499 1480
1500 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1481 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1482 is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1501 1483
1502 /* 1484 /* Synchronous External Abort? */
1503 * The host kernel will handle the synchronous external abort. There 1485 if (kvm_vcpu_dabt_isextabt(vcpu)) {
1504 * is no need to pass the error into the guest. 1486 /*
1505 */ 1487 * For RAS the host kernel may handle this abort.
1506 if (is_abort_sea(fault_status)) { 1488 * There is no need to pass the error into the guest.
1489 */
1507 if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) 1490 if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
1508 return 1; 1491 return 1;
1509 }
1510 1492
1511 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1493 if (unlikely(!is_iabt)) {
1512 if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) { 1494 kvm_inject_vabt(vcpu);
1513 kvm_inject_vabt(vcpu); 1495 return 1;
1514 return 1; 1496 }
1515 } 1497 }
1516 1498
1517 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), 1499 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
index 7072ab743332..10b38178cff2 100644
--- a/virt/kvm/arm/vgic/vgic-debug.c
+++ b/virt/kvm/arm/vgic/vgic-debug.c
@@ -234,7 +234,7 @@ static int vgic_debug_show(struct seq_file *s, void *v)
234 return 0; 234 return 0;
235} 235}
236 236
237static struct seq_operations vgic_debug_seq_ops = { 237static const struct seq_operations vgic_debug_seq_ops = {
238 .start = vgic_debug_start, 238 .start = vgic_debug_start,
239 .next = vgic_debug_next, 239 .next = vgic_debug_next,
240 .stop = vgic_debug_stop, 240 .stop = vgic_debug_stop,
@@ -255,7 +255,7 @@ static int debug_open(struct inode *inode, struct file *file)
255 return ret; 255 return ret;
256}; 256};
257 257
258static struct file_operations vgic_debug_fops = { 258static const struct file_operations vgic_debug_fops = {
259 .owner = THIS_MODULE, 259 .owner = THIS_MODULE,
260 .open = debug_open, 260 .open = debug_open,
261 .read = seq_read, 261 .read = seq_read,
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index aa6b68db80b4..f51c1e1b3f70 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -144,7 +144,6 @@ struct its_ite {
144 144
145 struct vgic_irq *irq; 145 struct vgic_irq *irq;
146 struct its_collection *collection; 146 struct its_collection *collection;
147 u32 lpi;
148 u32 event_id; 147 u32 event_id;
149}; 148};
150 149
@@ -813,7 +812,7 @@ static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
813/* Must be called with its_lock mutex held */ 812/* Must be called with its_lock mutex held */
814static struct its_ite *vgic_its_alloc_ite(struct its_device *device, 813static struct its_ite *vgic_its_alloc_ite(struct its_device *device,
815 struct its_collection *collection, 814 struct its_collection *collection,
816 u32 lpi_id, u32 event_id) 815 u32 event_id)
817{ 816{
818 struct its_ite *ite; 817 struct its_ite *ite;
819 818
@@ -823,7 +822,6 @@ static struct its_ite *vgic_its_alloc_ite(struct its_device *device,
823 822
824 ite->event_id = event_id; 823 ite->event_id = event_id;
825 ite->collection = collection; 824 ite->collection = collection;
826 ite->lpi = lpi_id;
827 825
828 list_add_tail(&ite->ite_list, &device->itt_head); 826 list_add_tail(&ite->ite_list, &device->itt_head);
829 return ite; 827 return ite;
@@ -873,7 +871,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
873 new_coll = collection; 871 new_coll = collection;
874 } 872 }
875 873
876 ite = vgic_its_alloc_ite(device, collection, lpi_nr, event_id); 874 ite = vgic_its_alloc_ite(device, collection, event_id);
877 if (IS_ERR(ite)) { 875 if (IS_ERR(ite)) {
878 if (new_coll) 876 if (new_coll)
879 vgic_its_free_collection(its, coll_id); 877 vgic_its_free_collection(its, coll_id);
@@ -1848,7 +1846,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
1848 1846
1849 next_offset = compute_next_eventid_offset(&dev->itt_head, ite); 1847 next_offset = compute_next_eventid_offset(&dev->itt_head, ite);
1850 val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) | 1848 val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) |
1851 ((u64)ite->lpi << KVM_ITS_ITE_PINTID_SHIFT) | 1849 ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
1852 ite->collection->collection_id; 1850 ite->collection->collection_id;
1853 val = cpu_to_le64(val); 1851 val = cpu_to_le64(val);
1854 return kvm_write_guest(kvm, gpa, &val, ite_esz); 1852 return kvm_write_guest(kvm, gpa, &val, ite_esz);
@@ -1895,7 +1893,7 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
1895 if (!collection) 1893 if (!collection)
1896 return -EINVAL; 1894 return -EINVAL;
1897 1895
1898 ite = vgic_its_alloc_ite(dev, collection, lpi_id, event_id); 1896 ite = vgic_its_alloc_ite(dev, collection, event_id);
1899 if (IS_ERR(ite)) 1897 if (IS_ERR(ite))
1900 return PTR_ERR(ite); 1898 return PTR_ERR(ite);
1901 1899
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index 37522e65eb53..b3d4a10f09a1 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -303,6 +303,51 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
303 vgic_set_vmcr(vcpu, &vmcr); 303 vgic_set_vmcr(vcpu, &vmcr);
304} 304}
305 305
306static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
307 gpa_t addr, unsigned int len)
308{
309 int n; /* which APRn is this */
310
311 n = (addr >> 2) & 0x3;
312
313 if (kvm_vgic_global_state.type == VGIC_V2) {
314 /* GICv2 hardware systems support max. 32 groups */
315 if (n != 0)
316 return 0;
317 return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr;
318 } else {
319 struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
320
321 if (n > vgic_v3_max_apr_idx(vcpu))
322 return 0;
323 /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
324 return vgicv3->vgic_ap1r[n];
325 }
326}
327
328static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu,
329 gpa_t addr, unsigned int len,
330 unsigned long val)
331{
332 int n; /* which APRn is this */
333
334 n = (addr >> 2) & 0x3;
335
336 if (kvm_vgic_global_state.type == VGIC_V2) {
337 /* GICv2 hardware systems support max. 32 groups */
338 if (n != 0)
339 return;
340 vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val;
341 } else {
342 struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
343
344 if (n > vgic_v3_max_apr_idx(vcpu))
345 return;
346 /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
347 vgicv3->vgic_ap1r[n] = val;
348 }
349}
350
306static const struct vgic_register_region vgic_v2_dist_registers[] = { 351static const struct vgic_register_region vgic_v2_dist_registers[] = {
307 REGISTER_DESC_WITH_LENGTH(GIC_DIST_CTRL, 352 REGISTER_DESC_WITH_LENGTH(GIC_DIST_CTRL,
308 vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12, 353 vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12,
@@ -364,7 +409,7 @@ static const struct vgic_register_region vgic_v2_cpu_registers[] = {
364 vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, 409 vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
365 VGIC_ACCESS_32bit), 410 VGIC_ACCESS_32bit),
366 REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO, 411 REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO,
367 vgic_mmio_read_raz, vgic_mmio_write_wi, 16, 412 vgic_mmio_read_apr, vgic_mmio_write_apr, 16,
368 VGIC_ACCESS_32bit), 413 VGIC_ACCESS_32bit),
369 REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, 414 REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
370 vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, 415 vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index bba7fa22a7f7..bf9ceab67c77 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -220,4 +220,20 @@ int vgic_debug_destroy(struct kvm *kvm);
220bool lock_all_vcpus(struct kvm *kvm); 220bool lock_all_vcpus(struct kvm *kvm);
221void unlock_all_vcpus(struct kvm *kvm); 221void unlock_all_vcpus(struct kvm *kvm);
222 222
223static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
224{
225 struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;
226
227 /*
228 * num_pri_bits are initialized with HW supported values.
229 * We can rely safely on num_pri_bits even if VM has not
230 * restored ICC_CTLR_EL1 before restoring APnR registers.
231 */
232 switch (cpu_if->num_pri_bits) {
233 case 7: return 3;
234 case 6: return 1;
235 default: return 0;
236 }
237}
238
223#endif 239#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4d81f6ded88e..6ed1c2021198 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1609,7 +1609,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
1609 struct page **pages, int nr_pages) 1609 struct page **pages, int nr_pages)
1610{ 1610{
1611 unsigned long addr; 1611 unsigned long addr;
1612 gfn_t entry; 1612 gfn_t entry = 0;
1613 1613
1614 addr = gfn_to_hva_many(slot, gfn, &entry); 1614 addr = gfn_to_hva_many(slot, gfn, &entry);
1615 if (kvm_is_error_hva(addr)) 1615 if (kvm_is_error_hva(addr))
@@ -1928,6 +1928,7 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
1928 * verify that the entire region is valid here. 1928 * verify that the entire region is valid here.
1929 */ 1929 */
1930 while (start_gfn <= end_gfn) { 1930 while (start_gfn <= end_gfn) {
1931 nr_pages_avail = 0;
1931 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 1932 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
1932 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 1933 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
1933 &nr_pages_avail); 1934 &nr_pages_avail);
@@ -2275,7 +2276,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
2275#endif 2276#endif
2276} 2277}
2277 2278
2278void kvm_vcpu_on_spin(struct kvm_vcpu *me) 2279void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
2279{ 2280{
2280 struct kvm *kvm = me->kvm; 2281 struct kvm *kvm = me->kvm;
2281 struct kvm_vcpu *vcpu; 2282 struct kvm_vcpu *vcpu;
@@ -2306,6 +2307,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
2306 continue; 2307 continue;
2307 if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) 2308 if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
2308 continue; 2309 continue;
2310 if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
2311 continue;
2309 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2312 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
2310 continue; 2313 continue;
2311 2314